diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp index cb658e8af0..31182a029a 100644 --- a/3rdparty/carotene/hal/tegra_hal.hpp +++ b/3rdparty/carotene/hal/tegra_hal.hpp @@ -1932,4 +1932,34 @@ inline int TEGRA_GaussianBlurBinomial(const uchar* src_data, size_t src_step, uc #endif // OPENCV_IMGPROC_HAL_INTERFACE_H +// The optimized branch was developed for old armv7 processors +#if defined(__ARM_ARCH) && (__ARM_ARCH == 7) +inline int TEGRA_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step, + const short* prev_deriv_data, size_t prev_deriv_step, + const uchar* next_data, size_t next_step, + int width, int height, int cn, + const float *prev_points, float *next_points, size_t point_count, + uchar *status, float *err, + const int win_width, const int win_height, + int termination_count, double termination_epsilon, + bool get_min_eigen_vals, + float min_eigen_vals_threshold) +{ + if (!CAROTENE_NS::isSupportedConfiguration()) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + CAROTENE_NS::pyrLKOptFlowLevel(CAROTENE_NS::Size2D(width, height), cn, + prev_data, prev_data_step, prev_deriv_data, prev_deriv_step, + next_data, next_step, + point_count, prev_points, next_points, + status, err, CAROTENE_NS::Size2D(win_width, win_height), + termination_count, termination_epsilon, + get_min_eigen_vals, min_eigen_vals_threshold); + return CV_HAL_ERROR_OK; +} + +#undef cv_hal_LKOpticalFlowLevel +#define cv_hal_LKOpticalFlowLevel TEGRA_LKOpticalFlowLevel +#endif // __ARM_ARCH=7 + #endif diff --git a/3rdparty/carotene/include/carotene/functions.hpp b/3rdparty/carotene/include/carotene/functions.hpp index 76d1328194..8a4fa3efdd 100644 --- a/3rdparty/carotene/include/carotene/functions.hpp +++ b/3rdparty/carotene/include/carotene/functions.hpp @@ -2485,7 +2485,7 @@ namespace CAROTENE_NS { u8 *status, f32 *err, const Size2D &winSize, u32 terminationCount, f64 terminationEpsilon, - u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals, + bool getMinEigenVals, f32 minEigThreshold); } diff --git a/3rdparty/carotene/src/opticalflow.cpp b/3rdparty/carotene/src/opticalflow.cpp index 7b29742c84..463ba77fa0 100644 --- a/3rdparty/carotene/src/opticalflow.cpp +++ b/3rdparty/carotene/src/opticalflow.cpp @@ -58,7 +58,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, u8 *status, f32 *err, const Size2D &winSize, u32 terminationCount, f64 terminationEpsilon, - u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals, + bool getMinEigenVals, f32 minEigThreshold) { internal::assertSupportedConfiguration(); @@ -74,32 +74,11 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, for( u32 ptidx = 0; ptidx < ptCount; ptidx++ ) { - f32 levscale = (1./(1 << level)); u32 ptref = ptidx << 1; - f32 prevPtX = prevPts[ptref+0]*levscale; - f32 prevPtY = prevPts[ptref+1]*levscale; - f32 nextPtX; - f32 nextPtY; - if( level == maxLevel ) - { - if( useInitialFlow ) - { - nextPtX = nextPts[ptref+0]*levscale; - nextPtY = nextPts[ptref+1]*levscale; - } - else - { - nextPtX = prevPtX; - nextPtY = prevPtY; - } - } - else - { - nextPtX = nextPts[ptref+0]*2.f; - nextPtY = nextPts[ptref+1]*2.f; - } - nextPts[ptref+0] = nextPtX; - nextPts[ptref+1] = nextPtY; + f32 prevPtX = prevPts[ptref+0]; + f32 prevPtY = prevPts[ptref+1]; + f32 nextPtX = nextPts[ptref+0]; + f32 nextPtY = nextPts[ptref+1]; s32 iprevPtX, iprevPtY; s32 inextPtX, inextPtY; @@ -111,13 +90,10 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width || iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height ) { - if( level == 0 ) - { - if( status ) - status[ptidx] = false; - if( err ) - err[ptidx] = 0; - } + if( status ) + status[ptidx] = false; + if( err ) + err[ptidx] = 0; continue; } @@ -333,7 +309,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, if( minEig < minEigThreshold || D < FLT_EPSILON ) { - if( level == 0 && status ) + if( status ) status[ptidx] = false; continue; } @@ -353,7 +329,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width || inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height ) { - if( level == 0 && status ) + if( status ) status[ptidx] = false; break; } @@ -469,8 +445,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, prevDeltaX = deltaX; prevDeltaY = deltaY; } - - if( status && status[ptidx] && err && level == 0 && !getMinEigenVals ) + if( status && status[ptidx] && err && !getMinEigenVals ) { f32 nextPointX = nextPts[ptref+0] - halfWinX; f32 nextPointY = nextPts[ptref+1] - halfWinY; @@ -526,9 +501,6 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, (void)winSize; (void)terminationCount; (void)terminationEpsilon; - (void)level; - (void)maxLevel; - (void)useInitialFlow; (void)getMinEigenVals; (void)minEigThreshold; (void)ptCount; @@ -536,4 +508,3 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, } }//CAROTENE_NS - diff --git a/3rdparty/ndsrvp/include/imgproc.hpp b/3rdparty/ndsrvp/include/imgproc.hpp index 94104f0b71..db0ee05132 100644 --- a/3rdparty/ndsrvp/include/imgproc.hpp +++ b/3rdparty/ndsrvp/include/imgproc.hpp @@ -5,6 +5,8 @@ #ifndef OPENCV_NDSRVP_IMGPROC_HPP #define OPENCV_NDSRVP_IMGPROC_HPP +struct cvhalFilter2D; + namespace cv { namespace ndsrvp { @@ -71,6 +73,34 @@ int threshold(const uchar* src_data, size_t src_step, #undef cv_hal_threshold #define cv_hal_threshold (cv::ndsrvp::threshold) +// ################ filter ################ + +int filterInit(cvhalFilter2D **context, + uchar *kernel_data, size_t kernel_step, + int kernel_type, int kernel_width, + int kernel_height, int max_width, int max_height, + int src_type, int dst_type, int borderType, + double delta, int anchor_x, int anchor_y, + bool allowSubmatrix, bool allowInplace); + +#undef cv_hal_filterInit +#define cv_hal_filterInit (cv::ndsrvp::filterInit) + +int filter(cvhalFilter2D *context, + const uchar *src_data, size_t src_step, + uchar *dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y); + +#undef cv_hal_filter +#define cv_hal_filter (cv::ndsrvp::filter) + +int filterFree(cvhalFilter2D *context); + +#undef cv_hal_filterFree +#define cv_hal_filterFree (cv::ndsrvp::filterFree) + } // namespace ndsrvp } // namespace cv diff --git a/3rdparty/ndsrvp/src/cvutils.cpp b/3rdparty/ndsrvp/src/cvutils.cpp index 48e025488f..6afac5136d 100644 --- a/3rdparty/ndsrvp/src/cvutils.cpp +++ b/3rdparty/ndsrvp/src/cvutils.cpp @@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType) return p; } +int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType) +{ + int16x4_t vzero = (int16x4_t){0, 0, 0, 0}; + int16x4_t vone = (int16x4_t){1, 1, 1, 1}; + int16x4_t vlen = (int16x4_t){len, len, len, len}; + if(borderType == CV_HAL_BORDER_REPLICATE) + vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0)); + else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101) + { + int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero; + if(len == 1) + return vzero; + do + { + int16x4_t vneg = -vp - 1 + vdelta; + int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta; + vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0)); + } + while( (long)(vp >= vlen) || (long)(vp < 0) ); + } + else if(borderType == CV_HAL_BORDER_WRAP) + { + ndsrvp_assert(len > 0); + int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen; + int16x4_t vpos = vp % vlen; + vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0)); + } + else if(borderType == CV_HAL_BORDER_CONSTANT) + vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen)); + else + ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type"); + return vp; +} + } // namespace ndsrvp } // namespace cv diff --git a/3rdparty/ndsrvp/src/cvutils.hpp b/3rdparty/ndsrvp/src/cvutils.hpp index 8cf1476ed6..78bb11d95f 100644 --- a/3rdparty/ndsrvp/src/cvutils.hpp +++ b/3rdparty/ndsrvp/src/cvutils.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -26,16 +27,26 @@ namespace ndsrvp { void* fastMalloc(size_t size); void fastFree(void* ptr); int borderInterpolate(int p, int len, int borderType); +int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType); #ifndef MAX # define MAX(a,b) ((a) < (b) ? (b) : (a)) #endif +#ifndef MIN +# define MIN(a,b) ((a) > (b) ? (b) : (a)) +#endif + #define CV_MAT_CN_MASK ((CV_CN_MAX - 1) << CV_CN_SHIFT) #define CV_MAT_CN(flags) ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1) +#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15) +#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type)) + #define CV_MALLOC_ALIGN 64 +inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); } + // error codes enum Error{ @@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b) return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a)); } +// expand + +/* + [0] [1] [2] [3] [4] [5] [6] [7] +810 [ 0 ] [ 1 ] [ 4 ] [ 5 ] +832 [ 2 ] [ 3 ] [ 6 ] [ 7 ] +bb [ 0 ] [ 1 ] [ 2 ] [ 3 ] +tt [ 4 ] [ 5 ] [ 6 ] [ 7 ] +*/ + +inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst) +{ + unsigned long vs810 = __nds__zunpkd810(vs); + unsigned long vs832 = __nds__zunpkd832(vs); + *(unsigned long*)dst = __nds__pkbb32(vs832, vs810); + *(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810); +} + +/* + [0] [1] [2] [3] [4] [5] [6] [7] +820 [ 0 ] [ 2 ] [ 4 ] [ 6 ] +831 [ 1 ] [ 3 ] [ 5 ] [ 7 ] +bb [ 0 ] [ 2 ] [ 1 ] [ 3 ] +tt [ 4 ] [ 6 ] [ 5 ] [ 7 ] +*/ + +inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst) +{ + unsigned long vs820 = __nds__zunpkd820(vs); + unsigned long vs831 = __nds__zunpkd831(vs); + *(unsigned long*)dst = __nds__pkbb32(vs831, vs820); + *(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820); +} + +/* + [0] [1] [2] [3] [4] [5] [6] [7] +820 [ 0 ] [ 2 ] [ 4 ] [ 6 ] +831 [ 1 ] [ 3 ] [ 5 ] [ 7 ] +bb [ 0 ] [ 2 ] [ 1 ] [ 3 ] +tt [ 4 ] [ 6 ] [ 5 ] [ 7 ] +bbbb[ 0 ] [ 1 ] +bbtt[ 2 ] [ 3 ] +ttbb[ 4 ] [ 5 ] +tttt[ 6 ] [ 7 ] +*/ + + +inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst) +{ + unsigned long vs820 = __nds__zunpkd820(vs); + unsigned long vs831 = __nds__zunpkd831(vs); + unsigned long vsbb = __nds__pkbb32(vs831, vs820); + unsigned long vstt = __nds__pktt32(vs831, vs820); + *(unsigned long*)dst = __nds__pkbb16(0, vsbb); + *(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb); + *(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt); + *(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt); +} + +// float replacement + +inline void ndsrvp_f32_add8(const float* a, const float* b, float* c) +{ + c[0] = a[0] + b[0]; + c[1] = a[1] + b[1]; + c[2] = a[2] + b[2]; + c[3] = a[3] + b[3]; + c[4] = a[4] + b[4]; + c[5] = a[5] + b[5]; + c[6] = a[6] + b[6]; + c[7] = a[7] + b[7]; +} + +/* + [1] [8] [23] + [24] [8] +*/ + +inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact +{ + const int mask_frac = 0x007FFFFF; + const int mask_sign = 0x7FFFFFFF; + const int mask_lead = 0x40000000; + const int ofs_exp = 23; + + uint32x2_t va01 = *(uint32x2_t*)a; + uint32x2_t va23 = *(uint32x2_t*)(a + 2); + uint32x2_t va45 = *(uint32x2_t*)(a + 4); + uint32x2_t va67 = *(uint32x2_t*)(a + 6); + + uint32x2_t vaexp01 = va01 >> ofs_exp; + uint32x2_t vaexp23 = va23 >> ofs_exp; + uint32x2_t vaexp45 = va45 >> ofs_exp; + uint32x2_t vaexp67 = va67 >> ofs_exp; + + uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead; + uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead; + uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead; + uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead; + + int16x4_t vb[2]; // fake signed for signed multiply + ndsrvp_u8_u16_eswap8(b, (ushort*)vb); + + vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]); + vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]); + vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]); + vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]); + + uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8; + uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8; + uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8; + uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8; + + vaexp01 += 8 - vaclz01; + vaexp23 += 8 - vaclz23; + vaexp45 += 8 - vaclz45; + vaexp67 += 8 - vaclz67; + + vafrac01 <<= vaclz01; + vafrac23 <<= vaclz23; + vafrac45 <<= vaclz45; + vafrac67 <<= vaclz67; + + *(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac); + *(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac); + *(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac); + *(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac); +} + // saturate template static inline _Tp saturate_cast(int v) { return _Tp(v); } @@ -94,6 +234,26 @@ template<> inline short saturate_cast(double v) { return saturate_cas template<> inline int saturate_cast(float v) { return (int)lrintf(v); } template<> inline int saturate_cast(double v) { return (int)lrint(v); } +inline double cast_ptr_to_double(const uchar* v, int depth) { + switch (depth) { + case CV_8U: return (double)*(uchar*)v; + case CV_8S: return (double)*(char*)v; + case CV_16U: return (double)*(ushort*)v; + case CV_16S: return (double)*(short*)v; + case CV_32S: return (double)*(int*)v; + case CV_32F: return (double)*(float*)v; + case CV_64F: return (double)*(double*)v; + case CV_16F: return (double)*(float*)v; + default: return 0; + } +} + +template +inline _Tp data_at(const uchar* data, int step, int y, int x, int cn) +{ + return ((_Tp*)(data + y * step))[x * cn]; +} + // align inline long align(size_t v, int n) diff --git a/3rdparty/ndsrvp/src/filter.cpp b/3rdparty/ndsrvp/src/filter.cpp new file mode 100644 index 0000000000..89508eea11 --- /dev/null +++ b/3rdparty/ndsrvp/src/filter.cpp @@ -0,0 +1,321 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "ndsrvp_hal.hpp" +#include "opencv2/imgproc/hal/interface.h" +#include "cvutils.hpp" + +namespace cv { + +namespace ndsrvp { + +class FilterData +{ +public: + FilterData(uchar *_kernel_data, size_t _kernel_step, int _kernel_type, int _src_type, int _dst_type, int _borderType, + int _kernel_width, int _kernel_height, int _max_width, int _max_height, double _delta, int _anchor_x, int _anchor_y) + : kernel_data(_kernel_data), kernel_step(_kernel_step), kernel_type(_kernel_type), src_type(_src_type), dst_type(_dst_type), borderType(_borderType), + kernel_width(_kernel_width), kernel_height(_kernel_height), max_width(_max_width), max_height(_max_height), delta(_delta), anchor_x(_anchor_x), anchor_y(_anchor_y) + { + } + + uchar *kernel_data; + size_t kernel_step; // bytes between rows(height) + int kernel_type, src_type, dst_type, borderType; + int kernel_width, kernel_height; + int max_width, max_height; + double delta; + int anchor_x, anchor_y; + std::vector coords; + std::vector coeffs; + int nz; + std::vector padding; +}; + +static int countNonZero(const FilterData* ctx) +{ + int i, j, nz = 0; + const uchar* ker_row = ctx->kernel_data; + for( i = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step ) + { + for( j = 0; j < ctx->kernel_width; j++ ) + { + if( ((float*)ker_row)[j] != 0.0 ) + nz++; + } + } + return nz; +} + +static void preprocess2DKernel(FilterData* ctx) +{ + int i, j, k, nz = countNonZero(ctx), ktype = ctx->kernel_type; + if(nz == 0) + nz = 1; // (0, 0) == 0 by default + ndsrvp_assert( ktype == CV_32F ); + + ctx->coords.resize(nz * 2); + ctx->coeffs.resize(nz); + + const uchar* ker_row = ctx->kernel_data; + for( i = k = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step ) + { + for( j = 0; j < ctx->kernel_width; j++ ) + { + float val = ((float*)ker_row)[j]; + if( val == 0.0 ) + continue; + ctx->coords[k * 2] = j; + ctx->coords[k * 2 + 1] = i; + ctx->coeffs[k++] = val; + } + } + + ctx->nz = k; +} + +int filterInit(cvhalFilter2D **context, + uchar *kernel_data, size_t kernel_step, + int kernel_type, int kernel_width, + int kernel_height, int max_width, int max_height, + int src_type, int dst_type, int borderType, + double delta, int anchor_x, int anchor_y, + bool allowSubmatrix, bool allowInplace) +{ + int sdepth = CV_MAT_DEPTH(src_type), ddepth = CV_MAT_DEPTH(dst_type); + int cn = CV_MAT_CN(src_type), kdepth = kernel_type; + + (void)allowSubmatrix; + (void)allowInplace; + + if(delta - (int)delta != 0.0) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if(kdepth != CV_32F || (sdepth != CV_8U && sdepth != CV_16U) || ddepth != sdepth) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + FilterData *ctx = new FilterData(kernel_data, kernel_step, kernel_type, src_type, dst_type, borderType, + kernel_width, kernel_height, max_width, max_height, delta, anchor_x, anchor_y); + + *context = (cvhalFilter2D*)ctx; + + ndsrvp_assert(cn == CV_MAT_CN(dst_type) && ddepth >= sdepth); + + preprocess2DKernel(ctx); + + return CV_HAL_ERROR_OK; +} + +int filter(cvhalFilter2D *context, + const uchar *src_data, size_t src_step, + uchar *dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y) +{ + FilterData *ctx = (FilterData*)context; + + int cn = CV_MAT_CN(ctx->src_type); + int cnes = CV_ELEM_SIZE(ctx->src_type); + int ddepth = CV_MAT_DEPTH(ctx->dst_type); + float delta_sat = (uchar)(ctx->delta); + if(ddepth == CV_8U) + delta_sat = (float)saturate_cast(ctx->delta); + else if(ddepth == CV_16U) + delta_sat = (float)saturate_cast(ctx->delta); + + // fetch original image data + const uchar *ogn_data = src_data - offset_y * src_step - offset_x * cnes; + int ogn_step = src_step; + + // ROI fully used in the computation + int cal_width = width + ctx->kernel_width - 1; + int cal_height = height + ctx->kernel_height - 1; + int cal_x = offset_x - ctx->anchor_x; + int cal_y = offset_y - ctx->anchor_y; + + // calculate source border + ctx->padding.resize(cal_width * cal_height * cnes); + uchar* pad_data = &ctx->padding[0]; + int pad_step = cal_width * cnes; + + uchar* pad_ptr; + const uchar* ogn_ptr; + std::vector vec_zeros(cnes, 0); + for(int i = 0; i < cal_height; i++) + { + int y = borderInterpolate(i + cal_y, full_height, ctx->borderType); + if(y < 0) { + memset(pad_data + i * pad_step, 0, cnes * cal_width); + continue; + } + + // left border + int j = 0; + int16x4_t vj = {0, 1, 2, 3}; + vj += saturate_cast(cal_x); + for(; j + cal_x < -4; j += 4, vj += 4) + { + int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType); + for(int k = 0; k < 4; k++) { + if(vx[k] < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes; + pad_ptr = pad_data + i * pad_step + (j + k) * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + } + for(; j + cal_x < 0; j++) + { + int x = borderInterpolate(j + cal_x, full_width, ctx->borderType); + if(x < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + x * cnes; + pad_ptr = pad_data + i * pad_step + j * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + + // center + int rborder = MIN(cal_width, full_width - cal_x); + ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cnes; + pad_ptr = pad_data + i * pad_step + j * cnes; + memcpy(pad_ptr, ogn_ptr, cnes * (rborder - j)); + + // right border + j = rborder; + vj = (int16x4_t){0, 1, 2, 3} + saturate_cast(cal_x + rborder); + for(; j <= cal_width - 4; j += 4, vj += 4) + { + int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType); + for(int k = 0; k < 4; k++) { + if(vx[k] < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes; + pad_ptr = pad_data + i * pad_step + (j + k) * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + } + for(; j < cal_width; j++) + { + int x = borderInterpolate(j + cal_x, full_width, ctx->borderType); + if(x < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + x * cnes; + pad_ptr = pad_data + i * pad_step + j * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + } + + // prepare the pointers + int i, k, count, nz = ctx->nz; + const uchar* ker_pts = &ctx->coords[0]; + const float* ker_cfs = &ctx->coeffs[0]; + + if( ddepth == CV_8U ) + { + std::vector src_ptrarr; + src_ptrarr.resize(nz); + uchar** src_ptrs = &src_ptrarr[0]; + uchar* dst_row = dst_data; + uchar* pad_row = pad_data; + + for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step ) + { + for( k = 0; k < nz; k++ ) + src_ptrs[k] = (uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes; + + i = 0; + for( ; i <= width * cnes - 8; i += 8 ) + { + float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat}; + for( k = 0; k < nz; k++ ) { + float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]}; + // experimental code + // ndsrvp_f32_u8_mul8(vker_cfs, *(unsigned long*)(src_ptrs[k] + i), vker_cfs); + // ndsrvp_f32_add8(vs0, vker_cfs, vs0); + vs0[0] += vker_cfs[0] * src_ptrs[k][i]; + vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1]; + vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2]; + vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3]; + vs0[4] += vker_cfs[4] * src_ptrs[k][i + 4]; + vs0[5] += vker_cfs[5] * src_ptrs[k][i + 5]; + vs0[6] += vker_cfs[6] * src_ptrs[k][i + 6]; + vs0[7] += vker_cfs[7] * src_ptrs[k][i + 7]; + } + dst_row[i] = saturate_cast(vs0[0]); + dst_row[i + 1] = saturate_cast(vs0[1]); + dst_row[i + 2] = saturate_cast(vs0[2]); + dst_row[i + 3] = saturate_cast(vs0[3]); + dst_row[i + 4] = saturate_cast(vs0[4]); + dst_row[i + 5] = saturate_cast(vs0[5]); + dst_row[i + 6] = saturate_cast(vs0[6]); + dst_row[i + 7] = saturate_cast(vs0[7]); + } + for( ; i < width * cnes; i++ ) + { + float s0 = delta_sat; + for( k = 0; k < nz; k++ ) { + s0 += ker_cfs[k] * src_ptrs[k][i]; + } + dst_row[i] = saturate_cast(s0); + } + } + } + else if( ddepth == CV_16U ) + { + std::vector src_ptrarr; + src_ptrarr.resize(nz); + ushort** src_ptrs = &src_ptrarr[0]; + uchar* dst_row = dst_data; + uchar* pad_row = pad_data; + + for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step ) + { + for( k = 0; k < nz; k++ ) + src_ptrs[k] = (ushort*)((uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes); + + i = 0; + for( ; i <= width * cn - 4; i += 4 ) + { + float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat}; + for( k = 0; k < nz; k++ ) { + float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]}; + vs0[0] += vker_cfs[0] * src_ptrs[k][i]; + vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1]; + vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2]; + vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3]; + } + ushort* dst_row_ptr = (ushort*)dst_row; + dst_row_ptr[i] = saturate_cast(vs0[0]); + dst_row_ptr[i + 1] = saturate_cast(vs0[1]); + dst_row_ptr[i + 2] = saturate_cast(vs0[2]); + dst_row_ptr[i + 3] = saturate_cast(vs0[3]); + } + for( ; i < width * cn; i++ ) + { + float s0 = delta_sat; + for( k = 0; k < nz; k++ ) { + s0 += ker_cfs[k] * src_ptrs[k][i]; + } + ((ushort*)dst_row)[i] = saturate_cast(s0); + } + } + } + + return CV_HAL_ERROR_OK; +} + +int filterFree(cvhalFilter2D *context) { + FilterData *ctx = (FilterData*)context; + delete ctx; + return CV_HAL_ERROR_OK; +} + +} // namespace ndsrvp + +} // namespace cv diff --git a/3rdparty/zlib-ng/CMakeLists.txt b/3rdparty/zlib-ng/CMakeLists.txt index c05511ca87..83e6dac542 100644 --- a/3rdparty/zlib-ng/CMakeLists.txt +++ b/3rdparty/zlib-ng/CMakeLists.txt @@ -1,12 +1,38 @@ -project(${ZLIB_LIBRARY} LANGUAGES C) - -if("c_std_11" IN_LIST CMAKE_C_COMPILE_FEATURES) - set(CMAKE_C_STANDARD 11) # The C standard whose features are requested to build this target +cmake_minimum_required(VERSION 3.5.1) +if(CMAKE_VERSION VERSION_LESS 3.12) + cmake_policy(VERSION ${CMAKE_VERSION}) else() - set(CMAKE_C_STANDARD 99) + cmake_policy(VERSION 3.5.1...3.29.0) endif() -set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement -set(CMAKE_C_EXTENSIONS OFF) # Boolean specifying whether compiler specific extensions are requested +message(STATUS "Using CMake version ${CMAKE_VERSION}") + +# If not specified on the command line, enable C11 as the default +# Configuration items that affect the global compiler environment standards +# should be issued before the "project" command. +if(NOT CMAKE_C_STANDARD) + set(CMAKE_C_STANDARD 11) # The C standard whose features are requested to build this target +endif() +if(NOT CMAKE_C_STANDARD_REQUIRED) + set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement +endif() +if(NOT CMAKE_C_EXTENSIONS) + set(CMAKE_C_EXTENSIONS OFF) # Boolean specifying whether compiler specific extensions are requested +endif() +set(VALID_C_STANDARDS "99" "11") +if(NOT CMAKE_C_STANDARD IN_LIST VALID_C_STANDARDS) + MESSAGE(FATAL_ERROR "CMAKE_C_STANDARD:STRING=${CMAKE_C_STANDARD} not in known standards list\n ${VALID_C_STANDARDS}") +endif() + +# Parse the full version number from zlib.h.in and include in ZLIB_FULL_VERSION +file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in _zlib_h_contents) +string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([0-9]+.[0-9]+.[0-9]+).*\".*" + "\\1" ZLIB_HEADER_VERSION ${_zlib_h_contents}) +string(REGEX REPLACE ".*#define[ \t]+ZLIBNG_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*" + "\\1" ZLIBNG_HEADER_VERSION ${_zlib_h_contents}) +message(STATUS "ZLIB_HEADER_VERSION: ${ZLIB_HEADER_VERSION}") +message(STATUS "ZLIBNG_HEADER_VERSION: ${ZLIBNG_HEADER_VERSION}") + +project(zlib VERSION ${ZLIB_HEADER_VERSION} LANGUAGES C) include(CheckTypeSize) include(CheckSymbolExists) @@ -16,142 +42,325 @@ include(CheckCSourceCompiles) include(CheckCSourceRuns) include(CheckCCompilerFlag) include(CMakeDependentOption) +include(CMakePackageConfigHelpers) +include(FeatureSummary) -if(X86_64 OR X86) - set(BASEARCH_X86_FOUND TRUE) -endif() -if(AARCH64 OR ARM) - set(BASEARCH_ARM_FOUND TRUE) -endif() -if(PPC64LE OR PPC64) - set(BASEARCH_PPC_FOUND TRUE) -endif() -if(RISCV) - set(BASEARCH_RISCV_FOUND TRUE) -endif() - +include(cmake/detect-arch.cmake) +include(cmake/detect-install-dirs.cmake) +include(cmake/detect-coverage.cmake) include(cmake/detect-intrinsics.cmake) +include(cmake/detect-sanitizer.cmake) include(cmake/fallback-macros.cmake) -set(ZLIB_SYMBOL_PREFIX "") - -if(BASEARCH_X86_FOUND) - set(WITH_AVX2 ON) - set(WITH_AVX512 ON) - set(WITH_AVX512VNNI ON) - set(WITH_SSE2 ON) - set(WITH_SSSE3 ON) - set(WITH_SSE42 ON) - set(WITH_PCLMULQDQ ON) - set(WITH_VPCLMULQDQ ON) +if(CMAKE_TOOLCHAIN_FILE) + message(STATUS "Using CMake toolchain: ${CMAKE_TOOLCHAIN_FILE}") endif() + +# Make sure we use an appropriate BUILD_TYPE by default, "Release" to be exact +# this should select the maximum generic optimisation on the current platform (i.e. -O3 for gcc/clang) +get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) +if(NOT GENERATOR_IS_MULTI_CONFIG) + if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, standard options are: Debug Release RelWithDebInfo MinSizeRel." + FORCE) + add_feature_info(CMAKE_BUILD_TYPE 1 "Build type: ${CMAKE_BUILD_TYPE} (default)") + else() + add_feature_info(CMAKE_BUILD_TYPE 1 "Build type: ${CMAKE_BUILD_TYPE} (selected)") + endif() +endif() + +# +# Options parsing +# +option(WITH_GZFILEOP "Compile with support for gzFile related functions" ON) +option(ZLIB_COMPAT "Compile with zlib compatible API" ON) +option(ZLIB_ENABLE_TESTS "Build test binaries" OFF) +option(ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API" OFF) +option(WITH_GTEST "Build gtest_zlib" OFF) +option(WITH_FUZZERS "Build test/fuzz" OFF) +option(WITH_BENCHMARKS "Build test/benchmarks" OFF) +option(WITH_BENCHMARK_APPS "Build application benchmarks" OFF) +option(WITH_OPTIM "Build with optimisation" ON) +option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces performance)" OFF) +option(WITH_NEW_STRATEGIES "Use new strategies" ON) +option(WITH_NATIVE_INSTRUCTIONS + "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)" OFF) +option(WITH_RUNTIME_CPU_DETECTION "Build with runtime detection of CPU architecture" ON) +option(WITH_MAINTAINER_WARNINGS "Build with project maintainer warnings" OFF) +option(WITH_CODE_COVERAGE "Enable code coverage reporting" OFF) +option(WITH_INFLATE_STRICT "Build with strict inflate distance checking" OFF) +option(WITH_INFLATE_ALLOW_INVALID_DIST "Build with zero fill for inflate invalid distances" OFF) +option(WITH_UNALIGNED "Support unaligned reads on platforms that support it" ON) + +set(ZLIB_SYMBOL_PREFIX "" CACHE STRING "Give this prefix to all publicly exported symbols. +Useful when embedding into a larger library. +Default is no prefix (empty prefix).") + +# Add multi-choice option +set(WITH_SANITIZER AUTO CACHE STRING "Enable sanitizer support") +set_property(CACHE WITH_SANITIZER PROPERTY STRINGS "Memory" "Address" "Undefined" "Thread") + if(BASEARCH_ARM_FOUND) - set(WITH_ACLE ON) - set(WITH_NEON ON) - if(ARM) - set(WITH_ARMV6 ON) - else() - set(WITH_ARMV6 OFF) - endif() -endif() -if(BASEARCH_PPC_FOUND) - set(WITH_ALTIVEC ON) - set(WITH_POWER8 ON) - set(WITH_POWER9 ON) -endif() -if(BASEARCH_RISCV_FOUND) - set(WITH_RVV ON) + option(WITH_ACLE "Build with ACLE" ON) + option(WITH_NEON "Build with NEON intrinsics" ON) + cmake_dependent_option(WITH_ARMV6 "Build with ARMv6 SIMD" ON "NOT ARCH STREQUAL \"aarch64\"" OFF) +elseif(BASEARCH_PPC_FOUND) + option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON) + option(WITH_POWER8 "Build with optimisations for POWER8" ON) + option(WITH_POWER9 "Build with optimisations for POWER9" ON) +elseif(BASEARCH_RISCV_FOUND) + option(WITH_RVV "Build with RVV intrinsics" ON) +elseif(BASEARCH_S360_FOUND) + option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF) + option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF) + option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON) +elseif(BASEARCH_X86_FOUND) + option(WITH_SSE2 "Build with SSE2" ON) + cmake_dependent_option(WITH_SSSE3 "Build with SSSE3" ON "WITH_SSE2" OFF) + cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSSE3" OFF) + cmake_dependent_option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON "WITH_SSE42" OFF) + cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF) + cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF) + cmake_dependent_option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON "WITH_AVX512" OFF) + cmake_dependent_option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON "WITH_PCLMULQDQ;WITH_AVX512" OFF) endif() +option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF) -add_definitions(-DZLIB_COMPAT) +set(ZLIB_BUILD_SHARED_LIBS OFF) +set(SKIP_INSTALL_ALL ON) +ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes -Wmissing-declarations -Wundef -Wstrict-prototypes -Wtype-limits) +ocv_warnings_disable(CMAKE_C_FLAGS /wd4819 /wd4244 /wd4334) -add_definitions(-DWITH_GZFILEOP) +mark_as_advanced(FORCE + ZLIB_SYMBOL_PREFIX + WITH_REDUCED_MEM + WITH_ACLE WITH_NEON + WITH_ARMV6 + WITH_DFLTCC_DEFLATE + WITH_DFLTCC_INFLATE + WITH_CRC32_VX + WITH_AVX2 WITH_SSE2 + WITH_SSSE3 WITH_SSE42 + WITH_PCLMULQDQ + WITH_ALTIVEC + WITH_POWER8 + WITH_POWER9 + WITH_RVV + WITH_INFLATE_STRICT + WITH_INFLATE_ALLOW_INVALID_DIST + WITH_UNALIGNED + INSTALL_UTILS + ) + +if(ZLIB_COMPAT) + add_definitions(-DZLIB_COMPAT) + set(WITH_GZFILEOP ON) + set(SUFFIX "") + set(ZLIB_FULL_VERSION ${ZLIB_HEADER_VERSION}.zlib-ng) + set(EXPORT_NAME ZLIB) +else() + set(SUFFIX "-ng") + set(ZLIB_FULL_VERSION ${ZLIBNG_HEADER_VERSION}) + set(EXPORT_NAME zlib-ng) +endif() + +if(WITH_GZFILEOP) + add_definitions(-DWITH_GZFILEOP) +endif() if(CMAKE_C_COMPILER_ID MATCHES "^Intel") - set(WARNFLAGS_DISABLE) + if(CMAKE_HOST_UNIX) + set(WARNFLAGS -Wall) + set(WARNFLAGS_MAINTAINER -Wall -Wcheck -Wremarks) + set(WARNFLAGS_DISABLE) + else() + set(WARNFLAGS /Wall) + set(WARNFLAGS_MAINTAINER /W5) + set(WARNFLAGS_DISABLE) + endif() + check_c_compiler_flag(-diag-disable=10441 HAVE_DIAG_10441) + if(HAVE_DIAG_10441) + list(APPEND WARNFLAGS_DISABLE "-diag-disable=10441") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -diag-disable=10441") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -diag-disable=10441") + endif() elseif(MSVC) - # Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013 - # See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html - if(MSVC_VERSION VERSION_LESS 1800) - message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).") - endif() - # TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination - # (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should - # avoid mistakes. - # /Oi ? - set(WARNFLAGS_DISABLE) - if(BASEARCH_ARM_FOUND) - add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE) - if(NOT "${ARCH}" MATCHES "aarch64") - set(NEONFLAG "/arch:VFPv4") - endif() - endif() -elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - set(WARNFLAGS_DISABLE) - # Check whether -fno-lto is available - set(CMAKE_REQUIRED_FLAGS "-fno-lto") - check_c_source_compiles( - "int main() { return 0; }" - FNO_LTO_AVAILABLE FAIL_REGEX "not supported") - set(CMAKE_REQUIRED_FLAGS) - if(FNO_LTO_AVAILABLE) - set(ZNOLTOFLAG "-fno-lto") - endif() - if(BASEARCH_ARM_FOUND) - if(ARM AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi") - # Auto-detect support for ARM floating point ABI - check_include_file(features.h HAVE_FEATURES_H) - if(HAVE_FEATURES_H) - set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp) - check_c_source_compiles( - "#include - int main() { return 0; }" - HAVE_FLOATABI_SOFTFP) - if(HAVE_FLOATABI_SOFTFP) - set(FLOATABI -mfloat-abi=softfp) - else() - set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard) - check_c_source_compiles( - "#include - int main() { return 0; }" - HAVE_FLOATABI_HARD) - if(HAVE_FLOATABI_HARD) - set(FLOATABI -mfloat-abi=hard) - endif() + # Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013 + # See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html + if(MSVC_VERSION VERSION_LESS 1800) + message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).") + endif() + # TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination + # (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should + # avoid mistakes. + # /Oi ? + set(WARNFLAGS /W3) + set(WARNFLAGS_MAINTAINER /W4) + set(WARNFLAGS_DISABLE) + if(BASEARCH_ARM_FOUND) + add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE) + if(NOT "${ARCH}" MATCHES "aarch64") + set(NEONFLAG "/arch:VFPv4") endif() - set(CMAKE_REQUIRED_FLAGS) - endif() - if(FLOATABI) - message(STATUS "${ZLIB_LIBRARY} ARM floating point arch: ${FLOATABI}") - add_compile_options(${FLOATABI}) - else() - message(STATUS "${ZLIB_LIBRARY} ARM floating point arch not auto-detected") - endif() endif() - endif() - if(FNO_LTO_AVAILABLE) - set(NOLTOFLAG ${ZNOLTOFLAG}) - endif() - if(MINGW) - # Add `-Wno-pedantic-ms-format` only if the toolchain supports it - check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT) - if(HAVE_NO_PEDANTIC_MS_FORMAT) - list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format) +elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + # Enable warnings in GCC and Clang + set(WARNFLAGS -Wall) + set(WARNFLAGS_MAINTAINER -Wextra) + set(WARNFLAGS_DISABLE) + # Check whether -fno-lto is available + set(CMAKE_REQUIRED_FLAGS "-fno-lto") + check_c_source_compiles( + "int main() { return 0; }" + FNO_LTO_AVAILABLE FAIL_REGEX "not supported") + set(CMAKE_REQUIRED_FLAGS) + if(FNO_LTO_AVAILABLE) + set(ZNOLTOFLAG "-fno-lto") + endif() + if(NOT WITH_NATIVE_INSTRUCTIONS) + if(BASEARCH_ARM_FOUND) + if("${ARCH}" MATCHES "arm" AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi") + # Auto-detect support for ARM floating point ABI + check_include_file(features.h HAVE_FEATURES_H) + if(HAVE_FEATURES_H) + set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp) + check_c_source_compiles( + "#include + int main() { return 0; }" + HAVE_FLOATABI_SOFTFP) + if(HAVE_FLOATABI_SOFTFP) + set(FLOATABI -mfloat-abi=softfp) + else() + set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard) + check_c_source_compiles( + "#include + int main() { return 0; }" + HAVE_FLOATABI_HARD) + if(HAVE_FLOATABI_HARD) + set(FLOATABI -mfloat-abi=hard) + endif() + endif() + set(CMAKE_REQUIRED_FLAGS) + endif() + if(FLOATABI) + message(STATUS "ARM floating point arch: ${FLOATABI}") + add_compile_options(${FLOATABI}) + else() + message(STATUS "ARM floating point arch not auto-detected") + endif() + endif() + endif() + # Disable LTO unless Native Instructions are enabled + if(FNO_LTO_AVAILABLE) + set(NOLTOFLAG ${ZNOLTOFLAG}) + endif() + endif() + if(MINGW) + # Add `-Wno-pedantic-ms-format` only if the toolchain supports it + check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT) + if(HAVE_NO_PEDANTIC_MS_FORMAT) + list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format) + endif() endif() - endif() endif() -# Force disable LTO -set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) +# Set native march/mcpu +if(WITH_NATIVE_INSTRUCTIONS) + if(NATIVE_ARCH_OVERRIDE) + message(STATUS "WARNING: WITH_NATIVE_INSTRUCTIONS enabled, but running with NATIVE_ARCH_OVERRIDE: ${NATIVE_ARCH_OVERRIDE}") + set(NATIVEFLAG "${NATIVE_ARCH_OVERRIDE}") + else() + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + check_c_compiler_flag(-march=native HAVE_MARCH_NATIVE) + if(HAVE_MARCH_NATIVE) + set(NATIVEFLAG "-march=native") + else() + check_c_compiler_flag(-mcpu=native HAVE_MCPU_NATIVE) + if(HAVE_MCPU_NATIVE) + set(NATIVEFLAG "-mcpu=native") + endif() + endif() + # Fall through + endif() + endif() + if(NATIVEFLAG) + # Apply flags to all source files and compilation checks + if(WIN32) + separate_arguments(NATIVEOPTIONS WINDOWS_COMMAND "${NATIVEFLAG}") + else() + separate_arguments(NATIVEOPTIONS UNIX_COMMAND "${NATIVEFLAG}") + endif() + add_compile_options(${NATIVEOPTIONS}) + set(WITH_RUNTIME_CPU_DETECTION OFF) + else() + message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not implemented yet on this configuration") + set(WITH_NATIVE_INSTRUCTIONS OFF) + endif() +endif() + +# Compile without functable or CPU detection +if(NOT WITH_RUNTIME_CPU_DETECTION) + if(MSVC AND BASEARCH_X86_FOUND) + message(STATUS "WARNING: Microsoft Visual Studio does not support compile time detection of CPU features for \"/arch\" before \"AVX\"") + # Workaround for MSVC. By default MSVC does not define the __SSE*__ macros. + # Fix it if AVX is enabled. + set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") + check_c_source_compiles( + "#ifndef __AVX__ + # error \"AVX is not enabled.\" + #endif + int main(void) { return 0; }" + MSVC_IS_ENABLED_AVX + ) + set(CMAKE_REQUIRED_FLAGS) + if(MSVC_IS_ENABLED_AVX) + add_definitions( + -D__SSE__=1 + -D__SSE2__=1 + -D__SSE3__=1 + -D__SSSE3__=1 + -D__SSE4_1__=1 + -D__SSE4_2__=1 + -D__PCLMUL__=1 + ) + endif() + endif() + add_definitions(-DDISABLE_RUNTIME_CPU_DETECTION) +endif() + +# Force disable LTO if WITH_NATIVE_INSTRUCTIONS is not active +if(NOT WITH_NATIVE_INSTRUCTIONS) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) + foreach(_cfg_name IN LISTS CMAKE_CONFIGURATION_TYPES) + string(TOUPPER "${_cfg_name}" _cfg_name_uc) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_${_cfg_name_uc} OFF) + endforeach() +endif() + +# Set architecture alignment requirements +if(NOT WITH_UNALIGNED) + add_definitions(-DNO_UNALIGNED) + message(STATUS "Unaligned reads manually disabled") +endif() # Apply warning compiler flags -add_compile_options(${WARNFLAGS_DISABLE}) +if(WITH_MAINTAINER_WARNINGS) + add_compile_options(${WARNFLAGS} ${WARNFLAGS_MAINTAINER} ${WARNFLAGS_DISABLE}) +else() + add_compile_options(${WARNFLAGS} ${WARNFLAGS_DISABLE}) +endif() + +# Set code coverage compiler flags +if(WITH_CODE_COVERAGE) + add_code_coverage() +endif() # Replace optimization level 3 added by default with level 2 -if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3") - string(REGEX REPLACE "([\\/\\-]O)3" "\\12" - CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") +if(NOT WITH_CODE_COVERAGE AND NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3") + string(REGEX REPLACE "([\\/\\-]O)3" "\\12" + CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") endif() # @@ -159,32 +368,40 @@ endif() # check_include_file(arm_acle.h HAVE_ARM_ACLE_H) if(HAVE_ARM_ACLE_H) - add_definitions(-DHAVE_ARM_ACLE_H) + add_definitions(-DHAVE_ARM_ACLE_H) endif() check_include_file(sys/auxv.h HAVE_SYS_AUXV_H) if(HAVE_SYS_AUXV_H) - add_definitions(-DHAVE_SYS_AUXV_H) + add_definitions(-DHAVE_SYS_AUXV_H) endif() check_include_file(sys/sdt.h HAVE_SYS_SDT_H) if(HAVE_SYS_SDT_H) - add_definitions(-DHAVE_SYS_SDT_H) + add_definitions(-DHAVE_SYS_SDT_H) endif() check_include_file(unistd.h HAVE_UNISTD_H) +# +# Check for Linux includes +# +check_include_file(linux/auxvec.h HAVE_LINUX_AUXVEC_H) +if(HAVE_LINUX_AUXVEC_H) + add_definitions(-DHAVE_LINUX_AUXVEC_H) +endif() + # # Check to see if we have large file support # set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64) check_type_size(off64_t OFF64_T) if(HAVE_OFF64_T) - add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64) -else() - check_type_size(_off64_t _OFF64_T) - if(HAVE__OFF64_T) add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64) - else() - check_type_size(__off64_t __OFF64_T) - endif() +else() + check_type_size(_off64_t _OFF64_T) + if(HAVE__OFF64_T) + add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64) + else() + check_type_size(__off64_t __OFF64_T) + endif() endif() set(CMAKE_REQUIRED_DEFINITIONS) # clear variable @@ -193,499 +410,676 @@ set(CMAKE_REQUIRED_DEFINITIONS) # clear variable # check_function_exists(fseeko HAVE_FSEEKO) if(NOT HAVE_FSEEKO) - add_definitions(-DNO_FSEEKO) + add_definitions(-DNO_FSEEKO) endif() check_function_exists(strerror HAVE_STRERROR) if(NOT HAVE_STRERROR) - add_definitions(-DNO_STRERROR) + add_definitions(-DNO_STRERROR) endif() set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112L) check_symbol_exists(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN) if(HAVE_POSIX_MEMALIGN) - add_definitions(-DHAVE_POSIX_MEMALIGN) + add_definitions(-DHAVE_POSIX_MEMALIGN) endif() set(CMAKE_REQUIRED_DEFINITIONS) set(CMAKE_REQUIRED_DEFINITIONS -D_ISOC11_SOURCE=1) check_symbol_exists(aligned_alloc stdlib.h HAVE_ALIGNED_ALLOC) if(HAVE_ALIGNED_ALLOC) - add_definitions(-DHAVE_ALIGNED_ALLOC) + add_definitions(-DHAVE_ALIGNED_ALLOC) endif() set(CMAKE_REQUIRED_DEFINITIONS) +if(WITH_SANITIZER STREQUAL "Address") + add_address_sanitizer() +elseif(WITH_SANITIZER STREQUAL "Memory") + add_memory_sanitizer() +elseif(WITH_SANITIZER STREQUAL "Thread") + add_thread_sanitizer() +elseif(WITH_SANITIZER STREQUAL "Undefined") + add_undefined_sanitizer() +endif() + +# +# Check whether compiler supports -fno-semantic-interposition parameter +# +check_c_compiler_flag(-fno-semantic-interposition HAVE_NO_INTERPOSITION) + # # Check if we can hide zlib internal symbols that are linked between separate source files using hidden # check_c_source_compiles( - "#define Z_INTERNAL __attribute__((visibility (\"hidden\"))) - int Z_INTERNAL foo; - int main() { - return 0; - }" - HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility") + "#define Z_INTERNAL __attribute__((visibility (\"hidden\"))) + int Z_INTERNAL foo; + int main() { + return 0; + }" + HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility") if(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN) - add_definitions(-DHAVE_VISIBILITY_HIDDEN) + add_definitions(-DHAVE_VISIBILITY_HIDDEN) endif() # # Check if we can hide zlib internal symbols that are linked between separate source files using internal # check_c_source_compiles( - "#define Z_INTERNAL __attribute__((visibility (\"internal\"))) - int Z_INTERNAL foo; - int main() { - return 0; - }" - HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility") + "#define Z_INTERNAL __attribute__((visibility (\"internal\"))) + int Z_INTERNAL foo; + int main() { + return 0; + }" + HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility") if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL) - add_definitions(-DHAVE_VISIBILITY_INTERNAL) + add_definitions(-DHAVE_VISIBILITY_INTERNAL) endif() # # Check for __attribute__((aligned(x))) support in the compiler # check_c_source_compiles( - "int main(void) { - __attribute__((aligned(8))) int test = 0; - (void)test; - return 0; - }" - HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned") + "int main(void) { + __attribute__((aligned(8))) int test = 0; + (void)test; + return 0; + }" + HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned") if(HAVE_ATTRIBUTE_ALIGNED) - add_definitions(-DHAVE_ATTRIBUTE_ALIGNED) + add_definitions(-DHAVE_ATTRIBUTE_ALIGNED) +endif() + +# +# Check for __builtin_assume_aligned(x,n) support in the compiler +# +check_c_source_compiles( + "char *test(char *buffer) { + char *abuffer = __builtin_assume_aligned(buffer,64); + return abuffer; + } + int main() { + return 0; + }" + HAVE_BUILTIN_ASSUME_ALIGNED) +if(HAVE_BUILTIN_ASSUME_ALIGNED) + add_definitions(-DHAVE_BUILTIN_ASSUME_ALIGNED) endif() # # check for __builtin_ctz() support in the compiler # check_c_source_compiles( - "int main(void) { - unsigned int zero = 0; - long test = __builtin_ctz(zero); - (void)test; - return 0; - }" - HAVE_BUILTIN_CTZ + "int main(void) { + unsigned int zero = 0; + long test = __builtin_ctz(zero); + (void)test; + return 0; + }" + HAVE_BUILTIN_CTZ ) if(HAVE_BUILTIN_CTZ) - add_definitions(-DHAVE_BUILTIN_CTZ) + add_definitions(-DHAVE_BUILTIN_CTZ) endif() # # check for __builtin_ctzll() support in the compiler # check_c_source_compiles( - "int main(void) { - unsigned int zero = 0; - long test = __builtin_ctzll(zero); - (void)test; - return 0; - }" - HAVE_BUILTIN_CTZLL + "int main(void) { + unsigned int zero = 0; + long test = __builtin_ctzll(zero); + (void)test; + return 0; + }" + HAVE_BUILTIN_CTZLL ) if(HAVE_BUILTIN_CTZLL) - add_definitions(-DHAVE_BUILTIN_CTZLL) + add_definitions(-DHAVE_BUILTIN_CTZLL) endif() # # check for ptrdiff_t support # check_c_source_compiles( - "#include - int main() { - ptrdiff_t *a; - (void)a; - return 0; - }" - HAVE_PTRDIFF_T + "#include + int main() { + ptrdiff_t *a; + (void)a; + return 0; + }" + HAVE_PTRDIFF_T ) if(NOT HAVE_PTRDIFF_T) - set(NEED_PTRDIFF_T 1) + set(NEED_PTRDIFF_T 1) - check_type_size("void *" SIZEOF_DATA_PTR) - message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes") + check_type_size("void *" SIZEOF_DATA_PTR) + message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes") - if(${SIZEOF_DATA_PTR} MATCHES "4") - set(PTRDIFF_TYPE "uint32_t") - elseif(${SIZEOF_DATA_PTR} MATCHES "8") - set(PTRDIFF_TYPE "uint64_t") - else() - message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit") - endif() + if(${SIZEOF_DATA_PTR} MATCHES "4") + set(PTRDIFF_TYPE "uint32_t") + elseif(${SIZEOF_DATA_PTR} MATCHES "8") + set(PTRDIFF_TYPE "uint64_t") + else() + message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit") + endif() endif() +add_compile_options($<$:-DZLIB_DEBUG>) + if(MSVC) - add_definitions(-D_CRT_SECURE_NO_DEPRECATE) - add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) + set(CMAKE_DEBUG_POSTFIX "d") + add_definitions(-D_CRT_SECURE_NO_DEPRECATE) + add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) endif() +if(BASEARCH_X86_FOUND) + # FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true + if("${ARCH}" MATCHES "i[3-6]86") + cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF) + endif() +endif() + +# +# Enable deflate_quick at level 1 +# +if(NOT WITH_NEW_STRATEGIES) + add_definitions(-DNO_QUICK_STRATEGY) +endif() +# +# Enable deflate_medium at level 4-6 +# +if(NOT WITH_NEW_STRATEGIES) + add_definitions(-DNO_MEDIUM_STRATEGY) +endif() +# +# Enable inflate compilation options +# +if(WITH_INFLATE_STRICT) + add_definitions(-DINFLATE_STRICT) + message(STATUS "Inflate strict distance checking enabled") +endif() +if(WITH_INFLATE_ALLOW_INVALID_DIST) + add_definitions(-DINFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR) + message(STATUS "Inflate zero data for invalid distances enabled") +endif() +# +# Enable reduced memory configuration +# +if(WITH_REDUCED_MEM) + add_definitions(-DHASH_SIZE=32768u -DGZBUFSIZE=8192 -DNO_LIT_MEM) + message(STATUS "Configured for reduced memory environment") +endif() + +set(GENERIC_ARCHDIR "arch/generic") + set(ZLIB_ARCH_SRCS) -set(ZLIB_ARCH_HDRS) -set(ARCHDIR "arch/generic") -if(BASEARCH_X86_FOUND) - set(ARCHDIR "arch/x86") -endif() +set(ZLIB_ARCH_HDRS ${GENERIC_ARCHDIR}/generic_functions.h) + if(BASEARCH_ARM_FOUND) - set(ARCHDIR "arch/arm") -endif() -if(BASEARCH_PPC_FOUND) - set(ARCHDIR "arch/power") -endif() -if(BASEARCH_RISCV_FOUND) - set(ARCHDIR "arch/riscv") + set(ARCHDIR "arch/arm") +elseif(BASEARCH_PPC_FOUND) + set(ARCHDIR "arch/power") +elseif(BASEARCH_RISCV_FOUND) + set(ARCHDIR "arch/riscv") +elseif(BASEARCH_S360_FOUND) + set(ARCHDIR "arch/s390") +elseif(BASEARCH_X86_FOUND) + set(ARCHDIR "arch/x86") + if(NOT ${ARCH} MATCHES "x86_64") + add_feature_info(SSE2 1 "Support the SSE2 instruction set, using \"${SSE2FLAG}\"") + endif() +else() + set(ARCHDIR ${GENERIC_ARCHDIR}) + message(STATUS "No optimized architecture: using ${ARCHDIR}") endif() -if(NOT CV_DISABLE_OPTIMIZATION) - if(BASEARCH_ARM_FOUND) - add_definitions(-DARM_FEATURES) - if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if("${ARCH}" MATCHES "aarch64") - check_c_source_compiles( - "#include - int main() { - return (getauxval(AT_HWCAP) & HWCAP_CRC32); - }" - ARM_AUXV_HAS_CRC32 - ) - if(ARM_AUXV_HAS_CRC32) - add_definitions(-DARM_AUXV_HAS_CRC32) - else() - message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.") +if(WITH_OPTIM) + if(BASEARCH_ARM_FOUND) + add_definitions(-DARM_FEATURES) + if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + if("${ARCH}" MATCHES "aarch64") + check_c_source_compiles( + "#include + int main() { + return (getauxval(AT_HWCAP) & HWCAP_CRC32); + }" + ARM_AUXV_HAS_CRC32 + ) + if(ARM_AUXV_HAS_CRC32) + add_definitions(-DARM_AUXV_HAS_CRC32) + else() + message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.") + endif() + else() + check_c_source_compiles( + "#include + int main() { + return (getauxval(AT_HWCAP2) & HWCAP2_CRC32); + }" + ARM_AUXV_HAS_CRC32 + ) + if(ARM_AUXV_HAS_CRC32) + add_definitions(-DARM_AUXV_HAS_CRC32) + else() + check_c_source_compiles( + "#include + #include + int main() { + return (getauxval(AT_HWCAP2) & HWCAP2_CRC32); + }" + ARM_HWCAP_HAS_CRC32 + ) + if (ARM_HWCAP_HAS_CRC32) + add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP) + else() + message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.") + endif() + endif() + check_c_source_compiles( + "#include + int main() { + return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON); + }" + ARM_AUXV_HAS_NEON + ) + if(ARM_AUXV_HAS_NEON) + add_definitions(-DARM_AUXV_HAS_NEON) + else() + check_c_source_compiles( + "#include + int main() { + return (getauxval(AT_HWCAP) & HWCAP_NEON); + }" + ARM_AUXV_HAS_NEON + ) + if (ARM_AUXV_HAS_NEON) + add_definitions(-DARM_AUXV_HAS_NEON) + else() + message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.") + endif() + endif() + endif() endif() - else() - check_c_source_compiles( - "#include - int main() { - return (getauxval(AT_HWCAP2) & HWCAP2_CRC32); - }" - ARM_AUXV_HAS_CRC32 - ) - if(ARM_AUXV_HAS_CRC32) - add_definitions(-DARM_AUXV_HAS_CRC32) - else() - check_c_source_compiles( - "#include - #include - int main() { - return (getauxval(AT_HWCAP2) & HWCAP2_CRC32); - }" - ARM_HWCAP_HAS_CRC32 - ) - if(ARM_HWCAP_HAS_CRC32) - add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP) - else() - message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.") - endif() + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_functions.h) + if(WITH_RUNTIME_CPU_DETECTION) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c) endif() - check_c_source_compiles( - "#include - int main() { - return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON); - }" - ARM_AUXV_HAS_NEON - ) - if(ARM_AUXV_HAS_NEON) - add_definitions(-DARM_AUXV_HAS_NEON) + + if(WITH_ACLE) + check_acle_compiler_flag() + if(HAVE_ACLE_FLAG) + add_definitions(-DARM_ACLE) + set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c) + set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}") + list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS}) + add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"") + else() + set(WITH_ACLE OFF) + endif() else() - check_c_source_compiles( - "#include - int main() { - return (getauxval(AT_HWCAP) & HWCAP_NEON); - }" - ARM_AUXV_HAS_NEON - ) - if (ARM_AUXV_HAS_NEON) - add_definitions(-DARM_AUXV_HAS_NEON) - else() - message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.") - endif() + set(WITH_ACLE OFF) + endif() + if(WITH_NEON) + check_neon_compiler_flag() + if(NEON_AVAILABLE) + add_definitions(-DARM_NEON) + set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c + ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c) + list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS}) + set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}") + if(MSVC) + add_definitions(-D__ARM_NEON__) + endif() + add_feature_info(NEON_ADLER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"") + add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"") + check_neon_ld4_intrinsics() + if(NEON_HAS_LD4) + add_definitions(-DARM_NEON_HASLD4) + endif() + else() + set(WITH_NEON OFF) + endif() + endif() + if(WITH_ARMV6) + check_armv6_compiler_flag() + if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN) + add_definitions(-DARM_SIMD) + set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c) + set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}") + list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS}) + add_feature_info(ARMV6 1 "Support ARMv6 SIMD instructions in slide_hash, using \"${ARMV6FLAG}\"") + if(HAVE_ARMV6_INTRIN) + add_definitions(-DARM_SIMD_INTRIN) + endif() + else() + set(WITH_ARMV6 OFF) + endif() + else() + set(WITH_ARMV6 OFF) + endif() + elseif(BASEARCH_PPC_FOUND) + # Common arch detection code + if(WITH_ALTIVEC) + check_ppc_intrinsics() + endif() + if(WITH_POWER8) + check_power8_intrinsics() + endif() + if(WITH_POWER9) + check_power9_intrinsics() + endif() + if(POWER8_NEED_AUXVEC_H OR POWER9_NEED_AUXVEC_H) + add_definitions(-DPOWER_NEED_AUXVEC_H) + endif() + if(HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN) + add_definitions(-DPOWER_FEATURES) + endif() + if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_functions.h) + if(WITH_RUNTIME_CPU_DETECTION) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c) + endif() + endif() + # VMX specific options and files + if(WITH_ALTIVEC) + if(HAVE_VMX) + add_definitions(-DPPC_FEATURES) + if(HAVE_ALTIVEC) + add_definitions(-DPPC_VMX) + set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c) + list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS}) + add_feature_info(ALTIVEC 1 "Support the AltiVec instruction set, using \"-maltivec\"") + set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}") + else() + set(WITH_ALTIVEC OFF) + endif() + endif() + endif() + # Power8 specific options and files + if(WITH_POWER8) + if(HAVE_POWER8_INTRIN) + add_definitions(-DPOWER8_VSX) + set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c) + if("${ARCH}" MATCHES "powerpc64(le)?") + add_definitions(-DPOWER8_VSX_CRC32) + list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c) + endif() + list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS}) + set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}") + else() + set(WITH_POWER8 OFF) + endif() + endif() + # Power9 specific options and files + if(WITH_POWER9) + if(HAVE_POWER9_INTRIN) + add_definitions(-DPOWER9) + set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c) + list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS}) + set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}") + else() + set(WITH_POWER9 OFF) + endif() + endif() + elseif(BASEARCH_RISCV_FOUND) + if(WITH_RVV) + check_rvv_intrinsics() + if(HAVE_RVV_INTRIN) + add_definitions(-DRISCV_FEATURES) + add_definitions(-DRISCV_RVV) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_functions.h) + if(WITH_RUNTIME_CPU_DETECTION) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c) + endif() + # FIXME: we will not set compile flags for riscv_features.c when + # the kernels update hwcap or hwprobe for riscv + set(RVV_SRCS ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c) + if(WITH_RUNTIME_CPU_DETECTION) + list(APPEND RVV_SRCS ${ARCHDIR}/riscv_features.c) + endif() + list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS}) + set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}") + else() + set(WITH_RVV OFF) + endif() + endif() + elseif(BASEARCH_S360_FOUND) + check_s390_intrinsics() + if(HAVE_S390_INTRIN) + add_definitions(-DS390_FEATURES) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/s390_functions.h) + if(WITH_RUNTIME_CPU_DETECTION) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/s390_features.h) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/s390_features.c) + endif() + endif() + if(WITH_DFLTCC_DEFLATE) + add_definitions(-DS390_DFLTCC_DEFLATE) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_deflate.c) + endif() + if(WITH_DFLTCC_INFLATE) + add_definitions(-DS390_DFLTCC_INFLATE) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_inflate.c) + endif() + if(WITH_CRC32_VX) + check_vgfma_intrinsics() + if(HAVE_VGFMA_INTRIN) + add_definitions(-DS390_CRC32_VX) + set(CRC32_VX_SRCS ${ARCHDIR}/crc32-vx.c) + list(APPEND ZLIB_ARCH_SRCS ${CRC32_VX_SRCS}) + set_property(SOURCE ${CRC32_VX_SRCS} PROPERTY COMPILE_FLAGS "${VGFMAFLAG} ${NOLTOFLAG}") + else() + set(WITH_CRC32_VX OFF) + endif() + endif() + elseif(BASEARCH_X86_FOUND) + add_definitions(-DX86_FEATURES) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_functions.h) + if(WITH_RUNTIME_CPU_DETECTION) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c) endif() - endif() - endif() - list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c) - if(WITH_ACLE) - check_acle_compiler_flag() - if(HAVE_ACLE_FLAG) - add_definitions(-DARM_ACLE) - set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c) - set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}") - list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS}) - else() - set(WITH_ACLE OFF) - endif() - else() - set(WITH_ACLE OFF) - endif() - if(WITH_NEON) - check_neon_compiler_flag() - if(NEON_AVAILABLE) - add_definitions(-DARM_NEON) - set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c - ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c) - list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS}) - set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}") if(MSVC) - add_definitions(-D__ARM_NEON__) + list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) endif() - check_neon_ld4_intrinsics() - if(NEON_HAS_LD4) - add_definitions(-DARM_NEON_HASLD4) + check_xsave_intrinsics() + if(HAVE_XSAVE_INTRIN) + add_feature_info(XSAVE 1 "Support XSAVE intrinsics using \"${XSAVEFLAG}\"") + if(WITH_RUNTIME_CPU_DETECTION) + set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}") + endif() + if(NOT (CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8.2)) + add_definitions(-DX86_HAVE_XSAVE_INTRIN) + endif() endif() - else() - set(WITH_NEON OFF) - endif() - endif() - if(WITH_ARMV6) - check_armv6_compiler_flag() - if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN) - add_definitions(-DARM_SIMD) - set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c) - set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}") - list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS}) - if(HAVE_ARMV6_INTRIN) - add_definitions(-DARM_SIMD_INTRIN) + if(WITH_SSE2) + check_sse2_intrinsics() + if(HAVE_SSE2_INTRIN) + add_definitions(-DX86_SSE2) + set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c) + list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS}) + if(NOT ${ARCH} MATCHES "x86_64") + set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}") + add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable") + if(FORCE_SSE2) + add_definitions(-DX86_NOCHECK_SSE2) + endif() + endif() + else() + set(WITH_SSE2 OFF) + endif() endif() - else() - set(WITH_ARMV6 OFF) - endif() - else() - set(WITH_ARMV6 OFF) - endif() - endif() - if(BASEARCH_PPC_FOUND) - # Common arch detection code - if(WITH_ALTIVEC) - check_ppc_intrinsics() - endif() - if(WITH_POWER8) - check_power8_intrinsics() - endif() - if(WITH_POWER9) - check_power9_intrinsics() - endif() - if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN) - add_definitions(-DPOWER_FEATURES) - list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c) - endif() - # VMX specific options and files - if(WITH_ALTIVEC) - if(HAVE_VMX) - add_definitions(-DPPC_FEATURES) - if(HAVE_ALTIVEC) - add_definitions(-DPPC_VMX) - set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c) - list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS}) - set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}") - else() - set(WITH_ALTIVEC OFF) + if(WITH_SSSE3) + check_ssse3_intrinsics() + if(HAVE_SSSE3_INTRIN AND WITH_SSE2) + add_definitions(-DX86_SSSE3) + set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c) + add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS}) + set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}") + else() + set(WITH_SSSE3 OFF) + endif() endif() - endif() - endif() - # Power8 specific options and files - if(WITH_POWER8) - if(HAVE_POWER8_INTRIN) - add_definitions(-DPOWER8_VSX) - set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c) - if("${ARCH}" MATCHES "powerpc64(le)?") - add_definitions(-DPOWER8_VSX_CRC32) - list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c) + if(WITH_SSE42) + check_sse42_intrinsics() + if(HAVE_SSE42_INTRIN AND WITH_SSSE3) + add_definitions(-DX86_SSE42) + set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c) + add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS}) + set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}") + else() + set(WITH_SSE42 OFF) + endif() endif() - list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS}) - set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}") - else() - set(WITH_POWER8 OFF) - endif() - endif() - # Power9 specific options and files - if(WITH_POWER9) - if(HAVE_POWER9_INTRIN) - add_definitions(-DPOWER9) - set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c) - list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS}) - set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}") - else() - set(WITH_POWER9 OFF) - endif() - endif() - endif() - if(BASEARCH_RISCV_FOUND) - if(WITH_RVV) - check_rvv_intrinsics() - if(HAVE_RVV_INTRIN) - add_definitions(-DRISCV_FEATURES) - add_definitions(-DRISCV_RVV) - list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c) - # FIXME: we will not set compile flags for riscv_features.c when - # the kernels update hwcap or hwprobe for riscv - set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c) - list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS}) - set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}") - else() - set(WITH_RVV OFF) - endif() - endif() - endif() - if(BASEARCH_X86_FOUND) - add_definitions(-DX86_FEATURES) - list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c) - if(MSVC) - list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) - endif() - if(WITH_AVX2) - check_avx2_intrinsics() - if(HAVE_AVX2_INTRIN) - add_definitions(-DX86_AVX2) - set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c) - list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c) - list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c) - list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c) - list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS}) - set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}") - else() - set(WITH_AVX2 OFF) - endif() - endif() - if(WITH_AVX512) - check_avx512_intrinsics() - if(HAVE_AVX512_INTRIN) - add_definitions(-DX86_AVX512) - list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) - list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) - list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h) - if(HAVE_MASK_INTRIN) - add_definitions(-DX86_MASK_INTRIN) + if(WITH_PCLMULQDQ) + check_pclmulqdq_intrinsics() + if(HAVE_PCLMULQDQ_INTRIN AND WITH_SSE42) + add_definitions(-DX86_PCLMULQDQ_CRC) + set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c) + add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSE42FLAG} ${PCLMULFLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS}) + set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}") + else() + set(WITH_PCLMULQDQ OFF) + endif() endif() - set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}") - else() - set(WITH_AVX512 OFF) - endif() - endif() - if(WITH_AVX512VNNI) - check_avx512vnni_intrinsics() - if(HAVE_AVX512VNNI_INTRIN) - add_definitions(-DX86_AVX512VNNI) - list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c) - list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS}) - set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}") - else() - set(WITH_AVX512VNNI OFF) - endif() - endif() - if(WITH_SSE42) - check_sse42_intrinsics() - if(HAVE_SSE42_INTRIN) - add_definitions(-DX86_SSE42) - set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c) - list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS}) - set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}") - else() - set(WITH_SSE42 OFF) - endif() - endif() - if(WITH_SSE2) - check_sse2_intrinsics() - if(HAVE_SSE2_INTRIN) - add_definitions(-DX86_SSE2) - set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c) - list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS}) - if(NOT ${ARCH} MATCHES "x86_64") - set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}") - add_definitions(-DX86_NOCHECK_SSE2) + if(WITH_AVX2) + check_avx2_intrinsics() + if(HAVE_AVX2_INTRIN AND WITH_SSE42) + add_definitions(-DX86_AVX2) + set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c) + add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") + list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c) + add_feature_info(AVX2_CHUNKSET 1 "Support AVX2 optimized chunkset, using \"${AVX2FLAG}\"") + list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c) + add_feature_info(AVX2_COMPARE256 1 "Support AVX2 optimized compare256, using \"${AVX2FLAG}\"") + list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c) + add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS}) + set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}") + else() + set(WITH_AVX2 OFF) + endif() endif() - else() - set(WITH_SSE2 OFF) - endif() - endif() - if(WITH_SSSE3) - check_ssse3_intrinsics() - if(HAVE_SSSE3_INTRIN) - add_definitions(-DX86_SSSE3) - set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c) - list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS}) - set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}") - else() - set(WITH_SSSE3 OFF) - endif() - endif() - if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42) - check_pclmulqdq_intrinsics() - if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN) - add_definitions(-DX86_PCLMULQDQ_CRC) - set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c) - list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS}) - set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}") - - if(WITH_VPCLMULQDQ AND WITH_AVX512) - check_vpclmulqdq_intrinsics() - if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN) - add_definitions(-DX86_VPCLMULQDQ_CRC) - set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c) - list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS}) - set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}") - else() - set(WITH_VPCLMULQDQ OFF) - endif() - else() - set(WITH_VPCLMULQDQ OFF) + if(WITH_AVX512) + check_avx512_intrinsics() + if(HAVE_AVX512_INTRIN AND WITH_AVX2) + add_definitions(-DX86_AVX512) + list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) + add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h) + set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}") + else() + set(WITH_AVX512 OFF) + endif() + endif() + if(WITH_AVX512VNNI) + check_avx512vnni_intrinsics() + if(HAVE_AVX512VNNI_INTRIN AND WITH_AVX2) + add_definitions(-DX86_AVX512VNNI) + add_feature_info(AVX512VNNI_ADLER32 1 "Support AVX512VNNI adler32, using \"${AVX512VNNIFLAG}\"") + list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c) + list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS}) + set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}") + else() + set(WITH_AVX512VNNI OFF) + endif() + endif() + if(WITH_VPCLMULQDQ) + check_vpclmulqdq_intrinsics() + if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX512) + add_definitions(-DX86_VPCLMULQDQ_CRC) + set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c) + add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS}) + set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}") + else() + set(WITH_VPCLMULQDQ OFF) + endif() endif() - else() - set(WITH_PCLMULQDQ OFF) - set(WITH_VPCLMULQDQ OFF) - endif() - else() - set(WITH_PCLMULQDQ OFF) - set(WITH_VPCLMULQDQ OFF) endif() - check_xsave_intrinsics() - if(HAVE_XSAVE_INTRIN) - set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}") - endif() - endif() endif() +message(STATUS "Architecture-specific source files: ${ZLIB_ARCH_SRCS}") + #============================================================================ # zconf.h #============================================================================ macro(generate_cmakein input output) - file(REMOVE ${output}) - file(STRINGS ${input} _lines) - foreach(_line IN LISTS _lines) - string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}") - string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}") - if(NEED_PTRDIFF_T) - string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}") - endif() - file(APPEND ${output} "${_line}\n") - endforeach() + file(REMOVE ${output}) + file(STRINGS ${input} _lines) + foreach(_line IN LISTS _lines) + string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}") + string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}") + if(NEED_PTRDIFF_T) + string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}") + endif() + file(APPEND ${output} "${_line}\n") + endforeach() endmacro(generate_cmakein) -generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein ) +generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h.cmakein ) + +if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR) + # If we're doing an out of source build and the user has a zconf.h + # in their source tree... + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h) + message(STATUS "Renaming") + message(STATUS " ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h") + message(STATUS "to 'zconf${SUFFIX}.h.included' because this file is included with zlib") + message(STATUS "but CMake generates it automatically in the build directory.") + file(RENAME ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.included) + endif() + + # If we're doing an out of source build and the user has a zconf.h.cmakein + # in their source tree... + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakein) + message(STATUS "Renaming") + message(STATUS " ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakein") + message(STATUS "to 'zconf${SUFFIX}.h.cmakeincluded' because this file is included with zlib") + message(STATUS "but CMake generates it automatically in the build directory.") + file(RENAME ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakein ${CMAKE_CURRENT_SOURCE_DIR}/zconf${SUFFIX}.h.cmakeincluded) + endif() +endif() + +# The user is allowed (but discouraged) to set absolute CMAKE_INSTALL_*DIR paths. +# If they do, we copy these non-relocatable paths into the pkg-config file. +if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}") + set(PC_INC_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}") +else() + set(PC_INC_INSTALL_DIR "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") +endif() + +if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}") + set(PC_LIB_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}") +else() + set(PC_LIB_INSTALL_DIR "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") +endif() #============================================================================ # zlib #============================================================================ set(ZLIB_PUBLIC_HDRS - ${CMAKE_CURRENT_BINARY_DIR}/zconf.h - ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling.h - ${CMAKE_CURRENT_BINARY_DIR}/zlib.h + ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h + ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h + ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h ) set(ZLIB_PRIVATE_HDRS adler32_p.h chunkset_tpl.h compare256_rle.h - cpu_features.h + arch_functions.h crc32_braid_p.h crc32_braid_comb_p.h crc32_braid_tbl.h - crc32_fold.h deflate.h deflate_p.h functable.h @@ -704,15 +1098,17 @@ set(ZLIB_PRIVATE_HDRS zutil.h ) set(ZLIB_SRCS + arch/generic/adler32_c.c + arch/generic/adler32_fold_c.c + arch/generic/chunkset_c.c + arch/generic/compare256_c.c + arch/generic/crc32_braid_c.c + arch/generic/crc32_fold_c.c + arch/generic/slide_hash_c.c adler32.c - adler32_fold.c - chunkset.c - compare256.c compress.c - cpu_features.c - crc32_braid.c + crc32.c crc32_braid_comb.c - crc32_fold.c deflate.c deflate_fast.c deflate_huff.c @@ -727,12 +1123,16 @@ set(ZLIB_SRCS inftrees.c insert_string.c insert_string_roll.c - slide_hash.c trees.c uncompr.c zutil.c ) +if(WITH_RUNTIME_CPU_DETECTION) + list(APPEND ZLIB_PRIVATE_HDRS cpu_features.h) + list(APPEND ZLIB_SRCS cpu_features.c) +endif() + set(ZLIB_GZFILE_PRIVATE_HDRS gzguts.h ) @@ -743,13 +1143,124 @@ set(ZLIB_GZFILE_SRCS ) set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) -list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS}) +if(WITH_GZFILEOP) + list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS}) +endif() -add_library(zlib STATIC ${ZLIB_ALL_SRCS}) +if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS) + set(ZLIB_DLL_SRCS win32/zlib${SUFFIX}1.rc) +endif() -target_include_directories(zlib PUBLIC - "$" - "$") +if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS) + add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS}) + add_library(zlibstatic STATIC ${ZLIB_ALL_SRCS}) + + set(ZLIB_INSTALL_LIBRARIES zlib zlibstatic) +else() + + if(ZLIB_BUILD_SHARED_LIBS) + add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS}) + target_sources(zlib PRIVATE ${ZLIB_DLL_SRCS}) + else() + add_library(zlib STATIC ${ZLIB_ALL_SRCS}) + add_library(zlibstatic ALIAS zlib) + endif() + + set(ZLIB_INSTALL_LIBRARIES zlib) +endif() + +# INFO: Mimics official zlib CMake target +# Generates ZLIB.cmake in case ZLIB_COMPAT=ON and always exports the CMake target ZLIB::ZLIB +# In case ZLIB_COMPAT=OFF, the CMake target and file follows zlib-ng naming convention +if (ZLIB_COMPAT) + if (TARGET zlib) + set_target_properties(zlib PROPERTIES EXPORT_NAME ZLIB) + else() + set_target_properties(zlibstatic PROPERTIES EXPORT_NAME ZLIB) + endif() +endif() + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${ARCHDIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/arch/generic) + +foreach(ZLIB_INSTALL_LIBRARY ${ZLIB_INSTALL_LIBRARIES}) + if(NOT ZLIB_COMPAT) + target_compile_definitions(${ZLIB_INSTALL_LIBRARY} PUBLIC ZLIBNG_NATIVE_API) + endif() + target_include_directories(${ZLIB_INSTALL_LIBRARY} PUBLIC + "$${CMAKE_CURRENT_SOURCE_DIR}>" + "$") +endforeach() + +if(WIN32) + # Shared library + if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS) + set_target_properties(zlib PROPERTIES OUTPUT_NAME zlib${SUFFIX}) + endif() + # Static library + if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS) + if(MSVC) + set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX}) + else() + set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME z${SUFFIX}) + endif() + elseif(NOT ZLIB_BUILD_SHARED_LIBS) + if(MSVC) + set_target_properties(zlib PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX}) + else() + set_target_properties(zlib PROPERTIES OUTPUT_NAME z${SUFFIX}) + endif() + endif() +else() + # On unix-like platforms the library is almost always called libz + set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES OUTPUT_NAME z${SUFFIX}) +endif() + +if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS) + set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) + + if(ZLIB_COMPAT) + set_target_properties(zlib PROPERTIES SOVERSION 1) + else() + set_target_properties(zlib PROPERTIES SOVERSION 2) + endif() + + if(NOT CYGWIN) + # This property causes shared libraries on Linux to have the full version + # encoded into their final filename. We disable this on Cygwin because + # it causes cygz-${ZLIB_FULL_VERSION}.dll to be created when cygz.dll + # seems to be the default. + # + # This has no effect with MSVC, on that platform the version info for + # the DLL comes from the resource file win32/zlib1.rc + set_target_properties(zlib PROPERTIES VERSION ${ZLIB_FULL_VERSION}) + endif() + + if(UNIX) + if(HAVE_NO_INTERPOSITION) + set_target_properties(zlib PROPERTIES COMPILE_FLAGS "-fno-semantic-interposition") + endif() + if(NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL AIX) + if(NOT ZLIB_COMPAT) + add_definitions(-DHAVE_SYMVER) + endif() + set_target_properties(zlib PROPERTIES LINK_FLAGS + "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.map\"") + endif() + endif() + if(MSYS) + # Suppress version number from shared library name + set(CMAKE_SHARED_LIBRARY_NAME_WITH_VERSION 0) + elseif(WIN32) + # Creates zlib1.dll when building shared library version + if(ZLIB_COMPAT) + set_target_properties(zlib PROPERTIES SUFFIX "1.dll") + else() + set_target_properties(zlib PROPERTIES SUFFIX "2.dll") + endif() + endif() +endif() if(HAVE_UNISTD_H) SET(ZCONF_UNISTD_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */") @@ -757,40 +1268,143 @@ else() SET(ZCONF_UNISTD_LINE "#if 0 /* was set to #if 0 by configure/cmake/etc */") endif() if(NEED_PTRDIFF_T) - SET(ZCONF_PTRDIFF_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */") + SET(ZCONF_PTRDIFF_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */") else() - SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T /* may be set to #if 1 by configure/cmake/etc */") + SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T /* may be set to #if 1 by configure/cmake/etc */") endif() -configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein - ${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in - ${CMAKE_CURRENT_BINARY_DIR}/zlib.h @ONLY) +set(ZLIB_PC ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.pc) +if(WITH_GZFILEOP) + set(PKG_CONFIG_CFLAGS "-DWITH_GZFILEOP") +endif() +configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h.cmakein + ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.h.in + ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in - ${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY) + ${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty - ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY) +if (NOT ZLIB_SYMBOL_PREFIX STREQUAL "") + add_feature_info(ZLIB_SYMBOL_PREFIX ON "Publicly exported symbols have a custom prefix") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling${SUFFIX}.h.in + ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h @ONLY) +else() + add_feature_info(ZLIB_SYMBOL_PREFIX OFF "Publicly exported symbols DO NOT have a custom prefix") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty + ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY) +endif() +# add_definitions(-DZLIB_SYMBOL_PREFIX=${ZLIB_SYMBOL_PREFIX}) # not needed -ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes - -Wundef - -Wmissing-declarations -) -set_target_properties(${ZLIB_LIBRARY} PROPERTIES - OUTPUT_NAME ${ZLIB_LIBRARY} - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - COMPILE_PDB_NAME ${ZLIB_LIBRARY} - COMPILE_PDB_NAME_DEBUG "${ZLIB_LIBRARY}${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH} -) +if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL) + install(TARGETS ${ZLIB_INSTALL_LIBRARIES} + EXPORT ${EXPORT_NAME} + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}") +endif() +if(NOT SKIP_INSTALL_HEADERS AND NOT SKIP_INSTALL_ALL) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" RENAME zlib${SUFFIX}.h) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" RENAME zlib_name_mangling${SUFFIX}.h) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" RENAME zconf${SUFFIX}.h) +endif() +if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL) + install(FILES ${ZLIB_PC} DESTINATION "${PKGCONFIG_INSTALL_DIR}") + install(EXPORT ${EXPORT_NAME} + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${EXPORT_NAME}" + NAMESPACE ${EXPORT_NAME}::) + # Use GNU-style variable names + set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}) + set(LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}) + if (ZLIB_COMPAT) + set(PACKAGE_CONFIGNAME zlib) + set(PACKAGE_VERSION ${ZLIB_HEADER_VERSION}) + else() + set(PACKAGE_CONFIGNAME zlib-ng) + set(PACKAGE_VERSION ${ZLIBNG_HEADER_VERSION}) + endif() +endif() + +#============================================================================ +# Example binaries +#============================================================================ + +if(ZLIB_ENABLE_TESTS) + enable_testing() + + if(ZLIB_BUILD_SHARED_LIBS) + if(ZLIBNG_ENABLE_TESTS) + message(STATUS "Disabling zlib-ng tests because shared libraries are enabled") + set(ZLIBNG_ENABLE_TESTS OFF) + endif() + + if(WITH_BENCHMARKS OR WITH_BENCHMARK_APPS) + message(STATUS "Disabling benchmarks because shared libraries are enabled") + set(WITH_BENCHMARKS OFF) + set(WITH_BENCHMARK_APPS OFF) + endif() + endif() + + add_subdirectory(test) +endif() + +add_feature_info(WITH_GZFILEOP WITH_GZFILEOP "Compile with support for gzFile related functions") +add_feature_info(ZLIB_COMPAT ZLIB_COMPAT "Compile with zlib compatible API") +add_feature_info(ZLIB_ENABLE_TESTS ZLIB_ENABLE_TESTS "Build test binaries") +add_feature_info(ZLIBNG_ENABLE_TESTS ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API") +add_feature_info(WITH_SANITIZER WITH_SANITIZER "Enable sanitizer support") +add_feature_info(WITH_GTEST WITH_GTEST "Build gtest_zlib") +add_feature_info(WITH_FUZZERS WITH_FUZZERS "Build test/fuzz") +add_feature_info(WITH_BENCHMARKS WITH_BENCHMARKS "Build test/benchmarks") +add_feature_info(WITH_BENCHMARK_APPS WITH_BENCHMARK_APPS "Build application benchmarks") +add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation") +add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies") +add_feature_info(WITH_NATIVE_INSTRUCTIONS WITH_NATIVE_INSTRUCTIONS + "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)") +add_feature_info(WITH_RUNTIME_CPU_DETECTION WITH_RUNTIME_CPU_DETECTION "Build with runtime CPU detection") +add_feature_info(WITH_MAINTAINER_WARNINGS WITH_MAINTAINER_WARNINGS "Build with project maintainer warnings") +add_feature_info(WITH_CODE_COVERAGE WITH_CODE_COVERAGE "Enable code coverage reporting") +add_feature_info(WITH_INFLATE_STRICT WITH_INFLATE_STRICT "Build with strict inflate distance checking") +add_feature_info(WITH_INFLATE_ALLOW_INVALID_DIST WITH_INFLATE_ALLOW_INVALID_DIST "Build with zero fill for inflate invalid distances") + +if(BASEARCH_ARM_FOUND) + add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE") + add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics") + add_feature_info(WITH_ARMV6 WITH_ARMV6 "Build with ARMv6 SIMD") +elseif(BASEARCH_PPC_FOUND) + add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations") + add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8") + add_feature_info(WITH_POWER9 WITH_POWER9 "Build with optimisations for POWER9") +elseif(BASEARCH_RISCV_FOUND) + add_feature_info(WITH_RVV WITH_RVV "Build with RVV intrinsics") +elseif(BASEARCH_S360_FOUND) + add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z") + add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z") + add_feature_info(WITH_CRC32_VX WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z") +elseif(BASEARCH_X86_FOUND) + add_feature_info(WITH_AVX2 WITH_AVX2 "Build with AVX2") + add_feature_info(WITH_AVX512 WITH_AVX512 "Build with AVX512") + add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI") + add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2") + add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3") + add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42") + add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ") + add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ") +endif() + +add_feature_info(INSTALL_UTILS INSTALL_UTILS "Copy minigzip and minideflate during install") + +FEATURE_SUMMARY(WHAT ALL INCLUDE_QUIET_PACKAGES) if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${ZLIB_LIBRARY} PROPERTIES FOLDER "3rdparty") + set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES FOLDER "3rdparty") endif() if(NOT BUILD_SHARED_LIBS) - ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) + ocv_install_target(${ZLIB_INSTALL_LIBRARIES} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) endif() -ocv_install_3rdparty_licenses(${ZLIB_LIBRARY} LICENSE.md) +ocv_install_3rdparty_licenses(${ZLIB_INSTALL_LIBRARIES} LICENSE.md) diff --git a/3rdparty/zlib-ng/LICENSE.md b/3rdparty/zlib-ng/LICENSE.md index adb48d4729..e866d7ac18 100644 --- a/3rdparty/zlib-ng/LICENSE.md +++ b/3rdparty/zlib-ng/LICENSE.md @@ -1,4 +1,4 @@ -(C) 1995-2013 Jean-loup Gailly and Mark Adler +(C) 1995-2024 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages diff --git a/3rdparty/zlib-ng/README.md b/3rdparty/zlib-ng/README.md index 4f9fe09c69..411621b52f 100644 --- a/3rdparty/zlib-ng/README.md +++ b/3rdparty/zlib-ng/README.md @@ -21,7 +21,6 @@ Features * Support for CPU intrinsics when available * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z - * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX @@ -95,20 +94,21 @@ make test Build Options ------------- -| CMake | configure | Description | Default | -|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------| -| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF | -| ZLIB_ENABLE_TESTS | | Build test binaries | ON | -| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON | -| WITH_OPTIM | --without-optimizations | Build with optimisations | ON | -| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON | -| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF | -| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF | -| WITH_GTEST | | Build gtest_zlib | ON | -| WITH_FUZZERS | | Build test/fuzz | OFF | -| WITH_BENCHMARKS | | Build test/benchmarks | OFF | -| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF | -| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF | +| CMake | configure | Description | Default | +|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------| +| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF | +| ZLIB_ENABLE_TESTS | | Build test binaries | ON | +| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON | +| WITH_OPTIM | --without-optimizations | Build with optimisations | ON | +| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON | +| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF | +| WITH_RUNTIME_CPU_DETECTION | | Compiles with runtime CPU detection | ON | +| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF | +| WITH_GTEST | | Build gtest_zlib | ON | +| WITH_FUZZERS | | Build test/fuzz | OFF | +| WITH_BENCHMARKS | | Build test/benchmarks | OFF | +| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF | +| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF | Install diff --git a/3rdparty/zlib-ng/adler32.c b/3rdparty/zlib-ng/adler32.c index 95ac13c304..1a643ed53b 100644 --- a/3rdparty/zlib-ng/adler32.c +++ b/3rdparty/zlib-ng/adler32.c @@ -7,70 +7,24 @@ #include "functable.h" #include "adler32_p.h" -/* ========================================================================= */ -Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { - uint32_t sum2; - unsigned n; - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* in case user likes doing a byte at a time, keep it fast */ - if (UNLIKELY(len == 1)) - return adler32_len_1(adler, buf, sum2); - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (UNLIKELY(buf == NULL)) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (UNLIKELY(len < 16)) - return adler32_len_16(adler, buf, len, sum2); - - /* do length NMAX blocks -- requires just one modulo operation */ - while (len >= NMAX) { - len -= NMAX; -#ifdef UNROLL_MORE - n = NMAX / 16; /* NMAX is divisible by 16 */ -#else - n = NMAX / 8; /* NMAX is divisible by 8 */ -#endif - do { -#ifdef UNROLL_MORE - DO16(adler, sum2, buf); /* 16 sums unrolled */ - buf += 16; -#else - DO8(adler, sum2, buf, 0); /* 8 sums unrolled */ - buf += 8; -#endif - } while (--n); - adler %= BASE; - sum2 %= BASE; - } - - /* do remaining bytes (less than NMAX, still just one modulo) */ - return adler32_len_64(adler, buf, len, sum2); -} - #ifdef ZLIB_COMPAT unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) { - return (unsigned long)functable.adler32((uint32_t)adler, buf, len); + return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len); } #else uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) { - return functable.adler32(adler, buf, len); + return FUNCTABLE_CALL(adler32)(adler, buf, len); } #endif /* ========================================================================= */ #ifdef ZLIB_COMPAT unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) { - return (unsigned long)functable.adler32((uint32_t)adler, buf, len); + return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len); } #else uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) { - return functable.adler32(adler, buf, len); + return FUNCTABLE_CALL(adler32)(adler, buf, len); } #endif diff --git a/3rdparty/zlib-ng/adler32_fold.h b/3rdparty/zlib-ng/adler32_fold.h deleted file mode 100644 index 20aa1c7400..0000000000 --- a/3rdparty/zlib-ng/adler32_fold.h +++ /dev/null @@ -1,11 +0,0 @@ -/* adler32_fold.h -- adler32 folding interface - * Copyright (C) 2022 Adam Stylinski - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifndef ADLER32_FOLD_H_ -#define ADLER32_FOLD_H_ - -Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); - -#endif diff --git a/3rdparty/zlib-ng/arch/.gitignore b/3rdparty/zlib-ng/arch/.gitignore deleted file mode 100644 index 2c3af0a08c..0000000000 --- a/3rdparty/zlib-ng/arch/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# ignore Makefiles; they're all automatically generated -Makefile diff --git a/3rdparty/zlib-ng/arch/arm/Makefile.in b/3rdparty/zlib-ng/arch/arm/Makefile.in index 9d05b00b54..b6f0aaf211 100644 --- a/3rdparty/zlib-ng/arch/arm/Makefile.in +++ b/3rdparty/zlib-ng/arch/arm/Makefile.in @@ -25,7 +25,6 @@ all: \ crc32_acle.o crc32_acle.lo \ slide_hash_neon.o slide_hash_neon.lo \ slide_hash_armv6.o slide_hash_armv6.lo \ - insert_string_acle.o insert_string_acle.lo adler32_neon.o: $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c @@ -69,12 +68,6 @@ slide_hash_armv6.o: slide_hash_armv6.lo: $(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c -insert_string_acle.o: - $(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c - -insert_string_acle.lo: - $(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c - mostlyclean: clean clean: rm -f *.o *.lo *~ diff --git a/3rdparty/zlib-ng/arch/arm/adler32_neon.c b/3rdparty/zlib-ng/arch/arm/adler32_neon.c index f1c43ff047..8e46b38017 100644 --- a/3rdparty/zlib-ng/arch/arm/adler32_neon.c +++ b/3rdparty/zlib-ng/arch/arm/adler32_neon.c @@ -7,8 +7,8 @@ */ #ifdef ARM_NEON #include "neon_intrins.h" -#include "../../zbuild.h" -#include "../../adler32_p.h" +#include "zbuild.h" +#include "adler32_p.h" static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) { static const uint16_t ALIGNED_(16) taps[64] = { diff --git a/3rdparty/zlib-ng/arch/arm/arm_features.c b/3rdparty/zlib-ng/arch/arm/arm_features.c index a0e070ba95..d0d49764f4 100644 --- a/3rdparty/zlib-ng/arch/arm/arm_features.c +++ b/3rdparty/zlib-ng/arch/arm/arm_features.c @@ -1,4 +1,4 @@ -#include "../../zbuild.h" +#include "zbuild.h" #include "arm_features.h" #if defined(__linux__) && defined(HAVE_SYS_AUXV_H) @@ -11,6 +11,11 @@ # ifndef ID_AA64ISAR0_CRC32_VAL # define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32 # endif +#elif defined(__OpenBSD__) && defined(__aarch64__) +# include +# include +# include +# include #elif defined(__APPLE__) # if !defined(_DARWIN_C_SOURCE) # define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */ @@ -30,6 +35,16 @@ static int arm_has_crc32() { #elif defined(__FreeBSD__) && defined(__aarch64__) return getenv("QEMU_EMULATING") == NULL && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE; +#elif defined(__OpenBSD__) && defined(__aarch64__) + int hascrc32 = 0; + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0 = 0; + size_t len = sizeof(isar0); + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE) + hascrc32 = 1; + } + return hascrc32; #elif defined(__APPLE__) int hascrc32; size_t size = sizeof(hascrc32); diff --git a/3rdparty/zlib-ng/arch/arm/arm_features.h b/3rdparty/zlib-ng/arch/arm/arm_features.h index eca078e310..d968e02fbb 100644 --- a/3rdparty/zlib-ng/arch/arm/arm_features.h +++ b/3rdparty/zlib-ng/arch/arm/arm_features.h @@ -2,8 +2,8 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifndef ARM_H_ -#define ARM_H_ +#ifndef ARM_FEATURES_H_ +#define ARM_FEATURES_H_ struct arm_cpu_features { int has_simd; @@ -13,4 +13,4 @@ struct arm_cpu_features { void Z_INTERNAL arm_check_features(struct arm_cpu_features *features); -#endif /* ARM_H_ */ +#endif /* ARM_FEATURES_H_ */ diff --git a/3rdparty/zlib-ng/arch/arm/arm_functions.h b/3rdparty/zlib-ng/arch/arm/arm_functions.h new file mode 100644 index 0000000000..61c682710a --- /dev/null +++ b/3rdparty/zlib-ng/arch/arm/arm_functions.h @@ -0,0 +1,65 @@ +/* arm_functions.h -- ARM implementations for arch-specific functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_FUNCTIONS_H_ +#define ARM_FUNCTIONS_H_ + +#ifdef ARM_NEON +uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t chunksize_neon(void); +uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); + +# ifdef HAVE_BUILTIN_CTZLL +uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); +uint32_t longest_match_neon(deflate_state *const s, Pos cur_match); +uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match); +# endif +void slide_hash_neon(deflate_state *s); +void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); +#endif + +#ifdef ARM_ACLE +uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len); +#endif + +#ifdef ARM_SIMD +void slide_hash_armv6(deflate_state *s); +#endif + + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// ARM - SIMD +# if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD) +# undef native_slide_hash +# define native_slide_hash slide_hash_armv6 +# endif +// ARM - NEON +# if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON +# undef native_adler32 +# define native_adler32 adler32_neon +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_neon +# undef native_chunksize +# define native_chunksize chunksize_neon +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_neon +# undef native_slide_hash +# define native_slide_hash slide_hash_neon +# ifdef HAVE_BUILTIN_CTZLL +# undef native_compare256 +# define native_compare256 compare256_neon +# undef native_longest_match +# define native_longest_match longest_match_neon +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_neon +# endif +# endif +// ARM - ACLE +# if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32) +# undef native_crc32 +# define native_crc32 crc32_acle +# endif +#endif + +#endif /* ARM_FUNCTIONS_H_ */ diff --git a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c index f9a444b068..1c49ef5612 100644 --- a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c +++ b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c @@ -4,8 +4,8 @@ #ifdef ARM_NEON #include "neon_intrins.h" -#include "../../zbuild.h" -#include "../generic/chunk_permute_table.h" +#include "zbuild.h" +#include "arch/generic/chunk_permute_table.h" typedef uint8x16_t chunk_t; diff --git a/3rdparty/zlib-ng/arch/arm/compare256_neon.c b/3rdparty/zlib-ng/arch/arm/compare256_neon.c index 7daeba411e..87d14c89c0 100644 --- a/3rdparty/zlib-ng/arch/arm/compare256_neon.c +++ b/3rdparty/zlib-ng/arch/arm/compare256_neon.c @@ -3,8 +3,9 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" - +#include "zbuild.h" +#include "zutil_p.h" +#include "deflate.h" #include "fallback_builtins.h" #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) diff --git a/3rdparty/zlib-ng/arch/arm/crc32_acle.c b/3rdparty/zlib-ng/arch/arm/crc32_acle.c index ac7d6ff66b..116bcab1c2 100644 --- a/3rdparty/zlib-ng/arch/arm/crc32_acle.c +++ b/3rdparty/zlib-ng/arch/arm/crc32_acle.c @@ -7,7 +7,7 @@ #ifdef ARM_ACLE #include "acle_intrins.h" -#include "../../zbuild.h" +#include "zbuild.h" Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) { Z_REGISTER uint32_t c; diff --git a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c deleted file mode 100644 index aa8385c712..0000000000 --- a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c +++ /dev/null @@ -1,24 +0,0 @@ -/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions - * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - * - */ - -#ifdef ARM_ACLE -#include "acle_intrins.h" -#include "../../zbuild.h" -#include "../../deflate.h" - -#define HASH_CALC(s, h, val) \ - h = __crc32w(0, val) - -#define HASH_CALC_VAR h -#define HASH_CALC_VAR_INIT uint32_t h = 0 - -#define UPDATE_HASH Z_TARGET_CRC update_hash_acle -#define INSERT_STRING Z_TARGET_CRC insert_string_acle -#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle - -#include "../../insert_string_tpl.h" -#endif diff --git a/3rdparty/zlib-ng/arch/arm/neon_intrins.h b/3rdparty/zlib-ng/arch/arm/neon_intrins.h index 51df77dbe6..a9e99ec88a 100644 --- a/3rdparty/zlib-ng/arch/arm/neon_intrins.h +++ b/3rdparty/zlib-ng/arch/arm/neon_intrins.h @@ -25,6 +25,13 @@ out.val[3] = vqsubq_u16(a.val[3], b); \ } while (0) +# if defined(__clang__) && defined(__arm__) && defined(__ANDROID__) +/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */ +# undef ARM_NEON_HASLD4 +# undef vld1q_u16_x4 +# undef vld1q_u8_x4 +# undef vst1q_u16_x4 +# endif # ifndef ARM_NEON_HASLD4 diff --git a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c index 0a2eeccf92..07f71b59eb 100644 --- a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c +++ b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c @@ -5,8 +5,8 @@ #if defined(ARM_SIMD) #include "acle_intrins.h" -#include "../../zbuild.h" -#include "../../deflate.h" +#include "zbuild.h" +#include "deflate.h" /* SIMD version of hash_chain rebase */ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { diff --git a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c index a96ca11799..a601e6099a 100644 --- a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c +++ b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c @@ -10,8 +10,8 @@ #ifdef ARM_NEON #include "neon_intrins.h" -#include "../../zbuild.h" -#include "../../deflate.h" +#include "zbuild.h" +#include "deflate.h" /* SIMD version of hash_chain rebase */ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { diff --git a/3rdparty/zlib-ng/arch/generic/Makefile.in b/3rdparty/zlib-ng/arch/generic/Makefile.in index c717026f86..32c8242d02 100644 --- a/3rdparty/zlib-ng/arch/generic/Makefile.in +++ b/3rdparty/zlib-ng/arch/generic/Makefile.in @@ -1,5 +1,6 @@ -# Makefile for zlib +# Makefile for zlib-ng # Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# Copyright (C) 2024 Hans Kristian Rosbach # For conditions of distribution and use, see copyright notice in zlib.h CC= @@ -11,12 +12,62 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: +all: \ + adler32_c.o adler32_c.lo \ + adler32_fold_c.o adler32_fold_c.lo \ + chunkset_c.o chunkset_c.lo \ + compare256_c.o compare256_c.lo \ + crc32_braid_c.o crc32_braid_c.lo \ + crc32_fold_c.o crc32_fold_c.lo \ + slide_hash_c.o slide_hash_c.lo + + +adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c + +adler32_c.lo: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c + +adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c + +adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c + +chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c + +chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c + +compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c + +compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c + +crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c + +crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c + +crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c + +crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c + +slide_hash_c.o: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c + +slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c mostlyclean: clean clean: - rm -f *.o *.lo *~ \ + rm -f *.o *.lo *~ rm -rf objs rm -f *.gcda *.gcno *.gcov diff --git a/3rdparty/zlib-ng/arch/generic/adler32_c.c b/3rdparty/zlib-ng/arch/generic/adler32_c.c new file mode 100644 index 0000000000..64258c89b4 --- /dev/null +++ b/3rdparty/zlib-ng/arch/generic/adler32_c.c @@ -0,0 +1,54 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011, 2016 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "functable.h" +#include "adler32_p.h" + +/* ========================================================================= */ +Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { + uint32_t sum2; + unsigned n; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_len_1(adler, buf, sum2); + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (UNLIKELY(buf == NULL)) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_len_16(adler, buf, len, sum2); + + /* do length NMAX blocks -- requires just one modulo operation */ + while (len >= NMAX) { + len -= NMAX; +#ifdef UNROLL_MORE + n = NMAX / 16; /* NMAX is divisible by 16 */ +#else + n = NMAX / 8; /* NMAX is divisible by 8 */ +#endif + do { +#ifdef UNROLL_MORE + DO16(adler, sum2, buf); /* 16 sums unrolled */ + buf += 16; +#else + DO8(adler, sum2, buf, 0); /* 8 sums unrolled */ + buf += 8; +#endif + } while (--n); + adler %= BASE; + sum2 %= BASE; + } + + /* do remaining bytes (less than NMAX, still just one modulo) */ + return adler32_len_64(adler, buf, len, sum2); +} diff --git a/3rdparty/zlib-ng/adler32_fold.c b/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c similarity index 83% rename from 3rdparty/zlib-ng/adler32_fold.c rename to 3rdparty/zlib-ng/arch/generic/adler32_fold_c.c index e2f6f9ac7d..397dd10400 100644 --- a/3rdparty/zlib-ng/adler32_fold.c +++ b/3rdparty/zlib-ng/arch/generic/adler32_fold_c.c @@ -5,12 +5,11 @@ #include "zbuild.h" #include "functable.h" -#include "adler32_fold.h" #include Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - adler = functable.adler32(adler, src, len); + adler = FUNCTABLE_CALL(adler32)(adler, src, len); memcpy(dst, src, len); return adler; } diff --git a/3rdparty/zlib-ng/chunkset.c b/3rdparty/zlib-ng/arch/generic/chunkset_c.c similarity index 100% rename from 3rdparty/zlib-ng/chunkset.c rename to 3rdparty/zlib-ng/arch/generic/chunkset_c.c diff --git a/3rdparty/zlib-ng/compare256.c b/3rdparty/zlib-ng/arch/generic/compare256_c.c similarity index 99% rename from 3rdparty/zlib-ng/compare256.c rename to 3rdparty/zlib-ng/arch/generic/compare256_c.c index 82551cdd57..0c12cb3a4e 100644 --- a/3rdparty/zlib-ng/compare256.c +++ b/3rdparty/zlib-ng/arch/generic/compare256_c.c @@ -5,6 +5,7 @@ #include "zbuild.h" #include "zutil_p.h" +#include "deflate.h" #include "fallback_builtins.h" /* ALIGNED, byte comparison */ diff --git a/3rdparty/zlib-ng/crc32_braid.c b/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c similarity index 79% rename from 3rdparty/zlib-ng/crc32_braid.c rename to 3rdparty/zlib-ng/arch/generic/crc32_braid_c.c index 96754b53df..7d8028f6d7 100644 --- a/3rdparty/zlib-ng/crc32_braid.c +++ b/3rdparty/zlib-ng/arch/generic/crc32_braid_c.c @@ -8,43 +8,9 @@ */ #include "zbuild.h" -#include "zutil.h" -#include "functable.h" #include "crc32_braid_p.h" #include "crc32_braid_tbl.h" -/* ========================================================================= */ - -const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) { - return (const uint32_t *)crc_table; -} - -#ifdef ZLIB_COMPAT -unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) { - if (buf == NULL) return 0; - - return (unsigned long)functable.crc32((uint32_t)crc, buf, len); -} -#else -uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) { - if (buf == NULL) return 0; - - return functable.crc32(crc, buf, len); -} -#endif - -#ifdef ZLIB_COMPAT -unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) { - return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len); -} -#else -uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) { - return PREFIX(crc32_z)(crc, buf, len); -} -#endif - -/* ========================================================================= */ - /* A CRC of a message is computed on N braids of words in the message, where each word consists of W bytes (4 or 8). If N is 3, for example, then three @@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t level. Your mileage may vary. */ -/* ========================================================================= */ - -#if BYTE_ORDER == LITTLE_ENDIAN -# define ZSWAPWORD(word) (word) -# define BRAID_TABLE crc_braid_table -#elif BYTE_ORDER == BIG_ENDIAN -# if W == 8 -# define ZSWAPWORD(word) ZSWAP64(word) -# elif W == 4 -# define ZSWAPWORD(word) ZSWAP32(word) -# endif -# define BRAID_TABLE crc_braid_big_table -#else -# error "No endian defined" -#endif -#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8) -#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 - /* ========================================================================= */ #ifdef W /* @@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) { /* ========================================================================= */ Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) { - Z_REGISTER uint32_t c; + uint32_t c; /* Pre-condition the CRC */ c = (~crc) & 0xffffffff; diff --git a/3rdparty/zlib-ng/crc32_fold.c b/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c similarity index 86% rename from 3rdparty/zlib-ng/crc32_fold.c rename to 3rdparty/zlib-ng/arch/generic/crc32_fold_c.c index 5b3c7c459f..43930e97c6 100644 --- a/3rdparty/zlib-ng/crc32_fold.c +++ b/3rdparty/zlib-ng/arch/generic/crc32_fold_c.c @@ -3,11 +3,9 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" +#include "zutil.h" #include "functable.h" - -#include "crc32_fold.h" - -#include +#include "crc32.h" Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) { crc->value = CRC32_INITIAL_VALUE; @@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) { } Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { - crc->value = functable.crc32(crc->value, src, len); + crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len); memcpy(dst, src, len); } @@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The * init_crc is an unused argument in this context */ Z_UNUSED(init_crc); - crc->value = functable.crc32(crc->value, src, len); + crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len); } Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) { diff --git a/3rdparty/zlib-ng/arch/generic/generic_functions.h b/3rdparty/zlib-ng/arch/generic/generic_functions.h new file mode 100644 index 0000000000..997dd4d01e --- /dev/null +++ b/3rdparty/zlib-ng/arch/generic/generic_functions.h @@ -0,0 +1,106 @@ +/* generic_functions.h -- generic C implementations for arch-specific functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef GENERIC_FUNCTIONS_H_ +#define GENERIC_FUNCTIONS_H_ + +#include "zendian.h" + +Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc); +Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); +Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc); + +Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); + + +typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len); +typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1); +typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len); + +uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); + +uint32_t chunksize_c(void); +uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); +void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); + +uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); + +uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); +#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN +uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1); +# ifdef HAVE_BUILTIN_CTZ + uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1); +# endif +# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) + uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1); +# endif +#endif + +typedef void (*slide_hash_func)(deflate_state *s); + +void slide_hash_c(deflate_state *s); + +uint32_t longest_match_c(deflate_state *const s, Pos cur_match); +# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN + uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); +# ifdef HAVE_BUILTIN_CTZ + uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); +# endif +# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) + uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); +# endif +# endif + +uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); +# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN + uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match); + uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match); +# ifdef UNALIGNED64_OK + uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); +# endif +# endif + + +// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions. +#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN +# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) +# define longest_match_generic longest_match_unaligned_64 +# define longest_match_slow_generic longest_match_slow_unaligned_64 +# define compare256_generic compare256_unaligned_64 +# elif defined(HAVE_BUILTIN_CTZ) +# define longest_match_generic longest_match_unaligned_32 +# define longest_match_slow_generic longest_match_slow_unaligned_32 +# define compare256_generic compare256_unaligned_32 +# else +# define longest_match_generic longest_match_unaligned_16 +# define longest_match_slow_generic longest_match_slow_unaligned_16 +# define compare256_generic compare256_unaligned_16 +# endif +#else +# define longest_match_generic longest_match_c +# define longest_match_slow_generic longest_match_slow_c +# define compare256_generic compare256_c +#endif + + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// Generic code +# define native_adler32 adler32_c +# define native_adler32_fold_copy adler32_fold_copy_c +# define native_chunkmemset_safe chunkmemset_safe_c +# define native_chunksize chunksize_c +# define native_crc32 PREFIX(crc32_braid) +# define native_crc32_fold crc32_fold_c +# define native_crc32_fold_copy crc32_fold_copy_c +# define native_crc32_fold_final crc32_fold_final_c +# define native_crc32_fold_reset crc32_fold_reset_c +# define native_inflate_fast inflate_fast_c +# define native_slide_hash slide_hash_c +# define native_longest_match longest_match_generic +# define native_longest_match_slow longest_match_slow_generic +# define native_compare256 compare256_generic +#endif + +#endif diff --git a/3rdparty/zlib-ng/slide_hash.c b/3rdparty/zlib-ng/arch/generic/slide_hash_c.c similarity index 96% rename from 3rdparty/zlib-ng/slide_hash.c rename to 3rdparty/zlib-ng/arch/generic/slide_hash_c.c index b9fbbdb69f..8345b9e36b 100644 --- a/3rdparty/zlib-ng/slide_hash.c +++ b/3rdparty/zlib-ng/arch/generic/slide_hash_c.c @@ -1,6 +1,6 @@ /* slide_hash.c -- slide hash table C implementation * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ diff --git a/3rdparty/zlib-ng/arch/power/chunkset_power8.c b/3rdparty/zlib-ng/arch/power/chunkset_power8.c index 7cbb8029b3..aef1973273 100644 --- a/3rdparty/zlib-ng/arch/power/chunkset_power8.c +++ b/3rdparty/zlib-ng/arch/power/chunkset_power8.c @@ -4,7 +4,7 @@ #ifdef POWER8_VSX #include -#include "../../zbuild.h" +#include "zbuild.h" typedef vector unsigned char chunk_t; diff --git a/3rdparty/zlib-ng/arch/power/compare256_power9.c b/3rdparty/zlib-ng/arch/power/compare256_power9.c index 9b0ddaf800..c8be498e4f 100644 --- a/3rdparty/zlib-ng/arch/power/compare256_power9.c +++ b/3rdparty/zlib-ng/arch/power/compare256_power9.c @@ -5,8 +5,10 @@ #ifdef POWER9 #include -#include "../../zbuild.h" -#include "../../zendian.h" +#include "zbuild.h" +#include "zutil_p.h" +#include "deflate.h" +#include "zendian.h" /* Older versions of GCC misimplemented semantics for these bit counting builtins. * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */ diff --git a/3rdparty/zlib-ng/arch/power/power_features.c b/3rdparty/zlib-ng/arch/power/power_features.c index f73503734b..4939d1c18f 100644 --- a/3rdparty/zlib-ng/arch/power/power_features.c +++ b/3rdparty/zlib-ng/arch/power/power_features.c @@ -1,16 +1,19 @@ /* power_features.c - POWER feature check * Copyright (C) 2020 Matheus Castanho , IBM - * Copyright (C) 2021-2022 Mika T. Lindqvist + * Copyright (C) 2021-2024 Mika T. Lindqvist * For conditions of distribution and use, see copyright notice in zlib.h */ #ifdef HAVE_SYS_AUXV_H # include #endif +#ifdef POWER_NEED_AUXVEC_H +# include +#endif #ifdef __FreeBSD__ # include #endif -#include "../../zbuild.h" +#include "zbuild.h" #include "power_features.h" void Z_INTERNAL power_check_features(struct power_cpu_features *features) { diff --git a/3rdparty/zlib-ng/arch/power/power_features.h b/3rdparty/zlib-ng/arch/power/power_features.h index 9252364cc4..1ff51de5dd 100644 --- a/3rdparty/zlib-ng/arch/power/power_features.h +++ b/3rdparty/zlib-ng/arch/power/power_features.h @@ -4,8 +4,8 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifndef POWER_H_ -#define POWER_H_ +#ifndef POWER_FEATURES_H_ +#define POWER_FEATURES_H_ struct power_cpu_features { int has_altivec; @@ -15,4 +15,4 @@ struct power_cpu_features { void Z_INTERNAL power_check_features(struct power_cpu_features *features); -#endif /* POWER_H_ */ +#endif /* POWER_FEATURES_H_ */ diff --git a/3rdparty/zlib-ng/arch/power/power_functions.h b/3rdparty/zlib-ng/arch/power/power_functions.h new file mode 100644 index 0000000000..cb6b7650ec --- /dev/null +++ b/3rdparty/zlib-ng/arch/power/power_functions.h @@ -0,0 +1,67 @@ +/* power_functions.h -- POWER implementations for arch-specific functions. + * Copyright (C) 2020 Matheus Castanho , IBM + * Copyright (C) 2021 Mika T. Lindqvist + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef POWER_FUNCTIONS_H_ +#define POWER_FUNCTIONS_H_ + +#ifdef PPC_VMX +uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); +void slide_hash_vmx(deflate_state *s); +#endif + +#ifdef POWER8_VSX +uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t chunksize_power8(void); +uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); +uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len); +void slide_hash_power8(deflate_state *s); +void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); +#endif + +#ifdef POWER9 +uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); +uint32_t longest_match_power9(deflate_state *const s, Pos cur_match); +uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match); +#endif + + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// Power - VMX +# if defined(PPC_VMX) && defined(__ALTIVEC__) +# undef native_adler32 +# define native_adler32 adler32_vmx +# undef native_slide_hash +# define native_slide_hash slide_hash_vmx +# endif +// Power8 - VSX +# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__) +# undef native_adler32 +# define native_adler32 adler32_power8 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_power8 +# undef native_chunksize +# define native_chunksize chunksize_power8 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_power8 +# undef native_slide_hash +# define native_slide_hash slide_hash_power8 +# endif +# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__) +# undef native_crc32 +# define native_crc32 crc32_power8 +# endif +// Power9 +# if defined(POWER9) && defined(_ARCH_PWR9) +# undef native_compare256 +# define native_compare256 compare256_power9 +# undef native_longest_match +# define native_longest_match longest_match_power9 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_power9 +# endif +#endif + +#endif /* POWER_FUNCTIONS_H_ */ diff --git a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c index da46f37e73..d0f9aaa567 100644 --- a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c +++ b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c @@ -9,8 +9,8 @@ #include #include -#include "../../zbuild.h" -#include "../../adler32_p.h" +#include "zbuild.h" +#include "adler32_p.h" static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) { /* split Adler-32 into component sums */ diff --git a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c index 0fd6082c44..3d6c3e3aa5 100644 --- a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c +++ b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c @@ -6,7 +6,9 @@ #ifdef RISCV_RVV -#include "../../zbuild.h" +#include "zbuild.h" +#include "zutil_p.h" +#include "deflate.h" #include "fallback_builtins.h" #include diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_features.c b/3rdparty/zlib-ng/arch/riscv/riscv_features.c index b066f427e0..1e3f45e0a7 100644 --- a/3rdparty/zlib-ng/arch/riscv/riscv_features.c +++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.c @@ -1,10 +1,13 @@ #include #include #include -#include #include -#include "../../zbuild.h" +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# include +#endif + +#include "zbuild.h" #include "riscv_features.h" #define ISA_V_HWCAP (1 << ('v' - 'a')) @@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea } void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) { +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) unsigned long hw_cap = getauxval(AT_HWCAP); +#else + unsigned long hw_cap = 0; +#endif features->has_rvv = hw_cap & ISA_V_HWCAP; } diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_features.h b/3rdparty/zlib-ng/arch/riscv/riscv_features.h index c76e967c36..b1593acc25 100644 --- a/3rdparty/zlib-ng/arch/riscv/riscv_features.h +++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.h @@ -6,8 +6,8 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifndef RISCV_H_ -#define RISCV_H_ +#ifndef RISCV_FEATURES_H_ +#define RISCV_FEATURES_H_ struct riscv_cpu_features { int has_rvv; @@ -15,4 +15,4 @@ struct riscv_cpu_features { void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features); -#endif /* RISCV_H_ */ +#endif /* RISCV_FEATURES_H_ */ diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_functions.h b/3rdparty/zlib-ng/arch/riscv/riscv_functions.h new file mode 100644 index 0000000000..015b2fbd75 --- /dev/null +++ b/3rdparty/zlib-ng/arch/riscv/riscv_functions.h @@ -0,0 +1,49 @@ +/* riscv_functions.h -- RISCV implementations for arch-specific functions. + * + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_FUNCTIONS_H_ +#define RISCV_FUNCTIONS_H_ + +#ifdef RISCV_RVV +uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t chunksize_rvv(void); +uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left); +uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); + +uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match); +uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match); +void slide_hash_rvv(deflate_state *s); +void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// RISCV - RVV +# if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__) +# undef native_adler32 +# define native_adler32 adler32_rvv +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_rvv +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_rvv +# undef native_chunksize +# define native_chunksize chunksize_rvv +# undef native_compare256 +# define native_compare256 compare256_rvv +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_rvv +# undef native_longest_match +# define native_longest_match longest_match_rvv +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_rvv +# undef native_slide_hash +# define native_slide_hash slide_hash_rvv +# endif +#endif + +#endif /* RISCV_FUNCTIONS_H_ */ diff --git a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c index 1164e89ba2..ac28bbd9f2 100644 --- a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c +++ b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c @@ -8,18 +8,16 @@ #include -#include "../../zbuild.h" -#include "../../deflate.h" +#include "zbuild.h" +#include "deflate.h" static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { size_t vl; while (entries > 0) { vl = __riscv_vsetvl_e16m4(entries); vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl); - vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl); - vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl); - v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl); - __riscv_vse16_v_u16m4(table, v_tab, vl); + vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl); + __riscv_vse16_v_u16m4(table, v_diff, vl); table += vl, entries -= vl; } } diff --git a/3rdparty/zlib-ng/arch/s390/Makefile.in b/3rdparty/zlib-ng/arch/s390/Makefile.in new file mode 100644 index 0000000000..e994157df2 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/Makefile.in @@ -0,0 +1,48 @@ +# Makefile for zlib-ng +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= +VGFMAFLAG= +NOLTOFLAG= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +s390_features.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c + +s390_features.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c + +dfltcc_deflate.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c + +dfltcc_deflate.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c + +dfltcc_inflate.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c + +dfltcc_inflate.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c + +crc32-vx.o: + $(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c + +crc32-vx.lo: + $(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/3rdparty/zlib-ng/arch/s390/README.md b/3rdparty/zlib-ng/arch/s390/README.md new file mode 100644 index 0000000000..7b383cc998 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/README.md @@ -0,0 +1,277 @@ +# Introduction + +This directory contains SystemZ deflate hardware acceleration support. +It can be enabled using the following build commands: + + $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate + $ make + +or + + $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 . + $ make + +When built like this, zlib-ng would compress using hardware on level 1, +and using software on all other levels. Decompression will always happen +in hardware. In order to enable hardware compression for levels 1-6 +(i.e. to make it used by default) one could add +`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng. + +SystemZ deflate hardware acceleration is available on [IBM z15]( +https://www.ibm.com/products/z15) and newer machines under the name [ +"Integrated Accelerator for zEnterprise Data Compression"]( +https://www.ibm.com/support/z-content-solutions/compression/). The +programming interface to it is a machine instruction called DEFLATE +CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles +of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both +the code and the rest of this document refer to this feature simply as +"DFLTCC". + +# Performance + +Performance figures are published [here]( +https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine +). The compression speed-up can be as high as 110x and the decompression +speed-up can be as high as 15x. + +# Limitations + +Two DFLTCC compression calls with identical inputs are not guaranteed to +produce identical outputs. Therefore care should be taken when using +hardware compression when reproducible results are desired. In +particular, zlib-ng-specific `zng_deflateSetParams` call allows setting +`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a +particular stream. + +DFLTCC does not support every single zlib-ng feature, in particular: + +* `inflate(Z_BLOCK)` and `inflate(Z_TREES)` +* `inflateMark()` +* `inflatePrime()` +* `inflateSyncPoint()` + +When used, these functions will either switch to software, or, in case +this is not possible, gracefully fail. + +# Code structure + +All SystemZ-specific code lives in `arch/s390` directory and is +integrated with the rest of zlib-ng using hook macros. + +## Hook macros + +DFLTCC takes as arguments a parameter block, an input buffer, an output +buffer, and a window. Parameter blocks are stored alongside zlib states; +buffers are forwarded from the caller; and window - which must be +4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`, +`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()` +and `INFLATE_ADJUST_WINDOW_SIZE()` hooks. + +Software and hardware window formats do not match, therefore, +`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()` +and `inflateGetDictionary()` need special handling, which is triggered using +`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`, +`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros. + +`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC +parameter block using `DEFLATE_RESET_KEEP_HOOK()` and +`INFLATE_RESET_KEEP_HOOK()` macros. + +`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and +`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported +calls gracefully fail. + +`DEFLATE_PARAMS_HOOK()` implements switching between hardware and +software compression mid-stream using `deflateParams()`. Switching +normally entails flushing the current block, which might not be possible +in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook +in order to detect and gracefully handle such situations. + +The algorithm implemented in hardware has different compression ratio +than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()` +and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()` +return the correct results for the hardware implementation. + +Actual compression and decompression are handled by `DEFLATE_HOOK()` and +`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the +window on its own, calling `updatewindow()` is suppressed using +`INFLATE_NEED_UPDATEWINDOW()` macro. + +In addition to compression, DFLTCC computes CRC-32 and Adler-32 +checksums, therefore, whenever it's used, software checksumming is +suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()` +macros. + +While software always produces reproducible compression results, this +is not the case for DFLTCC. Therefore, zlib-ng users are given the +ability to specify whether or not reproducible compression results +are required. While it is always possible to specify this setting +before the compression begins, it is not always possible to do so in +the middle of a deflate stream - the exact conditions for that are +determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro. + +## SystemZ-specific code + +When zlib-ng is built with DFLTCC, the hooks described above are +converted to calls to functions, which are implemented in +`arch/s390/dfltcc_*` files. The functions can be grouped in three broad +categories: + +* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`. +* Translating between software and hardware data formats, e.g. + `dfltcc_deflate_set_dictionary()`. +* Translating between software and hardware state machines, e.g. + `dfltcc_deflate()` and `dfltcc_inflate()`. + +The functions from the first two categories are fairly simple, however, +various quirks in both software and hardware state machines make the +functions from the third category quite complicated. + +### `dfltcc_deflate()` function + +This function is called by `deflate()` and has the following +responsibilities: + +* Checking whether DFLTCC can be used with the current stream. If this + is not the case, then it returns `0`, making `deflate()` use some + other function in order to compress in software. Otherwise it returns + `1`. +* Block management and Huffman table generation. DFLTCC ends blocks only + when explicitly instructed to do so by the software. Furthermore, + whether to use fixed or dynamic Huffman tables must also be determined + by the software. Since looking at data in order to gather statistics + would negate performance benefits, the following approach is used: the + first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed + block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into + dynamic blocks. +* Writing EOBS. Block Closing Control bit in the parameter block + instructs DFLTCC to write EOBS, however, certain conditions need to be + met: input data length must be non-zero or Continuation Flag must be + set. To put this in simpler terms, DFLTCC will silently refuse to + write EOBS if this is the only thing that it is asked to do. Since the + code has to be able to emit EOBS in software anyway, in order to avoid + tricky corner cases Block Closing Control is never used. Whether to + write EOBS is instead controlled by `soft_bcc` variable. +* Triggering block post-processing. Depending on flush mode, `deflate()` + must perform various additional actions when a block or a stream ends. + `dfltcc_deflate()` informs `deflate()` about this using + `block_state *result` parameter. +* Converting software state fields into hardware parameter block fields, + and vice versa. For example, `wrap` and Check Value Type or `bi_valid` + and Sub-Byte Boundary. Certain fields cannot be translated and must + persist untouched in the parameter block between calls, for example, + Continuation Flag or Continuation State Buffer. +* Handling flush modes and low-memory situations. These aspects are + quite intertwined and pervasive. The general idea here is that the + code must not do anything in software - whether explicitly by e.g. + calling `send_eobs()`, or implicitly - by returning to `deflate()` + with certain return and `*result` values, when Continuation Flag is + set. +* Ending streams. When a new block is started and flush mode is + `Z_FINISH`, Block Header Final parameter block bit is used to mark + this block as final. However, sometimes an empty final block is + needed, and, unfortunately, just like with EOBS, DFLTCC will silently + refuse to do this. The general idea of DFLTCC implementation is to + rely as much as possible on the existing code. Here in order to do + this, the code pretends that it does not support DFLTCC, which makes + `deflate()` call a software compression function, which writes an + empty final block. Whether this is required is controlled by + `need_empty_block` variable. +* Error handling. This is simply converting + Operation-Ending-Supplemental Code to string. Errors can only happen + due to things like memory corruption, and therefore they don't affect + the `deflate()` return code. + +### `dfltcc_inflate()` function + +This function is called by `inflate()` from the `TYPEDO` state (that is, +when all the metadata is parsed and the stream is positioned at the type +bits of deflate block header) and it's responsible for the following: + +* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`. + Unfortunately, there is no way to ask DFLTCC to stop decompressing on + block or tree boundary. +* `inflate()` decompression loop management. This is controlled using + the return value, which can be either `DFLTCC_INFLATE_BREAK` or + `DFLTCC_INFLATE_CONTINUE`. +* Converting software state fields into hardware parameter block fields, + and vice versa. For example, `whave` and History Length or `wnext` and + History Offset. +* Ending streams. This instructs `inflate()` to return `Z_STREAM_END` + and is controlled by `last` state field. +* Error handling. Like deflate, error handling comprises + Operation-Ending-Supplemental Code to string conversion. Unlike + deflate, errors may happen due to bad inputs, therefore they are + propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`. + +# Testing + +Given complexity of DFLTCC machine instruction, it is not clear whether +QEMU TCG will ever support it. At the time of writing, one has to have +access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since +DFLTCC is a non-privileged instruction, neither special VM/LPAR +configuration nor root are required. + +zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC +testing. There is no official IBM Z GitHub Actions runner, so we build +one inspired by `anup-kodlekere/gaplib`. +Future updates to actions-runner might need an updated patch. The .net +version number patch has been separated into a separate file to avoid a +need for constantly changing the patch. + +## Configuring the builder. + +### Install prerequisites. + +``` +sudo dnf install podman +``` + +### Add actions-runner service. + +``` +sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/ +sudo systemctl daemon-reload +``` + +### Create a config file, needs github personal access token. + +``` +# Create file /etc/actions-runner +repo=/ +access_token= +``` + +Access token should have the repo scope, consult +https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository +for details. + +### Autostart actions-runner. + +``` +$ sudo systemctl enable --now actions-runner +``` + +## Rebuilding the container + +In order to update the `gaplib-actions-runner` podman container, e.g. to get the +latest OS security fixes, follow these steps: +``` +# Stop actions-runner service +sudo systemctl stop actions-runner + +# Delete old container +sudo podman container rm gaplib-actions-runner + +# Delete old image +sudo podman image rm localhost/zlib-ng/actions-runner + +# Build image +sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg . + +# Build container +sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner + +# Start actions-runner service +sudo systemctl start actions-runner +``` diff --git a/3rdparty/zlib-ng/arch/s390/crc32-vx.c b/3rdparty/zlib-ng/arch/s390/crc32-vx.c new file mode 100644 index 0000000000..b3dcbf7030 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/crc32-vx.c @@ -0,0 +1,222 @@ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of bitreflected CRC-32 checksums. + * + * This CRC-32 implementation algorithm is bitreflected and processes + * the least-significant bit first (Little-Endian). + * + * This code was originally written by Hendrik Brueckner + * for use in the Linux kernel and has been + * relicensed under the zlib license. + */ + +#include "zbuild.h" +#include "arch_functions.h" + +#include + +typedef unsigned char uv16qi __attribute__((vector_size(16))); +typedef unsigned int uv4si __attribute__((vector_size(16))); +typedef unsigned long long uv2di __attribute__((vector_size(16))); + +static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) { + /* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 + * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 + * R3 = [(x128+32 mod P'(x) << 32)]' << 1 + * R4 = [(x128-32 mod P'(x) << 32)]' << 1 + * R5 = [(x64 mod P'(x) << 32)]' << 1 + * R6 = [(x32 mod P'(x) << 32)]' << 1 + * + * The bitreflected Barret reduction constant, u', is defined as + * the bit reversal of floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + */ + const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */ + const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */ + const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */ + const uv2di r5 = {0, 0x163CD6124}; /* R5 */ + const uv2di ru_poly = {0, 0x1F7011641}; /* u' */ + const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */ + + /* + * Load the initial CRC value. + * + * The CRC value is loaded into the rightmost word of the + * vector register and is later XORed with the LSB portion + * of the loaded input data. + */ + uv2di v0 = {0, 0}; + v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3); + + /* Load a 64-byte data chunk and XOR with CRC */ + uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be); + uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be); + uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be); + uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be); + + v1 ^= v0; + buf += 64; + len -= 64; + + while (len >= 64) { + /* Load the next 64-byte data chunk */ + uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be); + uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be); + uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be); + uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be); + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate result + * is then folded (accumulated) with the next data chunk in PART1 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1); + v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2); + v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3); + v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4); + + buf += 64; + len -= 64; + } + + /* + * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 + * and R4 and accumulating the next 128-bit chunk until a single 128-bit + * value remains. + */ + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2); + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3); + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4); + + while (len >= 16) { + /* Load next data chunk */ + v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be); + + /* Fold next data chunk */ + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2); + + buf += 16; + len -= 16; + } + + /* + * Set up a vector register for byte shifts. The shift value must + * be loaded in bits 1-4 in byte element 7 of a vector register. + * Shift by 8 bytes: 0x40 + * Shift by 4 bytes: 0x20 + */ + uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + v9 = vec_insert((unsigned char)0x40, v9, 7); + + /* + * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes + * to move R4 into the rightmost doubleword and set the leftmost + * doubleword to 0x1. + */ + v0 = vec_srb(r4r3, (uv2di)v9); + v0[0] = 1; + + /* + * Compute GF(2) product of V1 and V0. The rightmost doubleword + * of V1 is multiplied with R4. The leftmost doubleword of V1 is + * multiplied by 0x1 and is then XORed with rightmost product. + * Implicitly, the intermediate leftmost product becomes padded + */ + v1 = (uv2di)vec_gfmsum_128(v0, v1); + + /* + * Now do the final 32-bit fold by multiplying the rightmost word + * in V1 with R5 and XOR the result with the remaining bits in V1. + * + * To achieve this by a single VGFMAG, right shift V1 by a word + * and store the result in V2 which is then accumulated. Use the + * vector unpack instruction to load the rightmost half of the + * doubleword into the rightmost doubleword element of V1; the other + * half is loaded in the leftmost doubleword. + * The vector register with CONST_R5 contains the R5 constant in the + * rightmost doubleword and the leftmost doubleword is zero to ignore + * the leftmost product of V1. + */ + v9 = vec_insert((unsigned char)0x20, v9, 7); + v2 = vec_srb(v1, (uv2di)v9); + v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */ + v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2); + + /* + * Apply a Barret reduction to compute the final 32-bit CRC value. + * + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: The leftmost doubleword of vector register containing + * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product + * is zero and does not contribute to the final result. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + v2 = vec_unpackl((uv4si)v1); + v2 = (uv2di)vec_gfmsum_128(ru_poly, v2); + + /* + * Compute the GF(2) product of the CRC polynomial with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is stored in word element 2 of V2. + */ + v2 = vec_unpackl((uv4si)v2); + v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1); + + return ((uv4si)v2)[2]; +} + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) { + size_t prealign, aligned, remaining; + + if (len < VX_MIN_LEN + VX_ALIGN_MASK) + return PREFIX(crc32_braid)(crc, buf, len); + + if ((uintptr_t)buf & VX_ALIGN_MASK) { + prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK); + len -= prealign; + crc = PREFIX(crc32_braid)(crc, buf, prealign); + buf += prealign; + } + aligned = len & ~VX_ALIGN_MASK; + remaining = len & VX_ALIGN_MASK; + + crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff; + + if (remaining) + crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining); + + return crc; +} diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_common.h b/3rdparty/zlib-ng/arch/s390/dfltcc_common.h new file mode 100644 index 0000000000..a6527ab5df --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/dfltcc_common.h @@ -0,0 +1,119 @@ +#ifndef DFLTCC_COMMON_H +#define DFLTCC_COMMON_H + +#include "zutil.h" + +/* + Parameter Block for Query Available Functions. + */ +struct dfltcc_qaf_param { + char fns[16]; + char reserved1[8]; + char fmts[2]; + char reserved2[6]; +} ALIGNED_(8); + +/* + Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand. + */ +struct dfltcc_param_v0 { + uint16_t pbvn; /* Parameter-Block-Version Number */ + uint8_t mvn; /* Model-Version Number */ + uint8_t ribm; /* Reserved for IBM use */ + uint32_t reserved32 : 31; + uint32_t cf : 1; /* Continuation Flag */ + uint8_t reserved64[8]; + uint32_t nt : 1; /* New Task */ + uint32_t reserved129 : 1; + uint32_t cvt : 1; /* Check Value Type */ + uint32_t reserved131 : 1; + uint32_t htt : 1; /* Huffman-Table Type */ + uint32_t bcf : 1; /* Block-Continuation Flag */ + uint32_t bcc : 1; /* Block Closing Control */ + uint32_t bhf : 1; /* Block Header Final */ + uint32_t reserved136 : 1; + uint32_t reserved137 : 1; + uint32_t dhtgc : 1; /* DHT Generation Control */ + uint32_t reserved139 : 5; + uint32_t reserved144 : 5; + uint32_t sbb : 3; /* Sub-Byte Boundary */ + uint8_t oesc; /* Operation-Ending-Supplemental Code */ + uint32_t reserved160 : 12; + uint32_t ifs : 4; /* Incomplete-Function Status */ + uint16_t ifl; /* Incomplete-Function Length */ + uint8_t reserved192[8]; + uint8_t reserved256[8]; + uint8_t reserved320[4]; + uint16_t hl; /* History Length */ + uint32_t reserved368 : 1; + uint16_t ho : 15; /* History Offset */ + uint32_t cv; /* Check Value */ + uint32_t eobs : 15; /* End-of-block Symbol */ + uint32_t reserved431: 1; + uint8_t eobl : 4; /* End-of-block Length */ + uint32_t reserved436 : 12; + uint32_t reserved448 : 4; + uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table + Length */ + uint8_t reserved464[6]; + uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */ + uint8_t reserved[24]; + uint8_t ribm2[8]; /* Reserved for IBM use */ + uint8_t csb[1152]; /* Continuation-State Buffer */ +} ALIGNED_(8); + +/* + Extension of inflate_state and deflate_state. + */ +struct dfltcc_state { + struct dfltcc_param_v0 param; /* Parameter block. */ + struct dfltcc_qaf_param af; /* Available functions. */ + char msg[64]; /* Buffer for strm->msg */ +}; + +typedef struct { + struct dfltcc_state common; + uint16_t level_mask; /* Levels on which to use DFLTCC */ + uint32_t block_size; /* New block each X bytes */ + size_t block_threshold; /* New block after total_in > X */ + uint32_t dht_threshold; /* New block only if avail_in >= X */ +} arch_deflate_state; + +typedef struct { + struct dfltcc_state common; +} arch_inflate_state; + +/* + History buffer size. + */ +#define HB_BITS 15 +#define HB_SIZE (1 << HB_BITS) + +/* + Sizes of deflate block parts. + */ +#define DFLTCC_BLOCK_HEADER_BITS 3 +#define DFLTCC_HLITS_COUNT_BITS 5 +#define DFLTCC_HDISTS_COUNT_BITS 5 +#define DFLTCC_HCLENS_COUNT_BITS 4 +#define DFLTCC_MAX_HCLENS 19 +#define DFLTCC_HCLEN_BITS 3 +#define DFLTCC_MAX_HLITS 286 +#define DFLTCC_MAX_HDISTS 30 +#define DFLTCC_MAX_HLIT_HDIST_BITS 7 +#define DFLTCC_MAX_SYMBOL_BITS 16 +#define DFLTCC_MAX_EOBS_BITS 15 +#define DFLTCC_MAX_PADDING_BITS 7 + +#define DEFLATE_BOUND_COMPLEN(source_len) \ + ((DFLTCC_BLOCK_HEADER_BITS + \ + DFLTCC_HLITS_COUNT_BITS + \ + DFLTCC_HDISTS_COUNT_BITS + \ + DFLTCC_HCLENS_COUNT_BITS + \ + DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \ + (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \ + (source_len) * DFLTCC_MAX_SYMBOL_BITS + \ + DFLTCC_MAX_EOBS_BITS + \ + DFLTCC_MAX_PADDING_BITS) >> 3) + +#endif diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c new file mode 100644 index 0000000000..90b4b96e9c --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c @@ -0,0 +1,383 @@ +/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */ + +/* + Use the following commands to build zlib-ng with DFLTCC compression support: + + $ ./configure --with-dfltcc-deflate + or + + $ cmake -DWITH_DFLTCC_DEFLATE=1 . + + and then + + $ make +*/ + +#include "zbuild.h" +#include "deflate.h" +#include "trees_emit.h" +#include "dfltcc_deflate.h" +#include "dfltcc_detail.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + arch_deflate_state *dfltcc_state = &state->arch; + + dfltcc_reset_state(&dfltcc_state->common); + + /* Initialize tuning parameters */ + dfltcc_state->level_mask = DFLTCC_LEVEL_MASK; + dfltcc_state->block_size = DFLTCC_BLOCK_SIZE; + dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE; + dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE; +} + +static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy, + int reproducible) { + deflate_state *state = (deflate_state *)strm->state; + arch_deflate_state *dfltcc_state = &state->arch; + + /* Unsupported compression settings */ + if ((dfltcc_state->level_mask & (1 << level)) == 0) + return 0; + if (window_bits != HB_BITS) + return 0; + if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY) + return 0; + if (reproducible) + return 0; + + /* Unsupported hardware */ + if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) || + !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) || + !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0)) + return 0; + + return 1; +} + +int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + + return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible); +} + +static inline void dfltcc_gdht(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + size_t avail_in = strm->avail_in; + + dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL); +} + +static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + size_t avail_in = strm->avail_in; + size_t avail_out = strm->avail_out; + dfltcc_cc cc; + + cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR, + param, &strm->next_out, &avail_out, + &strm->next_in, &avail_in, state->window); + strm->total_in += (strm->avail_in - avail_in); + strm->total_out += (strm->avail_out - avail_out); + strm->avail_in = avail_in; + strm->avail_out = avail_out; + return cc; +} + +static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) { + deflate_state *state = (deflate_state *)strm->state; + + send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid); + PREFIX(flush_pending)(strm); + if (state->pending != 0) { + /* The remaining data is located in pending_out[0:pending]. If someone + * calls put_byte() - this might happen in deflate() - the byte will be + * placed into pending_buf[pending], which is incorrect. Move the + * remaining data to the beginning of pending_buf so that put_byte() is + * usable again. + */ + memmove(state->pending_buf, state->pending_out, state->pending); + state->pending_out = state->pending_buf; + } +#ifdef ZLIB_DEBUG + state->compressed_len += param->eobl; +#endif +} + +int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) { + deflate_state *state = (deflate_state *)strm->state; + arch_deflate_state *dfltcc_state = &state->arch; + struct dfltcc_param_v0 *param = &dfltcc_state->common.param; + uInt masked_avail_in; + dfltcc_cc cc; + int need_empty_block; + int soft_bcc; + int no_flush; + + if (!PREFIX(dfltcc_can_deflate)(strm)) { + /* Clear history. */ + if (flush == Z_FULL_FLUSH) + param->hl = 0; + return 0; + } + +again: + masked_avail_in = 0; + soft_bcc = 0; + no_flush = flush == Z_NO_FLUSH; + + /* No input data. Return, except when Continuation Flag is set, which means + * that DFLTCC has buffered some output in the parameter block and needs to + * be called again in order to flush it. + */ + if (strm->avail_in == 0 && !param->cf) { + /* A block is still open, and the hardware does not support closing + * blocks without adding data. Thus, close it manually. + */ + if (!no_flush && param->bcf) { + send_eobs(strm, param); + param->bcf = 0; + } + /* Let one of deflate_* functions write a trailing empty block. */ + if (flush == Z_FINISH) + return 0; + /* Clear history. */ + if (flush == Z_FULL_FLUSH) + param->hl = 0; + /* Trigger block post-processing if necessary. */ + *result = no_flush ? need_more : block_done; + return 1; + } + + /* There is an open non-BFINAL block, we are not going to close it just + * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see + * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new + * DHT in order to adapt to a possibly changed input data distribution. + */ + if (param->bcf && no_flush && + strm->total_in > dfltcc_state->block_threshold && + strm->avail_in >= dfltcc_state->dht_threshold) { + if (param->cf) { + /* We need to flush the DFLTCC buffer before writing the + * End-of-block Symbol. Mask the input data and proceed as usual. + */ + masked_avail_in += strm->avail_in; + strm->avail_in = 0; + no_flush = 0; + } else { + /* DFLTCC buffer is empty, so we can manually write the + * End-of-block Symbol right away. + */ + send_eobs(strm, param); + param->bcf = 0; + dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size; + } + } + + /* No space for compressed data. If we proceed, dfltcc_cmpr() will return + * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still + * set BCF=1, which is wrong. Avoid complications and return early. + */ + if (strm->avail_out == 0) { + *result = need_more; + return 1; + } + + /* The caller gave us too much data. Pass only one block worth of + * uncompressed data to DFLTCC and mask the rest, so that on the next + * iteration we start a new block. + */ + if (no_flush && strm->avail_in > dfltcc_state->block_size) { + masked_avail_in += (strm->avail_in - dfltcc_state->block_size); + strm->avail_in = dfltcc_state->block_size; + } + + /* When we have an open non-BFINAL deflate block and caller indicates that + * the stream is ending, we need to close an open deflate block and open a + * BFINAL one. + */ + need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf; + + /* Translate stream to parameter block */ + param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32; + if (!no_flush) + /* We need to close a block. Always do this in software - when there is + * no input data, the hardware will not honor BCC. */ + soft_bcc = 1; + if (flush == Z_FINISH && !param->bcf) + /* We are about to open a BFINAL block, set Block Header Final bit + * until the stream ends. + */ + param->bhf = 1; + /* DFLTCC-CMPR will write to next_out, so make sure that buffers with + * higher precedence are empty. + */ + Assert(state->pending == 0, "There must be no pending bytes"); + Assert(state->bi_valid < 8, "There must be less than 8 pending bits"); + param->sbb = (unsigned int)state->bi_valid; + if (param->sbb > 0) + *strm->next_out = (unsigned char)state->bi_buf; + /* Honor history and check value */ + param->nt = 0; + if (state->wrap == 1) + param->cv = strm->adler; + else if (state->wrap == 2) + param->cv = ZSWAP32(state->crc_fold.value); + + /* When opening a block, choose a Huffman-Table Type */ + if (!param->bcf) { + if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0)) + param->htt = HTT_FIXED; + else { + param->htt = HTT_DYNAMIC; + dfltcc_gdht(strm); + } + } + + /* Deflate */ + do { + cc = dfltcc_cmpr(strm); + if (strm->avail_in < 4096 && masked_avail_in > 0) + /* We are about to call DFLTCC with a small input buffer, which is + * inefficient. Since there is masked data, there will be at least + * one more DFLTCC call, so skip the current one and make the next + * one handle more data. + */ + break; + } while (cc == DFLTCC_CC_AGAIN); + + /* Translate parameter block to stream */ + strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc); + state->bi_valid = param->sbb; + if (state->bi_valid == 0) + state->bi_buf = 0; /* Avoid accessing next_out */ + else + state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1); + if (state->wrap == 1) + strm->adler = param->cv; + else if (state->wrap == 2) + state->crc_fold.value = ZSWAP32(param->cv); + + /* Unmask the input data */ + strm->avail_in += masked_avail_in; + masked_avail_in = 0; + + /* If we encounter an error, it means there is a bug in DFLTCC call */ + Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG"); + + /* Update Block-Continuation Flag. It will be used to check whether to call + * GDHT the next time. + */ + if (cc == DFLTCC_CC_OK) { + if (soft_bcc) { + send_eobs(strm, param); + param->bcf = 0; + dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size; + } else + param->bcf = 1; + if (flush == Z_FINISH) { + if (need_empty_block) + /* Make the current deflate() call also close the stream */ + return 0; + else { + bi_windup(state); + *result = finish_done; + } + } else { + if (flush == Z_FULL_FLUSH) + param->hl = 0; /* Clear history */ + *result = flush == Z_NO_FLUSH ? need_more : block_done; + } + } else { + param->bcf = 1; + *result = need_more; + } + if (strm->avail_in != 0 && strm->avail_out != 0) + goto again; /* deflate() must use all input or all output */ + return 1; +} + +/* + Switching between hardware and software compression. + + DFLTCC does not support all zlib settings, e.g. generation of non-compressed + blocks or alternative window sizes. When such settings are applied on the + fly with deflateParams, we need to convert between hardware and software + window formats. +*/ +static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + return strm->total_in > 0 || param->nt == 0 || param->hl > 0; +} + +int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) { + deflate_state *state = (deflate_state *)strm->state; + int could_deflate = PREFIX(dfltcc_can_deflate)(strm); + int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible); + + if (can_deflate == could_deflate) + /* We continue to work in the same mode - no changes needed */ + return Z_OK; + + if (!dfltcc_was_deflate_used(strm)) + /* DFLTCC was not used yet - no changes needed */ + return Z_OK; + + /* For now, do not convert between window formats - simply get rid of the old data instead */ + *flush = Z_FULL_FLUSH; + return Z_OK; +} + +int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might + * close the block without resetting the compression state. Detect this + * situation and return that deflation is not done. + */ + if (flush == Z_FULL_FLUSH && strm->avail_out == 0) + return 0; + + /* Return that deflation is not done if DFLTCC is used and either it + * buffered some data (Continuation Flag is set), or has not written EOBS + * yet (Block-Continuation Flag is set). + */ + return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf); +} + +int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) { + deflate_state *state = (deflate_state *)strm->state; + + return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm); +} + +/* + Preloading history. +*/ +int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + append_history(param, state->window, dictionary, dict_length); + state->strstart = 1; /* Add FDICT to zlib header */ + state->block_start = state->strstart; /* Make deflate_stored happy */ + return Z_OK; +} + +int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + if (dictionary) + get_history(param, state->window, dictionary); + if (dict_length) + *dict_length = param->hl; + return Z_OK; +} diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h new file mode 100644 index 0000000000..35e2fd3f62 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h @@ -0,0 +1,58 @@ +#ifndef DFLTCC_DEFLATE_H +#define DFLTCC_DEFLATE_H + +#include "deflate.h" +#include "dfltcc_common.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp)); +int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result); +int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush); +int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush); +int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible); +int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length); +int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length); + +#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_deflate)((strm))) \ + return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_deflate)((strm))) \ + return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state) + +#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \ + do { \ + int err; \ +\ + err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \ + if (err == Z_STREAM_ERROR) \ + return err; \ + } while (0) + +#define DEFLATE_DONE PREFIX(dfltcc_deflate_done) + +#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \ + do { \ + if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \ + (complen) = DEFLATE_BOUND_COMPLEN(source_len); \ + } while (0) + +#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm))) + +#define DEFLATE_HOOK PREFIX(dfltcc_deflate) + +#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm))) + +#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible) + +#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE) + +#endif diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h b/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h new file mode 100644 index 0000000000..ae6001ba38 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/dfltcc_detail.h @@ -0,0 +1,275 @@ +#include "zbuild.h" +#include + +#ifdef HAVE_SYS_SDT_H +#include +#endif + +/* + Tuning parameters. + */ +#ifndef DFLTCC_LEVEL_MASK +#define DFLTCC_LEVEL_MASK 0x2 +#endif +#ifndef DFLTCC_BLOCK_SIZE +#define DFLTCC_BLOCK_SIZE 1048576 +#endif +#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE +#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096 +#endif +#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE +#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096 +#endif +#ifndef DFLTCC_RIBM +#define DFLTCC_RIBM 0 +#endif + +#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1] + +#define DFLTCC_SIZEOF_QAF 32 +static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf); + +static inline int is_bit_set(const char *bits, int n) { + return bits[n / 8] & (1 << (7 - (n % 8))); +} + +static inline void clear_bit(char *bits, int n) { + bits[n / 8] &= ~(1 << (7 - (n % 8))); +} + +#define DFLTCC_FACILITY 151 + +static inline int is_dfltcc_enabled(void) { + uint64_t facilities[(DFLTCC_FACILITY / 64) + 1]; + Z_REGISTER uint8_t r0 __asm__("r0"); + + memset(facilities, 0, sizeof(facilities)); + r0 = sizeof(facilities) / sizeof(facilities[0]) - 1; + /* STFLE is supported since z9-109 and only in z/Architecture mode. When + * compiling with -m31, gcc defaults to ESA mode, however, since the kernel + * is 64-bit, it's always z/Architecture mode at runtime. + */ + __asm__ volatile( +#ifndef __clang__ + ".machinemode push\n" + ".machinemode zarch\n" +#endif + "stfle %[facilities]\n" +#ifndef __clang__ + ".machinemode pop\n" +#endif + : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc"); + return is_bit_set((const char *)facilities, DFLTCC_FACILITY); +} + +#define DFLTCC_FMT0 0 + +#define CVT_CRC32 0 +#define CVT_ADLER32 1 +#define HTT_FIXED 0 +#define HTT_DYNAMIC 1 + +#define DFLTCC_SIZEOF_GDHT_V0 384 +#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536 +static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0); +static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0); + +static inline z_const char *oesc_msg(char *buf, int oesc) { + if (oesc == 0x00) + return NULL; /* Successful completion */ + else { + sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc); + return buf; + } +} + +/* + C wrapper for the DEFLATE CONVERSION CALL instruction. + */ +typedef enum { + DFLTCC_CC_OK = 0, + DFLTCC_CC_OP1_TOO_SHORT = 1, + DFLTCC_CC_OP2_TOO_SHORT = 2, + DFLTCC_CC_OP2_CORRUPT = 2, + DFLTCC_CC_AGAIN = 3, +} dfltcc_cc; + +#define DFLTCC_QAF 0 +#define DFLTCC_GDHT 1 +#define DFLTCC_CMPR 2 +#define DFLTCC_XPND 4 +#define HBT_CIRCULAR (1 << 7) +#define DFLTCC_FN_MASK ((1 << 7) - 1) + +/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */ +static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) { + *hl_high = MIN(param->hl, HB_SIZE - param->ho); + *hl_low = param->hl - *hl_high; +} + +/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */ +static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) { + size_t hl_high, hl_low; + + get_history_lengths(param, &hl_high, &hl_low); + instrument_read_write(hist + param->ho, hl_high); + instrument_read_write(hist, hl_low); +} + +/* Notify MSan about a completed write to the circular history buffer. */ +static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) { + size_t hl_high, hl_low; + + get_history_lengths(param, &hl_high, &hl_low); + __msan_unpoison(hist + param->ho, hl_high); + __msan_unpoison(hist, hl_low); +} + +static inline dfltcc_cc dfltcc(int fn, void *param, + unsigned char **op1, size_t *len1, + z_const unsigned char **op2, size_t *len2, void *hist) { + unsigned char *t2 = op1 ? *op1 : NULL; + unsigned char *orig_t2 = t2; + size_t t3 = len1 ? *len1 : 0; + z_const unsigned char *t4 = op2 ? *op2 : NULL; + size_t t5 = len2 ? *len2 : 0; + Z_REGISTER int r0 __asm__("r0"); + Z_REGISTER void *r1 __asm__("r1"); + Z_REGISTER unsigned char *r2 __asm__("r2"); + Z_REGISTER size_t r3 __asm__("r3"); + Z_REGISTER z_const unsigned char *r4 __asm__("r4"); + Z_REGISTER size_t r5 __asm__("r5"); + int cc; + + /* Insert pre-instrumentation for DFLTCC. */ + switch (fn & DFLTCC_FN_MASK) { + case DFLTCC_QAF: + instrument_write(param, DFLTCC_SIZEOF_QAF); + break; + case DFLTCC_GDHT: + instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0); + instrument_read(t4, t5); + break; + case DFLTCC_CMPR: + case DFLTCC_XPND: + instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0); + instrument_read(t4, t5); + instrument_write(t2, t3); + instrument_read_write_hist(param, hist); + break; + } + + r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5; + __asm__ volatile( +#ifdef HAVE_SYS_SDT_H + STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5)) +#endif + ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n" +#ifdef HAVE_SYS_SDT_H + STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5)) +#endif + "ipm %[cc]\n" + : [r2] "+r" (r2) + , [r3] "+r" (r3) + , [r4] "+r" (r4) + , [r5] "+r" (r5) + , [cc] "=r" (cc) + : [r0] "r" (r0) + , [r1] "r" (r1) + , [hist] "r" (hist) +#ifdef HAVE_SYS_SDT_H + , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist) +#endif + : "cc", "memory"); + t2 = r2; t3 = r3; t4 = r4; t5 = r5; + + /* Insert post-instrumentation for DFLTCC. */ + switch (fn & DFLTCC_FN_MASK) { + case DFLTCC_QAF: + __msan_unpoison(param, DFLTCC_SIZEOF_QAF); + break; + case DFLTCC_GDHT: + __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0); + break; + case DFLTCC_CMPR: + __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0); + __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1)); + msan_unpoison_hist(param, hist); + break; + case DFLTCC_XPND: + __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0); + __msan_unpoison(orig_t2, t2 - orig_t2); + msan_unpoison_hist(param, hist); + break; + } + + if (op1) + *op1 = t2; + if (len1) + *len1 = t3; + if (op2) + *op2 = t4; + if (len2) + *len2 = t5; + return (cc >> 28) & 3; +} + +#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1)) + +static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) { + /* Initialize available functions */ + if (is_dfltcc_enabled()) { + dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL); + memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af)); + } else + memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af)); + + /* Initialize parameter block */ + memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param)); + dfltcc_state->param.nt = 1; + dfltcc_state->param.ribm = DFLTCC_RIBM; +} + +static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) { + memcpy(dst, src, ALIGN_UP(size, 8) + extension_size); +} + +static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history, + const unsigned char *buf, uInt count) { + size_t offset; + size_t n; + + /* Do not use more than 32K */ + if (count > HB_SIZE) { + buf += count - HB_SIZE; + count = HB_SIZE; + } + offset = (param->ho + param->hl) % HB_SIZE; + if (offset + count <= HB_SIZE) + /* Circular history buffer does not wrap - copy one chunk */ + memcpy(history + offset, buf, count); + else { + /* Circular history buffer wraps - copy two chunks */ + n = HB_SIZE - offset; + memcpy(history + offset, buf, n); + memcpy(history, buf + n, count - n); + } + n = param->hl + count; + if (n <= HB_SIZE) + /* All history fits into buffer - no need to discard anything */ + param->hl = n; + else { + /* History does not fit into buffer - discard extra bytes */ + param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE; + param->hl = HB_SIZE; + } +} + +static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history, + unsigned char *buf) { + size_t hl_high, hl_low; + + get_history_lengths(param, &hl_high, &hl_low); + memcpy(buf, history + param->ho, hl_high); + memcpy(buf + hl_high, history, hl_low); +} diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c new file mode 100644 index 0000000000..cc3cb39781 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c @@ -0,0 +1,191 @@ +/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */ + +/* + Use the following commands to build zlib-ng with DFLTCC decompression support: + + $ ./configure --with-dfltcc-inflate + or + + $ cmake -DWITH_DFLTCC_INFLATE=1 . + + and then + + $ make +*/ + +#include "zbuild.h" +#include "zutil.h" +#include "inftrees.h" +#include "inflate.h" +#include "dfltcc_inflate.h" +#include "dfltcc_detail.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + + dfltcc_reset_state(&state->arch.common); +} + +int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_state *dfltcc_state = &state->arch.common; + + /* Unsupported hardware */ + return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0); +} + +static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + size_t avail_in = strm->avail_in; + size_t avail_out = strm->avail_out; + dfltcc_cc cc; + + cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR, + param, &strm->next_out, &avail_out, + &strm->next_in, &avail_in, state->window); + strm->avail_in = avail_in; + strm->avail_out = avail_out; + return cc; +} + +dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_state *dfltcc_state = &state->arch.common; + struct dfltcc_param_v0 *param = &dfltcc_state->param; + dfltcc_cc cc; + + if (flush == Z_BLOCK || flush == Z_TREES) { + /* DFLTCC does not support stopping on block boundaries */ + if (PREFIX(dfltcc_inflate_disable)(strm)) { + *ret = Z_STREAM_ERROR; + return DFLTCC_INFLATE_BREAK; + } else + return DFLTCC_INFLATE_SOFTWARE; + } + + if (state->last) { + if (state->bits != 0) { + strm->next_in++; + strm->avail_in--; + state->bits = 0; + } + state->mode = CHECK; + return DFLTCC_INFLATE_CONTINUE; + } + + if (strm->avail_in == 0 && !param->cf) + return DFLTCC_INFLATE_BREAK; + + /* if window not in use yet, initialize */ + if (state->wsize == 0) + state->wsize = 1U << state->wbits; + + /* Translate stream to parameter block */ + param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32; + param->sbb = state->bits; + if (param->hl) + param->nt = 0; /* Honor history for the first block */ + if (state->wrap & 4) + param->cv = state->flags ? ZSWAP32(state->check) : state->check; + + /* Inflate */ + do { + cc = dfltcc_xpnd(strm); + } while (cc == DFLTCC_CC_AGAIN); + + /* Translate parameter block to stream */ + strm->msg = oesc_msg(dfltcc_state->msg, param->oesc); + state->last = cc == DFLTCC_CC_OK; + state->bits = param->sbb; + if (state->wrap & 4) + strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv; + if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) { + /* Report an error if stream is corrupted */ + state->mode = BAD; + return DFLTCC_INFLATE_CONTINUE; + } + state->mode = TYPEDO; + /* Break if operands are exhausted, otherwise continue looping */ + return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ? + DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE; +} + +int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + + return !state->arch.common.param.nt; +} + +/* + Rotates a circular buffer. + The implementation is based on https://cplusplus.com/reference/algorithm/rotate/ + */ +static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) { + unsigned char *p = pivot; + unsigned char tmp; + + while (p != start) { + tmp = *start; + *start = *p; + *p = tmp; + + start++; + p++; + + if (p == end) + p = pivot; + else if (start == pivot) + pivot = p; + } +} + +int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_state *dfltcc_state = &state->arch.common; + struct dfltcc_param_v0 *param = &dfltcc_state->param; + + if (!PREFIX(dfltcc_can_inflate)(strm)) + return 0; + if (PREFIX(dfltcc_was_inflate_used)(strm)) + /* DFLTCC has already decompressed some data. Since there is not + * enough information to resume decompression in software, the call + * must fail. + */ + return 1; + /* DFLTCC was not used yet - decompress in software */ + memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af)); + /* Convert the window from the hardware to the software format */ + rotate(state->window, state->window + param->ho, state->window + HB_SIZE); + state->whave = state->wnext = MIN(param->hl, state->wsize); + return 0; +} + +/* + Preloading history. +*/ +int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + /* if window not in use yet, initialize */ + if (state->wsize == 0) + state->wsize = 1U << state->wbits; + + append_history(param, state->window, dictionary, dict_length); + state->havedict = 1; + return Z_OK; +} + +int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm, + unsigned char *dictionary, uInt *dict_length) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + if (dictionary && state->window) + get_history(param, state->window, dictionary); + if (dict_length) + *dict_length = param->hl; + return Z_OK; +} diff --git a/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h new file mode 100644 index 0000000000..3623f8ed7f --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h @@ -0,0 +1,67 @@ +#ifndef DFLTCC_INFLATE_H +#define DFLTCC_INFLATE_H + +#include "dfltcc_common.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm); +typedef enum { + DFLTCC_INFLATE_CONTINUE, + DFLTCC_INFLATE_BREAK, + DFLTCC_INFLATE_SOFTWARE, +} dfltcc_inflate_action; +dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret); +int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length); +int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm, + unsigned char *dictionary, uInt* dict_length); + +#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state) + +#define INFLATE_PRIME_HOOK(strm, bits, value) \ + do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0) + +#define INFLATE_TYPEDO_HOOK(strm, flush) \ + if (PREFIX(dfltcc_can_inflate)((strm))) { \ + dfltcc_inflate_action action; \ +\ + RESTORE(); \ + action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \ + LOAD(); \ + if (action == DFLTCC_INFLATE_CONTINUE) \ + break; \ + else if (action == DFLTCC_INFLATE_BREAK) \ + goto inf_leave; \ + } + +#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm))) + +#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm))) + +#define INFLATE_MARK_HOOK(strm) \ + do { \ + if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \ + } while (0) + +#define INFLATE_SYNC_POINT_HOOK(strm) \ + do { \ + if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \ + } while (0) + +#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_inflate)((strm))) \ + return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_inflate)((strm))) \ + return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE) + +#endif diff --git a/3rdparty/zlib-ng/arch/s390/s390_features.c b/3rdparty/zlib-ng/arch/s390/s390_features.c new file mode 100644 index 0000000000..629025d5bb --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/s390_features.c @@ -0,0 +1,14 @@ +#include "zbuild.h" +#include "s390_features.h" + +#ifdef HAVE_SYS_AUXV_H +# include +#endif + +#ifndef HWCAP_S390_VXRS +#define HWCAP_S390_VXRS HWCAP_S390_VX +#endif + +void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) { + features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS; +} diff --git a/3rdparty/zlib-ng/arch/s390/s390_features.h b/3rdparty/zlib-ng/arch/s390/s390_features.h new file mode 100644 index 0000000000..fb2ac14b26 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/s390_features.h @@ -0,0 +1,14 @@ +/* s390_features.h -- check for s390 features. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef S390_FEATURES_H_ +#define S390_FEATURES_H_ + +struct s390_cpu_features { + int has_vx; +}; + +void Z_INTERNAL s390_check_features(struct s390_cpu_features *features); + +#endif diff --git a/3rdparty/zlib-ng/arch/s390/s390_functions.h b/3rdparty/zlib-ng/arch/s390/s390_functions.h new file mode 100644 index 0000000000..e9c67978f0 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/s390_functions.h @@ -0,0 +1,20 @@ +/* s390_functions.h -- s390 implementations for arch-specific functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef S390_FUNCTIONS_H_ +#define S390_FUNCTIONS_H_ + +#ifdef S390_CRC32_VX +uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len); +#endif + + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +# if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__) +# undef native_crc32 +# define native_crc32 = crc32_s390_vx +# endif +#endif + +#endif diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile new file mode 100644 index 0000000000..cf5c3e7271 --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile @@ -0,0 +1,47 @@ +# Self-Hosted IBM Z Github Actions Runner. + +FROM almalinux:9 + +RUN dnf update -y -q && \ + dnf install -y -q --enablerepo=crb wget git which sudo jq \ + cmake make automake autoconf m4 libtool ninja-build python3-pip \ + gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \ + glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel + +RUN dnf install -y -q dotnet-sdk-6.0 && \ + echo "Using SDK - `dotnet --version`" + +COPY runner-s390x.patch /tmp/runner.patch +COPY runner-global.json /tmp/global.json + +RUN cd /tmp && \ + git clone -q https://github.com/actions/runner && \ + cd runner && \ + git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -b build && \ + git apply /tmp/runner.patch && \ + cp -f /tmp/global.json src/global.json + + +RUN cd /tmp/runner/src && \ + ./dev.sh layout && \ + ./dev.sh package && \ + rm -rf /root/.dotnet /root/.nuget + +RUN useradd -c "Action Runner" -m actions-runner && \ + usermod -L actions-runner + +RUN tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \ + chown -R actions-runner:actions-runner /home/actions-runner + +#VOLUME /home/actions-runner + +RUN rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \ + dnf clean all + +USER actions-runner + +# Scripts. +COPY fs/ / +WORKDIR /home/actions-runner +ENTRYPOINT ["/usr/bin/entrypoint"] +CMD ["/usr/bin/actions-runner"] diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service new file mode 100644 index 0000000000..b6c20b65ec --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service @@ -0,0 +1,18 @@ +[Unit] +Description=Podman container: Gaplib Github Actions Runner +Wants=network-online.target +After=network-online.target +StartLimitIntervalSec=1 +RequiresMountsFor=/run/user/1001/containers + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Restart=always +TimeoutStopSec=61 +ExecStart=/usr/bin/podman start gaplib-actions-runner +ExecStop=/usr/bin/podman stop -t 1 gaplib-actions-runner +ExecStopPost=/usr/bin/podman stop -t 1 gaplib-actions-runner +Type=forking + +[Install] +WantedBy=default.target diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json new file mode 100644 index 0000000000..e7028fe0dd --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json @@ -0,0 +1,5 @@ +{ + "sdk": { + "version": "6.0.421" + } +} diff --git a/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch new file mode 100644 index 0000000000..8260f3ccdd --- /dev/null +++ b/3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch @@ -0,0 +1,243 @@ +diff --git a/src/Directory.Build.props b/src/Directory.Build.props +index 9db5fac..f02e235 100644 +--- a/src/Directory.Build.props ++++ b/src/Directory.Build.props +@@ -44,6 +44,9 @@ + + $(DefineConstants);ARM64 + ++ ++ $(DefineConstants);S390X ++ + + + +diff --git a/src/Misc/externals.sh b/src/Misc/externals.sh +index 383221e..1555f67 100755 +--- a/src/Misc/externals.sh ++++ b/src/Misc/externals.sh +@@ -189,3 +189,8 @@ if [[ "$PACKAGERUNTIME" == "linux-arm" ]]; then + acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-armv7l.tar.gz" node16 fix_nested_dir + acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-armv7l.tar.gz" node20 fix_nested_dir + fi ++ ++if [[ "$PACKAGERUNTIME" == "linux-s390x" ]]; then ++ acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-s390x.tar.gz" node16 fix_nested_dir ++ acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-s390x.tar.gz" node20 fix_nested_dir ++fi +diff --git a/src/Misc/layoutroot/config.sh b/src/Misc/layoutroot/config.sh +index 14cc6ba..9b5b8e6 100755 +--- a/src/Misc/layoutroot/config.sh ++++ b/src/Misc/layoutroot/config.sh +@@ -20,25 +20,29 @@ then + + message="Execute sudo ./bin/installdependencies.sh to install any missing Dotnet Core 6.0 dependencies." + +- ldd ./bin/libcoreclr.so | grep 'not found' +- if [ $? -eq 0 ]; then +- echo "Dependencies is missing for Dotnet Core 6.0" +- echo $message +- exit 1 +- fi ++ ARCH=`uname -m` ++ if [ "${ARCH}" != "s390x" -a "${ARCH}" != "ppc64le" ] ++ then ++ ldd ./bin/libcoreclr.so | grep 'not found' ++ if [ $? -eq 0 ]; then ++ echo "Dependencies is missing for Dotnet Core 6.0" ++ echo $message ++ exit 1 ++ fi + +- ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found' +- if [ $? -eq 0 ]; then +- echo "Dependencies is missing for Dotnet Core 6.0" +- echo $message +- exit 1 +- fi ++ ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found' ++ if [ $? -eq 0 ]; then ++ echo "Dependencies is missing for Dotnet Core 6.0" ++ echo $message ++ exit 1 ++ fi + +- ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found' +- if [ $? -eq 0 ]; then +- echo "Dependencies is missing for Dotnet Core 6.0" +- echo $message +- exit 1 ++ ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found' ++ if [ $? -eq 0 ]; then ++ echo "Dependencies is missing for Dotnet Core 6.0" ++ echo $message ++ exit 1 ++ fi + fi + + if ! [ -x "$(command -v ldconfig)" ]; then +diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs +index 177e3c9..9545981 100644 +--- a/src/Runner.Common/Constants.cs ++++ b/src/Runner.Common/Constants.cs +@@ -58,7 +58,8 @@ namespace GitHub.Runner.Common + X86, + X64, + Arm, +- Arm64 ++ Arm64, ++ S390x + } + + public static class Runner +@@ -81,6 +82,8 @@ namespace GitHub.Runner.Common + public static readonly Architecture PlatformArchitecture = Architecture.Arm; + #elif ARM64 + public static readonly Architecture PlatformArchitecture = Architecture.Arm64; ++#elif S390X ++ public static readonly Architecture PlatformArchitecture = Architecture.S390x; + #else + public static readonly Architecture PlatformArchitecture = Architecture.X64; + #endif +diff --git a/src/Runner.Common/Util/VarUtil.cs b/src/Runner.Common/Util/VarUtil.cs +index 97273a1..2a34430 100644 +--- a/src/Runner.Common/Util/VarUtil.cs ++++ b/src/Runner.Common/Util/VarUtil.cs +@@ -53,6 +53,8 @@ namespace GitHub.Runner.Common.Util + return "ARM"; + case Constants.Architecture.Arm64: + return "ARM64"; ++ case Constants.Architecture.S390x: ++ return "S390X"; + default: + throw new NotSupportedException(); // Should never reach here. + } +diff --git a/src/Test/L0/ConstantGenerationL0.cs b/src/Test/L0/ConstantGenerationL0.cs +index 2042485..a9d8b46 100644 +--- a/src/Test/L0/ConstantGenerationL0.cs ++++ b/src/Test/L0/ConstantGenerationL0.cs +@@ -20,6 +20,7 @@ namespace GitHub.Runner.Common.Tests + "linux-x64", + "linux-arm", + "linux-arm64", ++ "linux-s390x", + "osx-x64", + "osx-arm64" + }; +diff --git a/src/Test/L0/Listener/SelfUpdaterL0.cs b/src/Test/L0/Listener/SelfUpdaterL0.cs +index 26ba65e..6791df3 100644 +--- a/src/Test/L0/Listener/SelfUpdaterL0.cs ++++ b/src/Test/L0/Listener/SelfUpdaterL0.cs +@@ -1,4 +1,4 @@ +-#if !(OS_WINDOWS && ARM64) ++#if !(OS_WINDOWS && ARM64) && !S390X + using System; + using System.Collections.Generic; + using System.IO; +@@ -16,6 +16,7 @@ using Xunit; + + namespace GitHub.Runner.Common.Tests.Listener + { ++#if !S390X // Self-update is not currently supported on S390X + public sealed class SelfUpdaterL0 + { + private Mock _runnerServer; +@@ -291,5 +292,6 @@ namespace GitHub.Runner.Common.Tests.Listener + } + } + } ++#endif + } + #endif +diff --git a/src/Test/L0/Listener/SelfUpdaterV2L0.cs b/src/Test/L0/Listener/SelfUpdaterV2L0.cs +index 5115a6b..dd8d198 100644 +--- a/src/Test/L0/Listener/SelfUpdaterV2L0.cs ++++ b/src/Test/L0/Listener/SelfUpdaterV2L0.cs +@@ -1,4 +1,4 @@ +-#if !(OS_WINDOWS && ARM64) ++#if !(OS_WINDOWS && ARM64) && !S390X + using System; + using System.Collections.Generic; + using System.IO; +diff --git a/src/Test/L0/Worker/StepHostL0.cs b/src/Test/L0/Worker/StepHostL0.cs +index f6b5889..26f8e21 100644 +--- a/src/Test/L0/Worker/StepHostL0.cs ++++ b/src/Test/L0/Worker/StepHostL0.cs +@@ -31,7 +31,7 @@ namespace GitHub.Runner.Common.Tests.Worker + return hc; + } + +-#if OS_LINUX ++#if OS_LINUX && !S390X + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Worker")] +diff --git a/src/dev.sh b/src/dev.sh +index fa637d1..8c66f37 100755 +--- a/src/dev.sh ++++ b/src/dev.sh +@@ -54,6 +54,7 @@ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then + case $CPU_NAME in + armv7l) RUNTIME_ID="linux-arm";; + aarch64) RUNTIME_ID="linux-arm64";; ++ s390x) RUNTIME_ID="linux-s390x";; + esac + fi + elif [[ "$CURRENT_PLATFORM" == 'darwin' ]]; then +@@ -80,7 +81,7 @@ if [[ "$CURRENT_PLATFORM" == 'windows' ]]; then + exit 1 + fi + elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then +- if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') ]]; then ++ if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') && ("$RUNTIME_ID" != 'linux-s390x') ]]; then + echo "Failed: Can't build $RUNTIME_ID package $CURRENT_PLATFORM" >&2 + exit 1 + fi +@@ -199,7 +200,8 @@ function package () + popd > /dev/null + } + +-if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet") ]]; then ++if [[ "${RUNTIME_ID}" != "linux-s390x" && ((! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet")) ]]; then ++ + + # Download dotnet SDK to ../_dotnetsdk directory + heading "Ensure Dotnet SDK" +@@ -224,8 +226,10 @@ if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTN + echo "${DOTNETSDK_VERSION}" > "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}" + fi + +-echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%" +-export PATH=${DOTNETSDK_INSTALLDIR}:$PATH ++if [[ -d "${DOTNETSDK_INSTALLDIR}" ]]; then ++ echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%" ++ export PATH=${DOTNETSDK_INSTALLDIR}:$PATH ++fi + + heading "Dotnet SDK Version" + dotnet --version +diff --git a/src/dir.proj b/src/dir.proj +index 056a312..8370922 100644 +--- a/src/dir.proj ++++ b/src/dir.proj +@@ -41,8 +41,18 @@ + + + +- +- ++ ++ ++ RuntimeIdentifier=$(PackageRuntime) ++ ++ SelfContained=false;CopyLocalRuntimeTargetAssets=false ++ ++ ++ ++ + + + diff --git a/3rdparty/zlib-ng/arch/x86/Makefile.in b/3rdparty/zlib-ng/arch/x86/Makefile.in index 7c052469b2..c13cd179c0 100644 --- a/3rdparty/zlib-ng/arch/x86/Makefile.in +++ b/3rdparty/zlib-ng/arch/x86/Makefile.in @@ -35,7 +35,6 @@ all: \ chunkset_ssse3.o chunkset_ssse3.lo \ compare256_avx2.o compare256_avx2.lo \ compare256_sse2.o compare256_sse2.lo \ - insert_string_sse42.o insert_string_sse42.lo \ crc32_pclmulqdq.o crc32_pclmulqdq.lo \ crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \ slide_hash_avx2.o slide_hash_avx2.lo \ @@ -77,12 +76,6 @@ compare256_sse2.o: compare256_sse2.lo: $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c -insert_string_sse42.o: - $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c - -insert_string_sse42.lo: - $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c - crc32_pclmulqdq.o: $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c @@ -90,10 +83,10 @@ crc32_pclmulqdq.lo: $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c crc32_vpclmulqdq.o: - $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c + $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c crc32_vpclmulqdq.lo: - $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c + $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c slide_hash_avx2.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c index e3ac6705ce..38e7f068e3 100644 --- a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c +++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c @@ -9,24 +9,15 @@ #ifdef X86_AVX2 -#include "../../zbuild.h" +#include "zbuild.h" #include -#include "../../adler32_fold.h" -#include "../../adler32_p.h" +#include "adler32_p.h" #include "adler32_avx2_p.h" #include "x86_intrins.h" -#ifdef X86_SSE42 extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); -#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d) -#define sub32(a, b, c) adler32_ssse3(a, b, c) -#else -#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1) -#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1) -#endif - static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { if (src == NULL) return 1L; if (len == 0) return adler; @@ -44,9 +35,9 @@ rem_peel: } } else if (len < 32) { if (COPY) { - return copy_sub32(adler, dst, src, len); + return adler32_fold_copy_sse42(adler, dst, src, len); } else { - return sub32(adler, src, len); + return adler32_ssse3(adler, src, len); } } diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c index aa6cc17018..626c4807f8 100644 --- a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c +++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c @@ -8,10 +8,9 @@ #ifdef X86_AVX512 -#include "../../zbuild.h" -#include "../../adler32_p.h" -#include "../../adler32_fold.h" -#include "../../cpu_features.h" +#include "zbuild.h" +#include "adler32_p.h" +#include "arch_functions.h" #include #include "x86_intrins.h" #include "adler32_avx512_p.h" @@ -33,13 +32,7 @@ rem_peel: _mm512_mask_storeu_epi8(dst, storemask, copy_vec); } -#ifdef X86_AVX2 return adler32_avx2(adler, src, len); -#elif defined(X86_SSSE3) - return adler32_ssse3(adler, src, len); -#else - return adler32_len_16(adler0, src, len, adler1); -#endif } __m512i vbuf, vs1_0, vs3; diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c index 771f7ebe04..4c5cfc1cad 100644 --- a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c +++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c @@ -9,11 +9,10 @@ #ifdef X86_AVX512VNNI -#include "../../zbuild.h" -#include "../../adler32_p.h" -#include "../../cpu_features.h" +#include "zbuild.h" +#include "adler32_p.h" +#include "arch_functions.h" #include -#include "../../adler32_fold.h" #include "x86_intrins.h" #include "adler32_avx512_p.h" #include "adler32_avx2_p.h" @@ -28,20 +27,10 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size rem_peel: if (len < 32) -#if defined(X86_SSSE3) return adler32_ssse3(adler, src, len); -#else - return adler32_len_16(adler0, src, len, adler1); -#endif if (len < 64) -#ifdef X86_AVX2 return adler32_avx2(adler, src, len); -#elif defined(X86_SSE3) - return adler32_ssse3(adler, src, len); -#else - return adler32_len_16(adler0, src, len, adler1); -#endif const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, @@ -135,11 +124,7 @@ rem_peel_copy: __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); _mm256_mask_storeu_epi8(dst, storemask, copy_vec); -#if defined(X86_SSSE3) return adler32_ssse3(adler, src, len); -#else - return adler32_len_16(adler0, src, len, adler1); -#endif } const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, diff --git a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c index 257a360982..df0739d165 100644 --- a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c +++ b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c @@ -6,9 +6,8 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" -#include "../../adler32_p.h" -#include "../../adler32_fold.h" +#include "zbuild.h" +#include "adler32_p.h" #include "adler32_ssse3_p.h" #include diff --git a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c index ae819d632e..15e2f78ba3 100644 --- a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c +++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c @@ -6,8 +6,8 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" -#include "../../adler32_p.h" +#include "zbuild.h" +#include "adler32_p.h" #include "adler32_ssse3_p.h" #ifdef X86_SSSE3 diff --git a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c index c06d1b37bd..722ecd3d51 100644 --- a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c +++ b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c @@ -4,10 +4,7 @@ #include "zbuild.h" -/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize - * code size by sharing the chunkcopy functions, which will certainly compile - * to identical machine code */ -#if defined(X86_SSSE3) && defined(X86_SSE2) +#if defined(X86_SSSE3) #include #include "../generic/chunk_permute_table.h" @@ -19,8 +16,6 @@ typedef __m128i chunk_t; #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 #define HAVE_CHUNK_MAG -#define HAVE_CHUNKCOPY -#define HAVE_CHUNKUNROLL static const lut_rem_pair perm_idx_lut[13] = { {0, 1}, /* 3 */ @@ -83,14 +78,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t return ret_vec; } -extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); -extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); - #define CHUNKSIZE chunksize_ssse3 #define CHUNKMEMSET chunkmemset_ssse3 #define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3 -#define CHUNKCOPY chunkcopy_sse2 -#define CHUNKUNROLL chunkunroll_sse2 +#define CHUNKCOPY chunkcopy_ssse3 +#define CHUNKUNROLL chunkunroll_ssse3 #include "chunkset_tpl.h" diff --git a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c index 1318a0e333..d2c835e4ee 100644 --- a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c +++ b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c @@ -3,8 +3,9 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" - +#include "zbuild.h" +#include "zutil_p.h" +#include "deflate.h" #include "fallback_builtins.h" #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) diff --git a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c index aad4bd240d..216bb3a705 100644 --- a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c +++ b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c @@ -3,8 +3,9 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" - +#include "zbuild.h" +#include "zutil_p.h" +#include "deflate.h" #include "fallback_builtins.h" #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) diff --git a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h index 3e79928317..1ffe201dda 100644 --- a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h +++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h @@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; __m128i xmm_crc_part = _mm_setzero_si128(); -#ifdef COPY char ALIGNED_(16) partial_buf[16] = { 0 }; -#else +#ifndef COPY __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); int32_t first = init_crc != 0; - /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31 - * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to - * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which - * by definition can be up to 15 bytes + one full vector load. */ - assert(len >= 31 || first == 0); + /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed + * for the aligning load that occurs. If there's an initial CRC, to carry it forward through + * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be + * up to 15 bytes + one full vector load. */ + assert(len >= 16 || first == 0); #endif crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); if (len < 16) { -#ifdef COPY if (len == 0) return; memcpy(partial_buf, src, len); xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); +#ifdef COPY memcpy(dst, partial_buf, len); #endif goto partial; @@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint if (algn_diff < 4 && init_crc != 0) { xmm_t0 = xmm_crc_part; - xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); - fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + if (len >= 32) { + xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + } else { + memcpy(partial_buf, src + 16, len - 16); + xmm_crc_part = _mm_load_si128((__m128i*)partial_buf); + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + src += 16; + len -= 16; +#ifdef COPY + dst -= algn_diff; +#endif + goto partial; + } + src += 16; len -= 16; } diff --git a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h index 05d3b15257..3a4f6af5af 100644 --- a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h +++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h @@ -17,7 +17,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" +#include "zbuild.h" #include #include @@ -26,8 +26,9 @@ # include #endif -#include "../../crc32_fold.h" -#include "../../crc32_braid_p.h" +#include "crc32.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" #include "x86_intrins.h" #include @@ -350,11 +351,22 @@ Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) { return crc->value; } +static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) { + uint32_t c = (~crc) & 0xffffffff; + + while (len) { + len--; + DO1; + } + + return c ^ 0xffffffff; +} + Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) { - /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for - * these short lengths might also prove to be effective */ - if (len < 64) - return PREFIX(crc32_braid)(crc32, buf, len); + /* For lens smaller than ~12, crc32_small method is faster. + * But there are also minimum requirements for the pclmul functions due to alignment */ + if (len < 16) + return crc32_small(crc32, buf, len); crc32_fold ALIGNED_(16) crc_state; CRC32_FOLD_RESET(&crc_state); diff --git a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c index ec641b4326..cad35b14ee 100644 --- a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c +++ b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c @@ -3,7 +3,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) +#ifdef X86_VPCLMULQDQ_CRC #define X86_VPCLMULQDQ #define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy diff --git a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c b/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c deleted file mode 100644 index ae092a7e47..0000000000 --- a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c +++ /dev/null @@ -1,24 +0,0 @@ -/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions - * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - * - */ - -#ifdef X86_SSE42 -#include "../../zbuild.h" -#include -#include "../../deflate.h" - -#define HASH_CALC(s, h, val)\ - h = _mm_crc32_u32(h, val) - -#define HASH_CALC_VAR h -#define HASH_CALC_VAR_INIT uint32_t h = 0 - -#define UPDATE_HASH update_hash_sse42 -#define INSERT_STRING insert_string_sse42 -#define QUICK_INSERT_STRING quick_insert_string_sse42 - -#include "../../insert_string_tpl.h" -#endif diff --git a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c index 94fe10c7bf..f49ad3331b 100644 --- a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c +++ b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c @@ -9,8 +9,8 @@ * * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" -#include "../../deflate.h" +#include "zbuild.h" +#include "deflate.h" #include diff --git a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c index 5daac4a739..cfdf7bee49 100644 --- a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c +++ b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c @@ -8,8 +8,8 @@ * * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" -#include "../../deflate.h" +#include "zbuild.h" +#include "deflate.h" #include #include diff --git a/3rdparty/zlib-ng/arch/x86/x86_features.c b/3rdparty/zlib-ng/arch/x86/x86_features.c index 8d11564c24..58cb4df341 100644 --- a/3rdparty/zlib-ng/arch/x86/x86_features.c +++ b/3rdparty/zlib-ng/arch/x86/x86_features.c @@ -7,7 +7,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" +#include "zbuild.h" #include "x86_features.h" #ifdef _MSC_VER @@ -15,6 +15,13 @@ #else // Newer versions of GCC and clang come with cpuid.h # include +# ifdef X86_HAVE_XSAVE_INTRIN +# if __GNUC__ == 8 +# include +# else +# include +# endif +# endif #endif #include @@ -29,6 +36,7 @@ static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, *ecx = registers[2]; *edx = registers[3]; #else + *eax = *ebx = *ecx = *edx = 0; __cpuid(info, *eax, *ebx, *ecx, *edx); #endif } @@ -43,12 +51,13 @@ static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, *ecx = registers[2]; *edx = registers[3]; #else + *eax = *ebx = *ecx = *edx = 0; __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx); #endif } static inline uint64_t xgetbv(unsigned int xcr) { -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN) return _xgetbv(xcr); #else uint32_t eax, edx; @@ -90,7 +99,16 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { // check AVX512 bits if the OS supports saving ZMM registers if (features->has_os_save_zmm) { - features->has_avx512 = ebx & 0x00010000; + features->has_avx512f = ebx & 0x00010000; + if (features->has_avx512f) { + // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable + // AVX512(DQ,BW,VL). + features->has_avx512dq = ebx & 0x00020000; + features->has_avx512bw = ebx & 0x40000000; + features->has_avx512vl = ebx & 0x80000000; + } + features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \ + && features->has_avx512vl; features->has_avx512vnni = ecx & 0x800; } } diff --git a/3rdparty/zlib-ng/arch/x86/x86_features.h b/3rdparty/zlib-ng/arch/x86/x86_features.h index 4a36bde835..6daa5e3828 100644 --- a/3rdparty/zlib-ng/arch/x86/x86_features.h +++ b/3rdparty/zlib-ng/arch/x86/x86_features.h @@ -1,14 +1,18 @@ /* x86_features.h -- check for CPU features -* Copyright (C) 2013 Intel Corporation Jim Kukunas -* For conditions of distribution and use, see copyright notice in zlib.h -*/ + * Copyright (C) 2013 Intel Corporation Jim Kukunas + * For conditions of distribution and use, see copyright notice in zlib.h + */ #ifndef X86_FEATURES_H_ #define X86_FEATURES_H_ struct x86_cpu_features { int has_avx2; - int has_avx512; + int has_avx512f; + int has_avx512dq; + int has_avx512bw; + int has_avx512vl; + int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled. int has_avx512vnni; int has_sse2; int has_ssse3; @@ -21,4 +25,4 @@ struct x86_cpu_features { void Z_INTERNAL x86_check_features(struct x86_cpu_features *features); -#endif /* CPU_H_ */ +#endif /* X86_FEATURES_H_ */ diff --git a/3rdparty/zlib-ng/arch/x86/x86_functions.h b/3rdparty/zlib-ng/arch/x86/x86_functions.h new file mode 100644 index 0000000000..5aa9b31747 --- /dev/null +++ b/3rdparty/zlib-ng/arch/x86/x86_functions.h @@ -0,0 +1,172 @@ +/* x86_functions.h -- x86 implementations for arch-specific functions. + * Copyright (C) 2013 Intel Corporation Jim Kukunas + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_FUNCTIONS_H_ +#define X86_FUNCTIONS_H_ + +#ifdef X86_SSE2 +uint32_t chunksize_sse2(void); +uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); + +# ifdef HAVE_BUILTIN_CTZ + uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); + uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match); + uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match); + void slide_hash_sse2(deflate_state *s); +# endif + void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); +#endif + +#ifdef X86_SSSE3 +uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); +uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left); +void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); +#endif + +#ifdef X86_SSE42 +uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_AVX2 +uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t chunksize_avx2(void); +uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left); + +# ifdef HAVE_BUILTIN_CTZ + uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); + uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match); + uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match); + void slide_hash_avx2(deflate_state *s); +# endif + void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start); +#endif +#ifdef X86_AVX512 +uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_AVX512VNNI +uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_PCLMULQDQ_CRC +uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc); +void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); +uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc); +uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); +#endif +#ifdef X86_VPCLMULQDQ_CRC +uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc); +void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); +uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc); +uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); +#endif + + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// X86 - SSE2 +# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2) +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_sse2 +# undef native_chunksize +# define native_chunksize chunksize_sse2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_sse2 +# undef native_slide_hash +# define native_slide_hash slide_hash_sse2 +# ifdef HAVE_BUILTIN_CTZ +# undef native_compare256 +# define native_compare256 compare256_sse2 +# undef native_longest_match +# define native_longest_match longest_match_sse2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_sse2 +# endif +#endif +// X86 - SSSE3 +# if defined(X86_SSSE3) && defined(__SSSE3__) +# undef native_adler32 +# define native_adler32 adler32_ssse3 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_ssse3 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_ssse3 +# endif +// X86 - SSE4.2 +# if defined(X86_SSE42) && defined(__SSE4_2__) +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_sse42 +# endif + +// X86 - PCLMUL +#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__) +# undef native_crc32 +# define native_crc32 crc32_pclmulqdq +# undef native_crc32_fold +# define native_crc32_fold crc32_fold_pclmulqdq +# undef native_crc32_fold_copy +# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy +# undef native_crc32_fold_final +# define native_crc32_fold_final crc32_fold_pclmulqdq_final +# undef native_crc32_fold_reset +# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset +#endif +// X86 - AVX +# if defined(X86_AVX2) && defined(__AVX2__) +# undef native_adler32 +# define native_adler32 adler32_avx2 +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_avx2 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_avx2 +# undef native_chunksize +# define native_chunksize chunksize_avx2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_avx2 +# undef native_slide_hash +# define native_slide_hash slide_hash_avx2 +# ifdef HAVE_BUILTIN_CTZ +# undef native_compare256 +# define native_compare256 compare256_avx2 +# undef native_longest_match +# define native_longest_match longest_match_avx2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx2 +# endif +# endif + +// X86 - AVX512 (F,DQ,BW,Vl) +# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) +# undef native_adler32 +# define native_adler32 adler32_avx512 +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_avx512 +// X86 - AVX512 (VNNI) +# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) +# undef native_adler32 +# define native_adler32 adler32_avx512_vnni +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni +# endif +// X86 - VPCLMULQDQ +# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__) +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq +# undef native_crc32_fold +# define native_crc32_fold crc32_fold_vpclmulqdq +# undef native_crc32_fold_copy +# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy +# undef native_crc32_fold_final +# define native_crc32_fold_final crc32_fold_vpclmulqdq_final +# undef native_crc32_fold_reset +# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset +# endif +# endif +#endif + +#endif /* X86_FUNCTIONS_H_ */ diff --git a/3rdparty/zlib-ng/arch/x86/x86_intrins.h b/3rdparty/zlib-ng/arch/x86/x86_intrins.h index 52e1085d66..0e596d18a1 100644 --- a/3rdparty/zlib-ng/arch/x86/x86_intrins.h +++ b/3rdparty/zlib-ng/arch/x86/x86_intrins.h @@ -7,7 +7,7 @@ #ifdef __AVX2__ #include -#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \ +#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \ || (defined(__apple_build_version__) && __apple_build_version__ < 9020039) static inline __m256i _mm256_zextsi128_si256(__m128i a) { __m128i r; @@ -29,7 +29,7 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) { /* GCC <9 is missing some AVX512 intrinsics. */ #ifdef __AVX512F__ -#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9) +#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9) #include #define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \ diff --git a/3rdparty/zlib-ng/arch_functions.h b/3rdparty/zlib-ng/arch_functions.h new file mode 100644 index 0000000000..9a7f8d9379 --- /dev/null +++ b/3rdparty/zlib-ng/arch_functions.h @@ -0,0 +1,29 @@ +/* arch_functions.h -- Arch-specific function prototypes. + * Copyright (C) 2017 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CPU_FUNCTIONS_H_ +#define CPU_FUNCTIONS_H_ + +#include "zbuild.h" +#include "zutil.h" +#include "crc32.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#include "arch/generic/generic_functions.h" + +#if defined(X86_FEATURES) +# include "arch/x86/x86_functions.h" +#elif defined(ARM_FEATURES) +# include "arch/arm/arm_functions.h" +#elif defined(PPC_FEATURES) || defined(POWER_FEATURES) +# include "arch/power/power_functions.h" +#elif defined(S390_FEATURES) +# include "arch/s390/s390_functions.h" +#elif defined(RISCV_FEATURES) +# include "arch/riscv/riscv_functions.h" +#endif + +#endif diff --git a/3rdparty/zlib-ng/chunkset_tpl.h b/3rdparty/zlib-ng/chunkset_tpl.h index f909a12557..f5cc5c0450 100644 --- a/3rdparty/zlib-ng/chunkset_tpl.h +++ b/3rdparty/zlib-ng/chunkset_tpl.h @@ -5,7 +5,7 @@ #include "zbuild.h" #include -#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2) +#if CHUNK_SIZE == 32 && defined(X86_SSSE3) extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len); #endif @@ -25,7 +25,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) { without iteration, which will hopefully make the branch prediction more reliable. */ #ifndef HAVE_CHUNKCOPY -Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { +static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { Assert(len > 0, "chunkcopy should never have a length 0"); chunk_t chunk; int32_t align = ((len - 1) % sizeof(chunk_t)) + 1; @@ -54,7 +54,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { least 258 bytes of output space available (258 being the maximum length output from a single token; see inflate_fast()'s assumptions below). */ #ifndef HAVE_CHUNKUNROLL -Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) { +static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) { unsigned char const *from = out - *dist; chunk_t chunk; while (*dist < *len && *dist < sizeof(chunk_t)) { @@ -98,7 +98,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ Assert(dist > 0, "chunkmemset cannot have a distance 0"); /* Only AVX2 */ -#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2) +#if CHUNK_SIZE == 32 && defined(X86_SSSE3) if (len <= 16) { return chunkmemset_ssse3(out, dist, len); } diff --git a/3rdparty/zlib-ng/cmake/detect-arch.c b/3rdparty/zlib-ng/cmake/detect-arch.c new file mode 100644 index 0000000000..92590182c2 --- /dev/null +++ b/3rdparty/zlib-ng/cmake/detect-arch.c @@ -0,0 +1,115 @@ +// archdetect.c -- Detect compiler architecture and raise preprocessor error +// containing a simple arch identifier. +// Copyright (C) 2019 Hans Kristian Rosbach +// Licensed under the Zlib license, see LICENSE.md for details + +// x86_64 +#if defined(__x86_64__) || defined(_M_X64) + #error archfound x86_64 + +// x86 +#elif defined(__i386) || defined(_M_IX86) + #error archfound i686 + +// ARM +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC) + #error archfound aarch64 +#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM) + #if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__) + #error archfound armv8 + #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) + #error archfound armv7 + #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__) + #error archfound armv6 + #elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) + #error archfound armv5 + #elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__) + #error archfound armv4 + #elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__) + #error archfound armv3 + #elif defined(__ARM_ARCH_2__) + #error archfound armv2 + #endif + +// PowerPC +#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__) + #if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__) + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #error archfound powerpc64le + #else + #error archfound powerpc64 + #endif + #else + #error archfound powerpc + #endif + +// --------------- Less common architectures alphabetically below --------------- + +// ALPHA +#elif defined(__alpha__) || defined(__alpha) + #error archfound alpha + +// Blackfin +#elif defined(__BFIN__) + #error archfound blackfin + +// Itanium +#elif defined(__ia64) || defined(_M_IA64) + #error archfound ia64 + +// MIPS +#elif defined(__mips__) || defined(__mips) + #error archfound mips + +// Motorola 68000-series +#elif defined(__m68k__) + #error archfound m68k + +// SuperH +#elif defined(__sh__) + #error archfound sh + +// SPARC +#elif defined(__sparc__) || defined(__sparc) + #if defined(__sparcv9) || defined(__sparc_v9__) + #error archfound sparc9 + #elif defined(__sparcv8) || defined(__sparc_v8__) + #error archfound sparc8 + #endif + +// SystemZ +#elif defined(__370__) + #error archfound s370 +#elif defined(__s390__) + #error archfound s390 +#elif defined(__s390x) || defined(__zarch__) + #error archfound s390x + +// PARISC +#elif defined(__hppa__) + #error archfound parisc + +// RS-6000 +#elif defined(__THW_RS6000) + #error archfound rs6000 + +// RISC-V +#elif defined(__riscv) + #if __riscv_xlen == 64 + #error archfound riscv64 + #elif __riscv_xlen == 32 + #error archfound riscv32 + #endif + +// LOONGARCH +#elif defined(__loongarch_lp64) + #error archfound loongarch64 + +// Emscripten (WebAssembly) +#elif defined(__EMSCRIPTEN__) + #error archfound wasm32 + +// return 'unrecognized' if we do not know what architecture this is +#else + #error archfound unrecognized +#endif diff --git a/3rdparty/zlib-ng/cmake/detect-arch.cmake b/3rdparty/zlib-ng/cmake/detect-arch.cmake new file mode 100644 index 0000000000..dfdc6013ce --- /dev/null +++ b/3rdparty/zlib-ng/cmake/detect-arch.cmake @@ -0,0 +1,104 @@ +# detect-arch.cmake -- Detect compiler architecture and set ARCH and BASEARCH +# Copyright (C) 2019 Hans Kristian Rosbach +# Licensed under the Zlib license, see LICENSE.md for details +set(ARCHDETECT_FOUND TRUE) + +if(CMAKE_OSX_ARCHITECTURES) + # If multiple architectures are requested (universal build), pick only the first + list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH) +elseif(MSVC) + if("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "X86") + set(ARCH "i686") + elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "x64") + set(ARCH "x86_64") + elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARMV7") + set(ARCH "arm") + elseif ("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64EC") + set(ARCH "aarch64") + endif() +elseif(EMSCRIPTEN) + set(ARCH "wasm32") +elseif(CMAKE_CROSSCOMPILING) + set(ARCH ${CMAKE_C_COMPILER_TARGET}) +else() + # Let preprocessor parse archdetect.c and raise an error containing the arch identifier + enable_language(C) + try_run( + run_result_unused + compile_result_unused + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_LIST_DIR}/detect-arch.c + COMPILE_OUTPUT_VARIABLE RAWOUTPUT + CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} + ) + + # Find basearch tag, and extract the arch word into BASEARCH variable + string(REGEX REPLACE ".*archfound ([a-zA-Z0-9_]+).*" "\\1" ARCH "${RAWOUTPUT}") + if(NOT ARCH) + set(ARCH unknown) + endif() +endif() + +# Make sure we have ARCH set +if(NOT ARCH OR ARCH STREQUAL "unknown") + set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) + message(STATUS "Arch not recognized, falling back to cmake arch: '${ARCH}'") +else() + message(STATUS "Arch detected: '${ARCH}'") +endif() + +# Base arch detection +if("${ARCH}" MATCHES "(x86_64|AMD64|i[3-6]86)") + set(BASEARCH "x86") + set(BASEARCH_X86_FOUND TRUE) +elseif("${ARCH}" MATCHES "(arm(v[0-9])?|aarch64|cortex)") + set(BASEARCH "arm") + set(BASEARCH_ARM_FOUND TRUE) +elseif("${ARCH}" MATCHES "ppc(64(le)?)?|powerpc(64(le)?)?") + set(BASEARCH "ppc") + set(BASEARCH_PPC_FOUND TRUE) +elseif("${ARCH}" MATCHES "alpha") + set(BASEARCH "alpha") + set(BASEARCH_ALPHA_FOUND TRUE) +elseif("${ARCH}" MATCHES "blackfin") + set(BASEARCH "blackfin") + set(BASEARCH_BLACKFIN_FOUND TRUE) +elseif("${ARCH}" MATCHES "ia64") + set(BASEARCH "ia64") + set(BASEARCH_IA64_FOUND TRUE) +elseif("${ARCH}" MATCHES "mips") + set(BASEARCH "mips") + set(BASEARCH_MIPS_FOUND TRUE) +elseif("${ARCH}" MATCHES "m68k") + set(BASEARCH "m68k") + set(BASEARCH_M68K_FOUND TRUE) +elseif("${ARCH}" MATCHES "sh") + set(BASEARCH "sh") + set(BASEARCH_SH_FOUND TRUE) +elseif("${ARCH}" MATCHES "sparc[89]?") + set(BASEARCH "sparc") + set(BASEARCH_SPARC_FOUND TRUE) +elseif("${ARCH}" MATCHES "s3[679]0x?") + set(BASEARCH "s360") + set(BASEARCH_S360_FOUND TRUE) +elseif("${ARCH}" MATCHES "parisc") + set(BASEARCH "parisc") + set(BASEARCH_PARISC_FOUND TRUE) +elseif("${ARCH}" MATCHES "rs6000") + set(BASEARCH "rs6000") + set(BASEARCH_RS6000_FOUND TRUE) +elseif("${ARCH}" MATCHES "riscv(32|64)") + set(BASEARCH "riscv") + set(BASEARCH_RISCV_FOUND TRUE) +elseif("${ARCH}" MATCHES "loongarch64") + set(BASEARCH "loongarch") + set(BASEARCH_LOONGARCH_FOUND TRUE) +elseif("${ARCH}" MATCHES "wasm32") + set(BASEARCH "wasm32") + set(BASEARCH_WASM32_FOUND TRUE) +else() + set(BASEARCH "x86") + set(BASEARCH_X86_FOUND TRUE) + message(STATUS "Basearch '${ARCH}' not recognized, defaulting to 'x86'.") +endif() +message(STATUS "Basearch of '${ARCH}' has been detected as: '${BASEARCH}'") diff --git a/3rdparty/zlib-ng/cmake/detect-coverage.cmake b/3rdparty/zlib-ng/cmake/detect-coverage.cmake new file mode 100644 index 0000000000..8e67a085cd --- /dev/null +++ b/3rdparty/zlib-ng/cmake/detect-coverage.cmake @@ -0,0 +1,46 @@ +# detect-coverage.cmake -- Detect supported compiler coverage flags +# Licensed under the Zlib license, see LICENSE.md for details + +macro(add_code_coverage) + # Check for -coverage flag support for Clang/GCC + if(CMAKE_VERSION VERSION_LESS 3.14) + set(CMAKE_REQUIRED_LIBRARIES -lgcov) + else() + set(CMAKE_REQUIRED_LINK_OPTIONS -coverage) + endif() + check_c_compiler_flag(-coverage HAVE_COVERAGE) + set(CMAKE_REQUIRED_LIBRARIES) + set(CMAKE_REQUIRED_LINK_OPTIONS) + + if(HAVE_COVERAGE) + add_compile_options(-coverage) + add_link_options(-coverage) + message(STATUS "Code coverage enabled using: -coverage") + else() + # Some versions of GCC don't support -coverage shorthand + if(CMAKE_VERSION VERSION_LESS 3.14) + set(CMAKE_REQUIRED_LIBRARIES -lgcov) + else() + set(CMAKE_REQUIRED_LINK_OPTIONS -lgcov -fprofile-arcs) + endif() + check_c_compiler_flag("-ftest-coverage -fprofile-arcs -fprofile-values" HAVE_TEST_COVERAGE) + set(CMAKE_REQUIRED_LIBRARIES) + set(CMAKE_REQUIRED_LINK_OPTIONS) + + if(HAVE_TEST_COVERAGE) + add_compile_options(-ftest-coverage -fprofile-arcs -fprofile-values) + add_link_options(-lgcov -fprofile-arcs) + message(STATUS "Code coverage enabled using: -ftest-coverage") + else() + message(WARNING "Compiler does not support code coverage") + set(WITH_CODE_COVERAGE OFF) + endif() + endif() + + # Set optimization level to zero for code coverage builds + if (WITH_CODE_COVERAGE) + # Use CMake compiler flag variables due to add_compile_options failure on Windows GCC + set(CMAKE_C_FLAGS "-O0 ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "-O0 ${CMAKE_CXX_FLAGS}") + endif() +endmacro() diff --git a/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake b/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake new file mode 100644 index 0000000000..a7c774f474 --- /dev/null +++ b/3rdparty/zlib-ng/cmake/detect-install-dirs.cmake @@ -0,0 +1,43 @@ +# detect-install-dirs.cmake -- Detect install directory parameters +# Copyright (C) 2021 Hans Kristian Rosbach +# Licensed under the Zlib license, see LICENSE.md for details + +# Determine installation directory for executables +if (DEFINED BIN_INSTALL_DIR) + set(BIN_INSTALL_DIR "${BIN_INSTALL_DIR}" CACHE PATH "Installation directory for executables (Deprecated)" FORCE) + set(CMAKE_INSTALL_BINDIR "${BIN_INSTALL_DIR}") +elseif (DEFINED INSTALL_BIN_DIR) + set(CMAKE_INSTALL_BINDIR "${INSTALL_BIN_DIR}") +endif() + +# Determine installation directory for libraries +if (DEFINED LIB_INSTALL_DIR) + set(LIB_INSTALL_DIR "${LIB_INSTALL_DIR}" CACHE PATH "Installation directory for libraries (Deprecated)" FORCE) + set(CMAKE_INSTALL_LIBDIR "${LIB_INSTALL_DIR}") +elseif (DEFINED INSTALL_LIB_DIR) + set(CMAKE_INSTALL_LIBDIR "${INSTALL_LIB_DIR}") +endif() + +# Determine installation directory for include files +if (DEFINED INC_INSTALL_DIR) + set(INC_INSTALL_DIR "${INC_INSTALL_DIR}" CACHE PATH "Installation directory for headers (Deprecated)" FORCE) + set(CMAKE_INSTALL_INCLUDEDIR "${INC_INSTALL_DIR}") +elseif (DEFINED INSTALL_INC_DIR) + set(CMAKE_INSTALL_INCLUDEDIR "${INSTALL_INC_DIR}") +endif() + +# Define GNU standard installation directories +include(GNUInstallDirs) + +# Determine installation directory for pkgconfig files +if (DEFINED PKGCONFIG_INSTALL_DIR) + set(PKGCONFIG_INSTALL_DIR "${PKGCONFIG_INSTALL_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE) +elseif (DEFINED INSTALL_PKGCONFIG_DIR) + set(PKGCONFIG_INSTALL_DIR "${INSTALL_PKGCONFIG_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE) +elseif (DEFINED CMAKE_INSTALL_PKGCONFIGDIR) + set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE) +elseif (DEFINED CMAKE_INSTALL_FULL_PKGCONFIGDIR) + set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_FULL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE) +else() + set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files") +endif() diff --git a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake index 74ac3910b8..78e46e14bb 100644 --- a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake +++ b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake @@ -2,40 +2,39 @@ # Licensed under the Zlib license, see LICENSE.md for details macro(check_acle_compiler_flag) - if(MSVC) - # Both ARM and ARM64-targeting msvc support intrinsics, but - # ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32 - if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64") - set(HAVE_ACLE_FLAG TRUE) - endif() - else() + if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + check_c_compiler_flag("-march=armv8-a+crc" HAVE_MARCH_ARMV8_CRC) + if(HAVE_MARCH_ARMV8_CRC) set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support") + else() + check_c_compiler_flag("-march=armv8-a+crc+simd" HAVE_MARCH_ARMV8_CRC_SIMD) + if(HAVE_MARCH_ARMV8_CRC_SIMD) + set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support") + endif() endif() endif() - # Check whether compiler supports ACLE flag - set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") - check_c_source_compiles( - "int main() { return 0; }" - HAVE_ACLE_FLAG FAIL_REGEX "not supported") - if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG) - set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE) - # Check whether compiler supports ACLE flag - set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}") - check_c_source_compiles( - "int main() { return 0; }" - HAVE_ACLE_FLAG2 FAIL_REGEX "not supported") - set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE) - unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable - endif() - set(CMAKE_REQUIRED_FLAGS) endif() + # Check whether compiler supports ARMv8 CRC intrinsics + set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") + check_c_source_compiles( + "#if defined(_MSC_VER) + #include + #else + #include + #endif + unsigned int f(unsigned int a, unsigned int b) { + return __crc32w(a, b); + } + int main(void) { return 0; }" + HAVE_ACLE_FLAG + ) + set(CMAKE_REQUIRED_FLAGS) endmacro() macro(check_armv6_compiler_flag) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6) if(HAVE_MARCH_ARMV6) set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support") @@ -67,21 +66,21 @@ macro(check_armv6_compiler_flag) return __uqsub16(a, b); #endif } - int main(void) { return 0; }" + int main(void) { return f(1,2); }" HAVE_ARMV6_INTRIN ) set(CMAKE_REQUIRED_FLAGS) endmacro() macro(check_avx512_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "Intel") - if(CMAKE_HOST_UNIX OR APPLE) - set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") - else() - set(AVX512FLAG "/arch:AVX512") - endif() - elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") + else() + set(AVX512FLAG "/arch:AVX512") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") @@ -94,9 +93,9 @@ macro(check_avx512_intrinsics) endif() unset(HAVE_CASCADE_LAKE) endif() + elseif(MSVC) + set(AVX512FLAG "/arch:AVX512") endif() - elseif(MSVC) - set(AVX512FLAG "/arch:AVX512") endif() # Check whether compiler supports AVX512 intrinsics set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") @@ -109,26 +108,17 @@ macro(check_avx512_intrinsics) int main(void) { return 0; }" HAVE_AVX512_INTRIN ) - - # Evidently both GCC and clang were late to implementing these - check_c_source_compiles( - "#include - __mmask16 f(__mmask16 x) { return _knot_mask16(x); } - int main(void) { return 0; }" - HAVE_MASK_INTRIN - ) - set(CMAKE_REQUIRED_FLAGS) endmacro() macro(check_avx512vnni_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "Intel") - if(CMAKE_HOST_UNIX OR APPLE) - set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") - else() - set(AVX512VNNIFLAG "/arch:AVX512") - endif() - elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM") + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni") + else() + set(AVX512VNNIFLAG "/arch:AVX512") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni") if(NOT MSVC) check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) @@ -139,11 +129,10 @@ macro(check_avx512vnni_intrinsics) endif() unset(HAVE_CASCADE_LAKE) endif() + elseif(MSVC) + set(AVX512VNNIFLAG "/arch:AVX512") endif() - elseif(MSVC) - set(AVX512VNNIFLAG "/arch:AVX512") endif() - # Check whether compiler supports AVX512vnni intrinsics set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") check_c_source_compiles( @@ -159,18 +148,18 @@ macro(check_avx512vnni_intrinsics) endmacro() macro(check_avx2_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "Intel") - if(CMAKE_HOST_UNIX OR APPLE) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(AVX2FLAG "-mavx2") + else() + set(AVX2FLAG "/arch:AVX2") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(AVX2FLAG "-mavx2") - else() + elseif(MSVC) set(AVX2FLAG "/arch:AVX2") endif() - elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) - set(AVX2FLAG "-mavx2") - endif() - elseif(MSVC) - set(AVX2FLAG "/arch:AVX2") endif() # Check whether compiler supports AVX2 intrinics set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") @@ -187,8 +176,8 @@ macro(check_avx2_intrinsics) endmacro() macro(check_neon_compiler_flag) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") if("${ARCH}" MATCHES "aarch64") set(NEONFLAG "-march=armv8-a+simd") else() @@ -206,12 +195,52 @@ macro(check_neon_compiler_flag) #endif int main() { return 0; }" NEON_AVAILABLE FAIL_REGEX "not supported") + # Check whether compiler native flag is enough for NEON support + # Some GCC versions don't enable FPU (vector unit) when using -march=native + if(NEON_AVAILABLE AND NATIVEFLAG AND (NOT "${ARCH}" MATCHES "aarch64")) + check_c_source_compiles( + "#include + uint8x16_t f(uint8x16_t x, uint8x16_t y) { + return vaddq_u8(x, y); + } + int main(int argc, char* argv[]) { + uint8x16_t a = vdupq_n_u8(argc); + uint8x16_t b = vdupq_n_u8(argc); + uint8x16_t result = f(a, b); + return result[0]; + }" + ARM_NEON_SUPPORT_NATIVE + ) + if(NOT ARM_NEON_SUPPORT_NATIVE) + set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG} -mfpu=neon ${ZNOLTOFLAG}") + check_c_source_compiles( + "#include + uint8x16_t f(uint8x16_t x, uint8x16_t y) { + return vaddq_u8(x, y); + } + int main(int argc, char* argv[]) { + uint8x16_t a = vdupq_n_u8(argc); + uint8x16_t b = vdupq_n_u8(argc); + uint8x16_t result = f(a, b); + return result[0]; + }" + ARM_NEON_SUPPORT_NATIVE_MFPU + ) + if(ARM_NEON_SUPPORT_NATIVE_MFPU) + set(NEONFLAG "-mfpu=neon") + else() + # Remove local NEON_AVAILABLE variable and overwrite the cache + unset(NEON_AVAILABLE) + set(NEON_AVAILABLE "" CACHE INTERNAL "NEON support available" FORCE) + endif() + endif() + endif() set(CMAKE_REQUIRED_FLAGS) endmacro() macro(check_neon_ld4_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") if("${ARCH}" MATCHES "aarch64") set(NEONFLAG "-march=armv8-a+simd") else() @@ -234,8 +263,8 @@ macro(check_neon_ld4_intrinsics) endmacro() macro(check_pclmulqdq_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM") set(PCLMULFLAG "-mpclmul") endif() endif() @@ -257,8 +286,8 @@ macro(check_pclmulqdq_intrinsics) endmacro() macro(check_vpclmulqdq_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM") set(VPCLMULFLAG "-mvpclmulqdq -mavx512f") endif() endif() @@ -341,8 +370,8 @@ macro(check_ppc_intrinsics) endmacro() macro(check_power8_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(POWER8FLAG "-mcpu=power8") endif() endif() @@ -364,12 +393,27 @@ macro(check_power8_intrinsics) }" HAVE_POWER8_INTRIN ) + if(NOT HAVE_POWER8_INTRIN AND HAVE_LINUX_AUXVEC_H) + check_c_source_compiles( + "#include + #include + int main() { + return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07); + }" + HAVE_POWER8_INTRIN2 + ) + if(HAVE_POWER8_INTRIN2) + set(POWER8_NEED_AUXVEC_H 1) + set(HAVE_POWER8_INTRIN ${HAVE_POWER8_INTRIN2} CACHE INTERNAL "Have POWER8 intrinsics" FORCE) + unset(HAVE_POWER8_INTRIN2 CACHE) + endif() + endif() set(CMAKE_REQUIRED_FLAGS) endmacro() macro(check_rvv_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(RISCVFLAG "-march=rv64gcv") endif() endif() @@ -399,8 +443,8 @@ macro(check_s390_intrinsics) endmacro() macro(check_power9_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(POWER9FLAG "-mcpu=power9") endif() endif() @@ -422,22 +466,37 @@ macro(check_power9_intrinsics) }" HAVE_POWER9_INTRIN ) + if(NOT HAVE_POWER9_INTRIN AND HAVE_LINUX_AUXVEC_H) + check_c_source_compiles( + "#include + #include + int main() { + return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00); + }" + HAVE_POWER9_INTRIN2 + ) + if(HAVE_POWER9_INTRIN2) + set(POWER9_NEED_AUXVEC_H 1) + set(HAVE_POWER9_INTRIN ${HAVE_POWER9_INTRIN2} CACHE INTERNAL "Have POWER9 intrinsics" FORCE) + unset(HAVE_POWER9_INTRIN2 CACHE) + endif() + endif() set(CMAKE_REQUIRED_FLAGS) endmacro() macro(check_sse2_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "Intel") - if(CMAKE_HOST_UNIX OR APPLE) - set(SSE2FLAG "-msse2") - else() - set(SSE2FLAG "/arch:SSE2") - endif() - elseif(MSVC) - if(NOT "${ARCH}" MATCHES "x86_64") - set(SSE2FLAG "/arch:SSE2") - endif() - elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(SSE2FLAG "-msse2") + else() + set(SSE2FLAG "/arch:SSE2") + endif() + elseif(MSVC) + if(NOT "${ARCH}" MATCHES "x86_64") + set(SSE2FLAG "/arch:SSE2") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(SSE2FLAG "-msse2") endif() endif() @@ -453,14 +512,14 @@ macro(check_sse2_intrinsics) endmacro() macro(check_ssse3_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "Intel") - if(CMAKE_HOST_UNIX OR APPLE) - set(SSSE3FLAG "-mssse3") - else() - set(SSSE3FLAG "/arch:SSSE3") - endif() - elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(SSSE3FLAG "-mssse3") + else() + set(SSSE3FLAG "/arch:SSSE3") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(SSSE3FLAG "-mssse3") endif() endif() @@ -478,14 +537,14 @@ macro(check_ssse3_intrinsics) endmacro() macro(check_sse42_intrinsics) - if(CMAKE_C_COMPILER_ID MATCHES "Intel") - if(CMAKE_HOST_UNIX OR APPLE) - set(SSE42FLAG "-msse4.2") - else() - set(SSE42FLAG "/arch:SSE4.2") - endif() - elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - if(NOT NATIVEFLAG) + if(NOT NATIVEFLAG) + if(CMAKE_C_COMPILER_ID MATCHES "Intel") + if(CMAKE_HOST_UNIX OR APPLE) + set(SSE42FLAG "-msse4.2") + else() + set(SSE42FLAG "/arch:SSE4.2") + endif() + elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(SSE42FLAG "-msse4.2") endif() endif() @@ -526,15 +585,17 @@ macro(check_vgfma_intrinsics) endmacro() macro(check_xsave_intrinsics) - if(NOT NATIVEFLAG AND NOT MSVC) + if(NOT NATIVEFLAG AND NOT MSVC AND NOT CMAKE_C_COMPILER_ID MATCHES "Intel") set(XSAVEFLAG "-mxsave") endif() set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") check_c_source_compiles( "#ifdef _MSC_VER # include + #elif __GNUC__ == 8 && __GNUC_MINOR__ > 1 + # include #else - # include + # include #endif unsigned int f(unsigned int a) { return (int) _xgetbv(a); } int main(void) { return 0; }" diff --git a/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake b/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake new file mode 100644 index 0000000000..f9521ec2f5 --- /dev/null +++ b/3rdparty/zlib-ng/cmake/detect-sanitizer.cmake @@ -0,0 +1,166 @@ +# detect-sanitizer.cmake -- Detect supported compiler sanitizer flags +# Licensed under the Zlib license, see LICENSE.md for details + +macro(add_common_sanitizer_flags) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + add_compile_options(-g3) + endif() + check_c_compiler_flag(-fno-omit-frame-pointer HAVE_NO_OMIT_FRAME_POINTER) + if(HAVE_NO_OMIT_FRAME_POINTER) + add_compile_options(-fno-omit-frame-pointer) + add_link_options(-fno-omit-frame-pointer) + endif() + check_c_compiler_flag(-fno-optimize-sibling-calls HAVE_NO_OPTIMIZE_SIBLING_CALLS) + if(HAVE_NO_OPTIMIZE_SIBLING_CALLS) + add_compile_options(-fno-optimize-sibling-calls) + add_link_options(-fno-optimize-sibling-calls) + endif() +endmacro() + +macro(check_sanitizer_support known_checks supported_checks) + set(available_checks "") + + # Build list of supported sanitizer flags by incrementally trying compilation with + # known sanitizer checks + + foreach(check ${known_checks}) + if(available_checks STREQUAL "") + set(compile_checks "${check}") + else() + set(compile_checks "${available_checks},${check}") + endif() + + set(CMAKE_REQUIRED_FLAGS -fsanitize=${compile_checks}) + + check_c_source_compiles("int main() { return 0; }" HAVE_SANITIZER_${check} + FAIL_REGEX "not supported|unrecognized command|unknown option") + + set(CMAKE_REQUIRED_FLAGS) + + if(HAVE_SANITIZER_${check}) + set(available_checks ${compile_checks}) + endif() + endforeach() + + set(${supported_checks} ${available_checks}) +endmacro() + +macro(add_address_sanitizer) + set(known_checks + address + pointer-compare + pointer-subtract + ) + + check_sanitizer_support("${known_checks}" supported_checks) + if(NOT ${supported_checks} STREQUAL "") + message(STATUS "Address sanitizer is enabled: ${supported_checks}") + add_compile_options(-fsanitize=${supported_checks}) + add_link_options(-fsanitize=${supported_checks}) + add_common_sanitizer_flags() + else() + message(STATUS "Address sanitizer is not supported") + endif() + + if(CMAKE_CROSSCOMPILING_EMULATOR) + # Only check for leak sanitizer if not cross-compiling due to qemu crash + message(WARNING "Leak sanitizer is not supported when cross compiling") + else() + # Leak sanitizer requires address sanitizer + check_sanitizer_support("leak" supported_checks) + if(NOT ${supported_checks} STREQUAL "") + message(STATUS "Leak sanitizer is enabled: ${supported_checks}") + add_compile_options(-fsanitize=${supported_checks}) + add_link_options(-fsanitize=${supported_checks}) + add_common_sanitizer_flags() + else() + message(STATUS "Leak sanitizer is not supported") + endif() + endif() +endmacro() + +macro(add_memory_sanitizer) + check_sanitizer_support("memory" supported_checks) + if(NOT ${supported_checks} STREQUAL "") + message(STATUS "Memory sanitizer is enabled: ${supported_checks}") + add_compile_options(-fsanitize=${supported_checks}) + add_link_options(-fsanitize=${supported_checks}) + add_common_sanitizer_flags() + + check_c_compiler_flag(-fsanitize-memory-track-origins HAVE_MEMORY_TRACK_ORIGINS) + if(HAVE_MEMORY_TRACK_ORIGINS) + add_compile_options(-fsanitize-memory-track-origins) + add_link_options(-fsanitize-memory-track-origins) + endif() + else() + message(STATUS "Memory sanitizer is not supported") + endif() +endmacro() + +macro(add_thread_sanitizer) + check_sanitizer_support("thread" supported_checks) + if(NOT ${supported_checks} STREQUAL "") + message(STATUS "Thread sanitizer is enabled: ${supported_checks}") + add_compile_options(-fsanitize=${supported_checks}) + add_link_options(-fsanitize=${supported_checks}) + add_common_sanitizer_flags() + else() + message(STATUS "Thread sanitizer is not supported") + endif() +endmacro() + +macro(add_undefined_sanitizer) + set(known_checks + array-bounds + bool + bounds + builtin + enum + float-cast-overflow + float-divide-by-zero + function + integer-divide-by-zero + local-bounds + null + nonnull-attribute + pointer-overflow + return + returns-nonnull-attribute + shift + shift-base + shift-exponent + signed-integer-overflow + undefined + unsigned-integer-overflow + unsigned-shift-base + vla-bound + vptr + ) + + # Only check for alignment sanitizer flag if unaligned access is not supported + if(NOT WITH_UNALIGNED) + list(APPEND known_checks alignment) + endif() + # Object size sanitizer has no effect at -O0 and produces compiler warning if enabled + if(NOT CMAKE_C_FLAGS MATCHES "-O0") + list(APPEND known_checks object-size) + endif() + + check_sanitizer_support("${known_checks}" supported_checks) + + if(NOT ${supported_checks} STREQUAL "") + message(STATUS "Undefined behavior sanitizer is enabled: ${supported_checks}") + add_compile_options(-fsanitize=${supported_checks}) + add_link_options(-fsanitize=${supported_checks}) + + # Group sanitizer flag -fsanitize=undefined will automatically add alignment, even if + # it is not in our sanitize flag list, so we need to explicitly disable alignment sanitizing. + if(WITH_UNALIGNED) + add_compile_options(-fno-sanitize=alignment) + endif() + + add_common_sanitizer_flags() + else() + message(STATUS "Undefined behavior sanitizer is not supported") + endif() +endmacro() diff --git a/3rdparty/zlib-ng/cpu_features.h b/3rdparty/zlib-ng/cpu_features.h index 00fa6c747c..8708724bc0 100644 --- a/3rdparty/zlib-ng/cpu_features.h +++ b/3rdparty/zlib-ng/cpu_features.h @@ -6,12 +6,10 @@ #ifndef CPU_FEATURES_H_ #define CPU_FEATURES_H_ -#include "adler32_fold.h" -#include "crc32_fold.h" +#ifndef DISABLE_RUNTIME_CPU_DETECTION #if defined(X86_FEATURES) # include "arch/x86/x86_features.h" -# include "fallback_builtins.h" #elif defined(ARM_FEATURES) # include "arch/arm/arm_features.h" #elif defined(PPC_FEATURES) || defined(POWER_FEATURES) @@ -38,266 +36,8 @@ struct cpu_features { #endif }; -extern void cpu_check_features(struct cpu_features *features); +void cpu_check_features(struct cpu_features *features); -/* adler32 */ -typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len); - -extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); -#ifdef ARM_NEON -extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); -#endif -#ifdef PPC_VMX -extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); -#endif -#ifdef RISCV_RVV -extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); -#endif -#ifdef X86_SSSE3 -extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); -#endif -#ifdef X86_AVX2 -extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); -#endif -#ifdef X86_AVX512 -extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); -#endif -#ifdef X86_AVX512VNNI -extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); -#endif -#ifdef POWER8_VSX -extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); -#endif - -/* adler32 folding */ -#ifdef RISCV_RVV -extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -#endif -#ifdef X86_SSE42 -extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -#endif -#ifdef X86_AVX2 -extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -#endif -#ifdef X86_AVX512 -extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -#endif -#ifdef X86_AVX512VNNI -extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -#endif - -/* CRC32 folding */ -#ifdef X86_PCLMULQDQ_CRC -extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc); -extern void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); -extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); -extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc); -extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); -#endif -#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) -extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc); -extern void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); -extern void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); -extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc); -extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); -#endif - -/* memory chunking */ -extern uint32_t chunksize_c(void); -extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#ifdef X86_SSE2 -extern uint32_t chunksize_sse2(void); -extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef X86_SSSE3 -extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef X86_AVX2 -extern uint32_t chunksize_avx2(void); -extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef ARM_NEON -extern uint32_t chunksize_neon(void); -extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef POWER8_VSX -extern uint32_t chunksize_power8(void); -extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef RISCV_RVV -extern uint32_t chunksize_rvv(void); -extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif - -#ifdef ZLIB_COMPAT -typedef struct z_stream_s z_stream; -#else -typedef struct zng_stream_s zng_stream; -#endif - -/* inflate fast loop */ -extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); -#ifdef X86_SSE2 -extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start); -#endif -#ifdef X86_SSSE3 -extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); -#endif -#ifdef X86_AVX2 -extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start); -#endif -#ifdef ARM_NEON -extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); -#endif -#ifdef POWER8_VSX -extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); -#endif -#ifdef RISCV_RVV -extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); -#endif - -/* CRC32 */ -typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len); - -extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); -#ifdef ARM_ACLE -extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len); -#elif defined(POWER8_VSX) -extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len); -#elif defined(S390_CRC32_VX) -extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len); -#endif - -/* compare256 */ -typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1); - -extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); -#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN -extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1); -#ifdef HAVE_BUILTIN_CTZ -extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1); -#endif -#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) -extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1); -#endif -#endif -#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); -#endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); -#endif -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) -extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); -#endif -#ifdef POWER9 -extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); -#endif -#ifdef RISCV_RVV -extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); -#endif - -#ifdef DEFLATE_H_ -/* insert_string */ -extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); -#ifdef X86_SSE42 -extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count); -#elif defined(ARM_ACLE) -extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); -#endif - -/* longest_match */ -extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match); -#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN -extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); -#ifdef HAVE_BUILTIN_CTZ -extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); -#endif -#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) -extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); -#endif -#endif -#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match); -#endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match); -#endif -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) -extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match); -#endif -#ifdef POWER9 -extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match); -#endif -#ifdef RISCV_RVV -extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match); -#endif - -/* longest_match_slow */ -extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); -#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN -extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match); -extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match); -#ifdef UNALIGNED64_OK -extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); -#endif -#endif -#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match); -#endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match); -#endif -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) -extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match); -#endif -#ifdef POWER9 -extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match); -#endif -#ifdef RISCV_RVV -extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match); -#endif - -/* quick_insert_string */ -extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); -#ifdef X86_SSE42 -extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str); -#elif defined(ARM_ACLE) -extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); -#endif - -/* slide_hash */ -typedef void (*slide_hash_func)(deflate_state *s); - -#ifdef X86_SSE2 -extern void slide_hash_sse2(deflate_state *s); -#endif -#if defined(ARM_SIMD) -extern void slide_hash_armv6(deflate_state *s); -#endif -#if defined(ARM_NEON) -extern void slide_hash_neon(deflate_state *s); -#endif -#if defined(PPC_VMX) -extern void slide_hash_vmx(deflate_state *s); -#endif -#if defined(POWER8_VSX) -extern void slide_hash_power8(deflate_state *s); -#endif -#if defined(RISCV_RVV) -extern void slide_hash_rvv(deflate_state *s); -#endif -#ifdef X86_AVX2 -extern void slide_hash_avx2(deflate_state *s); -#endif - -/* update_hash */ -extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val); -#ifdef X86_SSE42 -extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val); -#elif defined(ARM_ACLE) -extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val); -#endif #endif #endif diff --git a/3rdparty/zlib-ng/crc32.c b/3rdparty/zlib-ng/crc32.c new file mode 100644 index 0000000000..54f6ecd420 --- /dev/null +++ b/3rdparty/zlib-ng/crc32.c @@ -0,0 +1,42 @@ +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2022 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * This interleaved implementation of a CRC makes use of pipelined multiple + * arithmetic-logic units, commonly found in modern CPU cores. It is due to + * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. + */ + +#include "zbuild.h" +#include "functable.h" +#include "crc32_braid_tbl.h" + +/* ========================================================================= */ + +const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) { + return (const uint32_t *)crc_table; +} + +#ifdef ZLIB_COMPAT +unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) { + if (buf == NULL) return 0; + + return (unsigned long)FUNCTABLE_CALL(crc32)((uint32_t)crc, buf, len); +} +#else +uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) { + if (buf == NULL) return 0; + + return FUNCTABLE_CALL(crc32)(crc, buf, len); +} +#endif + +#ifdef ZLIB_COMPAT +unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) { + return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len); +} +#else +uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) { + return PREFIX(crc32_z)(crc, buf, len); +} +#endif diff --git a/3rdparty/zlib-ng/crc32.h b/3rdparty/zlib-ng/crc32.h new file mode 100644 index 0000000000..8c3d7a8a3e --- /dev/null +++ b/3rdparty/zlib-ng/crc32.h @@ -0,0 +1,16 @@ +/* crc32.h -- crc32 folding interface + * Copyright (C) 2021 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef CRC32_H_ +#define CRC32_H_ + +#define CRC32_FOLD_BUFFER_SIZE (16 * 4) +/* sizeof(__m128i) * (4 folds) */ + +typedef struct crc32_fold_s { + uint8_t fold[CRC32_FOLD_BUFFER_SIZE]; + uint32_t value; +} crc32_fold; + +#endif diff --git a/3rdparty/zlib-ng/crc32_braid_comb.c b/3rdparty/zlib-ng/crc32_braid_comb.c index 75fb474258..f253ae10a2 100644 --- a/3rdparty/zlib-ng/crc32_braid_comb.c +++ b/3rdparty/zlib-ng/crc32_braid_comb.c @@ -7,7 +7,6 @@ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. */ -#include "zbuild.h" #include "zutil.h" #include "crc32_braid_p.h" #include "crc32_braid_tbl.h" diff --git a/3rdparty/zlib-ng/crc32_braid_p.h b/3rdparty/zlib-ng/crc32_braid_p.h index 1d8a07068a..003bf91920 100644 --- a/3rdparty/zlib-ng/crc32_braid_p.h +++ b/3rdparty/zlib-ng/crc32_braid_p.h @@ -1,7 +1,6 @@ #ifndef CRC32_BRAID_P_H_ #define CRC32_BRAID_P_H_ -#include "zbuild.h" #include "zendian.h" /* Define N */ @@ -25,7 +24,7 @@ # endif #else # ifndef W -# if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) +# if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__) # define W 8 # else # define W 4 @@ -42,9 +41,24 @@ # endif #endif +#if BYTE_ORDER == LITTLE_ENDIAN +# define ZSWAPWORD(word) (word) +# define BRAID_TABLE crc_braid_table +#elif BYTE_ORDER == BIG_ENDIAN +# if W == 8 +# define ZSWAPWORD(word) ZSWAP64(word) +# elif W == 4 +# define ZSWAPWORD(word) ZSWAP32(word) +# endif +# define BRAID_TABLE crc_braid_big_table +#else +# error "No endian defined" +#endif + +#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + /* CRC polynomial. */ #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */ -extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); - #endif /* CRC32_BRAID_P_H_ */ diff --git a/3rdparty/zlib-ng/crc32_fold.h b/3rdparty/zlib-ng/crc32_fold.h deleted file mode 100644 index 0d2ff66967..0000000000 --- a/3rdparty/zlib-ng/crc32_fold.h +++ /dev/null @@ -1,21 +0,0 @@ -/* crc32_fold.h -- crc32 folding interface - * Copyright (C) 2021 Nathan Moinvaziri - * For conditions of distribution and use, see copyright notice in zlib.h - */ -#ifndef CRC32_FOLD_H_ -#define CRC32_FOLD_H_ - -#define CRC32_FOLD_BUFFER_SIZE (16 * 4) -/* sizeof(__m128i) * (4 folds) */ - -typedef struct crc32_fold_s { - uint8_t fold[CRC32_FOLD_BUFFER_SIZE]; - uint32_t value; -} crc32_fold; - -Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc); -Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); -Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); -Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc); - -#endif diff --git a/3rdparty/zlib-ng/deflate.c b/3rdparty/zlib-ng/deflate.c index 2a0a20e5d2..66b5506a52 100644 --- a/3rdparty/zlib-ng/deflate.c +++ b/3rdparty/zlib-ng/deflate.c @@ -1,5 +1,5 @@ /* deflate.c -- compress data using the deflation algorithm - * Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -58,7 +58,7 @@ # undef deflateInit2 #endif -const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jean-loup Gailly and Mark Adler "; +const char PREFIX(deflate_copyright)[] = " deflate 1.3.1 Copyright 1995-2024 Jean-loup Gailly and Mark Adler "; /* If you use the zlib library in a product, an acknowledgment is welcome in the documentation of your product. If for some reason you cannot @@ -71,14 +71,16 @@ const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jea */ #ifdef S390_DFLTCC_DEFLATE # include "arch/s390/dfltcc_deflate.h" +/* DFLTCC instructions require window to be page-aligned */ +# define PAD_WINDOW PAD_4096 +# define WINDOW_PAD_SIZE 4096 +# define HINT_ALIGNED_WINDOW HINT_ALIGNED_4096 #else -/* Memory management for the deflate state. Useful for allocating arch-specific extension blocks. */ -# define ZALLOC_DEFLATE_STATE(strm) ((deflate_state *)ZALLOC(strm, 1, sizeof(deflate_state))) -# define ZFREE_STATE(strm, addr) ZFREE(strm, addr) -# define ZCOPY_DEFLATE_STATE(dst, src) memcpy(dst, src, sizeof(deflate_state)) -/* Memory management for the window. Useful for allocation the aligned window. */ -# define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size) -# define TRY_FREE_WINDOW(strm, addr) TRY_FREE(strm, addr) +# define PAD_WINDOW PAD_64 +# define WINDOW_PAD_SIZE 64 +# define HINT_ALIGNED_WINDOW HINT_ALIGNED_64 +/* Adjust the window size for the arch-specific deflate code. */ +# define DEFLATE_ADJUST_WINDOW_SIZE(n) (n) /* Invoked at the beginning of deflateSetDictionary(). Useful for checking arch-specific window data. */ # define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0) /* Invoked at the beginning of deflateGetDictionary(). Useful for adjusting arch-specific window data. */ @@ -120,10 +122,6 @@ static void lm_set_level (deflate_state *s, int level); static void lm_init (deflate_state *s); Z_INTERNAL unsigned read_buf (PREFIX3(stream) *strm, unsigned char *buf, unsigned size); -extern uint32_t update_hash_roll (deflate_state *const s, uint32_t h, uint32_t val); -extern void insert_string_roll (deflate_state *const s, uint32_t str, uint32_t count); -extern Pos quick_insert_string_roll(deflate_state *const s, uint32_t str); - /* =========================================================================== * Local data */ @@ -185,17 +183,111 @@ static const config configuration_table[10] = { memset((unsigned char *)s->head, 0, HASH_SIZE * sizeof(*s->head)); \ } while (0) -/* ========================================================================= */ -/* This function is hidden in ZLIB_COMPAT builds. */ + +#ifdef DEF_ALLOC_DEBUG +# include +# define LOGSZ(name,size) fprintf(stderr, "%s is %d bytes\n", name, size) +# define LOGSZP(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %d, padded %d\n", name, size, loc, pad) +# define LOGSZPL(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %ld, padded %d\n", name, size, loc, pad) +#else +# define LOGSZ(name,size) +# define LOGSZP(name,size,loc,pad) +# define LOGSZPL(name,size,loc,pad) +#endif + +/* =========================================================================== + * Allocate a big buffer and divide it up into the various buffers deflate needs. + * Handles alignment of allocated buffer and alignment of individual buffers. + */ +Z_INTERNAL deflate_allocs* alloc_deflate(PREFIX3(stream) *strm, int windowBits, int lit_bufsize) { + int curr_size = 0; + + /* Define sizes */ + int window_size = DEFLATE_ADJUST_WINDOW_SIZE((1 << windowBits) * 2); + int prev_size = (1 << windowBits) * sizeof(Pos); + int head_size = HASH_SIZE * sizeof(Pos); + int pending_size = lit_bufsize * LIT_BUFS; + int state_size = sizeof(deflate_state); + int alloc_size = sizeof(deflate_allocs); + + /* Calculate relative buffer positions and paddings */ + LOGSZP("window", window_size, PAD_WINDOW(curr_size), PADSZ(curr_size,WINDOW_PAD_SIZE)); + int window_pos = PAD_WINDOW(curr_size); + curr_size = window_pos + window_size; + + LOGSZP("prev", prev_size, PAD_64(curr_size), PADSZ(curr_size,64)); + int prev_pos = PAD_64(curr_size); + curr_size = prev_pos + prev_size; + + LOGSZP("head", head_size, PAD_64(curr_size), PADSZ(curr_size,64)); + int head_pos = PAD_64(curr_size); + curr_size = head_pos + head_size; + + LOGSZP("pending", pending_size, PAD_64(curr_size), PADSZ(curr_size,64)); + int pending_pos = PAD_64(curr_size); + curr_size = pending_pos + pending_size; + + LOGSZP("state", state_size, PAD_64(curr_size), PADSZ(curr_size,64)); + int state_pos = PAD_64(curr_size); + curr_size = state_pos + state_size; + + LOGSZP("alloc", alloc_size, PAD_16(curr_size), PADSZ(curr_size,16)); + int alloc_pos = PAD_16(curr_size); + curr_size = alloc_pos + alloc_size; + + /* Add 64-1 or 4096-1 to allow window alignment, and round size of buffer up to multiple of 64 */ + int total_size = PAD_64(curr_size + (WINDOW_PAD_SIZE - 1)); + + /* Allocate buffer, align to 64-byte cacheline, and zerofill the resulting buffer */ + char *original_buf = strm->zalloc(strm->opaque, 1, total_size); + if (original_buf == NULL) + return NULL; + + char *buff = (char *)HINT_ALIGNED_WINDOW((char *)PAD_WINDOW(original_buf)); + LOGSZPL("Buffer alloc", total_size, PADSZ((uintptr_t)original_buf,WINDOW_PAD_SIZE), PADSZ(curr_size,WINDOW_PAD_SIZE)); + + /* Initialize alloc_bufs */ + deflate_allocs *alloc_bufs = (struct deflate_allocs_s *)(buff + alloc_pos); + alloc_bufs->buf_start = (char *)original_buf; + alloc_bufs->zfree = strm->zfree; + + /* Assign buffers */ + alloc_bufs->window = (unsigned char *)HINT_ALIGNED_WINDOW(buff + window_pos); + alloc_bufs->prev = (Pos *)HINT_ALIGNED_64(buff + prev_pos); + alloc_bufs->head = (Pos *)HINT_ALIGNED_64(buff + head_pos); + alloc_bufs->pending_buf = (unsigned char *)HINT_ALIGNED_64(buff + pending_pos); + alloc_bufs->state = (deflate_state *)HINT_ALIGNED_16(buff + state_pos); + + memset((char *)alloc_bufs->prev, 0, prev_size); + + return alloc_bufs; +} + +/* =========================================================================== + * Free all allocated deflate buffers + */ +static inline void free_deflate(PREFIX3(stream) *strm) { + deflate_state *state = (deflate_state *)strm->state; + + if (state->alloc_bufs != NULL) { + deflate_allocs *alloc_bufs = state->alloc_bufs; + alloc_bufs->zfree(strm->opaque, alloc_bufs->buf_start); + strm->state = NULL; + } +} + +/* =========================================================================== + * Initialize deflate state and buffers. + * This function is hidden in ZLIB_COMPAT builds. + */ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level, int32_t method, int32_t windowBits, int32_t memLevel, int32_t strategy) { /* Todo: ignore strm->next_in if we use it as window */ - uint32_t window_padding = 0; deflate_state *s; int wrap = 1; - /* Force initialization functable, because deflate captures function pointers from functable. */ - functable.force_init(); + /* Initialize functable */ + FUNCTABLE_INIT; if (strm == NULL) return Z_STREAM_ERROR; @@ -230,9 +322,19 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level if (windowBits == 8) windowBits = 9; /* until 256-byte window bug fixed */ - s = ZALLOC_DEFLATE_STATE(strm); - if (s == NULL) + /* Allocate buffers */ + int lit_bufsize = 1 << (memLevel + 6); + deflate_allocs *alloc_bufs = alloc_deflate(strm, windowBits, lit_bufsize); + if (alloc_bufs == NULL) return Z_MEM_ERROR; + + s = alloc_bufs->state; + s->alloc_bufs = alloc_bufs; + s->window = alloc_bufs->window; + s->prev = alloc_bufs->prev; + s->head = alloc_bufs->head; + s->pending_buf = alloc_bufs->pending_buf; + strm->state = (struct internal_state *)s; s->strm = strm; s->status = INIT_STATE; /* to pass state test in deflateReset() */ @@ -243,18 +345,9 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level s->w_size = 1 << s->w_bits; s->w_mask = s->w_size - 1; -#ifdef X86_PCLMULQDQ_CRC - window_padding = 8; -#endif - - s->window = (unsigned char *) ZALLOC_WINDOW(strm, s->w_size + window_padding, 2*sizeof(unsigned char)); - s->prev = (Pos *) ZALLOC(strm, s->w_size, sizeof(Pos)); - memset(s->prev, 0, s->w_size * sizeof(Pos)); - s->head = (Pos *) ZALLOC(strm, HASH_SIZE, sizeof(Pos)); - s->high_water = 0; /* nothing written to s->window yet */ - s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ + s->lit_bufsize = lit_bufsize; /* 16K elements by default */ /* We overlay pending_buf and sym_buf. This works since the average size * for length/distance pairs over any compressed block is assured to be 31 @@ -295,7 +388,6 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level * symbols from which it is being constructed. */ - s->pending_buf = (unsigned char *) ZALLOC(strm, s->lit_bufsize, 4); s->pending_buf_size = s->lit_bufsize * 4; if (s->window == NULL || s->prev == NULL || s->head == NULL || s->pending_buf == NULL) { @@ -304,8 +396,15 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level PREFIX(deflateEnd)(strm); return Z_MEM_ERROR; } + +#ifdef LIT_MEM + s->d_buf = (uint16_t *)(s->pending_buf + (s->lit_bufsize << 1)); + s->l_buf = s->pending_buf + (s->lit_bufsize << 2); + s->sym_end = s->lit_bufsize - 1; +#else s->sym_buf = s->pending_buf + s->lit_bufsize; s->sym_end = (s->lit_bufsize - 1) * 3; +#endif /* We avoid equality with lit_bufsize*3 because of wraparound at 64K * on 16 bit machines and because stored blocks are restricted to * 64K-1 bytes. @@ -348,7 +447,7 @@ static int deflateStateCheck(PREFIX3(stream) *strm) { if (strm == NULL || strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0) return 1; s = strm->state; - if (s == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE)) + if (s == NULL || s->alloc_bufs == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE)) return 1; return 0; } @@ -370,7 +469,7 @@ int32_t Z_EXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const uint8 /* when using zlib wrappers, compute Adler-32 for provided dictionary */ if (wrap == 1) - strm->adler = functable.adler32(strm->adler, dictionary, dictLength); + strm->adler = FUNCTABLE_CALL(adler32)(strm->adler, dictionary, dictLength); DEFLATE_SET_DICTIONARY_HOOK(strm, dictionary, dictLength); /* hook for IBM Z DFLTCC */ s->wrap = 0; /* avoid computing Adler-32 in read_buf */ @@ -457,7 +556,7 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) { #ifdef GZIP if (s->wrap == 2) { - strm->adler = functable.crc32_fold_reset(&s->crc_fold); + strm->adler = FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold); } else #endif strm->adler = ADLER32_INITIAL_VALUE; @@ -506,9 +605,17 @@ int32_t Z_EXPORT PREFIX(deflatePrime)(PREFIX3(stream) *strm, int32_t bits, int32 if (deflateStateCheck(strm)) return Z_STREAM_ERROR; s = strm->state; + +#ifdef LIT_MEM + if (bits < 0 || bits > BIT_BUF_SIZE || + (unsigned char *)s->d_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3)) + return Z_BUF_ERROR; +#else if (bits < 0 || bits > BIT_BUF_SIZE || bits > (int32_t)(sizeof(value) << 3) || s->sym_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3)) return Z_BUF_ERROR; +#endif + do { put = BIT_BUF_SIZE - s->bi_valid; put = MIN(put, bits); @@ -555,7 +662,7 @@ int32_t Z_EXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int32_t level, int if (s->level != level) { if (s->level == 0 && s->matches != 0) { if (s->matches == 1) { - functable.slide_hash(s); + FUNCTABLE_CALL(slide_hash)(s); } else { CLEAR_HASH(s); } @@ -794,7 +901,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { #ifdef GZIP if (s->status == GZIP_STATE) { /* gzip header */ - functable.crc32_fold_reset(&s->crc_fold); + FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold); put_byte(s, 31); put_byte(s, 139); put_byte(s, 8); @@ -911,7 +1018,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { } } put_short(s, (uint16_t)strm->adler); - functable.crc32_fold_reset(&s->crc_fold); + FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold); } s->status = BUSY_STATE; @@ -982,7 +1089,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { /* Write the trailer */ #ifdef GZIP if (s->wrap == 2) { - strm->adler = functable.crc32_fold_final(&s->crc_fold); + strm->adler = FUNCTABLE_CALL(crc32_fold_final)(&s->crc_fold); put_uint32(s, strm->adler); put_uint32(s, (uint32_t)strm->total_in); @@ -1007,21 +1114,13 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { /* ========================================================================= */ int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) { - int32_t status; - if (deflateStateCheck(strm)) return Z_STREAM_ERROR; - status = strm->state->status; + int32_t status = strm->state->status; - /* Deallocate in reverse order of allocations: */ - TRY_FREE(strm, strm->state->pending_buf); - TRY_FREE(strm, strm->state->head); - TRY_FREE(strm, strm->state->prev); - TRY_FREE_WINDOW(strm, strm->state->window); - - ZFREE_STATE(strm, strm->state); - strm->state = NULL; + /* Free allocated buffers */ + free_deflate(strm); return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } @@ -1032,7 +1131,6 @@ int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) { int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *source) { deflate_state *ds; deflate_state *ss; - uint32_t window_padding = 0; if (deflateStateCheck(source) || dest == NULL) return Z_STREAM_ERROR; @@ -1041,34 +1139,39 @@ int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *sou memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream))); - ds = ZALLOC_DEFLATE_STATE(dest); - if (ds == NULL) + deflate_allocs *alloc_bufs = alloc_deflate(dest, ss->w_bits, ss->lit_bufsize); + if (alloc_bufs == NULL) return Z_MEM_ERROR; + + ds = alloc_bufs->state; + dest->state = (struct internal_state *) ds; - ZCOPY_DEFLATE_STATE(ds, ss); + memcpy(ds, ss, sizeof(deflate_state)); ds->strm = dest; -#ifdef X86_PCLMULQDQ_CRC - window_padding = 8; -#endif - - ds->window = (unsigned char *) ZALLOC_WINDOW(dest, ds->w_size + window_padding, 2*sizeof(unsigned char)); - ds->prev = (Pos *) ZALLOC(dest, ds->w_size, sizeof(Pos)); - ds->head = (Pos *) ZALLOC(dest, HASH_SIZE, sizeof(Pos)); - ds->pending_buf = (unsigned char *) ZALLOC(dest, ds->lit_bufsize, 4); + ds->alloc_bufs = alloc_bufs; + ds->window = alloc_bufs->window; + ds->prev = alloc_bufs->prev; + ds->head = alloc_bufs->head; + ds->pending_buf = alloc_bufs->pending_buf; if (ds->window == NULL || ds->prev == NULL || ds->head == NULL || ds->pending_buf == NULL) { PREFIX(deflateEnd)(dest); return Z_MEM_ERROR; } - memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(unsigned char)); + memcpy(ds->window, ss->window, DEFLATE_ADJUST_WINDOW_SIZE(ds->w_size * 2 * sizeof(unsigned char))); memcpy((void *)ds->prev, (void *)ss->prev, ds->w_size * sizeof(Pos)); memcpy((void *)ds->head, (void *)ss->head, HASH_SIZE * sizeof(Pos)); - memcpy(ds->pending_buf, ss->pending_buf, ds->pending_buf_size); + memcpy(ds->pending_buf, ss->pending_buf, ds->lit_bufsize * LIT_BUFS); ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); +#ifdef LIT_MEM + ds->d_buf = (uint16_t *)(ds->pending_buf + (ds->lit_bufsize << 1)); + ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2); +#else ds->sym_buf = ds->pending_buf + ds->lit_bufsize; +#endif ds->l_desc.dyn_tree = ds->dyn_ltree; ds->d_desc.dyn_tree = ds->dyn_dtree; @@ -1095,10 +1198,10 @@ Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf, memcpy(buf, strm->next_in, len); #ifdef GZIP } else if (strm->state->wrap == 2) { - functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len); + FUNCTABLE_CALL(crc32_fold_copy)(&strm->state->crc_fold, buf, strm->next_in, len); #endif } else if (strm->state->wrap == 1) { - strm->adler = functable.adler32_fold_copy(strm->adler, buf, strm->next_in, len); + strm->adler = FUNCTABLE_CALL(adler32_fold_copy)(strm->adler, buf, strm->next_in, len); } else { memcpy(buf, strm->next_in, len); } @@ -1125,9 +1228,9 @@ static void lm_set_level(deflate_state *s, int level) { s->insert_string = &insert_string_roll; s->quick_insert_string = &quick_insert_string_roll; } else { - s->update_hash = functable.update_hash; - s->insert_string = functable.insert_string; - s->quick_insert_string = functable.quick_insert_string; + s->update_hash = update_hash; + s->insert_string = insert_string; + s->quick_insert_string = quick_insert_string; } s->level = level; @@ -1191,7 +1294,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) { s->block_start -= (int)wsize; if (s->insert > s->strstart) s->insert = s->strstart; - functable.slide_hash(s); + FUNCTABLE_CALL(slide_hash)(s); more += wsize; } if (s->strm->avail_in == 0) @@ -1217,7 +1320,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) { if (s->lookahead + s->insert >= STD_MIN_MATCH) { unsigned int str = s->strstart - s->insert; if (UNLIKELY(s->max_chain_length > 1024)) { - s->ins_h = s->update_hash(s, s->window[str], s->window[str+1]); + s->ins_h = s->update_hash(s->window[str], s->window[str+1]); } else if (str >= 1) { s->quick_insert_string(s, str + 2 - STD_MIN_MATCH); } diff --git a/3rdparty/zlib-ng/deflate.h b/3rdparty/zlib-ng/deflate.h index 8001b47c99..e122ae1aad 100644 --- a/3rdparty/zlib-ng/deflate.h +++ b/3rdparty/zlib-ng/deflate.h @@ -12,8 +12,12 @@ #include "zutil.h" #include "zendian.h" -#include "adler32_fold.h" -#include "crc32_fold.h" +#include "crc32.h" + +#ifdef S390_DFLTCC_DEFLATE +# include "arch/s390/dfltcc_common.h" +# define HAVE_ARCH_DEFLATE_STATE +#endif /* define NO_GZIP when compiling if you want to disable gzip header and trailer creation by deflate(). NO_GZIP would be used to avoid linking in @@ -23,6 +27,12 @@ # define GZIP #endif +/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at + the cost of a larger memory footprint */ +#ifndef NO_LIT_MEM +# define LIT_MEM +#endif + /* =========================================================================== * Internal compression state. */ @@ -108,11 +118,30 @@ typedef uint16_t Pos; /* Type definitions for hash callbacks */ typedef struct internal_state deflate_state; -typedef uint32_t (* update_hash_cb) (deflate_state *const s, uint32_t h, uint32_t val); +typedef uint32_t (* update_hash_cb) (uint32_t h, uint32_t val); typedef void (* insert_string_cb) (deflate_state *const s, uint32_t str, uint32_t count); typedef Pos (* quick_insert_string_cb)(deflate_state *const s, uint32_t str); -struct internal_state { +uint32_t update_hash (uint32_t h, uint32_t val); +void insert_string (deflate_state *const s, uint32_t str, uint32_t count); +Pos quick_insert_string (deflate_state *const s, uint32_t str); + +uint32_t update_hash_roll (uint32_t h, uint32_t val); +void insert_string_roll (deflate_state *const s, uint32_t str, uint32_t count); +Pos quick_insert_string_roll(deflate_state *const s, uint32_t str); + +/* Struct for memory allocation handling */ +typedef struct deflate_allocs_s { + char *buf_start; + free_func zfree; + deflate_state *state; + unsigned char *window; + unsigned char *pending_buf; + Pos *prev; + Pos *head; +} deflate_allocs; + +struct ALIGNED_(64) internal_state { PREFIX3(stream) *strm; /* pointer back to this zlib stream */ unsigned char *pending_buf; /* output still pending */ unsigned char *pending_out; /* next pending byte to output to the stream */ @@ -260,8 +289,16 @@ struct internal_state { * - I can't count above 4 */ +#ifdef LIT_MEM +# define LIT_BUFS 5 + uint16_t *d_buf; /* buffer for distances */ + unsigned char *l_buf; /* buffer for literals/lengths */ +#else +# define LIT_BUFS 4 unsigned char *sym_buf; /* buffer for distances and literals/lengths */ - unsigned int sym_next; /* running index in sym_buf */ +#endif + + unsigned int sym_next; /* running index in symbol buffer */ unsigned int sym_end; /* symbol table full when sym_next reaches this */ unsigned long opt_len; /* bit length of current block with optimal trees */ @@ -273,8 +310,11 @@ struct internal_state { unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */ unsigned long bits_sent; /* bit length of compressed data sent mod 2^32 */ - /* Reserved for future use and alignment purposes */ - char *reserved_p; + deflate_allocs *alloc_bufs; + +#ifdef HAVE_ARCH_DEFLATE_STATE + arch_deflate_state arch; /* architecture-specific extensions */ +#endif uint64_t bi_buf; /* Output buffer. bits are inserted starting at the bottom (least significant bits). */ @@ -284,7 +324,7 @@ struct internal_state { /* Reserved for future use and alignment purposes */ int32_t reserved[11]; -} ALIGNED_(8); +}; typedef enum { need_more, /* block not completed, need more input or more output */ diff --git a/3rdparty/zlib-ng/deflate_fast.c b/3rdparty/zlib-ng/deflate_fast.c index 3184aa718c..2d0444cd73 100644 --- a/3rdparty/zlib-ng/deflate_fast.c +++ b/3rdparty/zlib-ng/deflate_fast.c @@ -1,6 +1,6 @@ /* deflate_fast.c -- compress data using the fast strategy of deflation algorithm * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -41,7 +41,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) { * dictionary, and set hash_head to the head of the hash chain: */ if (s->lookahead >= WANT_MIN_MATCH) { - hash_head = functable.quick_insert_string(s, s->strstart); + hash_head = quick_insert_string(s, s->strstart); dist = (int64_t)s->strstart - hash_head; /* Find the longest match, discarding those <= prev_length. @@ -52,7 +52,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) { * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ - match_len = functable.longest_match(s, hash_head); + match_len = FUNCTABLE_CALL(longest_match)(s, hash_head); /* longest_match() sets match_start */ } } @@ -71,11 +71,11 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) { match_len--; /* string at strstart already in table */ s->strstart++; - functable.insert_string(s, s->strstart, match_len); + insert_string(s, s->strstart, match_len); s->strstart += match_len; } else { s->strstart += match_len; - functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH); + quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH); /* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not * matter since it will be recomputed at next deflate call. diff --git a/3rdparty/zlib-ng/deflate_huff.c b/3rdparty/zlib-ng/deflate_huff.c index b197e24d7c..d5a234b114 100644 --- a/3rdparty/zlib-ng/deflate_huff.c +++ b/3rdparty/zlib-ng/deflate_huff.c @@ -1,6 +1,6 @@ /* deflate_huff.c -- compress data using huffman encoding only strategy * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ diff --git a/3rdparty/zlib-ng/deflate_medium.c b/3rdparty/zlib-ng/deflate_medium.c index 47796e3221..2aeebe2026 100644 --- a/3rdparty/zlib-ng/deflate_medium.c +++ b/3rdparty/zlib-ng/deflate_medium.c @@ -45,16 +45,18 @@ static void insert_match(deflate_state *s, struct match match) { if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH))) return; + /* string at strstart already in table */ + match.strstart++; + match.match_length--; + /* matches that are not long enough we need to emit as literals */ - if (LIKELY(match.match_length < WANT_MIN_MATCH)) { - match.strstart++; - match.match_length--; + if (LIKELY(match.match_length < WANT_MIN_MATCH - 1)) { if (UNLIKELY(match.match_length > 0)) { if (match.strstart >= match.orgstart) { if (match.strstart + match.match_length - 1 >= match.orgstart) { - functable.insert_string(s, match.strstart, match.match_length); + insert_string(s, match.strstart, match.match_length); } else { - functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1); + insert_string(s, match.strstart, match.orgstart - match.strstart + 1); } match.strstart += match.match_length; match.match_length = 0; @@ -63,35 +65,18 @@ static void insert_match(deflate_state *s, struct match match) { return; } - /* Insert new strings in the hash table only if the match length - * is not too large. This saves time but degrades compression. - */ - if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) { - match.match_length--; /* string at strstart already in table */ - match.strstart++; - - if (LIKELY(match.strstart >= match.orgstart)) { - if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) { - functable.insert_string(s, match.strstart, match.match_length); - } else { - functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1); - } - } else if (match.orgstart < match.strstart + match.match_length) { - functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart); + /* Insert into hash table. */ + if (LIKELY(match.strstart >= match.orgstart)) { + if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) { + insert_string(s, match.strstart, match.match_length); + } else { + insert_string(s, match.strstart, match.orgstart - match.strstart + 1); } - match.strstart += match.match_length; - match.match_length = 0; - } else { - match.strstart += match.match_length; - match.match_length = 0; - - if (match.strstart >= (STD_MIN_MATCH - 2)) - functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH); - - /* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not - * matter since it will be recomputed at next deflate call. - */ + } else if (match.orgstart < match.strstart + match.match_length) { + insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart); } + match.strstart += match.match_length; + match.match_length = 0; } static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) { @@ -199,7 +184,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) { } else { hash_head = 0; if (s->lookahead >= WANT_MIN_MATCH) { - hash_head = functable.quick_insert_string(s, s->strstart); + hash_head = quick_insert_string(s, s->strstart); } current_match.strstart = (uint16_t)s->strstart; @@ -215,7 +200,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) { * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ - current_match.match_length = (uint16_t)functable.longest_match(s, hash_head); + current_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head); current_match.match_start = (uint16_t)s->match_start; if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH)) current_match.match_length = 1; @@ -235,7 +220,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) { /* now, look ahead one */ if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) { s->strstart = current_match.strstart + current_match.match_length; - hash_head = functable.quick_insert_string(s, s->strstart); + hash_head = quick_insert_string(s, s->strstart); next_match.strstart = (uint16_t)s->strstart; next_match.orgstart = next_match.strstart; @@ -250,7 +235,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) { * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ - next_match.match_length = (uint16_t)functable.longest_match(s, hash_head); + next_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head); next_match.match_start = (uint16_t)s->match_start; if (UNLIKELY(next_match.match_start >= next_match.strstart)) { /* this can happen due to some restarts */ diff --git a/3rdparty/zlib-ng/deflate_p.h b/3rdparty/zlib-ng/deflate_p.h index dd2021a0f5..7c74ebf5ad 100644 --- a/3rdparty/zlib-ng/deflate_p.h +++ b/3rdparty/zlib-ng/deflate_p.h @@ -1,7 +1,7 @@ /* deflate_p.h -- Private inline functions and macros shared with more than * one deflate method * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * */ @@ -60,27 +60,37 @@ extern const unsigned char Z_INTERNAL zng_dist_code[]; static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) { /* c is the unmatched char */ +#ifdef LIT_MEM + s->d_buf[s->sym_next] = 0; + s->l_buf[s->sym_next++] = c; +#else s->sym_buf[s->sym_next++] = 0; s->sym_buf[s->sym_next++] = 0; s->sym_buf[s->sym_next++] = c; +#endif s->dyn_ltree[c].Freq++; Tracevv((stderr, "%c", c)); Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal"); return (s->sym_next == s->sym_end); } -static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) { +static inline int zng_tr_tally_dist(deflate_state* s, uint32_t dist, uint32_t len) { /* dist: distance of matched string */ /* len: match length-STD_MIN_MATCH */ +#ifdef LIT_MEM + s->d_buf[s->sym_next] = dist; + s->l_buf[s->sym_next++] = len; +#else s->sym_buf[s->sym_next++] = (uint8_t)(dist); s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8); s->sym_buf[s->sym_next++] = (uint8_t)len; +#endif s->matches++; dist--; Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES, "zng_tr_tally: bad match"); - s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++; + s->dyn_ltree[zng_length_code[len] + LITERALS + 1].Freq++; s->dyn_dtree[d_code(dist)].Freq++; return (s->sym_next == s->sym_end); } diff --git a/3rdparty/zlib-ng/deflate_quick.c b/3rdparty/zlib-ng/deflate_quick.c index df5a17b9e6..5a1937b679 100644 --- a/3rdparty/zlib-ng/deflate_quick.c +++ b/3rdparty/zlib-ng/deflate_quick.c @@ -86,7 +86,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) { } if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) { - hash_head = functable.quick_insert_string(s, s->strstart); + hash_head = quick_insert_string(s, s->strstart); dist = (int64_t)s->strstart - hash_head; if (dist <= MAX_DIST(s) && dist > 0) { @@ -94,7 +94,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) { const uint8_t *match_start = s->window + hash_head; if (zng_memcmp_2(str_start, match_start) == 0) { - match_len = functable.compare256(str_start+2, match_start+2) + 2; + match_len = FUNCTABLE_CALL(compare256)(str_start+2, match_start+2) + 2; if (match_len >= WANT_MIN_MATCH) { if (UNLIKELY(match_len > s->lookahead)) diff --git a/3rdparty/zlib-ng/deflate_rle.c b/3rdparty/zlib-ng/deflate_rle.c index cd08509946..ee442141be 100644 --- a/3rdparty/zlib-ng/deflate_rle.c +++ b/3rdparty/zlib-ng/deflate_rle.c @@ -1,6 +1,6 @@ /* deflate_rle.c -- compress data using RLE strategy of deflation algorithm * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ diff --git a/3rdparty/zlib-ng/deflate_slow.c b/3rdparty/zlib-ng/deflate_slow.c index 9f1c913467..de70cc1bba 100644 --- a/3rdparty/zlib-ng/deflate_slow.c +++ b/3rdparty/zlib-ng/deflate_slow.c @@ -1,6 +1,6 @@ /* deflate_slow.c -- compress data using the slow strategy of deflation algorithm * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -19,12 +19,12 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) { int bflush; /* set if current block must be flushed */ int64_t dist; uint32_t match_len; - match_func *longest_match; + match_func longest_match; if (s->max_chain_length <= 1024) - longest_match = &functable.longest_match; + longest_match = FUNCTABLE_FPTR(longest_match); else - longest_match = &functable.longest_match_slow; + longest_match = FUNCTABLE_FPTR(longest_match_slow); /* Process the input block. */ for (;;) { @@ -61,7 +61,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) { * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ - match_len = (*longest_match)(s, hash_head); + match_len = longest_match(s, hash_head); /* longest_match() sets match_start */ if (match_len <= 5 && (s->strategy == Z_FILTERED)) { @@ -129,7 +129,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) { } Assert(flush != Z_NO_FLUSH, "no flush?"); if (UNLIKELY(s->match_available)) { - (void) zng_tr_tally_lit(s, s->window[s->strstart-1]); + Z_UNUSED(zng_tr_tally_lit(s, s->window[s->strstart-1])); s->match_available = 0; } s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1); diff --git a/3rdparty/zlib-ng/deflate_stored.c b/3rdparty/zlib-ng/deflate_stored.c index 6160896b3f..9e5acfbf96 100644 --- a/3rdparty/zlib-ng/deflate_stored.c +++ b/3rdparty/zlib-ng/deflate_stored.c @@ -1,6 +1,6 @@ /* deflate_stored.c -- store data without compression using deflation algorithm * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -22,7 +22,7 @@ * * deflate_stored() is written to minimize the number of times an input byte is * copied. It is most efficient with large input and output buffers, which - * maximizes the opportunites to have a single copy from next_in to next_out. + * maximizes the opportunities to have a single copy from next_in to next_out. */ Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) { /* Smallest worthy block size when not flushing or finishing. By default diff --git a/3rdparty/zlib-ng/fallback_builtins.h b/3rdparty/zlib-ng/fallback_builtins.h index 79072a1028..8303508fa1 100644 --- a/3rdparty/zlib-ng/fallback_builtins.h +++ b/3rdparty/zlib-ng/fallback_builtins.h @@ -5,9 +5,6 @@ #if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) #include -#ifdef X86_FEATURES -# include "arch/x86/x86_features.h" -#endif /* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0. * Because of that assumption trailing_zero is not initialized and the return value is not checked. diff --git a/3rdparty/zlib-ng/functable.c b/3rdparty/zlib-ng/functable.c index 37c4aeef7d..495d11edd2 100644 --- a/3rdparty/zlib-ng/functable.c +++ b/3rdparty/zlib-ng/functable.c @@ -2,14 +2,12 @@ * Copyright (C) 2017 Hans Kristian Rosbach * For conditions of distribution and use, see copyright notice in zlib.h */ +#ifndef DISABLE_RUNTIME_CPU_DETECTION #include "zbuild.h" -#include "zendian.h" -#include "crc32_braid_p.h" -#include "deflate.h" -#include "deflate_p.h" #include "functable.h" #include "cpu_features.h" +#include "arch_functions.h" #if defined(_MSC_VER) # include @@ -61,31 +59,10 @@ static void init_functable(void) { ft.crc32_fold_final = &crc32_fold_final_c; ft.crc32_fold_reset = &crc32_fold_reset_c; ft.inflate_fast = &inflate_fast_c; - ft.insert_string = &insert_string_c; - ft.quick_insert_string = &quick_insert_string_c; ft.slide_hash = &slide_hash_c; - ft.update_hash = &update_hash_c; - -#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN -# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) - ft.longest_match = &longest_match_unaligned_64; - ft.longest_match_slow = &longest_match_slow_unaligned_64; - ft.compare256 = &compare256_unaligned_64; -# elif defined(HAVE_BUILTIN_CTZ) - ft.longest_match = &longest_match_unaligned_32; - ft.longest_match_slow = &longest_match_slow_unaligned_32; - ft.compare256 = &compare256_unaligned_32; -# else - ft.longest_match = &longest_match_unaligned_16; - ft.longest_match_slow = &longest_match_slow_unaligned_16; - ft.compare256 = &compare256_unaligned_16; -# endif -#else - ft.longest_match = &longest_match_c; - ft.longest_match_slow = &longest_match_slow_c; - ft.compare256 = &compare256_c; -#endif - + ft.longest_match = &longest_match_generic; + ft.longest_match_slow = &longest_match_slow_generic; + ft.compare256 = &compare256_generic; // Select arch-optimized functions @@ -110,19 +87,14 @@ static void init_functable(void) { #ifdef X86_SSSE3 if (cf.x86.has_ssse3) { ft.adler32 = &adler32_ssse3; -# ifdef X86_SSE2 ft.chunkmemset_safe = &chunkmemset_safe_ssse3; ft.inflate_fast = &inflate_fast_ssse3; -# endif } #endif // X86 - SSE4.2 #ifdef X86_SSE42 if (cf.x86.has_sse42) { ft.adler32_fold_copy = &adler32_fold_copy_sse42; - ft.insert_string = &insert_string_sse42; - ft.quick_insert_string = &quick_insert_string_sse42; - ft.update_hash = &update_hash_sse42; } #endif // X86 - PCLMUL @@ -151,8 +123,9 @@ static void init_functable(void) { # endif } #endif + // X86 - AVX512 (F,DQ,BW,Vl) #ifdef X86_AVX512 - if (cf.x86.has_avx512) { + if (cf.x86.has_avx512_common) { ft.adler32 = &adler32_avx512; ft.adler32_fold_copy = &adler32_fold_copy_avx512; } @@ -164,8 +137,8 @@ static void init_functable(void) { } #endif // X86 - VPCLMULQDQ -#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) - if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) { +#ifdef X86_VPCLMULQDQ_CRC + if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) { ft.crc32 = &crc32_vpclmulqdq; ft.crc32_fold = &crc32_fold_vpclmulqdq; ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy; @@ -206,9 +179,6 @@ static void init_functable(void) { #ifdef ARM_ACLE if (cf.arm.has_crc32) { ft.crc32 = &crc32_acle; - ft.insert_string = &insert_string_acle; - ft.quick_insert_string = &quick_insert_string_acle; - ft.update_hash = &update_hash_acle; } #endif @@ -279,12 +249,9 @@ static void init_functable(void) { FUNCTABLE_ASSIGN(ft, crc32_fold_final); FUNCTABLE_ASSIGN(ft, crc32_fold_reset); FUNCTABLE_ASSIGN(ft, inflate_fast); - FUNCTABLE_ASSIGN(ft, insert_string); FUNCTABLE_ASSIGN(ft, longest_match); FUNCTABLE_ASSIGN(ft, longest_match_slow); - FUNCTABLE_ASSIGN(ft, quick_insert_string); FUNCTABLE_ASSIGN(ft, slide_hash); - FUNCTABLE_ASSIGN(ft, update_hash); // Memory barrier for weak memory order CPUs FUNCTABLE_BARRIER(); @@ -350,11 +317,6 @@ static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { functable.inflate_fast(strm, start); } -static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) { - init_functable(); - functable.insert_string(s, str, count); -} - static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) { init_functable(); return functable.longest_match(s, cur_match); @@ -365,21 +327,11 @@ static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) { return functable.longest_match_slow(s, cur_match); } -static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) { - init_functable(); - return functable.quick_insert_string(s, str); -} - static void slide_hash_stub(deflate_state* s) { init_functable(); functable.slide_hash(s); } -static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) { - init_functable(); - return functable.update_hash(s, h, val); -} - /* functable init */ Z_INTERNAL struct functable_s functable = { force_init_stub, @@ -394,10 +346,9 @@ Z_INTERNAL struct functable_s functable = { crc32_fold_final_stub, crc32_fold_reset_stub, inflate_fast_stub, - insert_string_stub, longest_match_stub, longest_match_slow_stub, - quick_insert_string_stub, slide_hash_stub, - update_hash_stub }; + +#endif diff --git a/3rdparty/zlib-ng/functable.h b/3rdparty/zlib-ng/functable.h index 9f78188e10..173a030c66 100644 --- a/3rdparty/zlib-ng/functable.h +++ b/3rdparty/zlib-ng/functable.h @@ -7,14 +7,21 @@ #define FUNCTABLE_H_ #include "deflate.h" -#include "crc32_fold.h" -#include "adler32_fold.h" +#include "crc32.h" + +#ifdef DISABLE_RUNTIME_CPU_DETECTION + +# include "arch_functions.h" + +/* When compiling with native instructions it is not necessary to use functable. + * Instead we use native_ macro indicating the best available variant of arch-specific + * functions for the current platform. + */ +# define FUNCTABLE_INIT ((void)0) +# define FUNCTABLE_CALL(name) native_ ## name +# define FUNCTABLE_FPTR(name) &native_ ## name -#ifdef ZLIB_COMPAT -typedef struct z_stream_s z_stream; #else -typedef struct zng_stream_s zng_stream; -#endif struct functable_s { void (* force_init) (void); @@ -29,14 +36,20 @@ struct functable_s { uint32_t (* crc32_fold_final) (struct crc32_fold_s *crc); uint32_t (* crc32_fold_reset) (struct crc32_fold_s *crc); void (* inflate_fast) (PREFIX3(stream) *strm, uint32_t start); - void (* insert_string) (deflate_state *const s, uint32_t str, uint32_t count); uint32_t (* longest_match) (deflate_state *const s, Pos cur_match); uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match); - Pos (* quick_insert_string)(deflate_state *const s, uint32_t str); void (* slide_hash) (deflate_state *s); - uint32_t (* update_hash) (deflate_state *const s, uint32_t h, uint32_t val); }; Z_INTERNAL extern struct functable_s functable; + +/* Explicitly indicate functions are conditionally dispatched. + */ +# define FUNCTABLE_INIT functable.force_init() +# define FUNCTABLE_CALL(name) functable.name +# define FUNCTABLE_FPTR(name) functable.name + +#endif + #endif diff --git a/3rdparty/zlib-ng/gzguts.h b/3rdparty/zlib-ng/gzguts.h index a663844b69..14f2391152 100644 --- a/3rdparty/zlib-ng/gzguts.h +++ b/3rdparty/zlib-ng/gzguts.h @@ -1,7 +1,7 @@ #ifndef GZGUTS_H_ #define GZGUTS_H_ /* gzguts.h -- zlib internal header definitions for gz* operations - * Copyright (C) 2004-2019 Mark Adler + * Copyright (C) 2004-2024 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -135,7 +135,9 @@ typedef gz_state *gz_statep; /* shared functions */ void Z_INTERNAL gz_error(gz_state *, int, const char *); - +#ifdef ZLIB_COMPAT +unsigned Z_INTERNAL gz_intmax(void); +#endif /* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t value -- needed when comparing unsigned to z_off64_t, which is signed (possible z_off64_t types off_t, off64_t, and long are all signed) */ diff --git a/3rdparty/zlib-ng/gzlib.c b/3rdparty/zlib-ng/gzlib.c index e613837efb..b8a506b6a5 100644 --- a/3rdparty/zlib-ng/gzlib.c +++ b/3rdparty/zlib-ng/gzlib.c @@ -1,5 +1,5 @@ /* gzlib.c -- zlib functions common to reading and writing gzip files - * Copyright (C) 2004-2019 Mark Adler + * Copyright (C) 2004-2024 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -523,3 +523,9 @@ void Z_INTERNAL gz_error(gz_state *state, int err, const char *msg) { } (void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3, "%s%s%s", state->path, ": ", msg); } + +#ifdef ZLIB_COMPAT +unsigned Z_INTERNAL gz_intmax(void) { + return INT_MAX; +} +#endif diff --git a/3rdparty/zlib-ng/infback.c b/3rdparty/zlib-ng/infback.c index 9f5042b4d3..307d05ca3c 100644 --- a/3rdparty/zlib-ng/infback.c +++ b/3rdparty/zlib-ng/infback.c @@ -43,10 +43,15 @@ int32_t ZNG_CONDEXPORT PREFIX(inflateBackInit)(PREFIX3(stream) *strm, int32_t wi } if (strm->zfree == NULL) strm->zfree = PREFIX(zcfree); - state = ZALLOC_INFLATE_STATE(strm); - if (state == NULL) + + inflate_allocs *alloc_bufs = alloc_inflate(strm); + if (alloc_bufs == NULL) return Z_MEM_ERROR; + + state = alloc_bufs->state; + state->alloc_bufs = alloc_bufs; Tracev((stderr, "inflate: allocated\n")); + strm->state = (struct internal_state *)state; state->dmax = 32768U; state->wbits = (unsigned int)windowBits; @@ -55,7 +60,7 @@ int32_t ZNG_CONDEXPORT PREFIX(inflateBackInit)(PREFIX3(stream) *strm, int32_t wi state->wnext = 0; state->whave = 0; state->sane = 1; - state->chunksize = functable.chunksize(); + state->chunksize = FUNCTABLE_CALL(chunksize)(); return Z_OK; } @@ -357,7 +362,7 @@ int32_t Z_EXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in RESTORE(); if (state->whave < state->wsize) state->whave = state->wsize - left; - functable.inflate_fast(strm, state->wsize); + FUNCTABLE_CALL(inflate_fast)(strm, state->wsize); LOAD(); break; } @@ -504,8 +509,10 @@ int32_t Z_EXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in int32_t Z_EXPORT PREFIX(inflateBackEnd)(PREFIX3(stream) *strm) { if (strm == NULL || strm->state == NULL || strm->zfree == NULL) return Z_STREAM_ERROR; - ZFREE_STATE(strm, strm->state); - strm->state = NULL; + + /* Free allocated buffers */ + free_inflate(strm); + Tracev((stderr, "inflate: end\n")); return Z_OK; } diff --git a/3rdparty/zlib-ng/inflate.c b/3rdparty/zlib-ng/inflate.c index fe55c498e3..956f37db7d 100644 --- a/3rdparty/zlib-ng/inflate.c +++ b/3rdparty/zlib-ng/inflate.c @@ -19,7 +19,7 @@ /* function prototypes */ static int inflateStateCheck(PREFIX3(stream) *strm); -static int updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum); +static void updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum); static uint32_t syncsearch(uint32_t *have, const unsigned char *buf, uint32_t len); static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst, @@ -28,11 +28,11 @@ static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst, struct inflate_state *state = (struct inflate_state*)strm->state; #ifdef GUNZIP if (state->flags) { - functable.crc32_fold_copy(&state->crc_fold, dst, src, copy); + FUNCTABLE_CALL(crc32_fold_copy)(&state->crc_fold, dst, src, copy); } else #endif { - strm->adler = state->check = functable.adler32_fold_copy(state->check, dst, src, copy); + strm->adler = state->check = FUNCTABLE_CALL(adler32_fold_copy)(state->check, dst, src, copy); } } @@ -40,11 +40,11 @@ static inline void inf_chksum(PREFIX3(stream) *strm, const uint8_t *src, uint32_ struct inflate_state *state = (struct inflate_state*)strm->state; #ifdef GUNZIP if (state->flags) { - functable.crc32_fold(&state->crc_fold, src, len, 0); + FUNCTABLE_CALL(crc32_fold)(&state->crc_fold, src, len, 0); } else #endif { - strm->adler = state->check = functable.adler32(state->check, src, len); + strm->adler = state->check = FUNCTABLE_CALL(adler32)(state->check, src, len); } } @@ -53,7 +53,7 @@ static int inflateStateCheck(PREFIX3(stream) *strm) { if (strm == NULL || strm->zalloc == NULL || strm->zfree == NULL) return 1; state = (struct inflate_state *)strm->state; - if (state == NULL || state->strm != strm || state->mode < HEAD || state->mode > SYNC) + if (state == NULL || state->alloc_bufs == NULL || state->strm != strm || state->mode < HEAD || state->mode > SYNC) return 1; return 0; } @@ -120,13 +120,9 @@ int32_t Z_EXPORT PREFIX(inflateReset2)(PREFIX3(stream) *strm, int32_t windowBits #endif } - /* set number of window bits, free window if different */ + /* set number of window bits */ if (windowBits && (windowBits < MIN_WBITS || windowBits > MAX_WBITS)) return Z_STREAM_ERROR; - if (state->window != NULL && state->wbits != (unsigned)windowBits) { - ZFREE_WINDOW(strm, state->window); - state->window = NULL; - } /* update state and reset the rest of it */ state->wrap = wrap; @@ -134,13 +130,94 @@ int32_t Z_EXPORT PREFIX(inflateReset2)(PREFIX3(stream) *strm, int32_t windowBits return PREFIX(inflateReset)(strm); } -/* This function is hidden in ZLIB_COMPAT builds. */ +#ifdef INF_ALLOC_DEBUG +# include +# define LOGSZ(name,size) fprintf(stderr, "%s is %d bytes\n", name, size) +# define LOGSZP(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %d, padded %d\n", name, size, loc, pad) +# define LOGSZPL(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %ld, padded %d\n", name, size, loc, pad) +#else +# define LOGSZ(name,size) +# define LOGSZP(name,size,loc,pad) +# define LOGSZPL(name,size,loc,pad) +#endif + +/* =========================================================================== + * Allocate a big buffer and divide it up into the various buffers inflate needs. + * Handles alignment of allocated buffer and alignment of individual buffers. + */ +Z_INTERNAL inflate_allocs* alloc_inflate(PREFIX3(stream) *strm) { + int curr_size = 0; + + /* Define sizes */ + int window_size = INFLATE_ADJUST_WINDOW_SIZE((1 << MAX_WBITS) + 64); /* 64B padding for chunksize */ + int state_size = sizeof(inflate_state); + int alloc_size = sizeof(inflate_allocs); + + /* Calculate relative buffer positions and paddings */ + LOGSZP("window", window_size, PAD_WINDOW(curr_size), PADSZ(curr_size,WINDOW_PAD_SIZE)); + int window_pos = PAD_WINDOW(curr_size); + curr_size = window_pos + window_size; + + LOGSZP("state", state_size, PAD_64(curr_size), PADSZ(curr_size,64)); + int state_pos = PAD_64(curr_size); + curr_size = state_pos + state_size; + + LOGSZP("alloc", alloc_size, PAD_16(curr_size), PADSZ(curr_size,16)); + int alloc_pos = PAD_16(curr_size); + curr_size = alloc_pos + alloc_size; + + /* Add 64-1 or 4096-1 to allow window alignment, and round size of buffer up to multiple of 64 */ + int total_size = PAD_64(curr_size + (WINDOW_PAD_SIZE - 1)); + + /* Allocate buffer, align to 64-byte cacheline, and zerofill the resulting buffer */ + char *original_buf = strm->zalloc(strm->opaque, 1, total_size); + if (original_buf == NULL) + return NULL; + + char *buff = (char *)HINT_ALIGNED_WINDOW((char *)PAD_WINDOW(original_buf)); + LOGSZPL("Buffer alloc", total_size, PADSZ((uintptr_t)original_buf,WINDOW_PAD_SIZE), PADSZ(curr_size,WINDOW_PAD_SIZE)); + + /* Initialize alloc_bufs */ + inflate_allocs *alloc_bufs = (struct inflate_allocs_s *)(buff + alloc_pos); + alloc_bufs->buf_start = (char *)original_buf; + alloc_bufs->zfree = strm->zfree; + + alloc_bufs->window = (unsigned char *)HINT_ALIGNED_WINDOW((buff + window_pos)); + alloc_bufs->state = (inflate_state *)HINT_ALIGNED_64((buff + state_pos)); + +#ifdef Z_MEMORY_SANITIZER + /* This is _not_ to subvert the memory sanitizer but to instead unposion some + data we willingly and purposefully load uninitialized into vector registers + in order to safely read the last < chunksize bytes of the window. */ + __msan_unpoison(alloc_bufs->window + window_size, 64); +#endif + + return alloc_bufs; +} + +/* =========================================================================== + * Free all allocated inflate buffers + */ +Z_INTERNAL void free_inflate(PREFIX3(stream) *strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + + if (state->alloc_bufs != NULL) { + inflate_allocs *alloc_bufs = state->alloc_bufs; + alloc_bufs->zfree(strm->opaque, alloc_bufs->buf_start); + strm->state = NULL; + } +} + +/* =========================================================================== + * Initialize inflate state and buffers. + * This function is hidden in ZLIB_COMPAT builds. + */ int32_t ZNG_CONDEXPORT PREFIX(inflateInit2)(PREFIX3(stream) *strm, int32_t windowBits) { int32_t ret; struct inflate_state *state; - /* Initialize functable earlier. */ - functable.force_init(); + /* Initialize functable */ + FUNCTABLE_INIT; if (strm == NULL) return Z_STREAM_ERROR; @@ -151,19 +228,23 @@ int32_t ZNG_CONDEXPORT PREFIX(inflateInit2)(PREFIX3(stream) *strm, int32_t windo } if (strm->zfree == NULL) strm->zfree = PREFIX(zcfree); - state = ZALLOC_INFLATE_STATE(strm); - if (state == NULL) + + inflate_allocs *alloc_bufs = alloc_inflate(strm); + if (alloc_bufs == NULL) return Z_MEM_ERROR; + + state = alloc_bufs->state; + state->window = alloc_bufs->window; + state->alloc_bufs = alloc_bufs; Tracev((stderr, "inflate: allocated\n")); + strm->state = (struct internal_state *)state; state->strm = strm; - state->window = NULL; state->mode = HEAD; /* to pass state test in inflateReset2() */ - state->chunksize = functable.chunksize(); + state->chunksize = FUNCTABLE_CALL(chunksize)(); ret = PREFIX(inflateReset2)(strm, windowBits); if (ret != Z_OK) { - ZFREE_STATE(strm, state); - strm->state = NULL; + free_inflate(strm); } return ret; } @@ -222,31 +303,6 @@ void Z_INTERNAL PREFIX(fixedtables)(struct inflate_state *state) { state->distbits = 5; } -int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state) { - /* if it hasn't been done already, allocate space for the window */ - if (state->window == NULL) { - unsigned wsize = 1U << state->wbits; - state->window = (unsigned char *)ZALLOC_WINDOW(state->strm, wsize + state->chunksize, sizeof(unsigned char)); - if (state->window == NULL) - return Z_MEM_ERROR; -#ifdef Z_MEMORY_SANITIZER - /* This is _not_ to subvert the memory sanitizer but to instead unposion some - data we willingly and purposefully load uninitialized into vector registers - in order to safely read the last < chunksize bytes of the window. */ - __msan_unpoison(state->window + wsize, state->chunksize); -#endif - } - - /* if window not in use yet, initialize */ - if (state->wsize == 0) { - state->wsize = 1U << state->wbits; - state->wnext = 0; - state->whave = 0; - } - - return Z_OK; -} - /* Update the window with the last wsize (normally 32K) bytes written before returning. If window does not exist yet, create it. This is only called @@ -261,20 +317,20 @@ int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state) { output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ -static int32_t updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum) { +static void updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum) { struct inflate_state *state; uint32_t dist; state = (struct inflate_state *)strm->state; - if (PREFIX(inflate_ensure_window)(state)) return 1; + /* if window not in use yet, initialize */ + if (state->wsize == 0) + state->wsize = 1U << state->wbits; /* len state->wsize or less output bytes into the circular window */ if (len >= state->wsize) { /* Only do this if the caller specifies to checksum bytes AND the platform requires - * it (s/390 being the primary exception to this. Also, for now, do the adler checksums - * if not a gzip based header. The inline adler checksums will come in the near future, - * possibly the next commit */ + * it (s/390 being the primary exception to this) */ if (INFLATE_NEED_CHECKSUM(strm) && cksum) { /* We have to split the checksum over non-copied and copied bytes */ if (len > state->wsize) @@ -314,7 +370,6 @@ static int32_t updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t state->whave += dist; } } - return 0; } /* @@ -636,7 +691,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { } /* compute crc32 checksum if not in raw mode */ if ((state->wrap & 4) && state->flags) - strm->adler = state->check = functable.crc32_fold_reset(&state->crc_fold); + strm->adler = state->check = FUNCTABLE_CALL(crc32_fold_reset)(&state->crc_fold); state->mode = TYPE; break; #endif @@ -867,7 +922,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { /* use inflate_fast() if we have enough input and output */ if (have >= INFLATE_FAST_MIN_HAVE && left >= INFLATE_FAST_MIN_LEFT) { RESTORE(); - functable.inflate_fast(strm, out); + FUNCTABLE_CALL(inflate_fast)(strm, out); LOAD(); if (state->mode == TYPE) state->back = -1; @@ -1026,7 +1081,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { } else { copy = MIN(state->length, left); - put = functable.chunkmemset_safe(put, state->offset, copy, left); + put = FUNCTABLE_CALL(chunkmemset_safe)(put, state->offset, copy, left); } left -= copy; state->length -= copy; @@ -1056,7 +1111,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { } #ifdef GUNZIP if (state->flags) - strm->adler = state->check = functable.crc32_fold_final(&state->crc_fold); + strm->adler = state->check = FUNCTABLE_CALL(crc32_fold_final)(&state->crc_fold); #endif } out = left; @@ -1098,9 +1153,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { ret = Z_DATA_ERROR; goto inf_leave; - case MEM: - return Z_MEM_ERROR; - case SYNC: default: /* can't happen, but makes compilers happy */ @@ -1111,7 +1163,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call updatewindow() to create and/or update the window state. - Note: a memory error from inflate() is non-recoverable. */ inf_leave: RESTORE(); @@ -1120,10 +1171,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { (state->wsize || (out != strm->avail_out && state->mode < BAD && (state->mode < CHECK || flush != Z_FINISH)))) { /* update sliding window with respective checksum if not in "raw" mode */ - if (updatewindow(strm, strm->next_out, check_bytes, state->wrap & 4)) { - state->mode = MEM; - return Z_MEM_ERROR; - } + updatewindow(strm, strm->next_out, check_bytes, state->wrap & 4); } in -= strm->avail_in; out -= strm->avail_out; @@ -1144,14 +1192,12 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { } int32_t Z_EXPORT PREFIX(inflateEnd)(PREFIX3(stream) *strm) { - struct inflate_state *state; if (inflateStateCheck(strm)) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (state->window != NULL) - ZFREE_WINDOW(strm, state->window); - ZFREE_STATE(strm, strm->state); - strm->state = NULL; + + /* Free allocated buffers */ + free_inflate(strm); + Tracev((stderr, "inflate: end\n")); return Z_OK; } @@ -1179,7 +1225,6 @@ int32_t Z_EXPORT PREFIX(inflateGetDictionary)(PREFIX3(stream) *strm, uint8_t *di int32_t Z_EXPORT PREFIX(inflateSetDictionary)(PREFIX3(stream) *strm, const uint8_t *dictionary, uint32_t dictLength) { struct inflate_state *state; unsigned long dictid; - int32_t ret; /* check state */ if (inflateStateCheck(strm)) @@ -1190,7 +1235,7 @@ int32_t Z_EXPORT PREFIX(inflateSetDictionary)(PREFIX3(stream) *strm, const uint8 /* check for correct dictionary identifier */ if (state->mode == DICT) { - dictid = functable.adler32(ADLER32_INITIAL_VALUE, dictionary, dictLength); + dictid = FUNCTABLE_CALL(adler32)(ADLER32_INITIAL_VALUE, dictionary, dictLength); if (dictid != state->check) return Z_DATA_ERROR; } @@ -1199,11 +1244,8 @@ int32_t Z_EXPORT PREFIX(inflateSetDictionary)(PREFIX3(stream) *strm, const uint8 /* copy dictionary to window using updatewindow(), which will amend the existing dictionary if appropriate */ - ret = updatewindow(strm, dictionary + dictLength, dictLength, 0); - if (ret) { - state->mode = MEM; - return Z_MEM_ERROR; - } + updatewindow(strm, dictionary + dictLength, dictLength, 0); + state->havedict = 1; Tracev((stderr, "inflate: dictionary set\n")); return Z_OK; @@ -1271,7 +1313,7 @@ int32_t Z_EXPORT PREFIX(inflateSync)(PREFIX3(stream) *strm) { /* if first time, start search in bit buffer */ if (state->mode != SYNC) { state->mode = SYNC; - state->hold <<= state->bits & 7; + state->hold >>= state->bits & 7; state->bits -= state->bits & 7; len = 0; while (state->bits >= 8) { @@ -1334,30 +1376,28 @@ int32_t Z_EXPORT PREFIX(inflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *sou return Z_STREAM_ERROR; state = (struct inflate_state *)source->state; + /* copy stream */ + memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream))); + /* allocate space */ - copy = ZALLOC_INFLATE_STATE(source); - if (copy == NULL) + inflate_allocs *alloc_bufs = alloc_inflate(dest); + if (alloc_bufs == NULL) return Z_MEM_ERROR; + copy = alloc_bufs->state; /* copy state */ - memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream))); - ZCOPY_INFLATE_STATE(copy, state); + memcpy(copy, state, sizeof(struct inflate_state)); copy->strm = dest; if (state->lencode >= state->codes && state->lencode <= state->codes + ENOUGH - 1) { copy->lencode = copy->codes + (state->lencode - state->codes); copy->distcode = copy->codes + (state->distcode - state->codes); } copy->next = copy->codes + (state->next - state->codes); + copy->window = alloc_bufs->window; + copy->alloc_bufs = alloc_bufs; /* window */ - copy->window = NULL; - if (state->window != NULL) { - if (PREFIX(inflate_ensure_window)(copy)) { - ZFREE_STATE(source, copy); - return Z_MEM_ERROR; - } - ZCOPY_WINDOW(copy->window, state->window, (size_t)state->wsize); - } + memcpy(copy->window, state->window, INFLATE_ADJUST_WINDOW_SIZE((size_t)state->wsize)); dest->state = (struct internal_state *)copy; return Z_OK; diff --git a/3rdparty/zlib-ng/inflate.h b/3rdparty/zlib-ng/inflate.h index 39cdf5d683..536da7d1f8 100644 --- a/3rdparty/zlib-ng/inflate.h +++ b/3rdparty/zlib-ng/inflate.h @@ -11,8 +11,12 @@ #ifndef INFLATE_H_ #define INFLATE_H_ -#include "adler32_fold.h" -#include "crc32_fold.h" +#include "crc32.h" + +#ifdef S390_DFLTCC_INFLATE +# include "arch/s390/dfltcc_common.h" +# define HAVE_ARCH_INFLATE_STATE +#endif /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate(). NO_GZIP would be used to avoid linking in the crc code when it is not needed. @@ -53,14 +57,13 @@ typedef enum { LENGTH, /* i: waiting for 32-bit length (gzip) */ DONE, /* finished check, done -- remain here until reset */ BAD, /* got a data error -- remain here until reset */ - MEM, /* got an inflate() memory error -- remain here until reset */ SYNC /* looking for synchronization bytes to restart inflate() */ } inflate_mode; /* State transitions between above modes - - (most modes can go to BAD or MEM on error -- not shown for clarity) + (most modes can go to BAD on error -- not shown for clarity) Process header: HEAD -> (gzip) or (zlib) or (raw) @@ -81,10 +84,19 @@ typedef enum { Process trailer: CHECK -> LENGTH -> DONE */ +typedef struct inflate_state inflate_state; + +/* Struct for memory allocation handling */ +typedef struct inflate_allocs_s { + char *buf_start; + free_func zfree; + inflate_state *state; + unsigned char *window; +} inflate_allocs; /* State maintained between inflate() calls -- approximately 7K bytes, not including the allocated sliding window, which is up to 32K bytes. */ -struct inflate_state { +struct ALIGNED_(64) inflate_state { PREFIX3(stream) *strm; /* pointer back to this zlib stream */ inflate_mode mode; /* current inflate mode */ int last; /* true if processing last block */ @@ -132,9 +144,14 @@ struct inflate_state { int back; /* bits back of last unprocessed length/lit */ unsigned was; /* initial length of match */ uint32_t chunksize; /* size of memory copying chunk */ + inflate_allocs *alloc_bufs; /* struct for handling memory allocations */ +#ifdef HAVE_ARCH_INFLATE_STATE + arch_inflate_state arch; /* architecture-specific extensions */ +#endif }; -int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state); void Z_INTERNAL PREFIX(fixedtables)(struct inflate_state *state); +Z_INTERNAL inflate_allocs* alloc_inflate(PREFIX3(stream) *strm); +Z_INTERNAL void free_inflate(PREFIX3(stream) *strm); #endif /* INFLATE_H_ */ diff --git a/3rdparty/zlib-ng/inflate_p.h b/3rdparty/zlib-ng/inflate_p.h index eff73876da..c324b0486a 100644 --- a/3rdparty/zlib-ng/inflate_p.h +++ b/3rdparty/zlib-ng/inflate_p.h @@ -10,15 +10,16 @@ /* Architecture-specific hooks. */ #ifdef S390_DFLTCC_INFLATE # include "arch/s390/dfltcc_inflate.h" +/* DFLTCC instructions require window to be page-aligned */ +# define PAD_WINDOW PAD_4096 +# define WINDOW_PAD_SIZE 4096 +# define HINT_ALIGNED_WINDOW HINT_ALIGNED_4096 #else -/* Memory management for the inflate state. Useful for allocating arch-specific extension blocks. */ -# define ZALLOC_INFLATE_STATE(strm) ((struct inflate_state *)ZALLOC(strm, 1, sizeof(struct inflate_state))) -# define ZFREE_STATE(strm, addr) ZFREE(strm, addr) -# define ZCOPY_INFLATE_STATE(dst, src) memcpy(dst, src, sizeof(struct inflate_state)) -/* Memory management for the window. Useful for allocation the aligned window. */ -# define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size) -# define ZCOPY_WINDOW(dest, src, n) memcpy(dest, src, n) -# define ZFREE_WINDOW(strm, addr) ZFREE(strm, addr) +# define PAD_WINDOW PAD_64 +# define WINDOW_PAD_SIZE 64 +# define HINT_ALIGNED_WINDOW HINT_ALIGNED_64 +/* Adjust the window size for the arch-specific inflate code. */ +# define INFLATE_ADJUST_WINDOW_SIZE(n) (n) /* Invoked at the end of inflateResetKeep(). Useful for initializing arch-specific extension blocks. */ # define INFLATE_RESET_KEEP_HOOK(strm) do {} while (0) /* Invoked at the beginning of inflatePrime(). Useful for updating arch-specific buffers. */ @@ -46,9 +47,9 @@ /* check function to use adler32() for zlib or crc32() for gzip */ #ifdef GUNZIP # define UPDATE(check, buf, len) \ - (state->flags ? PREFIX(crc32)(check, buf, len) : functable.adler32(check, buf, len)) + (state->flags ? PREFIX(crc32)(check, buf, len) : FUNCTABLE_CALL(adler32)(check, buf, len)) #else -# define UPDATE(check, buf, len) functable.adler32(check, buf, len) +# define UPDATE(check, buf, len) FUNCTABLE_CALL(adler32)(check, buf, len) #endif /* check macros for header crc */ diff --git a/3rdparty/zlib-ng/inftrees.c b/3rdparty/zlib-ng/inftrees.c index 423f7b461d..5234fe7ae0 100644 --- a/3rdparty/zlib-ng/inftrees.c +++ b/3rdparty/zlib-ng/inftrees.c @@ -1,5 +1,5 @@ /* inftrees.c -- generate Huffman trees for efficient decoding - * Copyright (C) 1995-2023 Mark Adler + * Copyright (C) 1995-2024 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -7,7 +7,7 @@ #include "zutil.h" #include "inftrees.h" -const char PREFIX(inflate_copyright)[] = " inflate 1.3.0 Copyright 1995-2023 Mark Adler "; +const char PREFIX(inflate_copyright)[] = " inflate 1.3.1 Copyright 1995-2024 Mark Adler "; /* If you use the zlib library in a product, an acknowledgment is welcome in the documentation of your product. If for some reason you cannot @@ -54,7 +54,7 @@ int Z_INTERNAL zng_inflate_table(codetype type, uint16_t *lens, unsigned codes, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; static const uint16_t lext[31] = { /* Length codes 257..285 extra */ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, - 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 77, 202}; + 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 203, 77}; static const uint16_t dbase[32] = { /* Distance codes 0..29 base */ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, diff --git a/3rdparty/zlib-ng/insert_string.c b/3rdparty/zlib-ng/insert_string.c index cfe39837f8..11a5b97ffe 100644 --- a/3rdparty/zlib-ng/insert_string.c +++ b/3rdparty/zlib-ng/insert_string.c @@ -1,6 +1,6 @@ /* insert_string.c -- insert_string integer hash variant * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * */ @@ -10,12 +10,12 @@ #define HASH_SLIDE 16 -#define HASH_CALC(s, h, val) h = ((val * 2654435761U) >> HASH_SLIDE); +#define HASH_CALC(h, val) h = ((val * 2654435761U) >> HASH_SLIDE); #define HASH_CALC_VAR h #define HASH_CALC_VAR_INIT uint32_t h = 0 -#define UPDATE_HASH update_hash_c -#define INSERT_STRING insert_string_c -#define QUICK_INSERT_STRING quick_insert_string_c +#define UPDATE_HASH update_hash +#define INSERT_STRING insert_string +#define QUICK_INSERT_STRING quick_insert_string #include "insert_string_tpl.h" diff --git a/3rdparty/zlib-ng/insert_string_roll.c b/3rdparty/zlib-ng/insert_string_roll.c index dfea347bcc..8693f96f59 100644 --- a/3rdparty/zlib-ng/insert_string_roll.c +++ b/3rdparty/zlib-ng/insert_string_roll.c @@ -1,6 +1,6 @@ /* insert_string_roll.c -- insert_string rolling hash variant * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * */ @@ -10,7 +10,7 @@ #define HASH_SLIDE 5 -#define HASH_CALC(s, h, val) h = ((h << HASH_SLIDE) ^ ((uint8_t)val)) +#define HASH_CALC(h, val) h = ((h << HASH_SLIDE) ^ ((uint8_t)val)) #define HASH_CALC_VAR s->ins_h #define HASH_CALC_VAR_INIT #define HASH_CALC_READ val = strstart[0] diff --git a/3rdparty/zlib-ng/insert_string_tpl.h b/3rdparty/zlib-ng/insert_string_tpl.h index c84617730a..281c013463 100644 --- a/3rdparty/zlib-ng/insert_string_tpl.h +++ b/3rdparty/zlib-ng/insert_string_tpl.h @@ -1,10 +1,10 @@ #ifndef INSERT_STRING_H_ #define INSERT_STRING_H_ -/* insert_string.h -- Private insert_string functions shared with more than - * one insert string implementation +/* insert_string_tpl.h -- Private insert_string functions shared with more than + * one insert string implementation * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * * Copyright (C) 2013 Intel Corporation. All rights reserved. * Authors: @@ -47,9 +47,8 @@ * input characters, so that a running hash key can be computed from the * previous key instead of complete recalculation each time. */ -Z_INTERNAL uint32_t UPDATE_HASH(deflate_state *const s, uint32_t h, uint32_t val) { - (void)s; - HASH_CALC(s, h, val); +Z_INTERNAL uint32_t UPDATE_HASH(uint32_t h, uint32_t val) { + HASH_CALC(h, val); return h & HASH_CALC_MASK; } @@ -65,7 +64,7 @@ Z_INTERNAL Pos QUICK_INSERT_STRING(deflate_state *const s, uint32_t str) { HASH_CALC_VAR_INIT; HASH_CALC_READ; - HASH_CALC(s, HASH_CALC_VAR, val); + HASH_CALC(HASH_CALC_VAR, val); HASH_CALC_VAR &= HASH_CALC_MASK; hm = HASH_CALC_VAR; @@ -94,7 +93,7 @@ Z_INTERNAL void INSERT_STRING(deflate_state *const s, uint32_t str, uint32_t cou HASH_CALC_VAR_INIT; HASH_CALC_READ; - HASH_CALC(s, HASH_CALC_VAR, val); + HASH_CALC(HASH_CALC_VAR, val); HASH_CALC_VAR &= HASH_CALC_MASK; hm = HASH_CALC_VAR; diff --git a/3rdparty/zlib-ng/match_tpl.h b/3rdparty/zlib-ng/match_tpl.h index d076798520..9c258242cd 100644 --- a/3rdparty/zlib-ng/match_tpl.h +++ b/3rdparty/zlib-ng/match_tpl.h @@ -1,6 +1,6 @@ /* match_tpl.h -- find longest match template for compare256 variants * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Portions copyright (C) 2014-2021 Konstantin Nosov @@ -8,11 +8,6 @@ * https://github.com/gildor2/fast_zlib */ -#include "zbuild.h" -#include "zutil_p.h" -#include "deflate.h" -#include "functable.h" - #ifndef MATCH_TPL_H #define MATCH_TPL_H @@ -107,11 +102,11 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because * these strings are not yet inserted into the hash table. */ - hash = s->update_hash(s, 0, scan[1]); - hash = s->update_hash(s, hash, scan[2]); + hash = s->update_hash(0, scan[1]); + hash = s->update_hash(hash, scan[2]); for (i = 3; i <= best_len; i++) { - hash = s->update_hash(s, hash, scan[i]); + hash = s->update_hash(hash, scan[i]); /* If we're starting with best_len >= 3, we can use offset search. */ pos = s->head[hash]; @@ -241,9 +236,9 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { */ scan_endstr = scan + len - (STD_MIN_MATCH+1); - hash = s->update_hash(s, 0, scan_endstr[0]); - hash = s->update_hash(s, hash, scan_endstr[1]); - hash = s->update_hash(s, hash, scan_endstr[2]); + hash = s->update_hash(0, scan_endstr[0]); + hash = s->update_hash(hash, scan_endstr[1]); + hash = s->update_hash(hash, scan_endstr[2]); pos = s->head[hash]; if (pos < cur_match) { diff --git a/3rdparty/zlib-ng/patches/zlib-ng-2.2.1-detect-intrinsics.patch b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1-detect-intrinsics.patch new file mode 100644 index 0000000000..237770d204 --- /dev/null +++ b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1-detect-intrinsics.patch @@ -0,0 +1,13 @@ +diff --git a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake +index 14f82fc..78e46e1 100644 +--- a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake ++++ b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake +@@ -66,7 +66,7 @@ macro(check_armv6_compiler_flag) + return __uqsub16(a, b); + #endif + } +- int main(void) { return 0; }" ++ int main(void) { return f(1,2); }" + HAVE_ARMV6_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) diff --git a/3rdparty/zlib-ng/patches/zlib-ng-2.2.1.patch b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1.patch new file mode 100644 index 0000000000..fb3699dcff --- /dev/null +++ b/3rdparty/zlib-ng/patches/zlib-ng-2.2.1.patch @@ -0,0 +1,148 @@ +--- ./CMakeLists.txt 2024-09-11 12:28:30.597680661 +0300 ++++ ../../../zlib-ng/CMakeLists.txt 2024-09-11 12:29:10.013644583 +0300 +@@ -74,10 +74,10 @@ + # Options parsing + # + option(WITH_GZFILEOP "Compile with support for gzFile related functions" ON) +-option(ZLIB_COMPAT "Compile with zlib compatible API" ON) +-option(ZLIB_ENABLE_TESTS "Build test binaries" OFF) +-option(ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API" OFF) +-option(WITH_GTEST "Build gtest_zlib" OFF) ++option(ZLIB_COMPAT "Compile with zlib compatible API" OFF) ++option(ZLIB_ENABLE_TESTS "Build test binaries" ON) ++option(ZLIBNG_ENABLE_TESTS "Test zlib-ng specific API" ON) ++option(WITH_GTEST "Build gtest_zlib" ON) + option(WITH_FUZZERS "Build test/fuzz" OFF) + option(WITH_BENCHMARKS "Build test/benchmarks" OFF) + option(WITH_BENCHMARK_APPS "Build application benchmarks" OFF) +@@ -128,11 +128,6 @@ + + option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF) + +-set(ZLIB_BUILD_SHARED_LIBS OFF) +-set(SKIP_INSTALL_ALL ON) +-ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes -Wmissing-declarations -Wundef -Wstrict-prototypes -Wtype-limits) +-ocv_warnings_disable(CMAKE_C_FLAGS /wd4819 /wd4244 /wd4334) +- + mark_as_advanced(FORCE + ZLIB_SYMBOL_PREFIX + WITH_REDUCED_MEM +@@ -1147,22 +1142,21 @@ + list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS}) + endif() + +-if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS) ++if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS) + set(ZLIB_DLL_SRCS win32/zlib${SUFFIX}1.rc) + endif() + +-if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS) ++if(NOT DEFINED BUILD_SHARED_LIBS) + add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS}) + add_library(zlibstatic STATIC ${ZLIB_ALL_SRCS}) + + set(ZLIB_INSTALL_LIBRARIES zlib zlibstatic) + else() ++ add_library(zlib ${ZLIB_ALL_SRCS}) + +- if(ZLIB_BUILD_SHARED_LIBS) +- add_library(zlib SHARED ${ZLIB_ALL_SRCS} ${ZLIB_DLL_SRCS}) ++ if(BUILD_SHARED_LIBS) + target_sources(zlib PRIVATE ${ZLIB_DLL_SRCS}) + else() +- add_library(zlib STATIC ${ZLIB_ALL_SRCS}) + add_library(zlibstatic ALIAS zlib) + endif() + +@@ -1195,17 +1189,17 @@ + + if(WIN32) + # Shared library +- if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS) ++ if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS) + set_target_properties(zlib PROPERTIES OUTPUT_NAME zlib${SUFFIX}) + endif() + # Static library +- if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS) ++ if(NOT DEFINED BUILD_SHARED_LIBS) + if(MSVC) + set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX}) + else() + set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME z${SUFFIX}) + endif() +- elseif(NOT ZLIB_BUILD_SHARED_LIBS) ++ elseif(NOT BUILD_SHARED_LIBS) + if(MSVC) + set_target_properties(zlib PROPERTIES OUTPUT_NAME zlibstatic${SUFFIX}) + else() +@@ -1217,7 +1211,7 @@ + set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES OUTPUT_NAME z${SUFFIX}) + endif() + +-if(NOT DEFINED ZLIB_BUILD_SHARED_LIBS OR ZLIB_BUILD_SHARED_LIBS) ++if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS) + set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) + + if(ZLIB_COMPAT) +@@ -1277,6 +1271,8 @@ + if(WITH_GZFILEOP) + set(PKG_CONFIG_CFLAGS "-DWITH_GZFILEOP") + endif() ++configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.pc.cmakein ++ ${ZLIB_PC} @ONLY) + configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h.cmakein + ${CMAKE_CURRENT_BINARY_DIR}/zconf${SUFFIX}.h @ONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.h.in +@@ -1326,6 +1322,17 @@ + set(PACKAGE_CONFIGNAME zlib-ng) + set(PACKAGE_VERSION ${ZLIBNG_HEADER_VERSION}) + endif() ++ configure_package_config_file(${PACKAGE_CONFIGNAME}-config.cmake.in ++ ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config.cmake ++ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${EXPORT_NAME} ++ PATH_VARS INCLUDE_INSTALL_DIR LIB_INSTALL_DIR) ++ write_basic_package_version_file( ++ ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config-version.cmake ++ VERSION ${PACKAGE_VERSION} ++ COMPATIBILITY AnyNewerVersion) ++ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config.cmake ++ ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_CONFIGNAME}-config-version.cmake ++ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${EXPORT_NAME}) + endif() + + #============================================================================ +@@ -1335,7 +1342,7 @@ + if(ZLIB_ENABLE_TESTS) + enable_testing() + +- if(ZLIB_BUILD_SHARED_LIBS) ++ if(BUILD_SHARED_LIBS) + if(ZLIBNG_ENABLE_TESTS) + message(STATUS "Disabling zlib-ng tests because shared libraries are enabled") + set(ZLIBNG_ENABLE_TESTS OFF) +@@ -1399,12 +1406,19 @@ + + FEATURE_SUMMARY(WHAT ALL INCLUDE_QUIET_PACKAGES) + +-if(ENABLE_SOLUTION_FOLDERS) +- set_target_properties(${ZLIB_INSTALL_LIBRARIES} PROPERTIES FOLDER "3rdparty") +-endif() ++#============================================================================ ++# CPack ++#============================================================================ ++set(CPACK_GENERATOR "TGZ") ++set(CPACK_SOURCE_GENERATOR "TGZ") ++set(CPACK_SOURCE_IGNORE_FILES .git/ _CPack_Packages/ "${PROJECT_BINARY_DIR}/") ++ ++set(CPACK_PACKAGE_NAME "zlib${SUFFIX}") ++set(CPACK_PACKAGE_VERSION ${ZLIB_FULL_VERSION}) ++set(CPACK_PACKAGE_DIRECTORY "${PROJECT_BINARY_DIR}/package") + +-if(NOT BUILD_SHARED_LIBS) +- ocv_install_target(${ZLIB_INSTALL_LIBRARIES} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) ++if("${PROJECT_BINARY_DIR}" STREQUAL "${PROJECT_SOURCE_DIR}") ++ message(WARNING "Building to source folder is not recommended. Cpack will be unable to generate source package.") + endif() + +-ocv_install_3rdparty_licenses(${ZLIB_INSTALL_LIBRARIES} LICENSE.md) ++include(CPack) diff --git a/3rdparty/zlib-ng/trees.c b/3rdparty/zlib-ng/trees.c index 5bb88389ba..9f2f49137f 100644 --- a/3rdparty/zlib-ng/trees.c +++ b/3rdparty/zlib-ng/trees.c @@ -1,5 +1,5 @@ /* trees.c -- output deflated data using Huffman coding - * Copyright (C) 1995-2021 Jean-loup Gailly + * Copyright (C) 1995-2024 Jean-loup Gailly * detect_data_type() function provided freely by Cosmin Truta, 2006 * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -75,7 +75,6 @@ static int build_bl_tree (deflate_state *s); static void send_all_trees (deflate_state *s, int lcodes, int dcodes, int blcodes); static void compress_block (deflate_state *s, const ct_data *ltree, const ct_data *dtree); static int detect_data_type (deflate_state *s); -static void bi_flush (deflate_state *s); /* =========================================================================== * Initialize the tree data structures for a new zlib stream. @@ -609,13 +608,6 @@ void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored } } -/* =========================================================================== - * Flush the bits in the bit buffer to pending output (leaves at most 7 bits) - */ -void Z_INTERNAL zng_tr_flush_bits(deflate_state *s) { - bi_flush(s); -} - /* =========================================================================== * Send one empty static block to give enough lookahead for inflate. * This takes 10 bits, of which 7 may remain in the bit buffer. @@ -623,7 +615,7 @@ void Z_INTERNAL zng_tr_flush_bits(deflate_state *s) { void Z_INTERNAL zng_tr_align(deflate_state *s) { zng_tr_emit_tree(s, STATIC_TREES, 0); zng_tr_emit_end_block(s, static_ltree, 0); - bi_flush(s); + zng_tr_flush_bits(s); } /* =========================================================================== @@ -718,21 +710,30 @@ static void compress_block(deflate_state *s, const ct_data *ltree, const ct_data /* dtree: distance tree */ unsigned dist; /* distance of matched string */ int lc; /* match length or unmatched char (if dist == 0) */ - unsigned sx = 0; /* running index in sym_buf */ + unsigned sx = 0; /* running index in symbol buffers */ if (s->sym_next != 0) { do { +#ifdef LIT_MEM + dist = s->d_buf[sx]; + lc = s->l_buf[sx++]; +#else dist = s->sym_buf[sx++] & 0xff; dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8; lc = s->sym_buf[sx++]; +#endif if (dist == 0) { zng_emit_lit(s, ltree, lc); } else { zng_emit_dist(s, ltree, dtree, lc, dist); } /* literal or match pair ? */ - /* Check that the overlay between pending_buf and sym_buf is ok: */ + /* Check for no overlay of pending_buf on needed symbols */ +#ifdef LIT_MEM + Assert(s->pending < 2 * (s->lit_bufsize + sx), "pending_buf overflow"); +#else Assert(s->pending < s->lit_bufsize + sx, "pending_buf overflow"); +#endif } while (sx < s->sym_next); } @@ -781,27 +782,26 @@ static int detect_data_type(deflate_state *s) { /* =========================================================================== * Flush the bit buffer, keeping at most 7 bits in it. */ -static void bi_flush(deflate_state *s) { - if (s->bi_valid == 64) { - put_uint64(s, s->bi_buf); - s->bi_buf = 0; - s->bi_valid = 0; - } else { - if (s->bi_valid >= 32) { - put_uint32(s, (uint32_t)s->bi_buf); - s->bi_buf >>= 32; - s->bi_valid -= 32; - } - if (s->bi_valid >= 16) { - put_short(s, (uint16_t)s->bi_buf); - s->bi_buf >>= 16; - s->bi_valid -= 16; - } - if (s->bi_valid >= 8) { - put_byte(s, s->bi_buf); - s->bi_buf >>= 8; - s->bi_valid -= 8; - } +void Z_INTERNAL zng_tr_flush_bits(deflate_state *s) { + if (s->bi_valid >= 48) { + put_uint32(s, (uint32_t)s->bi_buf); + put_short(s, (uint16_t)(s->bi_buf >> 32)); + s->bi_buf >>= 48; + s->bi_valid -= 48; + } else if (s->bi_valid >= 32) { + put_uint32(s, (uint32_t)s->bi_buf); + s->bi_buf >>= 32; + s->bi_valid -= 32; + } + if (s->bi_valid >= 16) { + put_short(s, (uint16_t)s->bi_buf); + s->bi_buf >>= 16; + s->bi_valid -= 16; + } + if (s->bi_valid >= 8) { + put_byte(s, s->bi_buf); + s->bi_buf >>= 8; + s->bi_valid -= 8; } } diff --git a/3rdparty/zlib-ng/win32/Makefile.a64 b/3rdparty/zlib-ng/win32/Makefile.a64 new file mode 100644 index 0000000000..9f8d6fb7fa --- /dev/null +++ b/3rdparty/zlib-ng/win32/Makefile.a64 @@ -0,0 +1,252 @@ +# Makefile for zlib using Microsoft (Visual) C +# zlib is copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler +# +# Usage: +# nmake -f win32/Makefile.a64 (standard build) +# nmake -f win32/Makefile.a64 LOC=-DFOO (nonstandard build) + +# The toplevel directory of the source tree. +# +TOP = . + +# optional build flags +LOC = + +# variables +STATICLIB = zlib.lib +SHAREDLIB = zlib1.dll +IMPLIB = zdll.lib +SYMBOL_PREFIX = + +CC = cl +LD = link +AR = lib +RC = rc +CP = copy /y +INCLUDES = -I$(TOP) -I$(TOP)/arch/arm -I$(TOP)/arch/generic +CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES) +WFLAGS = \ + -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ + -D_CRT_SECURE_NO_DEPRECATE \ + -D_CRT_NONSTDC_NO_DEPRECATE \ + -DARM_FEATURES \ + -DARM_NEON_HASLD4 \ + # +LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest +ARFLAGS = -nologo +RCFLAGS = /dARM64 /r +DEFFILE = zlib.def +RCFILE = zlib1.rc +RESFILE = zlib1.res +WITH_GZFILEOP = yes +ZLIB_COMPAT = +SUFFIX = + +OBJS = \ + adler32.obj \ + adler32_c.obj \ + adler32_fold_c.obj \ + arm_features.obj \ + chunkset_c.obj \ + compare256_c.obj \ + compress.obj \ + cpu_features.obj \ + crc32.obj \ + crc32_braid_c.obj \ + crc32_braid_comb.obj \ + crc32_fold_c.obj \ + deflate.obj \ + deflate_fast.obj \ + deflate_huff.obj \ + deflate_medium.obj \ + deflate_quick.obj \ + deflate_rle.obj \ + deflate_slow.obj \ + deflate_stored.obj \ + functable.obj \ + infback.obj \ + inflate.obj \ + inftrees.obj \ + insert_string.obj \ + insert_string_roll.obj \ + slide_hash_c.obj \ + trees.obj \ + uncompr.obj \ + zutil.obj \ + # +!if "$(ZLIB_COMPAT)" != "" +WITH_GZFILEOP = yes +WFLAGS = $(WFLAGS) -DZLIB_COMPAT +DEFFILE = zlibcompat.def +!else +STATICLIB = zlib-ng.lib +SHAREDLIB = zlib-ng1.dll +IMPLIB = zngdll.lib +DEFFILE = zlib-ng.def +RCFILE = zlib-ng1.rc +RESFILE = zlib-ng1.res +SUFFIX = -ng +!endif + +!if "$(WITH_GZFILEOP)" != "" +WFLAGS = $(WFLAGS) -DWITH_GZFILEOP +OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj +!endif + +WFLAGS = $(WFLAGS) \ + -DARM_ACLE \ + -D__ARM_NEON__=1 \ + -DARM_NEON \ + -DARM_NOCHECK_NEON \ + # +OBJS = $(OBJS) crc32_acle.obj adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj + +# targets +all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \ + example.exe minigzip.exe example_d.exe minigzip_d.exe + +!if "$(SYMBOL_PREFIX)" != "" +zlib_name_mangling$(SUFFIX).h: zlib_name_mangling$(SUFFIX).h.in + cscript $(TOP)\win32\replace.vbs $(TOP)\zlib_name_mangling$(SUFFIX).h.in zlib_name_mangling$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" +!else +zlib_name_mangling$(SUFFIX).h: zlib_name_mangling.h.empty + $(CP) $(TOP)\zlib_name_mangling.h.empty zlib_name_mangling$(SUFFIX).h +!endif + +zlib$(SUFFIX).h: zlib$(SUFFIX).h.in + cscript $(TOP)\win32\replace.vbs $(TOP)\zlib$(SUFFIX).h.in zlib$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +gzread.c: gzread.c.in + cscript $(TOP)\win32\replace.vbs $(TOP)\gzread.c.in gzread.c "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +zconf: $(TOP)/zconf$(SUFFIX).h.in $(TOP)/zlib$(SUFFIX).h $(TOP)/zlib_name_mangling$(SUFFIX).h + $(CP) $(TOP)\zconf$(SUFFIX).h.in $(TOP)\zconf$(SUFFIX).h + +$(TOP)/win32/$(DEFFILE): $(TOP)/win32/$(DEFFILE).in + cscript $(TOP)\win32\replace.vbs $(TOP)/win32/$(DEFFILE).in $(TOP)/win32/$(DEFFILE) "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +$(STATICLIB): zconf $(OBJS) + $(AR) $(ARFLAGS) -out:$@ $(OBJS) + +$(IMPLIB): $(SHAREDLIB) + +$(SHAREDLIB): zconf $(TOP)/win32/$(DEFFILE) $(OBJS) $(RESFILE) + $(LD) $(LDFLAGS) -def:$(TOP)/win32/$(DEFFILE) -dll -implib:$(IMPLIB) \ + -out:$@ -base:0x55A4C0000 $(OBJS) $(RESFILE) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;2 + +example.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + $(LD) $(LDFLAGS) example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +minigzip.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + $(LD) $(LDFLAGS) minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +example_d.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + $(LD) $(LDFLAGS) -out:$@ example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +minigzip_d.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + $(LD) $(LDFLAGS) -out:$@ minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +{$(TOP)}.c.obj: + $(CC) -c $(WFLAGS) $(CFLAGS) $< + +gzlib2.obj: gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzlib2.obj gzlib.c + +gzread2.obj: gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzread2.obj gzread.c + +gzwrite2.obj: gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzwrite2.obj gzwrite.c + +{$(TOP)/arch/arm}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $< + +{$(TOP)/arch/generic}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $< + +{$(TOP)/test}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP $< + +$(TOP)/zconf$(SUFFIX).h: zconf + +adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h +adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h +adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h +chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h +compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h +cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h +crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h +crc32_braid_c.obj: $(TOP)/arch/generic/crc32_braid_c.c $(TOP)/zbuild.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h +crc32_braid_comb.obj: $(TOP)/crc32_braid_comb.c $(TOP)/zutil.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h $(TOP)/crc32_braid_comb_p.h +crc32_fold_c.obj: $(TOP)/arch/generic/crc32_fold_c.c $(TOP)/zbuild.h $(TOP)/crc32.h $(TOP)/functable.h $(TOP)/zutil.h +deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h +deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h +deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +functable.obj: $(TOP)/functable.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/cpu_features.h $(TOP)/arch/arm/arm_features.h $(TOP)/arch_functions.h +gzlib.obj: $(TOP)/gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +gzread.obj: $(TOP)/gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +gzwrite.obj: $(TOP)/gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +infback.obj: $(TOP)/infback.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h +inflate.obj: $(TOP)/inflate.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h $(TOP)/inffixed_tbl.h +inftrees.obj: $(TOP)/inftrees.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h +insert_string.obj: $(TOP)/insert_string.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h +insert_string_roll.obj: $(TOP)/insert_string_roll.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h +slide_hash_c.obj: $(TOP)/arch/generic/slide_hash_c.c $(TOP)/zbuild.h $(TOP)/deflate.h +slide_hash_neon.obj: $(TOP)/arch/arm/slide_hash_neon.c $(TOP)/arch/arm/neon_intrins.h $(TOP)/zbuild.h $(TOP)/deflate.h +trees.obj: $(TOP)/trees.c $(TOP)/trees.h $(TOP)/trees_emit.h $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/trees_tbl.h +uncompr.obj: $(TOP)/uncompr.c $(TOP)/zbuild.h $(TOP)/zutil.h +zutil.obj: $(TOP)/zutil.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/zutil_p.h + +$(RESFILE): $(TOP)/win32/$(RCFILE) + $(RC) $(RCFLAGS) /fo$@ $(TOP)/win32/$(RCFILE) + +# testing +test: example.exe minigzip.exe + example + echo hello world | minigzip | minigzip -d + +testdll: example_d.exe minigzip_d.exe + example_d + echo hello world | minigzip_d | minigzip_d -d + +example.obj: $(TOP)/test/example.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h $(TOP)/deflate.h $(TOP)/test/test_shared_ng.h + +minigzip.obj: $(TOP)/test/minigzip.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h + + +# cleanup +clean: + -del $(STATICLIB) + -del $(SHAREDLIB) + -del $(IMPLIB) + -del *.obj + -del *.res + -del *.exp + -del *.exe + -del *.pdb + -del *.manifest + +distclean: clean + -del zconf$(SUFFIX).h + -del zlib$(SUFFIX).h + -del zlib_name_mangling$(SUFFIX).h + -del $(TOP)\win32\zlib.def + -del $(TOP)\win32\zlibcompat.def + -del $(TOP)\win32\zlib-ng.def + -del gzread.c diff --git a/3rdparty/zlib-ng/win32/Makefile.arm b/3rdparty/zlib-ng/win32/Makefile.arm new file mode 100644 index 0000000000..cab999dfe0 --- /dev/null +++ b/3rdparty/zlib-ng/win32/Makefile.arm @@ -0,0 +1,272 @@ +# Makefile for zlib using Microsoft (Visual) C +# zlib is copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler +# +# Usage: +# nmake -f win32/Makefile.arm (standard build) +# nmake -f win32/Makefile.arm LOC=-DFOO (nonstandard build) + +# The toplevel directory of the source tree. +# +TOP = . + +# optional build flags +LOC = + +# variables +STATICLIB = zlib.lib +SHAREDLIB = zlib1.dll +IMPLIB = zdll.lib +SYMBOL_PREFIX = + +CC = cl +LD = link +AR = lib +RC = rc +CP = copy /y +INCLUDES = -I$(TOP) -I$(TOP)/arch/arm -I$(TOP)/arch/generic +CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES) +WFLAGS = \ + -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \ + -D_CRT_SECURE_NO_DEPRECATE \ + -D_CRT_NONSTDC_NO_DEPRECATE \ + -DARM_FEATURES \ + -DARM_NEON_HASLD4 \ + # +LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest +ARFLAGS = -nologo +RCFLAGS = /dARM /r +DEFFILE = zlib.def +RCFILE = zlib1.rc +RESFILE = zlib1.res +WITH_GZFILEOP = yes +ZLIB_COMPAT = +WITH_ACLE = +WITH_NEON = +WITH_ARMV6 = +WITH_VFPV3 = +NEON_ARCH = /arch:VFPv4 +SUFFIX = + +OBJS = \ + adler32.obj \ + adler32_c.obj \ + adler32_fold_c.obj \ + arm_features.obj \ + chunkset_c.obj \ + compare256_c.obj \ + compress.obj \ + cpu_features.obj \ + crc32.obj \ + crc32_braid_c.obj \ + crc32_braid_comb.obj \ + crc32_fold_c.obj \ + deflate.obj \ + deflate_fast.obj \ + deflate_huff.obj \ + deflate_medium.obj \ + deflate_quick.obj \ + deflate_rle.obj \ + deflate_slow.obj \ + deflate_stored.obj \ + functable.obj \ + infback.obj \ + inflate.obj \ + inftrees.obj \ + insert_string.obj \ + insert_string_roll.obj \ + slide_hash_c.obj \ + trees.obj \ + uncompr.obj \ + zutil.obj \ + # +!if "$(ZLIB_COMPAT)" != "" +WITH_GZFILEOP = yes +WFLAGS = $(WFLAGS) -DZLIB_COMPAT +DEFFILE = zlibcompat.def +!else +STATICLIB = zlib-ng.lib +SHAREDLIB = zlib-ng1.dll +IMPLIB = zngdll.lib +DEFFILE = zlib-ng.def +RCFILE = zlib-ng1.rc +RESFILE = zlib-ng1.res +SUFFIX = -ng +!endif + +!if "$(WITH_GZFILEOP)" != "" +WFLAGS = $(WFLAGS) -DWITH_GZFILEOP +OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj +!endif + +!if "$(WITH_ACLE)" != "" +WFLAGS = $(WFLAGS) -DARM_ACLE +OBJS = $(OBJS) crc32_acle.obj +!endif +!if "$(WITH_VFPV3)" != "" +NEON_ARCH = /arch:VFPv3 +!endif +!if "$(WITH_NEON)" != "" +CFLAGS = $(CFLAGS) $(NEON_ARCH) +WFLAGS = $(WFLAGS) \ + -D__ARM_NEON__=1 \ + -DARM_NEON \ + -DARM_NOCHECK_NEON \ + # +OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj +!endif +!if "$(WITH_ARMV6)" != "" +WFLAGS = $(WFLAGS) \ + -DARM_SIMD \ + -DARM_NOCHECK_SIMD \ + # +OBJS = $(OBJS) slide_hash_armv6.obj +!endif + +# targets +all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \ + example.exe minigzip.exe example_d.exe minigzip_d.exe + +!if "$(SYMBOL_PREFIX)" != "" +zlib_name_mangling$(SUFFIX).h: zlib_name_mangling$(SUFFIX).h.in + cscript $(TOP)\win32\replace.vbs $(TOP)\zlib_name_mangling$(SUFFIX).h.in zlib_name_mangling$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" +!else +zlib_name_mangling$(SUFFIX).h: zlib_name_mangling.h.empty + $(CP) $(TOP)\zlib_name_mangling.h.empty zlib_name_mangling$(SUFFIX).h +!endif + +zlib$(SUFFIX).h: zlib$(SUFFIX).h.in + cscript $(TOP)\win32\replace.vbs $(TOP)\zlib$(SUFFIX).h.in zlib$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +gzread.c: gzread.c.in + cscript $(TOP)\win32\replace.vbs $(TOP)\gzread.c.in gzread.c "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +zconf: $(TOP)/zconf$(SUFFIX).h.in $(TOP)/zlib$(SUFFIX).h $(TOP)/zlib_name_mangling$(SUFFIX).h + $(CP) $(TOP)\zconf$(SUFFIX).h.in $(TOP)\zconf$(SUFFIX).h + +$(TOP)/win32/$(DEFFILE): $(TOP)/win32/$(DEFFILE).in + cscript $(TOP)\win32\replace.vbs $(TOP)/win32/$(DEFFILE).in $(TOP)/win32/$(DEFFILE) "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +$(STATICLIB): zconf $(OBJS) + $(AR) $(ARFLAGS) -out:$@ $(OBJS) + +$(IMPLIB): $(SHAREDLIB) + +$(SHAREDLIB): zconf $(TOP)/win32/$(DEFFILE) $(OBJS) $(RESFILE) + $(LD) $(LDFLAGS) -def:$(TOP)/win32/$(DEFFILE) -dll -implib:$(IMPLIB) \ + -out:$@ -base:0x5A4C0000 $(OBJS) $(RESFILE) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;2 + +example.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + $(LD) $(LDFLAGS) example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +minigzip.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + $(LD) $(LDFLAGS) minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +example_d.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + $(LD) $(LDFLAGS) -out:$@ example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +minigzip_d.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + $(LD) $(LDFLAGS) -out:$@ minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +{$(TOP)}.c.obj: + $(CC) -c $(WFLAGS) $(CFLAGS) $< + +gzlib2.obj: gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzlib2.obj gzlib.c + +gzread2.obj: gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzread2.obj gzread.c + +gzwrite2.obj: gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzwrite2.obj gzwrite.c + +{$(TOP)/arch/arm}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $< + +{$(TOP)/arch/generic}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $< + +{$(TOP)/test}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP $< + +$(TOP)/zconf$(SUFFIX).h: zconf + +adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h +adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h +adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h +chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h +compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h +cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h +crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h +crc32_braid_c.obj: $(TOP)/arch/generic/crc32_braid_c.c $(TOP)/zbuild.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h +crc32_braid_comb.obj: $(TOP)/crc32_braid_comb.c $(TOP)/zutil.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h $(TOP)/crc32_braid_comb_p.h +crc32_fold_c.obj: $(TOP)/arch/generic/crc32_fold_c.c $(TOP)/zbuild.h $(TOP)/crc32.h $(TOP)/functable.h $(TOP)/zutil.h +deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h +deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h +deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +functable.obj: $(TOP)/functable.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/cpu_features.h $(TOP)/arch/arm/arm_features.h $(TOP)/arch_functions.h +gzlib.obj: $(TOP)/gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +gzread.obj: $(TOP)/gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +gzwrite.obj: $(TOP)/gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +infback.obj: $(TOP)/infback.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h +inflate.obj: $(TOP)/inflate.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h $(TOP)/inffixed_tbl.h +inftrees.obj: $(TOP)/inftrees.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h +insert_string.obj: $(TOP)/insert_string.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h +insert_string_roll.obj: $(TOP)/insert_string_roll.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h +slide_hash_c.obj: $(TOP)/arch/generic/slide_hash_c.c $(TOP)/zbuild.h $(TOP)/deflate.h +trees.obj: $(TOP)/trees.c $(TOP)/trees.h $(TOP)/trees_emit.h $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/trees_tbl.h +uncompr.obj: $(TOP)/uncompr.c $(TOP)/zbuild.h $(TOP)/zutil.h +zutil.obj: $(TOP)/zutil.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/zutil_p.h + +$(RESFILE): $(TOP)/win32/$(RCFILE) + $(RC) $(RCFLAGS) /fo$@ $(TOP)/win32/$(RCFILE) + +# testing +test: example.exe minigzip.exe + example + echo hello world | minigzip | minigzip -d + +testdll: example_d.exe minigzip_d.exe + example_d + echo hello world | minigzip_d | minigzip_d -d + +example.obj: $(TOP)/test/example.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h $(TOP)/deflate.h $(TOP)/test/test_shared_ng.h + +minigzip.obj: $(TOP)/test/minigzip.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h + + +# cleanup +clean: + -del $(STATICLIB) + -del $(SHAREDLIB) + -del $(IMPLIB) + -del *.obj + -del *.res + -del *.exp + -del *.exe + -del *.pdb + -del *.manifest + +distclean: clean + -del zconf$(SUFFIX).h + -del zlib$(SUFFIX).h + -del zlib_name_mangling$(SUFFIX).h + -del $(TOP)\win32\zlib.def + -del $(TOP)\win32\zlibcompat.def + -del $(TOP)\win32\zlib-ng.def + -del gzread.c diff --git a/3rdparty/zlib-ng/win32/Makefile.msc b/3rdparty/zlib-ng/win32/Makefile.msc new file mode 100644 index 0000000000..8392fe46e7 --- /dev/null +++ b/3rdparty/zlib-ng/win32/Makefile.msc @@ -0,0 +1,292 @@ +# Makefile for zlib using Microsoft (Visual) C +# zlib is copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler +# +# Usage: +# nmake -f win32/Makefile.msc (standard build) +# nmake -f win32/Makefile.msc LOC=-DFOO (nonstandard build) + +# The toplevel directory of the source tree. +# +TOP = . + +# optional build flags +LOC = + +# variables +STATICLIB = zlib.lib +SHAREDLIB = zlib1.dll +IMPLIB = zdll.lib +SYMBOL_PREFIX = + +CC = cl +CXX = cl +LD = link +AR = lib +RC = rc +CP = copy /y +INCLUDES = -I$(TOP) -I$(TOP)/arch/x86 -I$(TOP)/arch/generic +CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES) +CXXFLAGS = -nologo -EHsc -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) $(INCLUDES) +WFLAGS = \ + -D_CRT_SECURE_NO_DEPRECATE \ + -D_CRT_NONSTDC_NO_DEPRECATE \ + -DX86_FEATURES \ + -DX86_PCLMULQDQ_CRC \ + -DX86_SSE2 \ + -DX86_SSE42 \ + -DX86_SSSE3 \ + -DX86_AVX2 + +LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest +ARFLAGS = -nologo +RCFLAGS = /dWIN32 /r +DEFFILE = zlib.def +RCFILE = zlib1.rc +RESFILE = zlib1.res +WITH_GZFILEOP = yes +ZLIB_COMPAT = +SUFFIX = + +OBJS = \ + adler32.obj \ + adler32_c.obj \ + adler32_avx2.obj \ + adler32_avx512.obj \ + adler32_avx512_vnni.obj \ + adler32_sse42.obj \ + adler32_ssse3.obj \ + adler32_fold_c.obj \ + chunkset_c.obj \ + chunkset_avx2.obj \ + chunkset_sse2.obj \ + chunkset_ssse3.obj \ + compare256_c.obj \ + compare256_avx2.obj \ + compare256_sse2.obj \ + compress.obj \ + cpu_features.obj \ + crc32.obj \ + crc32_braid_c.obj \ + crc32_braid_comb.obj \ + crc32_fold_c.obj \ + crc32_pclmulqdq.obj \ + deflate.obj \ + deflate_fast.obj \ + deflate_huff.obj \ + deflate_medium.obj \ + deflate_quick.obj \ + deflate_rle.obj \ + deflate_slow.obj \ + deflate_stored.obj \ + functable.obj \ + infback.obj \ + inflate.obj \ + inftrees.obj \ + insert_string.obj \ + insert_string_roll.obj \ + slide_hash_c.obj \ + slide_hash_avx2.obj \ + slide_hash_sse2.obj \ + trees.obj \ + uncompr.obj \ + zutil.obj \ + x86_features.obj \ + # +!if "$(ZLIB_COMPAT)" != "" +WITH_GZFILEOP = yes +WFLAGS = $(WFLAGS) -DZLIB_COMPAT +DEFFILE = zlibcompat.def +!else +STATICLIB = zlib-ng.lib +SHAREDLIB = zlib-ng1.dll +IMPLIB = zngdll.lib +DEFFILE = zlib-ng.def +RCFILE = zlib-ng1.rc +RESFILE = zlib-ng1.res +SUFFIX = -ng +!endif + +!if "$(WITH_GZFILEOP)" != "" +WFLAGS = $(WFLAGS) -DWITH_GZFILEOP +OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj +!endif + +# targets +all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \ + example.exe minigzip.exe example_d.exe minigzip_d.exe + +!if "$(SYMBOL_PREFIX)" != "" +zlib_name_mangling$(SUFFIX).h: zlib_name_mangling$(SUFFIX).h.in + cscript $(TOP)\win32\replace.vbs $(TOP)\zlib_name_mangling$(SUFFIX).h.in zlib_name_mangling$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" +!else +zlib_name_mangling$(SUFFIX).h: zlib_name_mangling.h.empty + $(CP) $(TOP)\zlib_name_mangling.h.empty zlib_name_mangling$(SUFFIX).h +!endif + +zlib$(SUFFIX).h: zlib$(SUFFIX).h.in + cscript $(TOP)\win32\replace.vbs $(TOP)\zlib$(SUFFIX).h.in zlib$(SUFFIX).h "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +gzread.c: gzread.c.in + cscript $(TOP)\win32\replace.vbs $(TOP)\gzread.c.in gzread.c "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +zconf: $(TOP)/zconf$(SUFFIX).h.in $(TOP)/zlib$(SUFFIX).h $(TOP)/zlib_name_mangling$(SUFFIX).h + $(CP) $(TOP)\zconf$(SUFFIX).h.in $(TOP)\zconf$(SUFFIX).h + +$(TOP)/win32/$(DEFFILE): $(TOP)/win32/$(DEFFILE).in + cscript $(TOP)\win32\replace.vbs $(TOP)/win32/$(DEFFILE).in $(TOP)/win32/$(DEFFILE) "@ZLIB_SYMBOL_PREFIX@" "$(SYMBOL_PREFIX)" + +$(STATICLIB): zconf $(OBJS) + $(AR) $(ARFLAGS) -out:$@ $(OBJS) + +$(IMPLIB): $(SHAREDLIB) + +$(SHAREDLIB): zconf $(TOP)/win32/$(DEFFILE) $(OBJS) $(RESFILE) + $(LD) $(LDFLAGS) -def:$(TOP)/win32/$(DEFFILE) -dll -implib:$(IMPLIB) \ + -out:$@ $(OBJS) $(RESFILE) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;2 + +depcheck.exe: depcheck.obj + $(LD) $(LDFLAGS) depcheck.obj + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +example.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + $(LD) $(LDFLAGS) example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +minigzip.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + $(LD) $(LDFLAGS) minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(STATICLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +example_d.exe: example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + $(LD) $(LDFLAGS) -out:$@ example.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +minigzip_d.exe: minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + $(LD) $(LDFLAGS) -out:$@ minigzip.obj gzlib2.obj gzread2.obj gzwrite2.obj $(IMPLIB) + if exist $@.manifest \ + mt -nologo -manifest $@.manifest -outputresource:$@;1 + +{$(TOP)}.c.obj: + $(CC) -c $(WFLAGS) $(CFLAGS) $< + +gzlib2.obj: gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzlib2.obj gzlib.c + +gzread2.obj: gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzread2.obj gzread.c + +gzwrite2.obj: gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h + $(CC) -c $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP -Fogzwrite2.obj gzwrite.c + +{$(TOP)/arch/x86}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $< + +{$(TOP)/arch/generic}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) $< + +{$(TOP)/test}.c.obj: + $(CC) -c -I$(TOP) $(WFLAGS) $(CFLAGS) -DWITH_GZFILEOP $< + +$(TOP)/zconf$(SUFFIX).h: zconf + +{$(TOP)/win32}.cpp.obj: + $(CXX) -c -I$(TOP) $(WFLAGS) $(CXXFLAGS) $< + +adler32.obj: $(TOP)/adler32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h +adler32_c.obj: $(TOP)/arch/generic/adler32_c.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/adler32_p.h +adler32_avx2.obj: $(TOP)/arch/x86/adler32_avx2.c $(TOP)/zbuild.h $(TOP)/adler32_p.h $(TOP)/arch/x86/adler32_avx2_p.h $(TOP)/arch/x86/x86_intrins.h +adler32_avx512.obj: $(TOP)/arch/x86/adler32_avx512.c $(TOP)/zbuild.h $(TOP)/arch_functions.h $(TOP)/adler32_p.h $(TOP)/arch/x86/adler32_avx512_p.h $(TOP)/arch/x86/x86_intrins.h +adler32_avx512_vnni.obj: $(TOP)/arch/x86/adler32_avx512_vnni.c $(TOP)/zbuild.h $(TOP)/arch_functions.h $(TOP)/adler32_p.h $(TOP)/arch/x86/adler32_avx512_p.h \ + $(TOP)/arch/x86/adler32_avx2_p.h $(TOP)/arch/x86/x86_intrins.h +adler32_sse42.obj: $(TOP)/arch/x86/adler32_sse42.c $(TOP)/zbuild.h $(TOP)/adler32_p.h \ + $(TOP)/arch/x86/adler32_ssse3_p.h +adler32_ssse3.obj: $(TOP)/arch/x86/adler32_ssse3.c $(TOP)/zbuild.h $(TOP)/adler32_p.h \ + $(TOP)/arch/x86/adler32_ssse3_p.h +adler32_fold_c.obj: $(TOP)/arch/generic/adler32_fold_c.c $(TOP)/zbuild.h $(TOP)/functable.h +chunkset_c.obj: $(TOP)/arch/generic/chunkset_c.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h +chunkset_avx2.obj: $(TOP)/arch/x86/chunkset_avx2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h $(TOP)/arch/generic/chunk_permute_table.h +chunkset_sse2.obj: $(TOP)/arch/x86/chunkset_sse2.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h +chunkset_ssse3.obj: $(TOP)/arch/x86/chunkset_ssse3.c $(TOP)/zbuild.h $(TOP)/chunkset_tpl.h $(TOP)/inffast_tpl.h $(TOP)/arch/generic/chunk_permute_table.h +compare256_c.obj: $(TOP)/arch/generic/compare256_c.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compare256_avx2.obj: $(TOP)/arch/x86/compare256_avx2.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compare256_sse2.obj: $(TOP)/arch/x86/compare256_sse2.c $(TOP)/zbuild.h $(TOP)/zutil_p.h $(TOP)/deflate.h $(TOP)/fallback_builtins.h $(TOP)/match_tpl.h +compress.obj: $(TOP)/compress.c $(TOP)/zbuild.h $(TOP)/zutil.h +cpu_features.obj: $(TOP)/cpu_features.c $(TOP)/cpu_features.h $(TOP)/zbuild.h +crc32.obj: $(TOP)/crc32.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/crc32_braid_tbl.h +crc32_braid_c.obj: $(TOP)/arch/generic/crc32_braid_c.c $(TOP)/zbuild.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h +crc32_braid_comb.obj: $(TOP)/crc32_braid_comb.c $(TOP)/zutil.h $(TOP)/crc32_braid_p.h $(TOP)/crc32_braid_tbl.h $(TOP)/crc32_braid_comb_p.h +crc32_fold_c.obj: $(TOP)/arch/generic/crc32_fold_c.c $(TOP)/zbuild.h $(TOP)/crc32.h $(TOP)/functable.h $(TOP)/zutil.h +crc32_pclmulqdq.obj: $(TOP)/arch/x86/crc32_pclmulqdq.c $(TOP)/arch/x86/crc32_pclmulqdq_tpl.h +deflate.obj: $(TOP)/deflate.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_fast.obj: $(TOP)/deflate_fast.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_huff.obj: $(TOP)/deflate_huff.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_medium.obj: $(TOP)/deflate_medium.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_quick.obj: $(TOP)/deflate_quick.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/trees_emit.h $(TOP)/zutil_p.h +deflate_rle.obj: $(TOP)/deflate_rle.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h $(TOP)/compare256_rle.h +deflate_slow.obj: $(TOP)/deflate_slow.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +deflate_stored.obj: $(TOP)/deflate_stored.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/deflate_p.h $(TOP)/functable.h +functable.obj: $(TOP)/functable.c $(TOP)/zbuild.h $(TOP)/functable.h $(TOP)/cpu_features.h $(TOP)/arch/x86/x86_features.h $(TOP)/arch_functions.h +gzlib.obj: $(TOP)/gzlib.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +gzread.obj: $(TOP)/gzread.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +gzwrite.obj: $(TOP)/gzwrite.c $(TOP)/zbuild.h $(TOP)/gzguts.h $(TOP)/zutil_p.h +infback.obj: $(TOP)/infback.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h +inflate.obj: $(TOP)/inflate.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h $(TOP)/inflate.h $(TOP)/inflate_p.h $(TOP)/functable.h $(TOP)/inffixed_tbl.h +inftrees.obj: $(TOP)/inftrees.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/inftrees.h +insert_string.obj: $(TOP)/insert_string.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h +insert_string_roll.obj: $(TOP)/insert_string_roll.c $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/insert_string_tpl.h +slide_hash_c.obj: $(TOP)/arch/generic/slide_hash_c.c $(TOP)/zbuild.h $(TOP)/deflate.h +slide_hash_avx2.obj: $(TOP)/arch/x86/slide_hash_avx2.c $(TOP)/zbuild.h $(TOP)/deflate.h +slide_hash_sse2.obj: $(TOP)/arch/x86/slide_hash_sse2.c $(TOP)/zbuild.h $(TOP)/deflate.h +trees.obj: $(TOP)/trees.c $(TOP)/trees.h $(TOP)/trees_emit.h $(TOP)/zbuild.h $(TOP)/deflate.h $(TOP)/trees_tbl.h +uncompr.obj: $(TOP)/uncompr.c $(TOP)/zbuild.h $(TOP)/zutil.h +zutil.obj: $(TOP)/zutil.c $(TOP)/zbuild.h $(TOP)/zutil.h $(TOP)/zutil_p.h + +$(RESFILE): $(TOP)/win32/$(RCFILE) + $(RC) $(RCFLAGS) /fo$@ $(TOP)/win32/$(RCFILE) + +# testing +depcheck: depcheck.exe + depcheck win32\Makefile.msc . + depcheck win32\Makefile.arm . + depcheck win32\Makefile.a64 . + +test: example.exe minigzip.exe depcheck + example + echo hello world | minigzip | minigzip -d + +testdll: example_d.exe minigzip_d.exe + example_d + echo hello world | minigzip_d | minigzip_d -d + +depcheck.obj: $(TOP)/win32/depcheck.cpp + +example.obj: $(TOP)/test/example.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h $(TOP)/deflate.h $(TOP)/test/test_shared_ng.h + +minigzip.obj: $(TOP)/test/minigzip.c $(TOP)/zbuild.h $(TOP)/zlib$(SUFFIX).h + + +# cleanup +clean: + -del $(STATICLIB) + -del $(SHAREDLIB) + -del $(IMPLIB) + -del *.obj + -del *.res + -del *.exp + -del *.exe + -del *.pdb + -del *.manifest + +distclean: clean + -del zconf$(SUFFIX).h + -del zlib$(SUFFIX).h + -del zlib_name_mangling$(SUFFIX).h + -del $(TOP)\win32\zlib.def + -del $(TOP)\win32\zlibcompat.def + -del $(TOP)\win32\zlib-ng.def + -del gzread.c diff --git a/3rdparty/zlib-ng/win32/depcheck.cpp b/3rdparty/zlib-ng/win32/depcheck.cpp new file mode 100644 index 0000000000..f83bdd6852 --- /dev/null +++ b/3rdparty/zlib-ng/win32/depcheck.cpp @@ -0,0 +1,321 @@ +/* depcheck.cpp - Dependency checker for NMake Makefiles + * Copyright (c) 2024 Mika T. Lindqvist + */ + +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + if (argc != 3) { + printf("Usage: depcheck Makefile \n"); + return -1; + } + std::filebuf fb; + if (fb.open (argv[1],std::ios::in)) { + std::istream is(&fb); + std::string makefile = argv[1]; + std::string l, tmp, tmp2; + while (is) { + std::getline(is, l); + while (l.back() == '\\') { + std::getline(is, tmp); + l.replace(l.length() - 1, 1, tmp); + } + size_t pos = l.find("obj:"); + if (pos != std::string::npos) { + std::string objfile = l.substr(0, pos+3); + printf("File: %s\n", objfile.c_str()); + std::vector files; + std::stringstream ss(l.substr(pos+4)); + while(getline(ss, tmp, ' ')){ + if (tmp != "" && tmp != "/") { + files.push_back(tmp); + } + } + for (auto it = files.begin(); it != files.end(); ++it) { + printf("Dependency: %s\n", (*it).c_str()); + } + if (!files.empty()) { + std::filebuf fb2; + std::string src = files[0]; + size_t pos2 = src.find("$(TOP)"); + if (pos2 != std::string::npos) { + src.replace(pos2, 6, argv[2]); + } + printf("Source: %s\n", src.c_str()); + if (fb2.open(src.c_str(),std::ios::in)) { + std::istream is2(&fb2); + std::vector includes; + while (is2) { + std::getline(is2, l); + pos = l.find("#"); + if (pos != std::string::npos) { + pos2 = l.find("include"); + size_t pos3 = l.find("\""); + if (pos2 != std::string::npos && pos3 != std::string::npos && pos2 > pos && pos3 > pos2) { + tmp = l.substr(pos3 + 1); + pos2 = tmp.find("\""); + if (pos2 != std::string::npos) { + tmp = tmp.substr(0, pos2); + } + pos2 = tmp.find("../"); + if (pos2 != std::string::npos) { + tmp = tmp.substr(3); + } + printf("Line: %s\n", tmp.c_str()); + int found = 0; + for (size_t i = 1; i < files.size(); i++) { + pos3 = files[i].find("$(SUFFIX)"); + if (pos3 != std::string::npos) { + tmp2 = files[i].substr(0, pos3).append(files[i].substr(pos3 + 9)); + printf("Comparing dependency \"%s\" and \"%s\"\n", tmp2.c_str(), tmp.c_str()); + if (tmp2 == tmp) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/%s\"\n", tmp2.c_str(), tmp.c_str()); + if (tmp2 == std::string("$(TOP)/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + + tmp2 = files[i].substr(0, pos3).append("-ng").append(files[i].substr(pos3 + 9)); + printf("Comparing dependency \"%s\" and \"%s\"\n", tmp2.c_str(), tmp.c_str()); + if (tmp2 == tmp) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/%s\"\n", tmp2.c_str(), tmp.c_str()); + if (tmp2 == std::string("$(TOP)/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + } else { + printf("Comparing dependency \"%s\" and \"%s\"\n", files[i].c_str(), tmp.c_str()); + if (files[i] == tmp) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/%s\"\n", files[i].c_str(), tmp.c_str()); + if (files[i] == std::string("$(TOP)/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/arch/%s\"\n", files[i].c_str(), tmp.c_str()); + if (files[i] == std::string("$(TOP)/arch/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/arch/generic/%s\"\n", files[i].c_str(), tmp.c_str()); + if (files[i] == std::string("$(TOP)/arch/generic/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/arch/arm/%s\"\n", files[i].c_str(), tmp.c_str()); + if (files[i] == std::string("$(TOP)/arch/arm/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/arch/x86/%s\"\n", files[i].c_str(), tmp.c_str()); + if (files[i] == std::string("$(TOP)/arch/x86/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + printf("Comparing dependency \"%s\" and \"$(TOP)/test/%s\"\n", files[i].c_str(), tmp.c_str()); + if (files[i] == std::string("$(TOP)/test/").append(tmp)) { + printf("Dependency %s OK\n", tmp.c_str()); + found = 1; + includes.push_back(tmp); + break; + } + } + } + // Skip irrelevant dependencies + if (tmp.substr(0, 9) == "arch/s390") found = 1; + if (tmp == "zlib-ng.h" && std::find(includes.begin(), includes.end(), "zlib.h") != includes.end()) found = 1; + if (found == 0) { + printf("%s: Dependency %s missing for %s!\n", makefile.c_str(), tmp.c_str(), objfile.c_str()); + return -1; + } + } + } + } + for (size_t i = 1; i < files.size(); i++) { + int found = 0; + tmp = files[i]; + printf("Dependency: %s\n", tmp.c_str()); + pos2 = tmp.find("$(TOP)"); + if (pos2 != std::string::npos) { + tmp = tmp.substr(7); + } + for (size_t j = 0; j < includes.size(); j++) { + pos2 = tmp.find("$(SUFFIX)"); + if (pos2 != std::string::npos) { + std::string tmp1 = tmp.substr(0, pos2).append(tmp.substr(pos2 + 9)); + printf("[%zd/%zd] Comparing dependency \"%s\" and \"%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == includes[j]) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/generic/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/generic/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/arm/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/arm/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/x86/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/x86/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"test/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("test/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + tmp1 = tmp.substr(0, pos2).append("-ng").append(tmp.substr(pos2 + 9)); + printf("[%zd/%zd] Comparing dependency \"%s\" and \"%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == includes[j]) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/generic/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/generic/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/arm/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/arm/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/x86/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("arch/x86/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"test/%s\"\n", j, includes.size(), tmp1.c_str(), includes[j].c_str()); + if (tmp1 == std::string("test/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + } else { + printf("[%zd/%zd] Comparing dependency \"%s\" and \"%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str()); + if (tmp == includes[j]) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str()); + if (tmp == std::string("arch/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/generic/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str()); + if (tmp == std::string("arch/generic/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/arm/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str()); + if (tmp == std::string("arch/arm/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"arch/x86/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str()); + if (tmp == std::string("arch/x86/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + printf("[%zd/%zd] Comparing dependency \"%s\" and \"test/%s\"\n", j, includes.size(), tmp.c_str(), includes[j].c_str()); + if (tmp == std::string("test/").append(includes[j])) { + printf("Dependency %s OK\n", files[i].c_str()); + found = 1; + break; + } + } + } + // Skip indirect dependencies + if (tmp.find("arm_features.h") != std::string::npos + && std::find(includes.begin(), includes.end(), "cpu_features.h") != includes.end() + && (makefile.find(".arm") != std::string::npos + || makefile.find(".a64") != std::string::npos)) found = 1; + if (tmp.find("x86_features.h") != std::string::npos + && std::find(includes.begin(), includes.end(), "cpu_features.h") != includes.end() + && makefile.find(".msc") != std::string::npos) found = 1; + // + if (tmp.find("generic_functions.h") != std::string::npos + && std::find(includes.begin(), includes.end(), "arch_functions.h") != includes.end()) found = 1; + if (tmp.find("arm_functions.h") != std::string::npos + && std::find(includes.begin(), includes.end(), "arch_functions.h") != includes.end() + && (makefile.find(".arm") != std::string::npos + || makefile.find(".a64") != std::string::npos)) found = 1; + if (tmp.find("x86_functions.h") != std::string::npos + && std::find(includes.begin(), includes.end(), "arch_functions.h") != includes.end() + && makefile.find(".msc") != std::string::npos) found = 1; + if (found == 0) { + printf("%s: Dependency %s not needed for %s\n", makefile.c_str(), files[i].c_str(), objfile.c_str()); + return -1; + } + } + fb2.close(); + } + } + } + } + fb.close(); + } + return 0; +} diff --git a/3rdparty/zlib-ng/win32/replace.vbs b/3rdparty/zlib-ng/win32/replace.vbs new file mode 100644 index 0000000000..6779971d07 --- /dev/null +++ b/3rdparty/zlib-ng/win32/replace.vbs @@ -0,0 +1,15 @@ +strInputFileName = Wscript.Arguments(0) +strOutputFileName = Wscript.Arguments(1) +strOldText = Wscript.Arguments(2) +strNewText = Wscript.Arguments(3) + +Set objFSO = CreateObject("Scripting.FileSystemObject") +Set objFile = objFSO.OpenTextFile(strInputFileName, 1) + +strText = objFile.ReadAll +objFile.Close +strNewText = Replace(strText, strOldText, strNewText) + +Set objFile = objFSO.OpenTextFile(strOutputFileName, 2, True) +objFile.Write strNewText +objFile.Close diff --git a/3rdparty/zlib-ng/win32/zlib-ng.def.in b/3rdparty/zlib-ng/win32/zlib-ng.def.in new file mode 100644 index 0000000000..53b2bc21f7 --- /dev/null +++ b/3rdparty/zlib-ng/win32/zlib-ng.def.in @@ -0,0 +1,60 @@ +; zlib-ng data compression library +EXPORTS +; basic functions + @ZLIB_SYMBOL_PREFIX@zlibng_version + @ZLIB_SYMBOL_PREFIX@zng_deflate + @ZLIB_SYMBOL_PREFIX@zng_deflateEnd + @ZLIB_SYMBOL_PREFIX@zng_deflateInit + @ZLIB_SYMBOL_PREFIX@zng_deflateInit2 + @ZLIB_SYMBOL_PREFIX@zng_inflate + @ZLIB_SYMBOL_PREFIX@zng_inflateEnd + @ZLIB_SYMBOL_PREFIX@zng_inflateInit + @ZLIB_SYMBOL_PREFIX@zng_inflateInit2 + @ZLIB_SYMBOL_PREFIX@zng_inflateBackInit +; advanced functions + @ZLIB_SYMBOL_PREFIX@zng_deflateSetDictionary + @ZLIB_SYMBOL_PREFIX@zng_deflateGetDictionary + @ZLIB_SYMBOL_PREFIX@zng_deflateCopy + @ZLIB_SYMBOL_PREFIX@zng_deflateReset + @ZLIB_SYMBOL_PREFIX@zng_deflateParams + @ZLIB_SYMBOL_PREFIX@zng_deflateTune + @ZLIB_SYMBOL_PREFIX@zng_deflateBound + @ZLIB_SYMBOL_PREFIX@zng_deflatePending + @ZLIB_SYMBOL_PREFIX@zng_deflatePrime + @ZLIB_SYMBOL_PREFIX@zng_deflateSetHeader + @ZLIB_SYMBOL_PREFIX@zng_deflateSetParams + @ZLIB_SYMBOL_PREFIX@zng_deflateGetParams + @ZLIB_SYMBOL_PREFIX@zng_inflateSetDictionary + @ZLIB_SYMBOL_PREFIX@zng_inflateGetDictionary + @ZLIB_SYMBOL_PREFIX@zng_inflateSync + @ZLIB_SYMBOL_PREFIX@zng_inflateCopy + @ZLIB_SYMBOL_PREFIX@zng_inflateReset + @ZLIB_SYMBOL_PREFIX@zng_inflateReset2 + @ZLIB_SYMBOL_PREFIX@zng_inflatePrime + @ZLIB_SYMBOL_PREFIX@zng_inflateMark + @ZLIB_SYMBOL_PREFIX@zng_inflateGetHeader + @ZLIB_SYMBOL_PREFIX@zng_inflateBack + @ZLIB_SYMBOL_PREFIX@zng_inflateBackEnd + @ZLIB_SYMBOL_PREFIX@zng_zlibCompileFlags +; utility functions + @ZLIB_SYMBOL_PREFIX@zng_compress + @ZLIB_SYMBOL_PREFIX@zng_compress2 + @ZLIB_SYMBOL_PREFIX@zng_compressBound + @ZLIB_SYMBOL_PREFIX@zng_uncompress + @ZLIB_SYMBOL_PREFIX@zng_uncompress2 +; checksum functions + @ZLIB_SYMBOL_PREFIX@zng_adler32 + @ZLIB_SYMBOL_PREFIX@zng_adler32_z + @ZLIB_SYMBOL_PREFIX@zng_crc32 + @ZLIB_SYMBOL_PREFIX@zng_crc32_z + @ZLIB_SYMBOL_PREFIX@zng_adler32_combine + @ZLIB_SYMBOL_PREFIX@zng_crc32_combine +; various hacks, don't look :) + @ZLIB_SYMBOL_PREFIX@zng_zError + @ZLIB_SYMBOL_PREFIX@zng_inflateSyncPoint + @ZLIB_SYMBOL_PREFIX@zng_get_crc_table + @ZLIB_SYMBOL_PREFIX@zng_inflateUndermine + @ZLIB_SYMBOL_PREFIX@zng_inflateValidate + @ZLIB_SYMBOL_PREFIX@zng_inflateCodesUsed + @ZLIB_SYMBOL_PREFIX@zng_inflateResetKeep + @ZLIB_SYMBOL_PREFIX@zng_deflateResetKeep diff --git a/3rdparty/zlib-ng/win32/zlib-ng1.rc b/3rdparty/zlib-ng/win32/zlib-ng1.rc new file mode 100644 index 0000000000..f65cfa254e --- /dev/null +++ b/3rdparty/zlib-ng/win32/zlib-ng1.rc @@ -0,0 +1,36 @@ +#include +#include "zlib-ng.h" + +VS_VERSION_INFO VERSIONINFO + FILEVERSION ZLIBNG_VER_MAJOR,ZLIBNG_VER_MINOR,ZLIBNG_VER_REVISION,0 + PRODUCTVERSION ZLIBNG_VER_MAJOR,ZLIBNG_VER_MINOR,ZLIBNG_VER_REVISION,0 + FILEFLAGSMASK VS_FFI_FILEFLAGSMASK +#ifdef _DEBUG + FILEFLAGS 1 +#else + FILEFLAGS 0 +#endif + FILEOS VOS__WINDOWS32 + FILETYPE VFT_DLL + FILESUBTYPE 0 // not used +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + //language ID = U.S. English, char set = Windows, Multilingual + BEGIN + VALUE "FileDescription", "zlib data compression library\0" + VALUE "FileVersion", ZLIBNG_VERSION "\0" + VALUE "InternalName", "zlib-ng1.dll\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" + VALUE "OriginalFilename", "zlib-ng1.dll\0" + VALUE "ProductName", "zlib\0" + VALUE "ProductVersion", ZLIBNG_VERSION "\0" + VALUE "Comments", "For more information visit https://www.zlib.net/\0" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x0409, 1252 + END +END diff --git a/3rdparty/zlib-ng/win32/zlib.def.in b/3rdparty/zlib-ng/win32/zlib.def.in new file mode 100644 index 0000000000..561a42f7f8 --- /dev/null +++ b/3rdparty/zlib-ng/win32/zlib.def.in @@ -0,0 +1,64 @@ +; zlib data compression library +EXPORTS +; basic functions + @ZLIB_SYMBOL_PREFIX@zlibVersion + @ZLIB_SYMBOL_PREFIX@deflate + @ZLIB_SYMBOL_PREFIX@deflateEnd + @ZLIB_SYMBOL_PREFIX@inflate + @ZLIB_SYMBOL_PREFIX@inflateEnd +; advanced functions + @ZLIB_SYMBOL_PREFIX@deflateSetDictionary + @ZLIB_SYMBOL_PREFIX@deflateGetDictionary + @ZLIB_SYMBOL_PREFIX@deflateCopy + @ZLIB_SYMBOL_PREFIX@deflateReset + @ZLIB_SYMBOL_PREFIX@deflateParams + @ZLIB_SYMBOL_PREFIX@deflateTune + @ZLIB_SYMBOL_PREFIX@deflateBound + @ZLIB_SYMBOL_PREFIX@deflatePending + @ZLIB_SYMBOL_PREFIX@deflatePrime + @ZLIB_SYMBOL_PREFIX@deflateSetHeader + @ZLIB_SYMBOL_PREFIX@inflateSetDictionary + @ZLIB_SYMBOL_PREFIX@inflateGetDictionary + @ZLIB_SYMBOL_PREFIX@inflateSync + @ZLIB_SYMBOL_PREFIX@inflateCopy + @ZLIB_SYMBOL_PREFIX@inflateReset + @ZLIB_SYMBOL_PREFIX@inflateReset2 + @ZLIB_SYMBOL_PREFIX@inflatePrime + @ZLIB_SYMBOL_PREFIX@inflateMark + @ZLIB_SYMBOL_PREFIX@inflateGetHeader + @ZLIB_SYMBOL_PREFIX@inflateBack + @ZLIB_SYMBOL_PREFIX@inflateBackEnd + @ZLIB_SYMBOL_PREFIX@zlibCompileFlags +; utility functions + @ZLIB_SYMBOL_PREFIX@compress + @ZLIB_SYMBOL_PREFIX@compress2 + @ZLIB_SYMBOL_PREFIX@compressBound + @ZLIB_SYMBOL_PREFIX@uncompress + @ZLIB_SYMBOL_PREFIX@uncompress2 +; large file functions + @ZLIB_SYMBOL_PREFIX@adler32_combine64 + @ZLIB_SYMBOL_PREFIX@crc32_combine64 + @ZLIB_SYMBOL_PREFIX@crc32_combine_gen64 +; checksum functions + @ZLIB_SYMBOL_PREFIX@adler32 + @ZLIB_SYMBOL_PREFIX@adler32_z + @ZLIB_SYMBOL_PREFIX@crc32 + @ZLIB_SYMBOL_PREFIX@crc32_z + @ZLIB_SYMBOL_PREFIX@adler32_combine + @ZLIB_SYMBOL_PREFIX@crc32_combine + @ZLIB_SYMBOL_PREFIX@crc32_combine_gen + @ZLIB_SYMBOL_PREFIX@crc32_combine_op +; various hacks, don't look :) + @ZLIB_SYMBOL_PREFIX@deflateInit_ + @ZLIB_SYMBOL_PREFIX@deflateInit2_ + @ZLIB_SYMBOL_PREFIX@inflateInit_ + @ZLIB_SYMBOL_PREFIX@inflateInit2_ + @ZLIB_SYMBOL_PREFIX@inflateBackInit_ + @ZLIB_SYMBOL_PREFIX@zError + @ZLIB_SYMBOL_PREFIX@inflateSyncPoint + @ZLIB_SYMBOL_PREFIX@get_crc_table + @ZLIB_SYMBOL_PREFIX@inflateUndermine + @ZLIB_SYMBOL_PREFIX@inflateValidate + @ZLIB_SYMBOL_PREFIX@inflateCodesUsed + @ZLIB_SYMBOL_PREFIX@inflateResetKeep + @ZLIB_SYMBOL_PREFIX@deflateResetKeep diff --git a/3rdparty/zlib-ng/win32/zlib1.rc b/3rdparty/zlib-ng/win32/zlib1.rc new file mode 100644 index 0000000000..9bb9c18654 --- /dev/null +++ b/3rdparty/zlib-ng/win32/zlib1.rc @@ -0,0 +1,36 @@ +#include +#include "zlib.h" + +VS_VERSION_INFO VERSIONINFO + FILEVERSION ZLIB_VER_MAJOR,ZLIB_VER_MINOR,ZLIB_VER_REVISION,0 + PRODUCTVERSION ZLIB_VER_MAJOR,ZLIB_VER_MINOR,ZLIB_VER_REVISION,0 + FILEFLAGSMASK VS_FFI_FILEFLAGSMASK +#ifdef _DEBUG + FILEFLAGS 1 +#else + FILEFLAGS 0 +#endif + FILEOS VOS__WINDOWS32 + FILETYPE VFT_DLL + FILESUBTYPE 0 // not used +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + //language ID = U.S. English, char set = Windows, Multilingual + BEGIN + VALUE "FileDescription", "zlib data compression library\0" + VALUE "FileVersion", ZLIB_VERSION "\0" + VALUE "InternalName", "zlib1.dll\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" + VALUE "OriginalFilename", "zlib1.dll\0" + VALUE "ProductName", "zlib\0" + VALUE "ProductVersion", ZLIB_VERSION "\0" + VALUE "Comments", "For more information visit https://www.zlib.net/\0" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x0409, 1252 + END +END diff --git a/3rdparty/zlib-ng/win32/zlibcompat.def.in b/3rdparty/zlib-ng/win32/zlibcompat.def.in new file mode 100644 index 0000000000..52a713cf03 --- /dev/null +++ b/3rdparty/zlib-ng/win32/zlibcompat.def.in @@ -0,0 +1,97 @@ +; zlib data compression library +EXPORTS +; basic functions + @ZLIB_SYMBOL_PREFIX@zlibVersion + @ZLIB_SYMBOL_PREFIX@deflate + @ZLIB_SYMBOL_PREFIX@deflateEnd + @ZLIB_SYMBOL_PREFIX@inflate + @ZLIB_SYMBOL_PREFIX@inflateEnd +; advanced functions + @ZLIB_SYMBOL_PREFIX@deflateSetDictionary + @ZLIB_SYMBOL_PREFIX@deflateGetDictionary + @ZLIB_SYMBOL_PREFIX@deflateCopy + @ZLIB_SYMBOL_PREFIX@deflateReset + @ZLIB_SYMBOL_PREFIX@deflateParams + @ZLIB_SYMBOL_PREFIX@deflateTune + @ZLIB_SYMBOL_PREFIX@deflateBound + @ZLIB_SYMBOL_PREFIX@deflatePending + @ZLIB_SYMBOL_PREFIX@deflatePrime + @ZLIB_SYMBOL_PREFIX@deflateSetHeader + @ZLIB_SYMBOL_PREFIX@inflateSetDictionary + @ZLIB_SYMBOL_PREFIX@inflateGetDictionary + @ZLIB_SYMBOL_PREFIX@inflateSync + @ZLIB_SYMBOL_PREFIX@inflateCopy + @ZLIB_SYMBOL_PREFIX@inflateReset + @ZLIB_SYMBOL_PREFIX@inflateReset2 + @ZLIB_SYMBOL_PREFIX@inflatePrime + @ZLIB_SYMBOL_PREFIX@inflateMark + @ZLIB_SYMBOL_PREFIX@inflateGetHeader + @ZLIB_SYMBOL_PREFIX@inflateBack + @ZLIB_SYMBOL_PREFIX@inflateBackEnd + @ZLIB_SYMBOL_PREFIX@zlibCompileFlags +; utility functions + @ZLIB_SYMBOL_PREFIX@compress + @ZLIB_SYMBOL_PREFIX@compress2 + @ZLIB_SYMBOL_PREFIX@compressBound + @ZLIB_SYMBOL_PREFIX@uncompress + @ZLIB_SYMBOL_PREFIX@uncompress2 + @ZLIB_SYMBOL_PREFIX@gzopen + @ZLIB_SYMBOL_PREFIX@gzdopen + @ZLIB_SYMBOL_PREFIX@gzbuffer + @ZLIB_SYMBOL_PREFIX@gzsetparams + @ZLIB_SYMBOL_PREFIX@gzread + @ZLIB_SYMBOL_PREFIX@gzfread + @ZLIB_SYMBOL_PREFIX@gzwrite + @ZLIB_SYMBOL_PREFIX@gzfwrite + @ZLIB_SYMBOL_PREFIX@gzprintf + @ZLIB_SYMBOL_PREFIX@gzvprintf + @ZLIB_SYMBOL_PREFIX@gzputs + @ZLIB_SYMBOL_PREFIX@gzgets + @ZLIB_SYMBOL_PREFIX@gzputc + @ZLIB_SYMBOL_PREFIX@gzgetc + @ZLIB_SYMBOL_PREFIX@gzungetc + @ZLIB_SYMBOL_PREFIX@gzflush + @ZLIB_SYMBOL_PREFIX@gzseek + @ZLIB_SYMBOL_PREFIX@gzrewind + @ZLIB_SYMBOL_PREFIX@gztell + @ZLIB_SYMBOL_PREFIX@gzoffset + @ZLIB_SYMBOL_PREFIX@gzeof + @ZLIB_SYMBOL_PREFIX@gzdirect + @ZLIB_SYMBOL_PREFIX@gzclose + @ZLIB_SYMBOL_PREFIX@gzclose_r + @ZLIB_SYMBOL_PREFIX@gzclose_w + @ZLIB_SYMBOL_PREFIX@gzerror + @ZLIB_SYMBOL_PREFIX@gzclearerr +; large file functions + @ZLIB_SYMBOL_PREFIX@gzopen64 + @ZLIB_SYMBOL_PREFIX@gzseek64 + @ZLIB_SYMBOL_PREFIX@gztell64 + @ZLIB_SYMBOL_PREFIX@gzoffset64 + @ZLIB_SYMBOL_PREFIX@adler32_combine64 + @ZLIB_SYMBOL_PREFIX@crc32_combine64 + @ZLIB_SYMBOL_PREFIX@crc32_combine_gen64 +; checksum functions + @ZLIB_SYMBOL_PREFIX@adler32 + @ZLIB_SYMBOL_PREFIX@adler32_z + @ZLIB_SYMBOL_PREFIX@crc32 + @ZLIB_SYMBOL_PREFIX@crc32_z + @ZLIB_SYMBOL_PREFIX@adler32_combine + @ZLIB_SYMBOL_PREFIX@crc32_combine + @ZLIB_SYMBOL_PREFIX@crc32_combine_gen + @ZLIB_SYMBOL_PREFIX@crc32_combine_op +; various hacks, don't look :) + @ZLIB_SYMBOL_PREFIX@deflateInit_ + @ZLIB_SYMBOL_PREFIX@deflateInit2_ + @ZLIB_SYMBOL_PREFIX@inflateInit_ + @ZLIB_SYMBOL_PREFIX@inflateInit2_ + @ZLIB_SYMBOL_PREFIX@inflateBackInit_ + @ZLIB_SYMBOL_PREFIX@gzgetc_ + @ZLIB_SYMBOL_PREFIX@zError + @ZLIB_SYMBOL_PREFIX@inflateSyncPoint + @ZLIB_SYMBOL_PREFIX@get_crc_table + @ZLIB_SYMBOL_PREFIX@inflateUndermine + @ZLIB_SYMBOL_PREFIX@inflateValidate + @ZLIB_SYMBOL_PREFIX@inflateCodesUsed + @ZLIB_SYMBOL_PREFIX@inflateResetKeep + @ZLIB_SYMBOL_PREFIX@deflateResetKeep + @ZLIB_SYMBOL_PREFIX@gzopen_w diff --git a/3rdparty/zlib-ng/zbuild.h b/3rdparty/zlib-ng/zbuild.h index d550b4c582..9157eef9e3 100644 --- a/3rdparty/zlib-ng/zbuild.h +++ b/3rdparty/zlib-ng/zbuild.h @@ -202,6 +202,24 @@ # define ALIGNED_(x) __declspec(align(x)) #endif +#ifdef HAVE_BUILTIN_ASSUME_ALIGNED +# define HINT_ALIGNED(p,n) __builtin_assume_aligned((void *)(p),(n)) +#else +# define HINT_ALIGNED(p,n) (p) +#endif +#define HINT_ALIGNED_16(p) HINT_ALIGNED((p),16) +#define HINT_ALIGNED_64(p) HINT_ALIGNED((p),64) +#define HINT_ALIGNED_4096(p) HINT_ALIGNED((p),4096) + +/* PADSZ returns needed bytes to pad bpos to pad size + * PAD_NN calculates pad size and adds it to bpos, returning the result. + * All take an integer or a pointer as bpos input. + */ +#define PADSZ(bpos, pad) (((pad) - ((uintptr_t)(bpos) % (pad))) % (pad)) +#define PAD_16(bpos) ((bpos) + PADSZ((bpos),16)) +#define PAD_64(bpos) ((bpos) + PADSZ((bpos),64)) +#define PAD_4096(bpos) ((bpos) + PADSZ((bpos),4096)) + /* Diagnostic functions */ #ifdef ZLIB_DEBUG # include @@ -246,6 +264,31 @@ # endif #endif +#if defined(__has_feature) +# if __has_feature(address_sanitizer) +# define Z_ADDRESS_SANITIZER 1 +# endif +#elif defined(__SANITIZE_ADDRESS__) +# define Z_ADDRESS_SANITIZER 1 +#endif + +/* + * __asan_loadN() and __asan_storeN() calls are inserted by compilers in order to check memory accesses. + * They can be called manually too, with the following caveats: + * gcc says: "warning: implicit declaration of function ‘...’" + * g++ says: "error: new declaration ‘...’ ambiguates built-in declaration ‘...’" + * Accommodate both. + */ +#ifdef Z_ADDRESS_SANITIZER +#ifndef __cplusplus +void __asan_loadN(void *, long); +void __asan_storeN(void *, long); +#endif +#else +# define __asan_loadN(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0) +# define __asan_storeN(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0) +#endif + #if defined(__has_feature) # if __has_feature(memory_sanitizer) # define Z_MEMORY_SANITIZER 1 @@ -254,7 +297,31 @@ #endif #ifndef Z_MEMORY_SANITIZER +# define __msan_check_mem_is_initialized(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0) # define __msan_unpoison(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0) #endif +/* Notify sanitizer runtime about an upcoming read access. */ +#define instrument_read(a, size) do { \ + void *__a = (void *)(a); \ + long __size = size; \ + __asan_loadN(__a, __size); \ + __msan_check_mem_is_initialized(__a, __size); \ +} while (0) + +/* Notify sanitizer runtime about an upcoming write access. */ +#define instrument_write(a, size) do { \ + void *__a = (void *)(a); \ + long __size = size; \ + __asan_storeN(__a, __size); \ +} while (0) + +/* Notify sanitizer runtime about an upcoming read/write access. */ +#define instrument_read_write(a, size) do { \ + void *__a = (void *)(a); \ + long __size = size; \ + __asan_storeN(__a, __size); \ + __msan_check_mem_is_initialized(__a, __size); \ +} while (0) + #endif diff --git a/3rdparty/zlib-ng/zconf-ng.h.in b/3rdparty/zlib-ng/zconf-ng.h.in new file mode 100644 index 0000000000..a1b5311b85 --- /dev/null +++ b/3rdparty/zlib-ng/zconf-ng.h.in @@ -0,0 +1,176 @@ +/* zconf-ng.h -- configuration of the zlib-ng compression library + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ZCONFNG_H +#define ZCONFNG_H + +#include "zlib_name_mangling-ng.h" + +#if !defined(_WIN32) && defined(__WIN32__) +# define _WIN32 +#endif + +/* Clang macro for detecting declspec support + * https://clang.llvm.org/docs/LanguageExtensions.html#has-declspec-attribute + */ +#ifndef __has_declspec_attribute +# define __has_declspec_attribute(x) 0 +#endif + +/* Always define z_const as const */ +#define z_const const + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# define MAX_MEM_LEVEL 9 +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2. + * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files + * created by gzip. (Files created by minigzip can still be extracted by + * gzip.) + */ +#ifndef MIN_WBITS +# define MIN_WBITS 8 /* 256 LZ77 window */ +#endif +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + (1 << (windowBits+2)) + (1 << (memLevel+9)) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus about 7 kilobytes + for small objects. +*/ + +/* Type declarations */ + +#ifdef ZLIB_INTERNAL +# define Z_INTERNAL ZLIB_INTERNAL +#endif + +/* If building or using zlib as a DLL, define ZLIB_DLL. + * This is not mandatory, but it offers a little performance increase. + */ +#if defined(ZLIB_DLL) && (defined(_WIN32) || (__has_declspec_attribute(dllexport) && __has_declspec_attribute(dllimport))) +# ifdef Z_INTERNAL +# define Z_EXTERN extern __declspec(dllexport) +# else +# define Z_EXTERN extern __declspec(dllimport) +# endif +#endif + +/* If building or using zlib with the WINAPI/WINAPIV calling convention, + * define ZLIB_WINAPI. + * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. + */ +#if defined(ZLIB_WINAPI) && defined(_WIN32) +# include + /* No need for _export, use ZLIB.DEF instead. */ + /* For complete Windows compatibility, use WINAPI, not __stdcall. */ +# define Z_EXPORT WINAPI +# define Z_EXPORTVA WINAPIV +#endif + +#ifndef Z_EXTERN +# define Z_EXTERN extern +#endif +#ifndef Z_EXPORT +# define Z_EXPORT +#endif +#ifndef Z_EXPORTVA +# define Z_EXPORTVA +#endif + +/* Conditional exports */ +#define ZNG_CONDEXPORT Z_EXPORT + +/* Fallback for something that includes us. */ +typedef unsigned char Byte; +typedef Byte Bytef; + +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +typedef char charf; +typedef int intf; +typedef uInt uIntf; +typedef uLong uLongf; + +typedef void const *voidpc; +typedef void *voidpf; +typedef void *voidp; + +#ifdef HAVE_UNISTD_H /* may be set to #if 1 by configure/cmake/etc */ +# define Z_HAVE_UNISTD_H +#endif + +#ifdef NEED_PTRDIFF_T /* may be set to #if 1 by configure/cmake/etc */ +typedef PTRDIFF_TYPE ptrdiff_t; +#endif + +#include /* for off_t */ + +#include /* for wchar_t and NULL */ + +/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and + * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even + * though the former does not conform to the LFS document), but considering + * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as + * equivalently requesting no 64-bit operations + */ +#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1 +# undef _LARGEFILE64_SOURCE +#endif + +#if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE) +# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# ifndef z_off_t +# define z_off_t off_t +# endif +#endif + +#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0 +# define Z_LFS64 +#endif + +#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64) +# define Z_LARGE64 +#endif + +#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64) +# define Z_WANT64 +#endif + +#if !defined(SEEK_SET) && defined(WITH_GZFILEOP) +# define SEEK_SET 0 /* Seek from beginning of file. */ +# define SEEK_CUR 1 /* Seek from current position. */ +# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#endif + +#ifndef z_off_t +# define z_off_t long +#endif + +#if !defined(_WIN32) && defined(Z_LARGE64) +# define z_off64_t off64_t +#else +# if defined(__MSYS__) +# define z_off64_t _off64_t +# elif defined(_WIN32) && !defined(__GNUC__) +# define z_off64_t __int64 +# else +# define z_off64_t z_off_t +# endif +#endif + +#endif /* ZCONFNG_H */ diff --git a/3rdparty/zlib-ng/zconf.h.in b/3rdparty/zlib-ng/zconf.h.in index 7a6e281e84..be8221fd86 100644 --- a/3rdparty/zlib-ng/zconf.h.in +++ b/3rdparty/zlib-ng/zconf.h.in @@ -1,5 +1,5 @@ /* zconf.h -- configuration of the zlib compression library - * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ diff --git a/3rdparty/zlib-ng/zlib.h.in b/3rdparty/zlib-ng/zlib.h.in index eabb94afe0..3dceaa3344 100644 --- a/3rdparty/zlib-ng/zlib.h.in +++ b/3rdparty/zlib-ng/zlib.h.in @@ -1,9 +1,9 @@ #ifndef ZLIB_H_ #define ZLIB_H_ /* zlib.h -- interface of the 'zlib-ng' compression library - Forked from and compatible with zlib 1.2.13 + Forked from and compatible with zlib 1.3.1 - Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler + Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages @@ -49,20 +49,20 @@ extern "C" { #endif -#define ZLIBNG_VERSION "2.1.6" -#define ZLIBNG_VERNUM 0x020106F0L /* MMNNRRSM: major minor revision status modified */ +#define ZLIBNG_VERSION "2.2.1" +#define ZLIBNG_VERNUM 0x020201F0L /* MMNNRRSM: major minor revision status modified */ #define ZLIBNG_VER_MAJOR 2 -#define ZLIBNG_VER_MINOR 1 -#define ZLIBNG_VER_REVISION 6 +#define ZLIBNG_VER_MINOR 2 +#define ZLIBNG_VER_REVISION 1 #define ZLIBNG_VER_STATUS F /* 0=devel, 1-E=beta, F=Release (DEPRECATED) */ #define ZLIBNG_VER_STATUSH 0xF /* Hex values: 0=devel, 1-E=beta, F=Release */ #define ZLIBNG_VER_MODIFIED 0 /* non-zero if modified externally from zlib-ng */ -#define ZLIB_VERSION "1.3.0.zlib-ng" -#define ZLIB_VERNUM 0x130f +#define ZLIB_VERSION "1.3.1.zlib-ng" +#define ZLIB_VERNUM 0x131f #define ZLIB_VER_MAJOR 1 #define ZLIB_VER_MINOR 3 -#define ZLIB_VER_REVISION 0 +#define ZLIB_VER_REVISION 1 #define ZLIB_VER_SUBREVISION 15 /* 15=fork (0xf) */ /* @@ -220,7 +220,7 @@ typedef gz_header *gz_headerp; #define Z_DEFLATED 8 /* The deflate compression method (the only one supported in this version) */ -#define Z_NULL NULL /* for compatibility with zlib, was for initializing zalloc, zfree, opaque */ +#define Z_NULL 0 /* for compatibility with zlib, was for initializing zalloc, zfree, opaque */ #define zlib_version zlibVersion() /* for compatibility with versions < 1.0.2 */ @@ -1732,14 +1732,14 @@ Z_EXTERN unsigned long Z_EXPORT crc32_combine(unsigned long crc1, unsigned long seq1 and seq2 with lengths len1 and len2, CRC-32 check values were calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and - len2. + len2. len2 must be non-negative. */ /* Z_EXTERN unsigned long Z_EXPORT crc32_combine_gen(z_off_t len2); Return the operator corresponding to length len2, to be used with - crc32_combine_op(). + crc32_combine_op(). len2 must be non-negative. */ Z_EXTERN unsigned long Z_EXPORT crc32_combine_op(unsigned long crc1, unsigned long crc2, diff --git a/3rdparty/zlib-ng/zlib.map b/3rdparty/zlib-ng/zlib.map new file mode 100644 index 0000000000..293e803729 --- /dev/null +++ b/3rdparty/zlib-ng/zlib.map @@ -0,0 +1,98 @@ +ZLIB_1.2.0 { + global: + compressBound; + deflateBound; + inflateBack; + inflateBackEnd; + inflateBackInit_; + inflateCopy; + local: + deflate_copyright; + inflate_copyright; + zcalloc; + zcfree; + z_errmsg; + gz_error; + gz_intmax; + _*; +}; + +ZLIB_1.2.0.2 { + gzclearerr; + gzungetc; + zlibCompileFlags; +} ZLIB_1.2.0; + +ZLIB_1.2.0.8 { + deflatePrime; +} ZLIB_1.2.0.2; + +ZLIB_1.2.2 { + adler32_combine; + crc32_combine; + deflateSetHeader; + inflateGetHeader; +} ZLIB_1.2.0.8; + +ZLIB_1.2.2.3 { + deflateTune; + gzdirect; +} ZLIB_1.2.2; + +ZLIB_1.2.2.4 { + inflatePrime; +} ZLIB_1.2.2.3; + +ZLIB_1.2.3.3 { + adler32_combine64; + crc32_combine64; + gzopen64; + gzseek64; + gztell64; + inflateUndermine; +} ZLIB_1.2.2.4; + +ZLIB_1.2.3.4 { + inflateReset2; + inflateMark; +} ZLIB_1.2.3.3; + +ZLIB_1.2.3.5 { + gzbuffer; + gzoffset; + gzoffset64; + gzclose_r; + gzclose_w; +} ZLIB_1.2.3.4; + +ZLIB_1.2.5.1 { + deflatePending; +} ZLIB_1.2.3.5; + +ZLIB_1.2.5.2 { + deflateResetKeep; + gzgetc_; + inflateResetKeep; +} ZLIB_1.2.5.1; + +ZLIB_1.2.7.1 { + inflateGetDictionary; + gzvprintf; +} ZLIB_1.2.5.2; + +ZLIB_1.2.9 { + inflateCodesUsed; + inflateValidate; + uncompress2; + gzfread; + gzfwrite; + deflateGetDictionary; + adler32_z; + crc32_z; +} ZLIB_1.2.7.1; + +ZLIB_1.2.12 { + crc32_combine_gen; + crc32_combine_gen64; + crc32_combine_op; +} ZLIB_1.2.9; diff --git a/3rdparty/zlib-ng/zutil.c b/3rdparty/zlib-ng/zutil.c index 270a28c742..39fbceb4a0 100644 --- a/3rdparty/zlib-ng/zutil.c +++ b/3rdparty/zlib-ng/zutil.c @@ -21,7 +21,7 @@ z_const char * const PREFIX(z_errmsg)[10] = { }; const char PREFIX3(vstring)[] = - " zlib-ng 2.1.6"; + " zlib-ng 2.2.1"; #ifdef ZLIB_COMPAT const char * Z_EXPORT zlibVersion(void) { @@ -109,51 +109,3 @@ void Z_INTERNAL PREFIX(zcfree)(void *opaque, void *ptr) { Z_UNUSED(opaque); zng_free(ptr); } - -/* Since we support custom memory allocators, some which might not align memory as we expect, - * we have to ask for extra memory and return an aligned pointer. */ -void Z_INTERNAL *PREFIX3(alloc_aligned)(zng_calloc_func zalloc, void *opaque, unsigned items, unsigned size, unsigned align) { - uintptr_t return_ptr, original_ptr; - uint32_t alloc_size, align_diff; - void *ptr; - - /* If no custom calloc function used then call zlib-ng's aligned calloc */ - if (zalloc == PREFIX(zcalloc)) - return PREFIX(zcalloc)(opaque, items, size); - - /* Allocate enough memory for proper alignment and to store the original memory pointer */ - alloc_size = sizeof(void *) + (items * size) + align; - ptr = zalloc(opaque, 1, alloc_size); - if (!ptr) - return NULL; - - /* Calculate return pointer address with space enough to store original pointer */ - align_diff = align - ((uintptr_t)ptr % align); - return_ptr = (uintptr_t)ptr + align_diff; - if (align_diff < sizeof(void *)) - return_ptr += align; - - /* Store the original pointer for free() */ - original_ptr = return_ptr - sizeof(void *); - memcpy((void *)original_ptr, &ptr, sizeof(void *)); - - /* Return properly aligned pointer in allocation */ - return (void *)return_ptr; -} - -void Z_INTERNAL PREFIX3(free_aligned)(zng_cfree_func zfree, void *opaque, void *ptr) { - /* If no custom cfree function used then call zlib-ng's aligned cfree */ - if (zfree == PREFIX(zcfree)) { - PREFIX(zcfree)(opaque, ptr); - return; - } - if (!ptr) - return; - - /* Calculate offset to original memory allocation pointer */ - void *original_ptr = (void *)((uintptr_t)ptr - sizeof(void *)); - void *free_ptr = *(void **)original_ptr; - - /* Free original memory allocation */ - zfree(opaque, free_ptr); -} diff --git a/3rdparty/zlib-ng/zutil.h b/3rdparty/zlib-ng/zutil.h index 663616b44d..a6284502d0 100644 --- a/3rdparty/zlib-ng/zutil.h +++ b/3rdparty/zlib-ng/zutil.h @@ -1,7 +1,7 @@ #ifndef ZUTIL_H_ #define ZUTIL_H_ /* zutil.h -- internal interface and configuration of the compression library - * Copyright (C) 1995-2022 Jean-loup Gailly, Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -24,7 +24,7 @@ typedef unsigned long ulg; extern z_const char * const PREFIX(z_errmsg)[10]; /* indexed by 2-zlib_error */ /* (size given to avoid silly warnings with Visual C++) */ -#define ERR_MSG(err) PREFIX(z_errmsg)[Z_NEED_DICT-(err)] +#define ERR_MSG(err) PREFIX(z_errmsg)[(err) < -6 || (err) > 2 ? 9 : 2 - (err)] #define ERR_RETURN(strm, err) return (strm->msg = ERR_MSG(err), (err)) /* To be used only when the state is known to be valid */ @@ -103,7 +103,7 @@ extern z_const char * const PREFIX(z_errmsg)[10]; /* indexed by 2-zlib_error */ # define OS_CODE 6 #endif -#if defined(MACOS) || defined(TARGET_OS_MAC) +#if defined(MACOS) # define OS_CODE 7 #endif @@ -137,12 +137,4 @@ void Z_INTERNAL PREFIX(zcfree)(void *opaque, void *ptr); typedef void *zng_calloc_func(void *opaque, unsigned items, unsigned size); typedef void zng_cfree_func(void *opaque, void *ptr); -void Z_INTERNAL *PREFIX3(alloc_aligned)(zng_calloc_func zalloc, void *opaque, unsigned items, unsigned size, unsigned align); -void Z_INTERNAL PREFIX3(free_aligned)(zng_cfree_func zfree, void *opaque, void *ptr); - -#define ZALLOC(strm, items, size) PREFIX3(alloc_aligned)((strm)->zalloc, (strm)->opaque, (items), (size), 64) -#define ZFREE(strm, addr) PREFIX3(free_aligned)((strm)->zfree, (strm)->opaque, (void *)(addr)) - -#define TRY_FREE(s, p) {if (p) ZFREE(s, p);} - #endif /* ZUTIL_H_ */ diff --git a/3rdparty/zlib-ng/zutil_p.h b/3rdparty/zlib-ng/zutil_p.h index caec91d50d..97799f0ce3 100644 --- a/3rdparty/zlib-ng/zutil_p.h +++ b/3rdparty/zlib-ng/zutil_p.h @@ -16,15 +16,19 @@ /* Function to allocate 16 or 64-byte aligned memory */ static inline void *zng_alloc(size_t size) { -#ifdef HAVE_POSIX_MEMALIGN +#ifdef HAVE_ALIGNED_ALLOC + /* Size must be a multiple of alignment */ + size = (size + (64 - 1)) & ~(64 - 1); + return (void *)aligned_alloc(64, size); /* Defined in C11 */ +#elif defined(HAVE_POSIX_MEMALIGN) void *ptr; return posix_memalign(&ptr, 64, size) ? NULL : ptr; #elif defined(_WIN32) return (void *)_aligned_malloc(size, 64); #elif defined(__APPLE__) - return (void *)malloc(size); /* MacOS always aligns to 16 bytes */ -#elif defined(HAVE_ALIGNED_ALLOC) - return (void *)aligned_alloc(64, size); + /* Fallback for when posix_memalign and aligned_alloc are not available. + * On macOS, it always aligns to 16 bytes. */ + return (void *)malloc(size); #else return (void *)memalign(64, size); #endif diff --git a/CMakeLists.txt b/CMakeLists.txt index 9320c90dac..3bc9cbe038 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,7 +145,7 @@ if(NOT OPENCV_SKIP_CMAKE_SYSTEM_FILE) endif() if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) # https://cmake.org/cmake/help/latest/variable/CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT.html - if(NOT CMAKE_TOOLCHAIN_FILE) + if(NOT CMAKE_CROSSCOMPILING) if(WIN32) set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory" FORCE) else() @@ -508,10 +508,6 @@ OCV_OPTION(OPENCV_ENABLE_MEMORY_SANITIZER "Better support for memory/address san OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CV_GCC ) OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) ) OCV_OPTION(ENABLE_FAST_MATH "Enable compiler options for fast math optimizations on FP computations (not recommended)" OFF) -if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS) AND CMAKE_CROSSCOMPILING) # Use CPU_BASELINE instead -OCV_OPTION(ENABLE_NEON "Enable NEON instructions" (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS OR XROS) ) -OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS OR XROS) ) -endif() OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors" OFF ) OCV_OPTION(ANDROID_EXAMPLES_WITH_LIBS "Build binaries of Android examples with native libraries" OFF IF ANDROID ) diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 1d8e98315e..865bfd28a4 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -104,7 +104,7 @@ ocv_optimization_process_obsolete_option(ENABLE_AVX2 AVX2 ON) ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON) ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF) -ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF) +ocv_optimization_process_obsolete_option(ENABLE_NEON NEON ON) ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON) @@ -170,7 +170,29 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ") set(CPU_BASELINE_DETECT ON) endif() +# For platforms which don't allow enabling of extra instruction sets with separate compiler options. +# E.g. GCC/Clang for RISC-V/AArch64 use suffixes for -march option. So we should avoid using existing +# CPU features mechanisms and rely on cmake-toolchain files or flags provided via command-line. +macro(ocv_default_baseline_detect_and_check_dispatch) + set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}") + if(NOT CPU_BASELINE MATCHES "^(DETECT|NATIVE|)$") + message(WARNING "CPU_BASELINE is set to '${CPU_BASELINE}', but '${CMAKE_SYSTEM_PROCESSOR}' " + "platform is designed to work with DETECT|NATIVE|, " + "otherwise target CPU architecture may be changed unexpectedly. " + "Please check your resulting compiler flags in the CMake output.") + endif() + foreach(opt ${CPU_DISPATCH}) + if(NOT DEFINED CPU_${opt}_FLAGS_ON) + message(WARNING "${opt} is in the CPU_DISPATCH list, but 'CPU_${opt}_FLAGS_ON' is not set. " + "Please provide feature-specific compiler options explicitly.") + endif() + endforeach() +endmacro() + +#=================================================================================================== + if(X86 OR X86_64) + ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL") ocv_update(CPU_AVX512_COMMON_GROUP "AVX_512F;AVX_512CD") @@ -347,7 +369,6 @@ elseif(ARM OR AARCH64) ocv_update(CPU_FP16_IMPLIES "NEON") else() ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16") - ocv_update(CPU_NEON_FLAGS_ON "") ocv_update(CPU_FP16_IMPLIES "NEON") ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON") ocv_update(CPU_NEON_FP16_IMPLIES "NEON") @@ -361,15 +382,19 @@ elseif(ARM OR AARCH64) ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16") ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+bf16") endif() - set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") set(CPU_DISPATCH "NEON_FP16;NEON_BF16;NEON_DOTPROD" CACHE STRING "${HELP_CPU_DISPATCH}") + ocv_default_baseline_detect_and_check_dispatch() endif() + elseif(MIPS) + ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp") ocv_update(CPU_KNOWN_OPTIMIZATIONS "MSA") ocv_update(CPU_MSA_FLAGS_ON "-mmsa") set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}") + elseif(PPC64LE) + ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3") ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp") ocv_update(CPU_VSX3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx3.cpp") @@ -390,9 +415,6 @@ elseif(PPC64LE) set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}") elseif(RISCV) - if(NOT DEFINED PLATFORM_STR) - set(PLATFORM_STR "rv64gc") - endif() ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV;FP16;RVV_ZVFH") ocv_update(CPU_RVV_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv.cpp") @@ -403,13 +425,11 @@ elseif(RISCV) ocv_update(CPU_RVV_FLAGS_ON "-march=rv64gc_v") ocv_update(CPU_FP16_FLAGS_ON "-march=rv64gc_v_zvfhmin") ocv_update(CPU_RVV_ZVFH_FLAGS_ON "-march=rv64gc_v_zvfhmin_zvfh") - ocv_update(CPU_RVV_FLAGS_CONFLICT "-march=[^ ]*") - - set(CPU_DISPATCH "FP16;RVV_ZVFH" CACHE STRING "${HELP_CPU_DISPATCH}") - set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}") + ocv_default_baseline_detect_and_check_dispatch() elseif(LOONGARCH64) + ocv_update(CPU_LSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lsx.cpp") ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp") ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX") @@ -451,7 +471,7 @@ macro(ocv_check_compiler_optimization OPT) set(_varname "") if(CPU_${OPT}_TEST_FILE) set(__available 0) - if(__is_from_baseline OR CPU_BASELINE_DETECT) + if(NOT __is_disabled AND (__is_from_baseline OR CPU_BASELINE_DETECT)) set(_varname "HAVE_CPU_${OPT}_SUPPORT") ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" "${_varname}" "${CPU_${OPT}_TEST_FILE}") if(${_varname}) @@ -489,23 +509,6 @@ macro(ocv_check_compiler_optimization OPT) endif() endmacro() -macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION) - unset(_POSTFIX) - # Check each feature option - foreach(OPT IN LISTS ${FEATURE_NAME_LIST}) - string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND) - if(NOT ${OPT_FOUND} EQUAL -1) - string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}") - string(APPEND _POSTFIX "${TRAILING_PART}") - string(REPLACE " ${CPU_${OPT}_FLAGS_ON}" "" ${FLAG_STRING} ${${FLAG_STRING}}) - endif() - endforeach() - # If more than one option found, merge them - if(NOT "x${_POSTFIX}" STREQUAL "x") - set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}") - endif() -endmacro() - foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "") if("${CPU_${OPT}_FLAGS_ON}" STREQUAL "disabled") @@ -588,7 +591,7 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) if(CPU_${OPT}_SUPPORTED) if(";${CPU_DISPATCH};" MATCHES ";${OPT};" AND NOT __is_from_baseline) list(APPEND CPU_DISPATCH_FINAL ${OPT}) - elseif(__is_from_baseline) + elseif(__is_from_baseline AND NOT __is_disabled) if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};") list(APPEND CPU_BASELINE_FINAL ${OPT}) endif() @@ -599,15 +602,6 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) endif() endforeach() -if(AARCH64) - if(NOT MSVC) - # Define the list of NEON options to check - set(NEON_OPTIONS_LIST NEON_DOTPROD NEON_FP16 NEON_BF16) - set(BASE_ARCHITECTURE "-march=armv8.2-a") - ocv_cpu_aarch64_baseline_merge_feature_options(NEON_OPTIONS_LIST CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE}) - endif() -endif() - foreach(OPT ${CPU_BASELINE_REQUIRE}) if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};") message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})") diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index f23bb13dc5..f0d6378bd7 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -77,6 +77,17 @@ macro(add_env_definitions option) add_definitions("-D${option}=\"${value}\"") endmacro() +# Use same flags for native AArch64 and RISC-V compilation as for cross-compile (Linux) +if(NOT CMAKE_CROSSCOMPILING AND NOT CMAKE_TOOLCHAIN_FILE AND COMMAND ocv_set_platform_flags) + unset(platform_flags) + ocv_set_platform_flags(platform_flags) + # externally-provided flags should have higher priority - prepend our flags + if(platform_flags) + set(CMAKE_CXX_FLAGS "${platform_flags} ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS "${platform_flags} ${CMAKE_C_FLAGS}") + endif() +endif() + if(NOT MSVC) # OpenCV fails some tests when 'char' is 'unsigned' by default add_extra_compiler_option(-fsigned-char) diff --git a/cmake/OpenCVMinDepVersions.cmake b/cmake/OpenCVMinDepVersions.cmake index e13bc154d3..0794ec8c71 100644 --- a/cmake/OpenCVMinDepVersions.cmake +++ b/cmake/OpenCVMinDepVersions.cmake @@ -1,5 +1,5 @@ if(NOT DEFINED MIN_VER_CMAKE) - set(MIN_VER_CMAKE 3.5.1) + set(MIN_VER_CMAKE 3.7) endif() set(MIN_VER_CUDA 6.5) set(MIN_VER_CUDNN 7.5) diff --git a/cmake/platforms/OpenCV-Linux.cmake b/cmake/platforms/OpenCV-Linux.cmake index 1bb8bf6d7f..5f015dfb79 100644 --- a/cmake/platforms/OpenCV-Linux.cmake +++ b/cmake/platforms/OpenCV-Linux.cmake @@ -1 +1,9 @@ -# empty +if((CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + AND NOT CMAKE_CROSSCOMPILING + AND NOT CMAKE_TOOLCHAIN_FILE) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") # Maybe use AARCH64 variable? + include(${CMAKE_CURRENT_LIST_DIR}/../../platforms/linux/flags-aarch64.cmake) + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") + include(${CMAKE_CURRENT_LIST_DIR}/../../platforms/linux/flags-riscv64.cmake) + endif() +endif() diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp index b58e5b2e50..e00ef365ed 100644 --- a/modules/core/include/opencv2/core.hpp +++ b/modules/core/include/opencv2/core.hpp @@ -811,10 +811,17 @@ The function cv::minMaxLoc finds the minimum and maximum element values and thei extremums are searched across the whole array or, if mask is not an empty array, in the specified array region. -The function do not work with multi-channel arrays. If you need to find minimum or maximum -elements across all the channels, use Mat::reshape first to reinterpret the array as -single-channel. Or you may extract the particular channel using either extractImageCOI, or -mixChannels, or split. +In C++, if the input is multi-channel, you should omit the minLoc, maxLoc, and mask arguments +(i.e. leave them as NULL, NULL, and noArray() respectively). These arguments are not +supported for multi-channel input arrays. If working with multi-channel input and you +need the minLoc, maxLoc, or mask arguments, then use Mat::reshape first to reinterpret +the array as single-channel. Alternatively, you can extract the particular channel using either +extractImageCOI, mixChannels, or split. + +In Python, multi-channel input is not supported at all due to a limitation in the +binding generation process (there is no way to set minLoc and maxLoc to NULL). A +workaround is to operate on each channel individually or to use NumPy to achieve the same +functionality. @note CV_16F/CV_16BF/CV_Bool/CV_64U/CV_64S/CV_32U are not supported for src. @param src input single-channel array. @param minVal pointer to the returned minimum value; NULL is used if not required. diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 67aba0bf27..6e033bf67c 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -775,317 +775,15 @@ namespace CV__SIMD_NAMESPACE { /** @brief SIMD processing state cleanup call */ inline void vx_cleanup() { VXPREFIX(_cleanup)(); } -#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP)) +#if !CV_SIMD_SCALABLE // Compatibility layer - +#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP)) template struct VTraits { static inline int vlanes() { return T::nlanes; } enum { nlanes = T::nlanes, max_nlanes = T::nlanes }; using lane_type = typename T::lane_type; }; - #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ - inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a + b; \ - } \ - inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a - b; \ - } \ - template \ - inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_add(f1 + f2, vf...); \ - } - #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \ - inline _Tpvec v_shr(const _Tpvec& a, int n) \ - { \ - return a >> n; \ - } \ - inline _Tpvec v_shl(const _Tpvec& a, int n) \ - { \ - return a << n; \ - } - - OPENCV_HAL_WRAP_SHIFT_OP(v_uint16) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint32) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint64) - OPENCV_HAL_WRAP_SHIFT_OP(v_int16) - OPENCV_HAL_WRAP_SHIFT_OP(v_int32) - OPENCV_HAL_WRAP_SHIFT_OP(v_int64) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2) - OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4) - OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16) - OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8) - OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4) - #endif - #endif - - #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \ - inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a & b; \ - } \ - inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a | b; \ - } \ - inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a ^ b; \ - } - - #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \ - inline _Tpvec v_not(const _Tpvec& a) \ - { \ - return ~a; \ - } - - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32) - OPENCV_HAL_WRAP_NOT_OP(v_uint8) - OPENCV_HAL_WRAP_NOT_OP(v_uint16) - OPENCV_HAL_WRAP_NOT_OP(v_uint32) - OPENCV_HAL_WRAP_NOT_OP(v_uint64) - OPENCV_HAL_WRAP_NOT_OP(v_int8) - OPENCV_HAL_WRAP_NOT_OP(v_int16) - OPENCV_HAL_WRAP_NOT_OP(v_int32) - OPENCV_HAL_WRAP_NOT_OP(v_int64) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4) - OPENCV_HAL_WRAP_NOT_OP(v_uint8x16) - OPENCV_HAL_WRAP_NOT_OP(v_uint16x8) - OPENCV_HAL_WRAP_NOT_OP(v_uint32x4) - OPENCV_HAL_WRAP_NOT_OP(v_uint64x2) - OPENCV_HAL_WRAP_NOT_OP(v_int8x16) - OPENCV_HAL_WRAP_NOT_OP(v_int16x8) - OPENCV_HAL_WRAP_NOT_OP(v_int32x4) - OPENCV_HAL_WRAP_NOT_OP(v_int64x2) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4) - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8) - OPENCV_HAL_WRAP_NOT_OP(v_uint8x32) - OPENCV_HAL_WRAP_NOT_OP(v_uint16x16) - OPENCV_HAL_WRAP_NOT_OP(v_uint32x8) - OPENCV_HAL_WRAP_NOT_OP(v_uint64x4) - OPENCV_HAL_WRAP_NOT_OP(v_int8x32) - OPENCV_HAL_WRAP_NOT_OP(v_int16x16) - OPENCV_HAL_WRAP_NOT_OP(v_int32x8) - OPENCV_HAL_WRAP_NOT_OP(v_int64x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4) - #endif - #endif - - #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ - inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a * b; \ - } \ - template \ - inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_mul(f1 * f2, vf...); \ - } - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4) - #endif - #endif - - #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \ - inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a / b; \ - } - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4) - #endif - #endif - - #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \ - inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a op b; \ - } - #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \ - inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a == b; \ - } \ - inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ - { \ - return a != b; \ - } - - #define OPENCV_HAL_WRAP_CMP(_Tpvec) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \ - OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=) - - OPENCV_HAL_WRAP_CMP(v_uint8) - OPENCV_HAL_WRAP_CMP(v_uint16) - OPENCV_HAL_WRAP_CMP(v_uint32) - OPENCV_HAL_WRAP_EQ_OP(v_uint64) - OPENCV_HAL_WRAP_CMP(v_int8) - OPENCV_HAL_WRAP_CMP(v_int16) - OPENCV_HAL_WRAP_CMP(v_int32) - OPENCV_HAL_WRAP_EQ_OP(v_int64) - OPENCV_HAL_WRAP_CMP(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_CMP(v_float64) - #endif - #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 - OPENCV_HAL_WRAP_CMP(v_uint8x16) - OPENCV_HAL_WRAP_CMP(v_uint16x8) - OPENCV_HAL_WRAP_CMP(v_uint32x4) - OPENCV_HAL_WRAP_EQ_OP(v_uint64x2) - OPENCV_HAL_WRAP_CMP(v_int8x16) - OPENCV_HAL_WRAP_CMP(v_int16x8) - OPENCV_HAL_WRAP_CMP(v_int32x4) - OPENCV_HAL_WRAP_EQ_OP(v_int64x2) - OPENCV_HAL_WRAP_CMP(v_float32x4) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_CMP(v_float64x2) - #endif - #endif - #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 - OPENCV_HAL_WRAP_CMP(v_uint8x32) - OPENCV_HAL_WRAP_CMP(v_uint16x16) - OPENCV_HAL_WRAP_CMP(v_uint32x8) - OPENCV_HAL_WRAP_EQ_OP(v_uint64x4) - OPENCV_HAL_WRAP_CMP(v_int8x32) - OPENCV_HAL_WRAP_CMP(v_int16x16) - OPENCV_HAL_WRAP_CMP(v_int32x8) - OPENCV_HAL_WRAP_EQ_OP(v_int64x4) - OPENCV_HAL_WRAP_CMP(v_float32x8) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_CMP(v_float64x4) - #endif - #endif - OPENCV_HAL_WRAP_CMP_OP(v_int64, lt, <) \ - OPENCV_HAL_WRAP_CMP_OP(v_int64, gt, >) \ - - //////////// get0 //////////// #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \ inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \ @@ -1133,6 +831,102 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_GRT0(v_float64x4) #endif #endif +#endif + + #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ + template \ + inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \ + return v_add(v_add(f1, f2), f3, vf...); \ + } + + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) + #endif + #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float16) + #endif // (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4) + #endif + #endif + + #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ + template \ + inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \ + return v_mul(v_mul(f1, f2), f3, vf...); \ + } + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) + #endif + #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float16) + #endif // (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4) + #endif + #endif #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \ inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \ @@ -1149,6 +943,9 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_EXTRACT(v_uint64) OPENCV_HAL_WRAP_EXTRACT(v_int64) OPENCV_HAL_WRAP_EXTRACT(v_float32) + #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) + OPENCV_HAL_WRAP_EXTRACT(v_float16) + #endif #if CV_SIMD_64F OPENCV_HAL_WRAP_EXTRACT(v_float64) #endif @@ -1190,6 +987,9 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BROADCAST(v_uint32) OPENCV_HAL_WRAP_BROADCAST(v_int32) OPENCV_HAL_WRAP_BROADCAST(v_float32) + #if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16) + OPENCV_HAL_WRAP_BROADCAST(v_float16) + #endif #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 OPENCV_HAL_WRAP_BROADCAST(v_uint32x4) OPENCV_HAL_WRAP_BROADCAST(v_int32x4) @@ -1203,83 +1003,6 @@ namespace CV__SIMD_NAMESPACE { #endif //!CV_SIMD_SCALABLE -#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP) -// Compatibility layer for the backend that cleaned up. - #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ - template \ - inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_add(v_add(f1, f2), vf...); \ - } - - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32) - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64) - #if CV_SIMD_FP16 - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float16) - #endif - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) - #endif - - #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ - template \ - inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \ - return v_mul(v_mul(f1, f2), vf...); \ - } - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16) - OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32) - #if CV_SIMD_FP16 - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float16) - #endif - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) - #endif - - #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \ - inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \ - { \ - return v_extract_n::nlanes-1>(v); \ - } - - OPENCV_HAL_WRAP_EXTRACT(v_uint8) - OPENCV_HAL_WRAP_EXTRACT(v_int8) - OPENCV_HAL_WRAP_EXTRACT(v_uint16) - OPENCV_HAL_WRAP_EXTRACT(v_int16) - OPENCV_HAL_WRAP_EXTRACT(v_uint32) - OPENCV_HAL_WRAP_EXTRACT(v_int32) - OPENCV_HAL_WRAP_EXTRACT(v_uint64) - OPENCV_HAL_WRAP_EXTRACT(v_int64) - #if CV_SIMD_FP16 - OPENCV_HAL_WRAP_EXTRACT(v_float16) - #endif - OPENCV_HAL_WRAP_EXTRACT(v_float32) - #if CV_SIMD_64F - OPENCV_HAL_WRAP_EXTRACT(v_float64) - #endif - - #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \ - inline _Tpvec v_broadcast_highest(const _Tpvec& v) \ - { \ - return v_broadcast_element::nlanes-1>(v); \ - } - - OPENCV_HAL_WRAP_BROADCAST(v_uint32) - OPENCV_HAL_WRAP_BROADCAST(v_int32) - OPENCV_HAL_WRAP_BROADCAST(v_float32) - -#endif //CV_NEON - //! @cond IGNORED // backward compatibility diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index b208479839..3a8505a297 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -673,53 +673,51 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4) /** Arithmetics **/ #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32, _mm256_adds_epu8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32, _mm256_subs_epu8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32, _mm256_adds_epi8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32, _mm256_subs_epi8) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16, _mm256_adds_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16, _mm256_subs_epi16) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8, _mm256_add_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8, _mm256_sub_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8, _mm256_mullo_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8, _mm256_add_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8, _mm256_sub_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8, _mm256_mullo_epi32) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4, _mm256_add_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4, _mm256_sub_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4, _mm256_add_epi64) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4, _mm256_sub_epi64) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps) -OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd) -OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd) -OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd) +OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd) // saturating multiply 8-bit, 16-bit -inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b) +inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b) { v_uint16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b) +inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b) { v_int16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) +inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b) { __m256i pl = _mm256_mullo_epi16(a.val, b.val); __m256i ph = _mm256_mulhi_epu16(a.val, b.val); @@ -727,7 +725,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) __m256i p1 = _mm256_unpackhi_epi16(pl, ph); return v_uint16x16(_v256_packs_epu32(p0, p1)); } -inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) +inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b) { __m256i pl = _mm256_mullo_epi16(a.val, b.val); __m256i ph = _mm256_mulhi_epi16(a.val, b.val); @@ -735,14 +733,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) __m256i p1 = _mm256_unpackhi_epi16(pl, ph); return v_int16x16(_mm256_packs_epi32(p0, p1)); } -inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b) -{ a = a * b; return a; } -inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b) -{ a = a * b; return a; } -inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b) -{ a = a * b; return a; } -inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b) -{ a = a * b; return a; } /** Non-saturating arithmetics **/ #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \ @@ -833,13 +823,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(srai(a.val, imm)); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -867,11 +857,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx /** Bitwise logic **/ -#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \ - OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \ - OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ +#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix) \ + OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); } OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1)) @@ -900,29 +890,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd) /** Comparison **/ -#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } \ - inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ - { return b > a; } \ - inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a < b); } \ - inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ - { return b >= a; } +#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \ + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } \ + inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ + { return v_gt(b, a); } \ + inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_lt(a, b)); } \ + inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ + { return v_ge(b, a); } #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \ - inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \ - inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m256i smask = _mm256_set1_##suffix(sbit); \ return _Tpuvec(_mm256_cmpgt_##suffix( \ _mm256_xor_si256(a.val, smask), \ _mm256_xor_si256(b.val, smask))); \ } \ - inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \ - inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \ OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \ OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec) @@ -932,30 +922,30 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768) OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000) #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \ - inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } -inline v_int64x4 operator > (const v_int64x4& a, const v_int64x4& b) +inline v_int64x4 v_gt(const v_int64x4& a, const v_int64x4& b) { return v_int64x4(_mm256_cmpgt_epi64(a.val, b.val)); } -inline v_int64x4 operator < (const v_int64x4& a, const v_int64x4& b) +inline v_int64x4 v_lt(const v_int64x4& a, const v_int64x4& b) { return v_int64x4(_mm256_cmpgt_epi64(b.val, a.val)); } OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4) OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4) #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); } #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \ - OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix) + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, suffix) \ + OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, suffix) OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd) @@ -1221,9 +1211,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a) { return v_reduce_sum(v_reinterpret_as_s32(a)); } inline int v_reduce_sum(const v_int16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline unsigned v_reduce_sum(const v_uint16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline float v_reduce_sum(const v_float32x8& a) { @@ -1278,27 +1268,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) { v_uint32x8 l, h; - v_expand(v_add_wrap(a - b, b - a), l, h); - return v_reduce_sum(l + h); + v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b) { v_uint32x8 l, h; v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b) { - return v_reduce_sum(v_max(a, b) - v_min(a, b)); + return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); } inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b) { - v_int32x8 m = a < b; - return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m)); + v_int32x8 m = v_lt(a, b); + return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m))); } inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) { - return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); + return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))))); } /** Popcount **/ @@ -1313,15 +1303,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a) inline v_uint16x16 v_popcount(const v_uint16x16& a) { v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff)); } inline v_uint32x8 v_popcount(const v_uint32x8& a) { v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff)); } inline v_uint64x4 v_popcount(const v_uint64x4& a) { @@ -1413,9 +1403,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16) inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b*b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd) @@ -1424,7 +1414,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd) inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) @@ -1434,16 +1424,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x inline v_float32x8 v_invsqrt(const v_float32x8& x) { - v_float32x8 half = x * v256_setall_f32(0.5); + v_float32x8 half = v_mul(x, v256_setall_f32(0.5)); v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val)); // todo: _mm256_fnmsub_ps - t *= v256_setall_f32(1.5) - ((t * t) * half); + t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half))); return t; } inline v_float64x4 v_invsqrt(const v_float64x4& x) { - return v256_setall_f64(1.) / v_sqrt(x); + return v_div(v256_setall_f64(1.), v_sqrt(x)); } /** Absolute values **/ @@ -1456,23 +1446,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16) OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32) inline v_float32x8 v_abs(const v_float32x8& x) -{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); } +{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); } inline v_float64x4 v_abs(const v_float64x4& x) -{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); } +{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); } /** Absolute difference **/ inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b) { v_int8x32 d = v_sub_wrap(a, b); - v_int8x32 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x32 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b) @@ -1480,26 +1470,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b) inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b) { - v_int32x8 d = a - b; - v_int32x8 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x8 d = v_sub(a, b); + v_int32x8 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b) { - v_int8x32 d = a - b; - v_int8x32 m = a < b; - return (d ^ m) - m; + v_int8x32 d = v_sub(a, b); + v_int8x32 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ////////// Conversions ///////// @@ -1794,7 +1784,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec) inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); } inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) @@ -1804,7 +1794,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) return v_int64x4(_mm256_add_epi64(even, odd)); } inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) @@ -1821,7 +1811,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) return v_uint32x8(_mm256_add_epi32(prod0, prod1)); } inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) { @@ -1836,7 +1826,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) return v_int32x8(_mm256_add_epi32(prod0, prod1)); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) @@ -1860,7 +1850,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) )); } inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) { @@ -1876,13 +1866,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) )); } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1928,7 +1918,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& return v_uint64x4(_mm256_add_epi64(p15_, p9d_)); } inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) { @@ -1939,7 +1929,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) return v_int64x4(_mm256_add_epi64(lo, hi)); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b) @@ -1958,7 +1948,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0, v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1); v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2); v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3); - return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3)))); } inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0, @@ -2063,43 +2053,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); - return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), - v_reinterpret_as_s16((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s16(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a) { v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1))); - v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n))); } template inline v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(schar* ptr, const v_int16x16& a) { v_int16x16 delta = v256_setall_s16((short)(1 << (n-1))); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 32 @@ -2132,43 +2122,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b) { // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers. v_uint32x8 delta = v256_setall_u32(1 << (n-1)); - return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), - v_reinterpret_as_s32((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s32(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a) { v_uint32x8 delta = v256_setall_u32(1 << (n-1)); - v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n))); } template inline v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(short* ptr, const v_int32x8& a) { v_int32x8 delta = v256_setall_s32(1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 64 @@ -2197,28 +2187,28 @@ template inline v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b) { v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a) { v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b) { v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(int* ptr, const v_int64x4& a) { v_int64x4 delta = v256_setall_s64((int64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // pack boolean diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index e59b8d92eb..64dab6b3ae 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -663,58 +663,56 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b) } #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64) /** Saturating arithmetics **/ -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64, _mm512_adds_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64, _mm512_subs_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64, _mm512_adds_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64, _mm512_subs_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32, _mm512_adds_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32, _mm512_subs_epi16) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps) -OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd) -OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd) -OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd) -OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd) // saturating multiply -inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b) +inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b) { v_uint16x32 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b) +inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b) { v_int16x32 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b) +inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b) { __m512i pl = _mm512_mullo_epi16(a.val, b.val); __m512i ph = _mm512_mulhi_epu16(a.val, b.val); @@ -724,7 +722,7 @@ inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b) const __m512i m = _mm512_set1_epi32(65535); return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m))); } -inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b) +inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b) { __m512i pl = _mm512_mullo_epi16(a.val, b.val); __m512i ph = _mm512_mulhi_epi16(a.val, b.val); @@ -733,15 +731,6 @@ inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b) return v_int16x32(_mm512_packs_epi32(p0, p1)); } -inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b) -{ a = a * b; return a; } -inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b) -{ a = a * b; return a; } -inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b) -{ a = a * b; return a; } -inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b) -{ a = a * b; return a; } - inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); } inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); } @@ -802,13 +791,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b, /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -830,10 +819,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \ - OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \ - OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); } OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1)) @@ -865,16 +854,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd) /** Comparison **/ #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); } #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval) + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval) OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1) OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1) @@ -886,16 +875,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1) OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1) #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); } #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \ - OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval) + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval) OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1) OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1) @@ -1250,9 +1239,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16) OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16) inline int v_reduce_sum(const v_int16x32& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline uint v_reduce_sum(const v_uint16x32& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \ inline sctype v_reduce_##func(const _Tpvec& a) \ @@ -1306,17 +1295,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b) return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); } inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b) -{ return v_reduce_sum(v_add_wrap(a - b, b - a)); } +{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); } inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b) { return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); } inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b) -{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); } +{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); } inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b) -{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); } +{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); } inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b) -{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); } +{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); } inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b) -{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); } +{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); } /** Popcount **/ inline v_uint8x64 v_popcount(const v_int8x64& a) @@ -1351,8 +1340,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a) _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero)))); #else v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff)); #endif } inline v_uint32x16 v_popcount(const v_int32x16& a) @@ -1361,9 +1350,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a) return v_uint32x16(_mm512_popcnt_epi32(a.val)); #else v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff)); #endif } inline v_uint64x8 v_popcount(const v_int64x8& a) @@ -1403,9 +1392,9 @@ inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinte inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b * b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps) OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd) @@ -1413,7 +1402,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps) OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd) inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c) -{ return a * b + c; } +{ return v_add(v_mul(a, b), c); } inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c) { return v_fma(a, b, c); } @@ -1422,9 +1411,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x) #if CV_AVX_512ER return v_float32x16(_mm512_rsqrt28_ps(x.val)); #else - v_float32x16 half = x * v512_setall_f32(0.5); + v_float32x16 half = v_mul(x, v512_setall_f32(0.5)); v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val)); - t *= v512_setall_f32(1.5) - ((t * t) * half); + t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half))); return t; #endif } @@ -1434,7 +1423,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x) #if CV_AVX_512ER return v_float64x8(_mm512_rsqrt28_pd(x.val)); #else - return v512_setall_f64(1.) / v_sqrt(x); + return v_div(v512_setall_f64(1.), v_sqrt(x)); // v_float64x8 half = x * v512_setall_f64(0.5); // v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val)); // t *= v512_setall_f64(1.5) - ((t * t) * half); @@ -1482,17 +1471,17 @@ inline v_float64x8 v_abs(const v_float64x8& x) /** Absolute difference **/ inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b) { v_int8x64 d = v_sub_wrap(a, b); - v_int8x64 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x64 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b) @@ -1500,26 +1489,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b) inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b) { - v_int32x16 d = a - b; - v_int32x16 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x16 d = v_sub(a, b); + v_int32x16 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b) { - v_int8x64 d = a - b; - v_int8x64 m = a < b; - return (d ^ m) - m; + v_int8x64 d = v_sub(a, b); + v_int8x64 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ////////// Conversions ///////// @@ -1818,7 +1807,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec) inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b) { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); } inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b) @@ -1828,7 +1817,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b) return v_int64x8(_mm512_add_epi64(even, odd)); } inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b) @@ -1844,7 +1833,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b) return v_uint32x16(_mm512_add_epi32(prod0, prod1)); } inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b) { @@ -1859,7 +1848,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b) return v_int32x16(_mm512_add_epi32(prod0, prod1)); } inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b) @@ -1883,7 +1872,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b) )); } inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b) { @@ -1893,13 +1882,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b) return v_int64x8(_mm512_add_epi64(even, odd)); } inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1944,7 +1933,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& return v_uint64x8(_mm512_add_epi64(p15_, p9d_)); } inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b) { return v_dotprod_expand(a, b); } @@ -1955,7 +1944,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b) { return v_dotprod_expand(a, b); } inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \ @@ -1969,7 +1958,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v, v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1); v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2); v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3); - return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3)))); } inline v_float32x16 v_matmuladd(const v_float32x16& v, @@ -2070,43 +2059,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b) { // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1))); - return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), - v_reinterpret_as_s16((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s16(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a) { v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1))); - v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n))); } template inline v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(schar* ptr, const v_int16x32& a) { v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 32 @@ -2139,43 +2128,43 @@ template inline v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b) { v_uint32x16 delta = v512_setall_u32(1 << (n-1)); - return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), - v_reinterpret_as_s32((b + delta) >> n)); + return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)), + v_reinterpret_as_s32(v_shr(v_add(b, delta), n))); } template inline void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a) { v_uint32x16 delta = v512_setall_u32(1 << (n-1)); - v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); + v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n))); } template inline v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - return v_pack_u((a + delta) >> n, (b + delta) >> n); + return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - v_pack_u_store(ptr, (a + delta) >> n); + v_pack_u_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(short* ptr, const v_int32x16& a) { v_int32x16 delta = v512_setall_s32(1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // 64 @@ -2196,28 +2185,28 @@ template inline v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b) { v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a) { v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } template inline v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b) { v_int64x8 delta = v512_setall_s64((int64)1 << (n-1)); - return v_pack((a + delta) >> n, (b + delta) >> n); + return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n)); } template inline void v_rshr_pack_store(int* ptr, const v_int64x8& a) { v_int64x8 delta = v512_setall_s64((int64)1 << (n-1)); - v_pack_store(ptr, (a + delta) >> n); + v_pack_store(ptr, v_shr(v_add(a, delta), n)); } // pack boolean diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index fbc6ad82e5..fed7cc261a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -225,32 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto Element-wise binary and unary operations. - Arithmetics: -@ref operator +(const v_reg &a, const v_reg &b) "+", -@ref operator -(const v_reg &a, const v_reg &b) "-", -@ref operator *(const v_reg &a, const v_reg &b) "*", -@ref operator /(const v_reg &a, const v_reg &b) "/", +@ref v_add(const v_reg &a, const v_reg &b) "+", +@ref v_sub(const v_reg &a, const v_reg &b) "-", +@ref v_mul(const v_reg &a, const v_reg &b) "*", +@ref v_div(const v_reg &a, const v_reg &b) "/", @ref v_mul_expand - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap - Bitwise shifts: -@ref operator <<(const v_reg &a, int s) "<<", -@ref operator >>(const v_reg &a, int s) ">>", +@ref v_shl(const v_reg &a, int s) "<<", +@ref v_shr(const v_reg &a, int s) ">>", @ref v_shl, @ref v_shr - Bitwise logic: -@ref operator &(const v_reg &a, const v_reg &b) "&", -@ref operator |(const v_reg &a, const v_reg &b) "|", -@ref operator ^(const v_reg &a, const v_reg &b) "^", -@ref operator ~(const v_reg &a) "~" +@ref v_and(const v_reg &a, const v_reg &b) "&", +@ref v_or(const v_reg &a, const v_reg &b) "|", +@ref v_xor(const v_reg &a, const v_reg &b) "^", +@ref v_not(const v_reg &a) "~" - Comparison: -@ref operator >(const v_reg &a, const v_reg &b) ">", -@ref operator >=(const v_reg &a, const v_reg &b) ">=", -@ref operator <(const v_reg &a, const v_reg &b) "<", -@ref operator <=(const v_reg &a, const v_reg &b) "<=", -@ref operator ==(const v_reg &a, const v_reg &b) "==", -@ref operator !=(const v_reg &a, const v_reg &b) "!=" +@ref v_gt(const v_reg &a, const v_reg &b) ">", +@ref v_ge(const v_reg &a, const v_reg &b) ">=", +@ref v_lt(const v_reg &a, const v_reg &b) "<", +@ref v_le(const v_reg &a, const v_reg &b) "<=", +@ref v_eq(const v_reg &a, const v_reg &b) "==", +@ref v_ne(const v_reg &a, const v_reg &b) "!=" - min/max: @ref v_min, @ref v_max @@ -573,50 +573,43 @@ enum { /** @brief Add values For all types. */ -template CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Subtract values For all types. */ -template CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Multiply values For 16- and 32-bit integer types and floating types. */ -template CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Divide values For floating types only. */ -template CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise AND Only for integer types. */ -template CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise OR Only for integer types. */ -template CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise XOR Only for integer types.*/ -template CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -template CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Bitwise NOT Only for integer types.*/ -template CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a); +template CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a); #ifndef CV_DOXYGEN @@ -639,33 +632,26 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \ CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \ CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \ -#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \ +#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \ template inline \ -v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ v_reg<_Tp, n> c; \ for( int i = 0; i < n; i++ ) \ c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ return c; \ -} \ -template inline \ -v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return a; \ } -#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op) +#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func) -CV__HAL_INTRIN_IMPL_BIN_OP(+) -CV__HAL_INTRIN_IMPL_BIN_OP(-) -CV__HAL_INTRIN_IMPL_BIN_OP(*) -CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /) +CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add) +CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub) +CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul) +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div) -#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \ +#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \ template CV_INLINE \ -v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ v_reg<_Tp, n> c; \ typedef typename V_TypeTraits<_Tp>::int_type itype; \ @@ -673,29 +659,20 @@ v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ return c; \ -} \ -template CV_INLINE \ -v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - typedef typename V_TypeTraits<_Tp>::int_type itype; \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ - V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ - return a; \ } -#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \ -CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \ -CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */ +#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \ +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \ +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */ -CV__HAL_INTRIN_IMPL_BIT_OP(&) -CV__HAL_INTRIN_IMPL_BIT_OP(|) -CV__HAL_INTRIN_IMPL_BIT_OP(^) +CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and) +CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or) +CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor) -#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \ +#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \ template CV_INLINE \ -v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \ +v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \ { \ v_reg<_Tp, n> c; \ for( int i = 0; i < n; i++ ) \ @@ -703,7 +680,7 @@ v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \ return c; \ } \ -CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~) +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not) #endif // !CV_DOXYGEN @@ -760,7 +737,6 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) * @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$. */ OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) -#define OPENCV_HAL_MATH_HAVE_LOG 1 /** * @brief Error function. @@ -771,9 +747,7 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp) //! @cond IGNORED OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) -#define OPENCV_HAL_MATH_HAVE_SIN 1 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) -#define OPENCV_HAL_MATH_HAVE_COS 1 //! @endcond /** @brief Absolute value of elements @@ -897,9 +871,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ +#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \ template \ -inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ typedef typename V_TypeTraits<_Tp>::int_type itype; \ v_reg<_Tp, n> c; \ @@ -911,28 +885,28 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> /** @brief Less-than comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(<) +OPENCV_HAL_IMPL_CMP_OP(<, v_lt) /** @brief Greater-than comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(>) +OPENCV_HAL_IMPL_CMP_OP(>, v_gt) /** @brief Less-than or equal comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(<=) +OPENCV_HAL_IMPL_CMP_OP(<=, v_le) /** @brief Greater-than or equal comparison For all types except 64-bit integer values. */ -OPENCV_HAL_IMPL_CMP_OP(>=) +OPENCV_HAL_IMPL_CMP_OP(>=, v_ge) /** @brief Equal comparison */ -OPENCV_HAL_IMPL_CMP_OP(==) +OPENCV_HAL_IMPL_CMP_OP(==, v_eq) /** @brief Not equal comparison */ -OPENCV_HAL_IMPL_CMP_OP(!=) +OPENCV_HAL_IMPL_CMP_OP(!=, v_ne) template inline v_reg v_not_nan(const v_reg& a) @@ -1301,8 +1275,8 @@ template inline void v_hsum(const v_reg<_Tp, n>& a, //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ -template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ +#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \ +template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \ { \ v_reg<_Tp, n> c; \ for( int i = 0; i < n; i++ ) \ @@ -1313,12 +1287,12 @@ template inline v_reg<_Tp, n> operator shift_op(const v_reg /** @brief Bitwise shift left For 16-, 32- and 64-bit integer values. */ -OPENCV_HAL_IMPL_SHIFT_OP(<< ) +OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl) /** @brief Bitwise shift right For 16-, 32- and 64-bit integer values. */ -OPENCV_HAL_IMPL_SHIFT_OP(>> ) +OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr) //! @brief Helper macro //! @ingroup core_hal_intrin_impl @@ -2942,7 +2916,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64) //! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \ template inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \ -{ return a << shift; } +{ return v_shl(a, shift); } //! @name Left shift //! @{ @@ -2959,7 +2933,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64) //! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \ template inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \ -{ return a >> shift; } +{ return v_shr(a, shift); } //! @name Right shift //! @{ @@ -3285,7 +3259,7 @@ inline v_reg v_matmuladd(const v_reg& v, template inline v_reg v_dotprod_expand(const v_reg& a, const v_reg& b) -{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); } template inline v_reg v_dotprod_expand(const v_reg& a, const v_reg& b, const v_reg& c) { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp index db491cc137..45f53de8a2 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp @@ -746,53 +746,51 @@ OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4) /** Arithmetics **/ #define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32, __lasx_xvsadd_bu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32, __lasx_xvssub_bu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32, __lasx_xvsadd_b) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32, __lasx_xvssub_b) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16, __lasx_xvsadd_h) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16, __lasx_xvssub_h) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8, __lasx_xvadd_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8, __lasx_xvsub_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8, __lasx_xvmul_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8, __lasx_xvadd_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8, __lasx_xvsub_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8, __lasx_xvmul_w) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4, __lasx_xvadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4, __lasx_xvsub_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4, __lasx_xvadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4, __lasx_xvsub_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s) -OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d) -OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d) +OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d) // saturating multiply 8-bit, 16-bit -inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b) +inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b) { v_uint16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b) +inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b) { v_int16x16 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) +inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b) { __m256i pl = __lasx_xvmul_h(a.val, b.val); __m256i ph = __lasx_xvmuh_hu(a.val, b.val); @@ -800,7 +798,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) __m256i p1 = __lasx_xvilvh_h(ph, pl); return v_uint16x16(_v256_packs_epu32(p0, p1)); } -inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) +inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b) { __m256i pl = __lasx_xvmul_h(a.val, b.val); __m256i ph = __lasx_xvmuh_h(a.val, b.val); @@ -808,14 +806,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) __m256i p1 = __lasx_xvilvh_h(ph, pl); return v_int16x16(_lasx_packs_w(p0, p1)); } -inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b) -{ a = a * b; return a; } -inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b) -{ a = a * b; return a; } -inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b) -{ a = a * b; return a; } -inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b) -{ a = a * b; return a; } /** Non-saturating arithmetics **/ @@ -904,13 +894,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec V_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -932,10 +922,10 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \ - OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \ - OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix) \ + OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lasx_xvnori_b(a.val, 0)); } OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1)) @@ -948,16 +938,14 @@ OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1)) OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1)) #define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; } + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } #define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \ - OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \ - OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \ - OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast) \ + OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); } OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps) @@ -983,25 +971,25 @@ inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const /** Comparison **/ #define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } \ - inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ - { return b > a; } \ - inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a < b); } \ - inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ - { return b >= a; } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } \ + inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ + { return v_gt(b, a); } \ + inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_lt(a, b)); } \ + inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ + { return v_ge(b, a); } #define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \ - inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ - inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { \ return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \ } \ - inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ - inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \ OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \ OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec) @@ -1011,37 +999,37 @@ OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu) OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu) #define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \ - inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d) OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d) #define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); } #define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix) + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt, xvfcmp_clt, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix) OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s) OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d) -inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b) +inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b) { return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); } -inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b) +inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b) { return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); } -inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b) +inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b) { return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); } -inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b) +inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b) { return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); } inline v_float32x8 v_not_nan(const v_float32x8& a) @@ -1309,9 +1297,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a) { return v_reduce_sum(v_reinterpret_as_s32(a)); } inline int v_reduce_sum(const v_int16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline unsigned v_reduce_sum(const v_uint16x16& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline float v_reduce_sum(const v_float32x8& a) { @@ -1379,27 +1367,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) { v_uint32x8 l, h; - v_expand(v_add_wrap(a - b, b - a), l, h); - return v_reduce_sum(l + h); + v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b) { v_uint32x8 l, h; v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b) { - return v_reduce_sum(v_max(a, b) - v_min(a, b)); + return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); } inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b) { - v_int32x8 m = a < b; - return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m)); + v_int32x8 m = v_lt(a, b); + return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m))); } inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) { - v_float32x8 a_b = a - b; + v_float32x8 a_b = v_sub(a, b); return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff))); } @@ -1503,9 +1491,9 @@ OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16) inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b*b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s) OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d) @@ -1556,20 +1544,20 @@ inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b) { return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); } inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b) { - v_int8x32 d = a - b; - v_int8x32 m = a < b; - return (d ^ m) - m; + v_int8x32 d = v_sub(a, b); + v_int8x32 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ////////// Conversions ///////// @@ -1891,7 +1879,7 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) { return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); } inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) @@ -1915,7 +1903,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) return v_uint32x8(__lasx_xvadd_w(prod0, prod1)); } inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) { @@ -1926,7 +1914,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) return v_int32x8(__lasx_xvadd_w(prod0, prod1)); } inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) @@ -1938,7 +1926,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) return v_uint64x4(__lasx_xvadd_d(prod0, prod1)); } inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) { @@ -1950,13 +1938,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) } inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1993,7 +1981,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0))); } inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) { @@ -2004,7 +1992,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) return v_int64x4(__lasx_xvadd_d(lo, hi)); } inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b) @@ -2024,7 +2012,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0, v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55); v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA); v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF); - return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3)))); } inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0, diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp index 6e3290426f..aa997070c3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp @@ -525,53 +525,51 @@ OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2) /** Arithmetics **/ #define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { a.val = intrin(a.val, b.val); return a; } + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16, __lsx_vsadd_bu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16, __lsx_vssub_bu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16, __lsx_vsadd_b) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16, __lsx_vssub_b) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8, __lsx_vsadd_hu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8, __lsx_vssub_hu) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8, __lsx_vsadd_h) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8, __lsx_vssub_h) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4, __lsx_vadd_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4, __lsx_vsub_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4, __lsx_vmul_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4, __lsx_vadd_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4, __lsx_vsub_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4, __lsx_vmul_w) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2, __lsx_vadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2, __lsx_vsub_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2, __lsx_vadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2, __lsx_vsub_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s) -OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d) -OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d) +OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d) // saturating multiply 8-bit, 16-bit -inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b) +inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b) { v_uint16x8 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b) +inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b) { v_int16x8 c, d; v_mul_expand(a, b, c, d); return v_pack(c, d); } -inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b) +inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b) { __m128i a0 = a.val, b0 = b.val; __m128i pev = __lsx_vmulwev_w_hu(a0, b0); @@ -580,7 +578,7 @@ inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b) __m128i ph = __lsx_vilvh_w(pod, pev); return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0); } -inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b) +inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b) { __m128i a0 = a.val, b0 = b.val; __m128i pev = __lsx_vmulwev_w_h(a0, b0); @@ -589,14 +587,6 @@ inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b) __m128i ph = __lsx_vilvh_w(pod, pev); return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0); } -inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b) -{ a = a * b; return a; } -inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b) -{ a = a * b; return a; } -inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b) -{ a = a * b; return a; } -inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b) -{ a = a * b; return a; } /** Non-saturating arithmetics **/ @@ -681,13 +671,13 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ - inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ - inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ - inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \ template \ inline _Tpuvec v_shl(const _Tpuvec& a) \ @@ -708,10 +698,10 @@ OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \ - OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \ - OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \ - OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \ - inline _Tpvec operator ~(const _Tpvec& a) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix) \ + OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lsx_vnori_b(a.val, 0)); } \ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v) @@ -724,18 +714,14 @@ OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v) OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v) #define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \ - a.val = cast(c); \ - return a;} + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } #define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \ - OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \ - OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \ - OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast) \ + OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast) \ + inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \ OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps) @@ -760,23 +746,23 @@ inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const /** Comparison **/ #define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~( a == b ); } \ - inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ - { return b > a ; } \ - inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a < b); } \ - inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ - { return b >= a; } \ + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } \ + inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ + { return v_gt(b, a); } \ + inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_lt(a, b)); } \ + inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ + { return v_ge(b, a); } \ #define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \ - inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \ - inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ + inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \ - inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \ - inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ + inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \ OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \ OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec) @@ -786,37 +772,37 @@ OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu) OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu) #define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \ - inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \ - inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ - { return ~(a == b); } + inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ + { return v_not(v_eq(a, b)); } OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d) OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d) #define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \ #define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \ - OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt, vfcmp_clt, _Tpvec, ssuffix) \ + OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix) \ OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s) OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d) -inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b) +inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b) { return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); } -inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b) +inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b) { return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); } -inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b) +inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b) { return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); } -inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b) +inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b) { return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); } inline v_float32x4 v_not_nan(const v_float32x4& a) @@ -1188,7 +1174,7 @@ inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) { - v_float32x4 a_b = a - b; + v_float32x4 a_b = v_sub(a, b); return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff))); } @@ -1295,9 +1281,9 @@ OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3) inline _Tpvec v_sqrt(const _Tpvec& x) \ { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_fma(a, a, b * b); } \ + { return v_fma(a, a, v_mul(b, b)); } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ - { return v_sqrt(v_fma(a, a, b * b)); } + { return v_sqrt(v_fma(a, a, v_mul(b, b))); } OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s) OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d) @@ -1349,20 +1335,20 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) { return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); } inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Saturating absolute difference **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) { - v_int8x16 d = a - b; - v_int8x16 m = a < b; - return (d ^ m) - m; + v_int8x16 d = v_sub(a, b); + v_int8x16 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } ///////// Conversions ///////// @@ -1673,7 +1659,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand(a, b) + c ;} +{ return v_add(v_dotprod_expand(a, b), c) ;} inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { @@ -1685,7 +1671,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) return v_int32x4(__lsx_vadd_w(prod0, prod1)); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) @@ -1698,7 +1684,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) return v_uint64x2(__lsx_vadd_d(prod0, prod1)); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -1710,13 +1696,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) return v_int64x2(__lsx_vadd_d(prod0, prod1)); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } ///////// Fast Dot Product ////// @@ -1755,7 +1741,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1))); } inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) { @@ -1767,7 +1753,7 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) return v_int64x2(__lsx_vadd_d(lo, hi)); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp index 23d6ebd3d1..8d2c22b087 100644 --- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp @@ -345,53 +345,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, } #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ } -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64) -OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32) -OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64) -OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64) -OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64) -OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \ -inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ -} \ -inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ -{a = a * b; return a; } +} OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8) OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8) @@ -546,13 +539,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) return v_int64x2(msa_hadd_s64(prod, prod)); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -596,10 +589,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, { return v_dotprod_expand(a, b, c); } #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \ -OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \ -OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \ -OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \ -inline _Tpvec operator ~ (const _Tpvec& a) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix) \ +inline _Tpvec v_not(const _Tpvec& a) \ { \ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \ } @@ -614,21 +607,16 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64) OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64) #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \ -inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \ { \ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \ -} \ -inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ -{ \ - a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \ - return a; \ } -OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32) -OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32) -OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32) -inline v_float32x4 operator ~ (const v_float32x4& a) +inline v_float32x4 v_not(const v_float32x4& a) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); } @@ -659,21 +647,16 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64) OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64) #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \ -inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \ { \ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \ -} \ -inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ -{ \ - a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \ - return a; \ } -OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64) -OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64) -OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64) -inline v_float64x2 operator ~ (const v_float64x2& a) +inline v_float64x2 v_not(const v_float64x2& a) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); } @@ -704,17 +687,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64) OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64) #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); } OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8) @@ -821,9 +804,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_ // trade efficiency for convenience #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ -inline _Tpvec operator << (const _Tpvec& a, int n) \ +inline _Tpvec v_shl(const _Tpvec& a, int n) \ { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ -inline _Tpvec operator >> (const _Tpvec& a, int n) \ +inline _Tpvec v_shr(const _Tpvec& a, int n) \ { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \ diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp index 5681ae211d..4900418df3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp @@ -373,70 +373,50 @@ inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ } #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val, num)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val, num); \ - return a; \ } -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4) -inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4) +inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b) { return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4)); } -inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) -{ - a.val = vfdiv_vv_f32m1(a.val, b.val, 4); - return a; -} -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2) -inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2) +inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b) { return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2)); } -inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b) -{ - a.val = vfdiv_vv_f64m1(a.val, b.val, 2); - return a; -} // TODO: exp, log, sin, cos #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \ @@ -562,10 +542,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_ } #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \ - OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \ - OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \ - OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \ - inline _Tpvec operator ~ (const _Tpvec & a) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \ + inline _Tpvec v_not(const _Tpvec & a) \ { \ return _Tpvec(vnot_v_##suffix(a.val, num)); \ } @@ -580,41 +560,31 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4) OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2) #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \ -inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \ { \ return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \ -} \ -inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ -{ \ - a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \ - return a; \ } -OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1) -OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1) -OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1) -inline v_float32x4 operator ~ (const v_float32x4& a) +inline v_float32x4 v_not(const v_float32x4& a) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4))); } #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \ -inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \ { \ return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \ -} \ -inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ -{ \ - a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \ - return a; \ } -OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1) -OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1) -OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1) -inline v_float64x2 operator ~ (const v_float64x2& a) +inline v_float64x2 v_not(const v_float64x2& a) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2))); } @@ -1174,32 +1144,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4) OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4) #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { \ vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \ return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ @@ -1215,37 +1185,37 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_) OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_) //TODO: == -inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } -inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b) +inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); @@ -1259,37 +1229,37 @@ inline v_float32x4 v_not_nan(const v_float32x4& a) } //TODO: == -inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } -inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b) +inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); @@ -1331,13 +1301,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32) #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \ -inline _Tpvec operator << (const _Tpvec& a, int n) \ +inline _Tpvec v_shl(const _Tpvec& a, int n) \ { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \ template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \ -inline _Tpvec operator >> (const _Tpvec& a, int n) \ +inline _Tpvec v_shr(const _Tpvec& a, int n) \ { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \ template inline _Tpvec v_shr(const _Tpvec& a) \ { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\ @@ -2037,13 +2007,11 @@ OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \ - inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ auto res = mul(a.val, b.val, num); \ return _Tpvec(cvt(res, 0, num)); \ - } \ - inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ - { a = a * b; return a; } + } OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1) OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1) @@ -2845,7 +2813,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) { vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4); @@ -2854,7 +2822,7 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) { v_float64x2 res = v_dotprod_expand_fast(a, b); - return res + c; } + return v_add(res, c); } #endif ////// FP16 support /////// #if __riscv_v == 7000 diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 88b67ae250..ee4545db6b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -735,53 +735,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, } #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \ - inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ - } \ - inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ - { \ - a.val = intrin(a.val, b.val); \ - return a; \ } -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64) -OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) -OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64) +OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \ - inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ - } \ - inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ - { a = a * b; return a; } + } OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8) OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8) @@ -845,7 +838,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { return v_int32x4(_mm_madd_epi16(a.val, b.val)); } inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) @@ -872,7 +865,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) #endif } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) @@ -886,7 +879,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) return v_uint32x4(_mm_add_epi32(p0, p1)); } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { @@ -899,7 +892,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) return v_int32x4(_mm_add_epi32(p0, p1)); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) @@ -911,14 +904,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) v_expand(c, c0, c1); v_expand(d, d0, d1); - c0 += c1; d0 += d1; + c0 = v_add(c0, c1); d0 = v_add(d0, d1); return v_uint64x2(_mm_add_epi64( _mm_unpacklo_epi64(c0.val, d0.val), _mm_unpackhi_epi64(c0.val, d0.val) )); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -931,7 +924,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) )); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) @@ -939,8 +932,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) #if CV_SSE4_1 return v_cvt_f64(v_dotprod(a, b)); #else - v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b); - v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b); + v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b)); + v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b)); return v_float64x2(_mm_add_pd( _mm_unpacklo_pd(c.val, d.val), @@ -949,7 +942,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) #endif } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -957,13 +950,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) { return v_dotprod(a, b); } inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 32 >> 64 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) { return v_dotprod(a, b); } inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) -{ return v_dotprod_fast(a, b) + c; } +{ return v_add(v_dotprod_fast(a, b), c); } // 8 >> 32 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) @@ -977,7 +970,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b return v_uint32x4(_mm_add_epi32(p0, p1)); } inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) { @@ -994,7 +987,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) #endif } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) @@ -1006,34 +999,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b v_expand(c, c0, c1); v_expand(d, d0, d1); - c0 += c1; d0 += d1; - return c0 + d0; + c0 = v_add(c0, c1); d0 = v_add(d0, d1); + return v_add(c0, d0); } inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) { v_int32x4 prod = v_dotprod(a, b); v_int64x2 c, d; v_expand(prod, c, d); - return c + d; + return v_add(c, d); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c); inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) -{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ - OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \ - inline _Tpvec operator ~ (const _Tpvec& a) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix) \ + OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \ + inline _Tpvec v_not(const _Tpvec& a) \ { \ return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \ } @@ -1182,58 +1175,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b) } #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \ -inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \ { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ } \ -inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \ { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \ } \ -inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \ } \ -inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \ } \ -inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ __m128i not_mask = _mm_set1_epi32(-1); \ __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \ return _Tpuvec(_mm_xor_si128(res, not_mask)); \ } \ -inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \ +inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \ { \ __m128i smask = _mm_set1_##suffix(sbit); \ __m128i not_mask = _mm_set1_epi32(-1); \ __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \ return _Tpuvec(_mm_xor_si128(res, not_mask)); \ } \ -inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \ { \ return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \ } \ -inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \ { \ return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \ } \ -inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \ } \ -inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \ +inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \ { \ __m128i not_mask = _mm_set1_epi32(-1); \ return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \ @@ -1244,17 +1237,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768) OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000) #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); } OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) @@ -1262,26 +1255,28 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) #if CV_SSE4_1 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return ~(a == b); } +inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \ +{ return v_not(v_eq(a, b)); } #else #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \ return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return ~(a == b); } +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ +{ return v_not(v_eq(a, b)); } #endif -inline v_int64x2 operator > (const v_int64x2& a, const v_int64x2& b) +inline v_int64x2 v_gt(const v_int64x2& a, const v_int64x2& b) { __m128i s = _mm_srli_epi64(_mm_sub_epi64(b.val, a.val), 63); return v_int64x2(_mm_sub_epi64(_mm_setzero_si128(), s)); } -inline v_int64x2 operator < (const v_int64x2& a, const v_int64x2& b) -{ return b > a; } +inline v_int64x2 v_lt(const v_int64x2& a, const v_int64x2& b) +{ + return v_gt(b, a); +} OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2) OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2) @@ -1319,17 +1314,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b) /** Absolute difference **/ inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) { v_int8x16 d = v_sub_wrap(a, b); - v_int8x16 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x16 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) { @@ -1337,25 +1332,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) } inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) { - v_int32x4 d = a - b; - v_int32x4 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x4 d = v_sub(a, b); + v_int32x4 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } /** Saturating absolute difference **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) { - v_int8x16 d = a - b; - v_int8x16 m = a < b; - return (d ^ m) - m; + v_int8x16 d = v_sub(a, b); + v_int8x16 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) @@ -1389,12 +1384,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ } \ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ { \ - _Tpvec res = v_fma(a, a, b*b); \ + _Tpvec res = v_fma(a, a, v_mul(b, b)); \ return _Tpvec(_mm_sqrt_##suffix(res.val)); \ } \ inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ { \ - return v_fma(a, a, b*b); \ + return v_fma(a, a, v_mul(b, b)); \ } \ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ { \ @@ -1405,19 +1400,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32(( OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1)) #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ -inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \ } \ -inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \ } \ -inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \ } \ -inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(srai(a.val, imm)); \ } \ @@ -1719,9 +1714,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) inline int v_reduce_sum(const v_int16x8& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline unsigned v_reduce_sum(const v_uint16x8& a) -{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); } inline uint64 v_reduce_sum(const v_uint64x2& a) { @@ -1778,13 +1773,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) { @@ -1813,15 +1808,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a) inline v_uint16x8 v_popcount(const v_uint16x8& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff)); } inline v_uint32x4 v_popcount(const v_uint32x4& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff)); } inline v_uint64x2 v_popcount(const v_uint64x2& a) { diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index e66563bede..fbe690461a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -513,48 +513,44 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) /* Element-wise binary and unary operations */ /** Arithmetics **/ #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ -{ return _Tpvec(intrin(a.val, b.val)); } \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ a.val = intrin(a.val, b.val); return a; } +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(intrin(a.val, b.val)); } -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul) -OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub) -OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add) -OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub) // saturating multiply #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \ - inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ - } \ - inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ - { a = a * b; return a; } + } OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8) OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8) @@ -596,9 +592,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul) /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \ -inline _Tpvec operator << (const _Tpvec& a, int imm) \ +inline _Tpvec v_shl(const _Tpvec& a, int imm) \ { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \ -inline _Tpvec operator >> (const _Tpvec& a, int imm) \ +inline _Tpvec v_shr(const _Tpvec& a, int imm) \ { return _Tpvec(shr(a.val, splfunc(imm))); } \ template inline _Tpvec v_shl(const _Tpvec& a) \ { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \ @@ -617,10 +613,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp) /** Bitwise logic **/ #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \ -OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \ -OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \ -OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \ -inline _Tpvec operator ~ (const _Tpvec& a) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor) \ +inline _Tpvec v_not(const _Tpvec& a) \ { return _Tpvec(vec_not(a.val)); } OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16) @@ -650,17 +646,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c) /** Comparison **/ #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpeq(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec V_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpne(a.val, b.val)); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmplt(a.val, b.val)); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec V_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpgt(a.val, b.val)); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmple(a.val, b.val)); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(vec_cmpge(a.val, b.val)); } OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16) @@ -1060,7 +1056,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4) OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2) inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) -{ return a * b + c; } +{ return v_add(v_mul(a, b), c); } // TODO: exp, log, sin, cos @@ -1089,12 +1085,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) -{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); } +{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); } inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) -{ return v_abs(a - b); } +{ return v_abs(v_sub(a, b)); } /** Absolute difference for signed integers **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) @@ -1442,7 +1438,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) return v_int64x2(vec_add(even, odd)); } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } // 8 >> 32 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) @@ -1485,7 +1481,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) return v_uint64x2(vec_add(s0, s1)); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -1495,13 +1491,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val))); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1531,7 +1527,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z))); } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) @@ -1544,10 +1540,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) v_int32x4 prod = v_dotprod(a, b); v_int64x2 c, d; v_expand(prod, c, d); - return c + d; + return v_add(c, d); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand_fast(a, b) + c; } +{ return v_add(v_dotprod_expand_fast(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp index 5d470d9419..3a8069ca91 100644 --- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp @@ -849,53 +849,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, } #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \ -inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ { \ return _Tpvec(intrin(a.val, b.val)); \ -} \ -inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ -{ \ - a.val = intrin(a.val, b.val); \ - return a; \ } -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add) -OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub) -OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul) -OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint8x16, wasm_u8x16_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint8x16, wasm_u8x16_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int8x16, wasm_i8x16_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int8x16, wasm_i8x16_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint16x8, wasm_u16x8_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint16x8, wasm_u16x8_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int16x8, wasm_i16x8_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int16x8, wasm_i16x8_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint32x4, wasm_i32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint32x4, wasm_i32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_uint32x4, wasm_i32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int32x4, wasm_i32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int32x4, wasm_i32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_int32x4, wasm_i32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float32x4, wasm_f32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float32x4, wasm_f32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float32x4, wasm_f32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float32x4, wasm_f32x4_div) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint64x2, wasm_i64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint64x2, wasm_i64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int64x2, wasm_i64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int64x2, wasm_i64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float64x2, wasm_f64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float64x2, wasm_f64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float64x2, wasm_f64x2_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float64x2, wasm_f64x2_div) // saturating multiply 8-bit, 16-bit #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \ -inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpwvec c, d; \ v_mul_expand(a, b, c, d); \ return v_pack(c, d); \ -} \ -inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ -{ a = a * b; return a; } +} OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8) OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8) @@ -986,7 +979,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) } inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) -{ return v_dotprod(a, b) + c; } +{ return v_add(v_dotprod(a, b), c); } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) { @@ -1000,7 +993,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) { - return v_dotprod(a, b) + c; + return v_add(v_dotprod(a, b), c); } // 8 >> 32 @@ -1010,13 +1003,13 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) v128_t a1 = wasm_u16x8_shr(a.val, 8); v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8); v128_t b1 = wasm_u16x8_shr(b.val, 8); - return v_uint32x4(( - v_dotprod(v_int16x8(a0), v_int16x8(b0)) + - v_dotprod(v_int16x8(a1), v_int16x8(b1))).val + return v_uint32x4((v_add( + v_dotprod(v_int16x8(a0), v_int16x8(b0)), + v_dotprod(v_int16x8(a1), v_int16x8(b1)))).val ); } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { @@ -1024,13 +1017,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) v128_t a1 = wasm_i16x8_shr(a.val, 8); v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8); v128_t b1 = wasm_i16x8_shr(b.val, 8); - return v_int32x4( - v_dotprod(v_int16x8(a0), v_int16x8(b0)) + + return v_int32x4(v_add( + v_dotprod(v_int16x8(a0), v_int16x8(b0)), v_dotprod(v_int16x8(a1), v_int16x8(b1)) - ); + )); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 16 >> 64 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) @@ -1039,13 +1032,13 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) v128_t a1 = wasm_u32x4_shr(a.val, 16); v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16); v128_t b1 = wasm_u32x4_shr(b.val, 16); - return v_uint64x2(( - v_dotprod(v_int32x4(a0), v_int32x4(b0)) + + return v_uint64x2((v_add( + v_dotprod(v_int32x4(a0), v_int32x4(b0)), v_dotprod(v_int32x4(a1), v_int32x4(b1))).val - ); + )); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { @@ -1053,20 +1046,20 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) v128_t a1 = wasm_i32x4_shr(a.val, 16); v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16); v128_t b1 = wasm_i32x4_shr(b.val, 16); - return v_int64x2(( - v_dotprod(v_int32x4(a0), v_int32x4(b0)) + + return v_int64x2((v_add( + v_dotprod(v_int32x4(a0), v_int32x4(b0)), v_dotprod(v_int32x4(a1), v_int32x4(b1))) - ); + )); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } // 32 >> 64f inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) { return v_cvt_f64(v_dotprod(a, b)); } inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) -{ return v_dotprod_expand(a, b) + c; } +{ return v_add(v_dotprod_expand(a, b), c); } //////// Fast Dot Product //////// @@ -1109,10 +1102,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, { return v_dotprod_expand(a, b, c); } #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \ -OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \ -OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \ -OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \ -inline _Tpvec operator ~ (const _Tpvec& a) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(v_and, _Tpvec, wasm_v128_and) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(v_or, _Tpvec, wasm_v128_or) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(v_xor, _Tpvec, wasm_v128_xor) \ +inline _Tpvec v_not(const _Tpvec& a) \ { \ return _Tpvec(wasm_v128_not(a.val)); \ } @@ -1215,17 +1208,17 @@ OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000) OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000) #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \ -inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \ -inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \ -inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \ -inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \ { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); } OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16) @@ -1238,10 +1231,10 @@ OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4) OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2) #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \ -inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ -{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \ -inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ -{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); } +inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_eq(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); } \ +inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_ne(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); } OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64) OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64) @@ -1299,17 +1292,17 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul) /** Absolute difference **/ inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b) -{ return v_add_wrap(a - b, b - a); } +{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); } inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) { v_int8x16 d = v_sub_wrap(a, b); - v_int8x16 m = a < b; - return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); + v_int8x16 m = v_lt(a, b); + return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m)); } inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) { @@ -1317,25 +1310,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) } inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) { - v_int32x4 d = a - b; - v_int32x4 m = a < b; - return v_reinterpret_as_u32((d ^ m) - m); + v_int32x4 d = v_sub(a, b); + v_int32x4 m = v_lt(a, b); + return v_reinterpret_as_u32(v_sub(v_xor(d, m), m)); } /** Saturating absolute difference **/ inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) { - v_int8x16 d = a - b; - v_int8x16 m = a < b; - return (d ^ m) - m; + v_int8x16 d = v_sub(a, b); + v_int8x16 m = v_lt(a, b); + return v_sub(v_xor(d, m), m); } inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) -{ return v_max(a, b) - v_min(a, b); } +{ return v_sub(v_max(a, b), v_min(a, b)); } inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) @@ -1345,12 +1338,12 @@ inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) { - return a * b + c; + return v_add(v_mul(a, b), c); } inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) @@ -1386,19 +1379,19 @@ OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4) OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2) #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \ -inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \ } \ -inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \ } \ -inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +inline _Tpuvec V_shr(const _Tpuvec& a, int imm) \ { \ return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \ } \ -inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ +inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \ { \ return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \ } \ @@ -1694,7 +1687,7 @@ inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) v_expand(v_absdiff(a, b), l16, h16); v_expand(l16, l16_l32, l16_h32); v_expand(h16, h16_l32, h16_h32); - return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32); + return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32))); } inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) { @@ -1703,19 +1696,19 @@ inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) v_expand(v_absdiff(a, b), l16, h16); v_expand(l16, l16_l32, l16_h32); v_expand(h16, h16_l32, h16_h32); - return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32); + return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32))); } inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) { v_uint32x4 l, h; v_expand(v_absdiff(a, b), l, h); - return v_reduce_sum(l + h); + return v_reduce_sum(v_add(l, h)); } inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) { @@ -1744,15 +1737,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a) inline v_uint16x8 v_popcount(const v_uint16x8& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff); + p = v_add(p, v_rotate_right<1>(p)); + return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff)); } inline v_uint32x4 v_popcount(const v_uint32x4& a) { v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); - p += v_rotate_right<1>(p); - p += v_rotate_right<2>(p); - return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff); + p = v_add(p, v_rotate_right<1>(p)); + p = v_add(p, v_rotate_right<2>(p)); + return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff)); } inline v_uint64x2 v_popcount(const v_uint64x2& a) { diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp index 0452d46e83..0d115d6595 100644 --- a/modules/core/test/test_arithm.cpp +++ b/modules/core/test/test_arithm.cpp @@ -1801,7 +1801,7 @@ INSTANTIATE_TEST_CASE_P(Core_CartToPolarToCart, ElemWiseTest, ::testing::Values( // Mixed Type Arithmetic Operations -typedef std::tuple> SomeType; +typedef std::tuple, int> SomeType; class ArithmMixedTest : public ::testing::TestWithParam {}; TEST_P(ArithmMixedTest, accuracy) @@ -1810,7 +1810,10 @@ TEST_P(ArithmMixedTest, accuracy) ElemWiseOpPtr op = std::get<0>(p); int srcDepth = std::get<0>(std::get<1>(p)); int dstDepth = std::get<1>(std::get<1>(p)); + int channels = std::get<2>(p); + int srcType = CV_MAKETYPE(srcDepth, channels); + int dstType = CV_MAKETYPE(dstDepth, channels); op->flags |= BaseElemWiseOp::MIXED_TYPE; int testIdx = 0; RNG rng((uint64)ARITHM_RNG_SEED); @@ -1825,15 +1828,15 @@ TEST_P(ArithmMixedTest, accuracy) int ninputs = op->ninputs; vector src(ninputs); for(int i = 0; i < ninputs; i++ ) - src[i] = cvtest::randomMat(rng, size, srcDepth, minval, maxval, true); + src[i] = cvtest::randomMat(rng, size, srcType, minval, maxval, true); Mat dst0, dst, mask; if( haveMask ) { mask = cvtest::randomMat(rng, size, CV_8UC1, 0, 2, true); } - dst0 = cvtest::randomMat(rng, size, dstDepth, minval, maxval, false); - dst = cvtest::randomMat(rng, size, dstDepth, minval, maxval, true); + dst0 = cvtest::randomMat(rng, size, dstType, minval, maxval, false); + dst = cvtest::randomMat(rng, size, dstType, minval, maxval, true); cvtest::copy(dst, dst0); op->generateScalars(dstDepth, rng); @@ -1853,53 +1856,62 @@ INSTANTIATE_TEST_CASE_P(Core_AddMixed, ArithmMixedTest, ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_AddScalarMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new AddSOp)), ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_AddWeightedMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new AddWeightedOp)), ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_SubMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new SubOp)), ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_SubScalarMinusArgMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new SubRSOp)), ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_MulMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new MulOp)), ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_MulScalarMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new MulSOp)), ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_DivMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new DivOp)), ::testing::Values(std::tuple{CV_8U, CV_16U}, std::tuple{CV_8S, CV_16S}, std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); INSTANTIATE_TEST_CASE_P(Core_RecipMixed, ArithmMixedTest, ::testing::Combine(::testing::Values(ElemWiseOpPtr(new RecipOp)), - ::testing::Values(std::tuple{CV_8U, CV_32F}, - std::tuple{CV_8S, CV_32F}))); + ::testing::Values(std::tuple{CV_8U, CV_16U}, + std::tuple{CV_8S, CV_32F}), + ::testing::Values(1, 3, 4))); TEST(Core_ArithmMask, uninitialized) { diff --git a/modules/dnn/src/tflite/tflite_importer.cpp b/modules/dnn/src/tflite/tflite_importer.cpp index 92bfeeef65..7e7f1d0503 100644 --- a/modules/dnn/src/tflite/tflite_importer.cpp +++ b/modules/dnn/src/tflite/tflite_importer.cpp @@ -271,7 +271,7 @@ TFLiteImporter::DispatchMap TFLiteImporter::buildDispatchMap() dispatch["DEPTHWISE_CONV_2D"] = &TFLiteImporter::parseDWConvolution; dispatch["ADD"] = dispatch["MUL"] = &TFLiteImporter::parseEltwise; dispatch["RELU"] = dispatch["PRELU"] = dispatch["HARD_SWISH"] = - dispatch["LOGISTIC"] = &TFLiteImporter::parseActivation; + dispatch["LOGISTIC"] = dispatch["LEAKY_RELU"] = &TFLiteImporter::parseActivation; dispatch["MAX_POOL_2D"] = dispatch["AVERAGE_POOL_2D"] = &TFLiteImporter::parsePooling; dispatch["MaxPoolingWithArgmax2D"] = &TFLiteImporter::parsePoolingWithArgmax; dispatch["MaxUnpooling2D"] = &TFLiteImporter::parseUnpooling; @@ -1029,6 +1029,7 @@ void TFLiteImporter::parseActivation(const Operator& op, const std::string& opco } void TFLiteImporter::parseActivation(const Operator& op, const std::string& opcode, LayerParams& activParams, bool isFused) { + float slope = 0.; if (opcode == "NONE") return; else if (opcode == "RELU6") @@ -1041,6 +1042,13 @@ void TFLiteImporter::parseActivation(const Operator& op, const std::string& opco activParams.type = "HardSwish"; else if (opcode == "LOGISTIC") activParams.type = "Sigmoid"; + else if (opcode == "LEAKY_RELU") + { + activParams.type = "ReLU"; + auto options = reinterpret_cast(op.builtin_options()); + slope = options->alpha(); + activParams.set("negative_slope", slope); + } else CV_Error(Error::StsNotImplemented, "Unsupported activation " + opcode); @@ -1072,6 +1080,8 @@ void TFLiteImporter::parseActivation(const Operator& op, const std::string& opco y = 1.0f / (1.0f + std::exp(-x)); else if (opcode == "HARD_SWISH") y = x * max(0.f, min(1.f, x / 6.f + 0.5f)); + else if (opcode == "LEAKY_RELU") + y = x >= 0.f ? x : slope*x; else CV_Error(Error::StsNotImplemented, "Lookup table for " + opcode); diff --git a/modules/dnn/test/test_tflite_importer.cpp b/modules/dnn/test/test_tflite_importer.cpp index 31e30ea724..6f21fe34ef 100644 --- a/modules/dnn/test/test_tflite_importer.cpp +++ b/modules/dnn/test/test_tflite_importer.cpp @@ -271,6 +271,10 @@ TEST_P(Test_TFLite, global_max_pooling_2d) { testLayer("global_max_pooling_2d"); } +TEST_P(Test_TFLite, leakyRelu) { + testLayer("leakyRelu"); +} + INSTANTIATE_TEST_CASE_P(/**/, Test_TFLite, dnnBackendsAndTargets()); }} // namespace diff --git a/modules/features2d/src/fast.avx2.cpp b/modules/features2d/src/fast.avx2.cpp index 72e7d66924..3d408a03df 100644 --- a/modules/features2d/src/fast.avx2.cpp +++ b/modules/features2d/src/fast.avx2.cpp @@ -157,7 +157,7 @@ public: q0 = v_max(q0, v_min(a, v0_)); q1 = v_min(q1, v_max(b, v0_)); } - q0 = v_max(q0, v_setzero_s16() - q1); + q0 = v_max(q0, v_sub(v_setzero_s16(), q1)); curr[j + k] = (uchar)(v_reduce_max(q0) - 1); } } diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp index 2c5cf9f997..76ef3082ea 100644 --- a/modules/features2d/src/sift.simd.hpp +++ b/modules/features2d/src/sift.simd.hpp @@ -150,7 +150,7 @@ void findScaleSpaceExtrema( void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float scl, - int d, int n, Mat& dst, int row + const int d, const int n, Mat& dst, int row ); @@ -708,7 +708,7 @@ void findScaleSpaceExtrema( void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float scl, - int d, int n, Mat& dstMat, int row + const int d, const int n, Mat& dstMat, int row ) { CV_TRACE_FUNCTION(); @@ -725,7 +725,10 @@ void calcSIFTDescriptor( cos_t /= hist_width; sin_t /= hist_width; - int i, j, k, len = (radius*2+1)*(radius*2+1), histlen = (d+2)*(d+2)*(n+2); + int i, j, k; + const int len = (radius*2+1)*(radius*2+1); + const int len_hist = (d+2)*(d+2)*(n+2); + const int len_ddn = d * d * n; int rows = img.rows, cols = img.cols; cv::utils::BufferArea area; @@ -736,8 +739,8 @@ void calcSIFTDescriptor( area.allocate(W, len, CV_SIMD_WIDTH); area.allocate(RBin, len, CV_SIMD_WIDTH); area.allocate(CBin, len, CV_SIMD_WIDTH); - area.allocate(hist, histlen, CV_SIMD_WIDTH); - area.allocate(rawDst, len, CV_SIMD_WIDTH); + area.allocate(hist, len_hist, CV_SIMD_WIDTH); + area.allocate(rawDst, len_ddn, CV_SIMD_WIDTH); area.commit(); Mag = Y; @@ -771,10 +774,10 @@ void calcSIFTDescriptor( } } - len = k; - cv::hal::fastAtan2(Y, X, Ori, len, true); - cv::hal::magnitude32f(X, Y, Mag, len); - cv::hal::exp32f(W, W, len); + const int len_left = k; + cv::hal::fastAtan2(Y, X, Ori, len_left, true); + cv::hal::magnitude32f(X, Y, Mag, len_left); + cv::hal::exp32f(W, W, len_left); k = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) @@ -788,7 +791,7 @@ void calcSIFTDescriptor( const v_int32 __1 = vx_setall_s32(1); const v_int32 __d_plus_2 = vx_setall_s32(d+2); const v_int32 __n_plus_2 = vx_setall_s32(n+2); - for( ; k <= len - vecsize; k += vecsize ) + for( ; k <= len_left - vecsize; k += vecsize ) { v_float32 rbin = vx_load_aligned(RBin + k); v_float32 cbin = vx_load_aligned(CBin + k); @@ -839,7 +842,7 @@ void calcSIFTDescriptor( } } #endif - for( ; k < len; k++ ) + for( ; k < len_left; k++ ) { float rbin = RBin[k], cbin = CBin[k]; float obin = (Ori[k] - ori)*bins_per_rad; @@ -892,13 +895,12 @@ void calcSIFTDescriptor( // and scale the result, so that it can be easily converted // to byte array float nrm2 = 0; - len = d*d*n; k = 0; #if (CV_SIMD || CV_SIMD_SCALABLE) { v_float32 __nrm2 = vx_setzero_f32(); v_float32 __rawDst; - for( ; k <= len - VTraits::vlanes(); k += VTraits::vlanes() ) + for( ; k <= len_ddn - VTraits::vlanes(); k += VTraits::vlanes() ) { __rawDst = vx_load_aligned(rawDst + k); __nrm2 = v_fma(__rawDst, __rawDst, __nrm2); @@ -906,10 +908,10 @@ void calcSIFTDescriptor( nrm2 = (float)v_reduce_sum(__nrm2); } #endif - for( ; k < len; k++ ) + for( ; k < len_ddn; k++ ) nrm2 += rawDst[k]*rawDst[k]; - float thr = std::sqrt(nrm2)*SIFT_DESCR_MAG_THR; + const float thr = std::sqrt(nrm2)*SIFT_DESCR_MAG_THR; i = 0, nrm2 = 0; #if 0 //CV_AVX2 @@ -920,7 +922,7 @@ void calcSIFTDescriptor( __m256 __dst; __m256 __nrm2 = _mm256_setzero_ps(); __m256 __thr = _mm256_set1_ps(thr); - for( ; i <= len - 8; i += 8 ) + for( ; i <= len_ddn - 8; i += 8 ) { __dst = _mm256_loadu_ps(&rawDst[i]); __dst = _mm256_min_ps(__dst, __thr); @@ -936,7 +938,7 @@ void calcSIFTDescriptor( nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7]; } #endif - for( ; i < len; i++ ) + for( ; i < len_ddn; i++ ) { float val = std::min(rawDst[i], thr); rawDst[i] = val; @@ -954,7 +956,7 @@ if( dstMat.type() == CV_32F ) v_float32 __min = vx_setzero_f32(); v_float32 __max = vx_setall_f32(255.0f); // max of uchar v_float32 __nrm2 = vx_setall_f32(nrm2); - for( k = 0; k <= len - VTraits::vlanes(); k += VTraits::vlanes() ) + for( k = 0; k <= len_ddn - VTraits::vlanes(); k += VTraits::vlanes() ) { __dst = vx_load_aligned(rawDst + k); __dst = v_min(v_max(v_cvt_f32(v_round(v_mul(__dst, __nrm2))), __min), __max); @@ -965,7 +967,7 @@ if( dstMat.type() == CV_32F ) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" // iteration XX invokes undefined behavior #endif - for( ; k < len; k++ ) + for( ; k < len_ddn; k++ ) { dst[k] = saturate_cast(rawDst[k]*nrm2); } @@ -980,7 +982,7 @@ else // CV_8U v_float32 __dst0, __dst1; v_uint16 __pack01; v_float32 __nrm2 = vx_setall_f32(nrm2); - for( k = 0; k <= len - VTraits::vlanes() * 2; k += VTraits::vlanes() * 2 ) + for( k = 0; k <= len_ddn - VTraits::vlanes() * 2; k += VTraits::vlanes() * 2 ) { __dst0 = vx_load_aligned(rawDst + k); __dst1 = vx_load_aligned(rawDst + k + VTraits::vlanes()); @@ -994,7 +996,7 @@ else // CV_8U #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" // iteration XX invokes undefined behavior #endif - for( ; k < len; k++ ) + for( ; k < len_ddn; k++ ) { dst[k] = saturate_cast(rawDst[k]*nrm2); } @@ -1004,7 +1006,7 @@ else // CV_8U } #else float nrm1 = 0; - for( k = 0; k < len; k++ ) + for( k = 0; k < len_ddn; k++ ) { rawDst[k] *= nrm2; nrm1 += rawDst[k]; @@ -1013,7 +1015,7 @@ else // CV_8U if( dstMat.type() == CV_32F ) { float *dst = dstMat.ptr(row); - for( k = 0; k < len; k++ ) + for( k = 0; k < len_ddn; k++ ) { dst[k] = std::sqrt(rawDst[k] * nrm1); } @@ -1021,7 +1023,7 @@ else // CV_8U else // CV_8U { uint8_t *dst = dstMat.ptr(row); - for( k = 0; k < len; k++ ) + for( k = 0; k < len_ddn; k++ ) { dst[k] = saturate_cast(std::sqrt(rawDst[k] * nrm1)*SIFT_INT_DESCR_FCTR); } diff --git a/modules/features2d/test/test_sift.cpp b/modules/features2d/test/test_sift.cpp index 731b31ac0f..d98f1c6b8a 100644 --- a/modules/features2d/test/test_sift.cpp +++ b/modules/features2d/test/test_sift.cpp @@ -30,5 +30,17 @@ TEST(Features2d_SIFT, descriptor_type) ASSERT_EQ(countNonZero(diff), 0) << "descriptors are not identical"; } +TEST(Features2d_SIFT, regression_26139) +{ + auto extractor = cv::SIFT::create(); + cv::Mat1b image{cv::Size{300, 300}, 0}; + std::vector kps { + cv::KeyPoint(154.076813f, 136.160904f, 111.078636f, 216.195618f, 0.00000899323549f, 7) + }; + cv::Mat descriptors; + extractor->compute(image, kps, descriptors); // we expect no memory corruption + ASSERT_EQ(descriptors.size(), Size(128, 1)); +} + }} // namespace diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp index 77f091d55a..9063aafe2c 100644 --- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp +++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp @@ -222,9 +222,9 @@ enum ImwriteHDRCompressionFlags { @anchor imread -The function imread loads an image from the specified file and returns it. If the image cannot be -read (because of missing file, improper permissions, unsupported or invalid format), the function -returns an empty matrix ( Mat::data==NULL ). +The `imread` function loads an image from the specified file and returns OpenCV matrix. If the image cannot be +read (because of a missing file, improper permissions, or unsupported/invalid format), the function +returns an empty matrix. Currently, the following file formats are supported: @@ -234,7 +234,7 @@ Currently, the following file formats are supported: - Portable Network Graphics - \*.png (see the *Note* section) - WebP - \*.webp (see the *Note* section) - AVIF - \*.avif (see the *Note* section) -- Portable image format - \*.pbm, \*.pgm, \*.ppm \*.pxm, \*.pnm (always supported) +- Portable image format - \*.pbm, \*.pgm, \*.ppm, \*.pxm, \*.pnm (always supported) - PFM files - \*.pfm (see the *Note* section) - Sun rasters - \*.sr, \*.ras (always supported) - TIFF files - \*.tiff, \*.tif (see the *Note* section) @@ -243,32 +243,31 @@ Currently, the following file formats are supported: - Raster and Vector geospatial data supported by GDAL (see the *Note* section) @note -- The function determines the type of an image by the content, not by the file extension. +- The function determines the type of an image by its content, not by the file extension. - In the case of color images, the decoded images will have the channels stored in **B G R** order. - When using IMREAD_GRAYSCALE, the codec's internal grayscale conversion will be used, if available. - Results may differ to the output of cvtColor() -- On Microsoft Windows\* OS and MacOSX\*, the codecs shipped with an OpenCV image (libjpeg, - libpng, libtiff, and libjasper) are used by default. So, OpenCV can always read JPEGs, PNGs, - and TIFFs. On MacOSX, there is also an option to use native MacOSX image readers. But beware - that currently these native image loaders give images with different pixel values because of - the color management embedded into MacOSX. -- On Linux\*, BSD flavors and other Unix-like open-source operating systems, OpenCV looks for - codecs supplied with an OS image. Install the relevant packages (do not forget the development - files, for example, "libjpeg-dev", in Debian\* and Ubuntu\*) to get the codec support or turn + Results may differ from the output of cvtColor(). +- On Microsoft Windows\* and Mac OS\*, the codecs shipped with OpenCV (libjpeg, libpng, libtiff, + and libjasper) are used by default. So, OpenCV can always read JPEGs, PNGs, and TIFFs. On Mac OS, + there is also an option to use native Mac OS image readers. However, beware that currently these + native image loaders give images with different pixel values because of the color management embedded + into Mac OS. +- On Linux\*, BSD flavors, and other Unix-like open-source operating systems, OpenCV looks for + codecs supplied with the OS. Ensure the relevant packages are installed (including development + files, such as "libjpeg-dev" in Debian\* and Ubuntu\*) to get codec support, or turn on the OPENCV_BUILD_3RDPARTY_LIBS flag in CMake. -- In the case you set *WITH_GDAL* flag to true in CMake and @ref IMREAD_LOAD_GDAL to load the image, - then the [GDAL](http://www.gdal.org) driver will be used in order to decode the image, supporting - the following formats: [Raster](http://www.gdal.org/formats_list.html), - [Vector](http://www.gdal.org/ogr_formats.html). -- If EXIF information is embedded in the image file, the EXIF orientation will be taken into account - and thus the image will be rotated accordingly except if the flags @ref IMREAD_IGNORE_ORIENTATION +- If the *WITH_GDAL* flag is set to true in CMake and @ref IMREAD_LOAD_GDAL is used to load the image, + the [GDAL](http://www.gdal.org) driver will be used to decode the image, supporting + [Raster](http://www.gdal.org/formats_list.html) and [Vector](http://www.gdal.org/ogr_formats.html) formats. +- If EXIF information is embedded in the image file, the EXIF orientation will be taken into account, + and thus the image will be rotated accordingly unless the flags @ref IMREAD_IGNORE_ORIENTATION or @ref IMREAD_UNCHANGED are passed. -- Use the IMREAD_UNCHANGED flag to keep the floating point values from PFM image. -- By default number of pixels must be less than 2^30. Limit can be set using system - variable OPENCV_IO_MAX_IMAGE_PIXELS +- Use the IMREAD_UNCHANGED flag to preserve the floating-point values from PFM images. +- By default, the number of pixels must be less than 2^30. This limit can be changed by setting + the environment variable `OPENCV_IO_MAX_IMAGE_PIXELS`. See @ref tutorial_env_reference. -@param filename Name of file to be loaded. -@param flags Flag that can take values of cv::ImreadModes +@param filename Name of the file to be loaded. +@param flags Flag that can take values of `cv::ImreadModes`. */ CV_EXPORTS_W Mat imread( const String& filename, int flags = IMREAD_COLOR_BGR ); diff --git a/modules/imgcodecs/src/bitstrm.cpp b/modules/imgcodecs/src/bitstrm.cpp index a8f91aa4dd..bb92d8a73b 100644 --- a/modules/imgcodecs/src/bitstrm.cpp +++ b/modules/imgcodecs/src/bitstrm.cpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html #include "precomp.hpp" #include "bitstrm.hpp" @@ -49,11 +11,6 @@ namespace cv const int BS_DEF_BLOCK_SIZE = 1<<15; -bool bsIsBigEndian( void ) -{ - return (((const int*)"\0\x1\x2\x3\x4\x5\x6\x7")[0] & 255) != 0; -} - ///////////////////////// RBaseStream //////////////////////////// bool RBaseStream::isOpened() diff --git a/modules/imgcodecs/src/bitstrm.hpp b/modules/imgcodecs/src/bitstrm.hpp index dd78d5d3d6..391ade503d 100644 --- a/modules/imgcodecs/src/bitstrm.hpp +++ b/modules/imgcodecs/src/bitstrm.hpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html #ifndef _BITSTRM_H_ #define _BITSTRM_H_ @@ -183,13 +145,6 @@ public: bool putDWord( int val ); }; -inline unsigned BSWAP(unsigned v) -{ - return (v<<24)|((v&0xff00)<<8)|((v>>8)&0xff00)|((unsigned)v>>24); -} - -bool bsIsBigEndian( void ); - } #endif/*_BITSTRM_H_*/ diff --git a/modules/imgcodecs/src/grfmt_base.hpp b/modules/imgcodecs/src/grfmt_base.hpp index 9ae23b24df..f6b5ba1b27 100644 --- a/modules/imgcodecs/src/grfmt_base.hpp +++ b/modules/imgcodecs/src/grfmt_base.hpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html #ifndef _GRFMT_BASE_H_ #define _GRFMT_BASE_H_ @@ -55,75 +17,228 @@ class BaseImageEncoder; typedef Ptr ImageEncoder; typedef Ptr ImageDecoder; -///////////////////////////////// base class for decoders //////////////////////// -class BaseImageDecoder -{ +/** + * @brief Base class for image decoders. + * + * The BaseImageDecoder class provides an abstract interface for decoding various image formats. + * It defines common functionality like setting the image source, reading image headers, + * and handling EXIF metadata. Derived classes must implement methods for reading image headers + * and image data to handle format-specific decoding logic. + */ + +class BaseImageDecoder { public: + /** + * @brief Constructor for BaseImageDecoder. + * Initializes the object and sets default values for member variables. + */ BaseImageDecoder(); + + /** + * @brief Virtual destructor for BaseImageDecoder. + * Ensures proper cleanup of derived classes when deleted via a pointer to BaseImageDecoder. + */ virtual ~BaseImageDecoder() {} + /** + * @brief Get the width of the image. + * @return The width of the image (in pixels). + */ int width() const { return m_width; } + + /** + * @brief Get the height of the image. + * @return The height of the image (in pixels). + */ int height() const { return m_height; } + + /** + * @brief Get the number of frames in the image or animation. + * @return The number of frames in the image. + */ size_t getFrameCount() const { return m_frame_count; } + + /** + * @brief Get the type of the image (e.g., color format, depth). + * @return The type of the image. + */ virtual int type() const { return m_type; } + /** + * @brief Fetch a specific EXIF tag from the image's metadata. + * @param tag The EXIF tag to retrieve. + * @return The EXIF entry corresponding to the tag. + */ ExifEntry_t getExifTag(const ExifTagName tag) const; - virtual bool setSource( const String& filename ); - virtual bool setSource( const Mat& buf ); - virtual int setScale( const int& scale_denom ); - virtual bool readHeader() = 0; - virtual bool readData( Mat& img ) = 0; + /** + * @brief Set the image source from a file. + * @param filename The name of the file to load the image from. + * @return true if the source was successfully set, false otherwise. + */ + virtual bool setSource(const String& filename); + + /** + * @brief Set the image source from a memory buffer. + * @param buf The buffer containing the image data. + * @return true if the source was successfully set, false otherwise. + */ + virtual bool setSource(const Mat& buf); + + /** + * @brief Set the scale factor for the image. + * @param scale_denom The denominator of the scale factor (image is scaled down by 1/scale_denom). + * @return The scale factor that was set. + */ + virtual int setScale(const int& scale_denom); + + /** + * @brief Read the image header to extract basic properties (width, height, type). + * This is a pure virtual function that must be implemented by derived classes. + * @return true if the header was successfully read, false otherwise. + */ + virtual bool readHeader() = 0; + + /** + * @brief Read the image data into a Mat object. + * This is a pure virtual function that must be implemented by derived classes. + * @param img The Mat object where the image data will be stored. + * @return true if the data was successfully read, false otherwise. + */ + virtual bool readData(Mat& img) = 0; + + /** + * @brief Set whether to decode the image in RGB order instead of the default BGR. + * @param useRGB If true, the image will be decoded in RGB order. + */ virtual void setRGB(bool useRGB); - /// Called after readData to advance to the next page, if any. + /** + * @brief Advance to the next page or frame of the image, if applicable. + * The default implementation does nothing and returns false. + * @return true if there is another page/frame, false otherwise. + */ virtual bool nextPage() { return false; } + /** + * @brief Get the length of the format signature used to identify the image format. + * @return The length of the signature. + */ virtual size_t signatureLength() const; - virtual bool checkSignature( const String& signature ) const; + + /** + * @brief Check if the provided signature matches the expected format signature. + * @param signature The signature to check. + * @return true if the signature matches, false otherwise. + */ + virtual bool checkSignature(const String& signature) const; + + /** + * @brief Create and return a new instance of the derived image decoder. + * @return A new ImageDecoder object. + */ virtual ImageDecoder newDecoder() const; protected: - int m_width; // width of the image ( filled by readHeader ) - int m_height; // height of the image ( filled by readHeader ) - int m_type; - int m_scale_denom; - String m_filename; - String m_signature; - Mat m_buf; - bool m_buf_supported; - bool m_use_rgb; // flag of decode image as RGB order instead of BGR. - ExifReader m_exif; - size_t m_frame_count; + int m_width; ///< Width of the image (set by readHeader). + int m_height; ///< Height of the image (set by readHeader). + int m_type; ///< Image type (e.g., color depth, channel order). + int m_scale_denom; ///< Scale factor denominator for resizing the image. + String m_filename; ///< Name of the file that is being decoded. + String m_signature; ///< Signature for identifying the image format. + Mat m_buf; ///< Buffer holding the image data when loaded from memory. + bool m_buf_supported; ///< Flag indicating whether buffer-based loading is supported. + bool m_use_rgb; ///< Flag indicating whether to decode the image in RGB order. + ExifReader m_exif; ///< Object for reading EXIF metadata from the image. + size_t m_frame_count; ///< Number of frames in the image (for animations and multi-page images). }; -///////////////////////////// base class for encoders //////////////////////////// -class BaseImageEncoder -{ +/** + * @brief Base class for image encoders. + * + * The BaseImageEncoder class provides an abstract interface for encoding images in various formats. + * It defines common functionality like setting the destination (file or memory buffer), checking if + * the format supports a specific image depth, and writing image data. Derived classes must implement + * methods like writing the image data to handle format-specific encoding logic. + */ +class BaseImageEncoder { public: + /** + * @brief Constructor for BaseImageEncoder. + * Initializes the object and sets default values for member variables. + */ BaseImageEncoder(); - virtual ~BaseImageEncoder() {} - virtual bool isFormatSupported( int depth ) const; - virtual bool setDestination( const String& filename ); - virtual bool setDestination( std::vector& buf ); - virtual bool write( const Mat& img, const std::vector& params ) = 0; + /** + * @brief Virtual destructor for BaseImageEncoder. + * Ensures proper cleanup of derived classes when deleted via a pointer to BaseImageEncoder. + */ + virtual ~BaseImageEncoder() {} + + /** + * @brief Checks if the image format supports a specific image depth. + * @param depth The depth (bit depth) of the image. + * @return true if the format supports the specified depth, false otherwise. + */ + virtual bool isFormatSupported(int depth) const; + + /** + * @brief Set the destination for encoding as a file. + * @param filename The name of the file to which the image will be written. + * @return true if the destination was successfully set, false otherwise. + */ + virtual bool setDestination(const String& filename); + + /** + * @brief Set the destination for encoding as a memory buffer. + * @param buf A reference to the buffer where the encoded image data will be stored. + * @return true if the destination was successfully set, false otherwise. + */ + virtual bool setDestination(std::vector& buf); + + /** + * @brief Encode and write the image data. + * This is a pure virtual function that must be implemented by derived classes. + * @param img The Mat object containing the image data to be encoded. + * @param params A vector of parameters controlling the encoding process (e.g., compression level). + * @return true if the image was successfully written, false otherwise. + */ + virtual bool write(const Mat& img, const std::vector& params) = 0; + + /** + * @brief Encode and write multiple images (e.g., for animated formats). + * By default, this method returns false, indicating that the format does not support multi-image encoding. + * @param img_vec A vector of Mat objects containing the images to be encoded. + * @param params A vector of parameters controlling the encoding process. + * @return true if multiple images were successfully written, false otherwise. + */ virtual bool writemulti(const std::vector& img_vec, const std::vector& params); + /** + * @brief Get a description of the image encoder (e.g., the format it supports). + * @return A string describing the encoder. + */ virtual String getDescription() const; + + /** + * @brief Create and return a new instance of the derived image encoder. + * @return A new ImageEncoder object. + */ virtual ImageEncoder newEncoder() const; + /** + * @brief Throw an exception based on the last error encountered during encoding. + * This method can be used to propagate error conditions back to the caller. + */ virtual void throwOnEror() const; protected: - String m_description; - - String m_filename; - std::vector* m_buf; - bool m_buf_supported; - - String m_last_error; + String m_description; ///< Description of the encoder (e.g., format name, capabilities). + String m_filename; ///< Destination file name for encoded data. + std::vector* m_buf; ///< Pointer to the buffer for encoded data if using memory-based destination. + bool m_buf_supported; ///< Flag indicating whether buffer-based encoding is supported. + String m_last_error; ///< Stores the last error message encountered during encoding. }; } diff --git a/modules/imgcodecs/src/grfmt_pfm.cpp b/modules/imgcodecs/src/grfmt_pfm.cpp index 61cab06714..baa0108081 100644 --- a/modules/imgcodecs/src/grfmt_pfm.cpp +++ b/modules/imgcodecs/src/grfmt_pfm.cpp @@ -81,20 +81,17 @@ PFMDecoder::~PFMDecoder() PFMDecoder::PFMDecoder() : m_scale_factor(0), m_swap_byte_order(false) { - m_strm.close(); + m_buf_supported = true; } bool PFMDecoder::readHeader() { - if (m_buf.empty()) { - if (!m_strm.open(m_filename)) { - return false; - } - } else { - if (!m_strm.open(m_buf)) { - return false; - } - } + if (!m_buf.empty()) + m_strm.open(m_buf); + else + m_strm.open(m_filename); + + if( !m_strm.isOpened()) return false; if (m_strm.getByte() != 'P') { CV_Error(Error::StsError, "Unexpected file type (expected P)"); @@ -177,6 +174,7 @@ void PFMDecoder::close() PFMEncoder::PFMEncoder() { m_description = "Portable image format - float (*.pfm)"; + m_buf_supported = true; } PFMEncoder::~PFMEncoder() diff --git a/modules/imgcodecs/src/grfmt_sunras.cpp b/modules/imgcodecs/src/grfmt_sunras.cpp index 852e735477..f2878d1760 100644 --- a/modules/imgcodecs/src/grfmt_sunras.cpp +++ b/modules/imgcodecs/src/grfmt_sunras.cpp @@ -1,44 +1,6 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html #include "precomp.hpp" #include "grfmt_sunras.hpp" @@ -60,6 +22,7 @@ SunRasterDecoder::SunRasterDecoder() m_encoding = RAS_STANDARD; m_maptype = RMT_NONE; m_maplength = 0; + m_buf_supported = true; } @@ -82,7 +45,12 @@ bool SunRasterDecoder::readHeader() { bool result = false; - if( !m_strm.open( m_filename )) return false; + if (!m_buf.empty()) + m_strm.open(m_buf); + else + m_strm.open(m_filename); + + if( !m_strm.isOpened()) return false; try { @@ -389,6 +357,7 @@ bad_decoding_end: SunRasterEncoder::SunRasterEncoder() { m_description = "Sun raster files (*.sr;*.ras)"; + m_buf_supported = true; } @@ -408,7 +377,18 @@ bool SunRasterEncoder::write( const Mat& img, const std::vector& ) int fileStep = (width*channels + 1) & -2; WMByteStream strm; - if( strm.open(m_filename) ) + if (m_buf) { + if (!strm.open(*m_buf)) { + return false; + } + else { + m_buf->reserve(height * fileStep + 32); + } + } + else + strm.open(m_filename); + + if( strm.isOpened() ) { CHECK_WRITE(strm.putBytes( fmtSignSunRas, (int)strlen(fmtSignSunRas) )); CHECK_WRITE(strm.putDWord( width )); diff --git a/modules/imgcodecs/test/test_avif.cpp b/modules/imgcodecs/test/test_avif.cpp index 68678599b2..0d8a718756 100644 --- a/modules/imgcodecs/test/test_avif.cpp +++ b/modules/imgcodecs/test/test_avif.cpp @@ -336,6 +336,7 @@ TEST_P(Imgcodecs_Avif_Animation_WriteDecodeSuite, encode_decode) { file.seekg(0, std::ios::beg); std::vector buf(size); EXPECT_TRUE(file.read(reinterpret_cast(buf.data()), size)); + file.close(); EXPECT_EQ(0, remove(output.c_str())); std::vector anim; ASSERT_TRUE(cv::imdecodemulti(buf, imread_mode_, anim)); diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 1b8b85a04b..71813d320d 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -2617,8 +2617,8 @@ public: v_uint32 r0, r1, r2, r3; v_expand(vx_load(S0), r0, r1); v_expand(vx_load(S1), r2, r3); - r0 += r2; r1 += r3; - v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0)); + r0 = v_add(r0, r2); r1 = v_add(r1, r3); + v_rshr_pack_store<2>(D, v_add(r0, v_rotate_left<1>(r1, r0))); } #else v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3))); diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp index 208ffc1231..b4aafeaea2 100644 --- a/modules/imgproc/src/sumpixels.simd.hpp +++ b/modules/imgproc/src/sumpixels.simd.hpp @@ -130,9 +130,9 @@ struct Integral_SIMD el8 = v_add(el8, v_rotate_left<1>(el8)); el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_expand(el8, el4l, el4h); @@ -188,11 +188,11 @@ struct Integral_SIMD el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); #endif #endif v_expand(el8_1, el4l_1, el4h_1); @@ -350,9 +350,9 @@ struct Integral_SIMD prev.val = _mm256_permute2x128_si256(el4h.val, el4h.val, 0x31); #else #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_expand(el8, el4l, el4h); @@ -364,7 +364,7 @@ struct Integral_SIMD prev = v_combine_high(el4h, el4h); #else v_int32 t = v_rotate_right<12>(el4h); - t |= v_rotate_left<4>(t); + t = v_or(t, v_rotate_left<4>(t)); prev = v_combine_low(t, t); #endif #endif @@ -442,9 +442,9 @@ struct Integral_SIMD el8 = v_add(el8, v_rotate_left<1>(el8)); el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi; @@ -501,11 +501,11 @@ struct Integral_SIMD el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; @@ -590,13 +590,13 @@ struct Integral_SIMD el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); - el8_3 += v_rotate_left<4>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); - el8_3 += v_rotate_left<8>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3; @@ -663,9 +663,9 @@ struct Integral_SIMD prev.val = _mm256_permute2f128_ps(el4h.val, el4h.val, 0x31); #else #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi; @@ -678,7 +678,7 @@ struct Integral_SIMD prev = v_combine_high(el4h, el4h); #else v_float32 t = v_rotate_right<12>(el4h); - t |= v_rotate_left<4>(t); + t = v_or(t, v_rotate_left<4>(t)); prev = v_combine_low(t, t); #endif #endif @@ -770,9 +770,9 @@ struct Integral_SIMD el8 = v_add(el8, v_rotate_left<1>(el8)); el8 = v_add(el8, v_rotate_left<2>(el8)); #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi; @@ -843,11 +843,11 @@ struct Integral_SIMD el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1)); el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2; @@ -958,13 +958,13 @@ struct Integral_SIMD el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2)); el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3)); #if CV_SIMD_WIDTH >= 32 - el8_1 += v_rotate_left<4>(el8_1); - el8_2 += v_rotate_left<4>(el8_2); - el8_3 += v_rotate_left<4>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3)); #if CV_SIMD_WIDTH == 64 - el8_1 += v_rotate_left<8>(el8_1); - el8_2 += v_rotate_left<8>(el8_2); - el8_3 += v_rotate_left<8>(el8_3); + el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1)); + el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2)); + el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3)); #endif #endif v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2, el4li_3, el4hi_3; @@ -1058,9 +1058,9 @@ struct Integral_SIMD prev_1.val = prev_2.val = el4hh.val; #else #if CV_SIMD_WIDTH >= 32 - el8 += v_rotate_left<4>(el8); + el8 = v_add(el8, v_rotate_left<4>(el8)); #if CV_SIMD_WIDTH == 64 - el8 += v_rotate_left<8>(el8); + el8 = v_add(el8, v_rotate_left<8>(el8)); #endif #endif v_int32 el4li, el4hi; diff --git a/modules/ts/include/opencv2/ts/ocl_perf.hpp b/modules/ts/include/opencv2/ts/ocl_perf.hpp index aa87243a4f..89b224147a 100644 --- a/modules/ts/include/opencv2/ts/ocl_perf.hpp +++ b/modules/ts/include/opencv2/ts/ocl_perf.hpp @@ -64,7 +64,7 @@ using namespace perf; public: \ OCL##_##fixture##_##name() { } \ protected: \ - virtual void PerfTestBody(); \ + virtual void PerfTestBody() CV_OVERRIDE; \ }; \ TEST_F(OCL##_##fixture##_##name, name) { CV_TRACE_REGION("PERF_TEST: " #fixture "_" #name); declare.strategy(OCL_PERF_STRATEGY); RunPerfTestBody(); } \ void OCL##_##fixture##_##name::PerfTestBody() @@ -76,7 +76,7 @@ using namespace perf; public: \ OCL##_##fixture##_##name() { } \ protected: \ - virtual void PerfTestBody(); \ + virtual void PerfTestBody() CV_OVERRIDE; \ }; \ TEST_P(OCL##_##fixture##_##name, name) { CV_TRACE_REGION("PERF_TEST_P: " #fixture "_" #name); declare.strategy(OCL_PERF_STRATEGY); RunPerfTestBody(); } \ INSTANTIATE_TEST_CASE_P(/*none*/, OCL##_##fixture##_##name, params); \ diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp index eebf4c594b..66e12d77d6 100644 --- a/modules/ts/include/opencv2/ts/ts_ext.hpp +++ b/modules/ts/include/opencv2/ts/ts_ext.hpp @@ -85,7 +85,7 @@ struct SkipThisTest : public ::testing::Test { };\ class test_case_name##test_name##_factory : public ::testing::internal::TestFactoryBase { \ public:\ - virtual ::testing::Test* CreateTest() { \ + virtual ::testing::Test* CreateTest() CV_OVERRIDE { \ try { \ return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name); \ } catch (const cvtest::details::SkipTestExceptionBase& e) { \ @@ -150,7 +150,7 @@ struct SkipThisTest : public ::testing::Test { };\ class test_fixture##test_name##_factory : public ::testing::internal::TestFactoryBase { \ public:\ - virtual ::testing::Test* CreateTest() { \ + virtual ::testing::Test* CreateTest() CV_OVERRIDE { \ try { \ return new GTEST_TEST_CLASS_NAME_(test_fixture, test_name); \ } catch (const cvtest::details::SkipTestExceptionBase& e) { \ diff --git a/modules/ts/include/opencv2/ts/ts_gtest.h b/modules/ts/include/opencv2/ts/ts_gtest.h index 49eb3a5ec7..5cf6fc4537 100644 --- a/modules/ts/include/opencv2/ts/ts_gtest.h +++ b/modules/ts/include/opencv2/ts/ts_gtest.h @@ -8458,7 +8458,7 @@ class TestFactoryBase { template class TestFactoryImpl : public TestFactoryBase { public: - virtual Test* CreateTest() { return new TestClass; } + virtual Test* CreateTest() override { return new TestClass; } }; #if GTEST_OS_WINDOWS @@ -11927,7 +11927,7 @@ class ParameterizedTestFactory : public internal::TestFactoryBase { typedef typename TestClass::ParamType ParamType; explicit ParameterizedTestFactory(ParamType parameter) : parameter_(parameter) {} - virtual Test* CreateTest() { + virtual Test* CreateTest() override { TestClass::SetParam(¶meter_); return new TestClass(); } @@ -11968,7 +11968,7 @@ class TestMetaFactory TestMetaFactory() {} - virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) override { return new ParameterizedTestFactory(parameter); } @@ -12030,9 +12030,9 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { : test_case_name_(name), code_location_(code_location) {} // Test case base name for display purposes. - virtual const std::string& GetTestCaseName() const { return test_case_name_; } + virtual const std::string& GetTestCaseName() const override { return test_case_name_; } // Test case id to verify identity. - virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } + virtual TypeId GetTestCaseTypeId() const override { return GetTypeId(); } // TEST_P macro uses AddTestPattern() to record information // about a single test in a LocalTestInfo structure. // test_case_name is the base name of the test case (without invocation @@ -12061,7 +12061,7 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { // This method should not be called more then once on any single // instance of a ParameterizedTestCaseInfoBase derived class. // UnitTest has a guard to prevent from calling this method more then once. - virtual void RegisterTests() { + virtual void RegisterTests() override { for (typename TestInfoContainer::iterator test_it = tests_.begin(); test_it != tests_.end(); ++test_it) { linked_ptr test_info = *test_it; diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp index 4b59978f3c..c2ce7926bb 100644 --- a/modules/ts/include/opencv2/ts/ts_perf.hpp +++ b/modules/ts/include/opencv2/ts/ts_perf.hpp @@ -586,7 +586,7 @@ void PrintTo(const Size& sz, ::std::ostream* os); public:\ fixture() {}\ protected:\ - virtual void PerfTestBody();\ + virtual void PerfTestBody() CV_OVERRIDE;\ };\ TEST_F(fixture, testname){ CV__PERF_TEST_BODY_IMPL(#fixture "_" #testname); }\ }\ @@ -627,7 +627,7 @@ void PrintTo(const Size& sz, ::std::ostream* os); public:\ fixture##_##name() {}\ protected:\ - virtual void PerfTestBody();\ + virtual void PerfTestBody() CV_OVERRIDE;\ };\ CV__TEST_P(fixture##_##name, name, PerfTestBodyDummy,, CV__PERF_TEST_BODY_IMPL){} \ INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\ diff --git a/modules/video/src/hal_replacement.hpp b/modules/video/src/hal_replacement.hpp new file mode 100644 index 0000000000..8d10ab39d1 --- /dev/null +++ b/modules/video/src/hal_replacement.hpp @@ -0,0 +1,101 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_VIDEO_HAL_REPLACEMENT_HPP +#define OPENCV_VIDEO_HAL_REPLACEMENT_HPP + +#include "opencv2/core/hal/interface.h" + +#if defined(__clang__) // clang or MSVC clang +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-parameter" +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4100) +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +//! @addtogroup video_hal_interface +//! @note Define your functions to override default implementations: +//! @code +//! #undef cv_hal_LK_optical_flow_level +//! #define cv_hal_LK_optical_flow_level my_hal_LK_optical_flow_level +//! @endcode +//! @{ + +/** +@brief Lucas-Kanade optical flow for single pyramid layer. See calcOpticalFlowPyrLK +@param prev_data previous frame image data +@param prev_data_step previous frame image data step +@param prev_deriv_data previous frame Schaar derivatives +@param prev_deriv_step previous frame Schaar derivatives step +@param next_data next frame image data +@param next_step next frame image step +@param width input images width +@param height input images height +@param cn source image channels +@param prev_points 2d points coordinates (x,y) on the previous frame +@param next_points points coordinates (x,y) on the next frame +@param point_count - amount of input points +@param status optical flow status for each point. Optional output, expected if not nullptr is provided +@param err optical flow estimation error for each point. Optional output, expected if not nullptr is provided +@param win_width optical flow window width +@param win_height optical flow window heigh +@param termination_count maximum algorithm iterations. 0 means unlimited +@param termination_epsilon maximal allowed algorithm error +@param get_min_eigen_vals return minimal egen values as point errors in err buffer +@param min_eigen_vals_threshold eigen values threshold +**/ +inline int hal_ni_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step, + const short* prev_deriv_data, size_t prev_deriv_step, + const uchar* next_data, size_t next_step, + int width, int height, int cn, + const float *prev_points, float *next_points, size_t point_count, + uchar *status, float *err, + const int win_width, const int win_height, + int termination_count, double termination_epsilon, + bool get_min_eigen_vals, + float min_eigen_vals_threshold) +{ + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +//! @cond IGNORED +#define cv_hal_LKOpticalFlowLevel hal_ni_LKOpticalFlowLevel +//! @endcond + +//! @} + +#if defined(__clang__) +#pragma clang diagnostic pop +#elif defined(_MSC_VER) +#pragma warning(pop) +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#include "custom_hal.hpp" + +//! @cond IGNORED +#define CALL_HAL_RET(name, fun, retval, ...) \ + int res = __CV_EXPAND(fun(__VA_ARGS__, &retval)); \ + if (res == CV_HAL_ERROR_OK) \ + return retval; \ + else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ + CV_Error_(cv::Error::StsInternal, \ + ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); + + +#define CALL_HAL(name, fun, ...) \ + int res = __CV_EXPAND(fun(__VA_ARGS__)); \ + if (res == CV_HAL_ERROR_OK) \ + return; \ + else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ + CV_Error_(cv::Error::StsInternal, \ + ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); +//! @endcond + +#endif diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index 662ac13235..03de93ee08 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -49,6 +49,8 @@ #include "opencv2/3d.hpp" #endif +#include "hal_replacement.hpp" + #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) namespace @@ -182,11 +184,17 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const { CV_INSTRUMENT_REGION(); + const int W_BITS = 14, W_BITS1 = 14; + const float FLT_SCALE = 1.f/(1 << 20); + Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f); const Mat& I = *prevImg; const Mat& J = *nextImg; const Mat& derivI = *prevDeriv; + cv::AutoBuffer prevPtsScaledData(range.end - range.start); + Point2f* prevPtsScaled = prevPtsScaledData.data(); + int j, cn = I.channels(), cn2 = cn*2; cv::AutoBuffer _buf(winSize.area()*(cn + cn2)); int derivDepth = DataType::depth; @@ -208,7 +216,23 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const else nextPt = nextPts[ptidx]*2.f; nextPts[ptidx] = nextPt; + prevPtsScaled[ptidx-range.start] = prevPt; + } + CALL_HAL(LKOpticalFlowLevel, cv_hal_LKOpticalFlowLevel, + I.data, I.step, (const short*)derivI.data, derivI.step, J.data, J.step, + I.cols, I.rows, I.channels(), + (float*)prevPtsScaled, (float*)(nextPts+range.start), range.end-range.start, + (level == 0) ? status+range.start: nullptr, + err != nullptr ? err+range.start: nullptr, + winSize.width, winSize.height, criteria.maxCount, criteria.epsilon, + (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) != 0, + (float)minEigThreshold + ); + + for( int ptidx = range.start; ptidx < range.end; ptidx++ ) + { + Point2f prevPt = prevPtsScaled[ptidx-range.start]; Point2i iprevPt, inextPt; prevPt -= halfWin; iprevPt.x = cvFloor(prevPt.x); @@ -219,8 +243,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const { if( level == 0 ) { - if( status ) - status[ptidx] = false; + status[ptidx] = false; if( err ) err[ptidx] = 0; } @@ -229,8 +252,6 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const float a = prevPt.x - iprevPt.x; float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - const float FLT_SCALE = 1.f/(1 << 20); int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); @@ -477,14 +498,14 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const if( minEig < minEigThreshold || D < FLT_EPSILON ) { - if( level == 0 && status ) + if(level == 0) status[ptidx] = false; continue; } D = 1.f/D; - nextPt -= halfWin; + Point2f nextPt = nextPts[ptidx] - halfWin; Point2f prevDelta; for( j = 0; j < criteria.maxCount; j++ ) @@ -495,7 +516,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const if( inextPt.x < -winSize.width || inextPt.x >= J.cols || inextPt.y < -winSize.height || inextPt.y >= J.rows ) { - if( level == 0 && status ) + if( level == 0 ) status[ptidx] = false; break; } @@ -678,7 +699,6 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const prevDelta = delta; } - CV_Assert(status != NULL); if( status[ptidx] && err && level == 0 && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) == 0 ) { Point2f nextPoint = nextPts[ptidx] - halfWin; @@ -690,8 +710,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols || inextPoint.y < -winSize.height || inextPoint.y >= J.rows ) { - if( status ) - status[ptidx] = false; + status[ptidx] = false; continue; } diff --git a/platforms/linux/arm.toolchain.cmake b/platforms/linux/arm.toolchain.cmake index 184997fba5..ddbad83e51 100644 --- a/platforms/linux/arm.toolchain.cmake +++ b/platforms/linux/arm.toolchain.cmake @@ -48,32 +48,23 @@ if(NOT DEFINED ARM_LINUX_SYSROOT AND DEFINED GNU_MACHINE) set(ARM_LINUX_SYSROOT /usr/${GNU_MACHINE}${FLOAT_ABI_SUFFIX}) endif() -if(NOT DEFINED CMAKE_CXX_FLAGS) - set(CMAKE_CXX_FLAGS "" CACHE INTERNAL "") - set(CMAKE_C_FLAGS "" CACHE INTERNAL "") - set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERNAL "") - set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERNAL "") - set(CMAKE_EXE_LINKER_FLAGS "" CACHE INTERNAL "") - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections -Wa,--noexecstack -fsigned-char -Wno-psabi") - if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm) - set(CMAKE_CXX_FLAGS "-mthumb ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS "-mthumb ${CMAKE_C_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,nocopyreloc") +# == Compiler flags +if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm) + set(CMAKE_CXX_FLAGS_INIT "-mthumb") + set(CMAKE_C_FLAGS_INIT "-mthumb") + set(common_ld_opt "-Wl,--fix-cortex-a8") + set(CMAKE_SHARED_LINKER_FLAGS_INIT "${common_ld_opt}") + set(CMAKE_MODULE_LINKER_FLAGS_INIT "${common_ld_opt}") + set(CMAKE_EXE_LINKER_FLAGS_INIT "${common_ld_opt} -Wl,-z,nocopyreloc") +elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + include("${CMAKE_CURRENT_LIST_DIR}/flags-aarch64.cmake") + if(COMMAND ocv_set_platform_flags) + ocv_set_platform_flags(CMAKE_CXX_FLAGS_INIT) + ocv_set_platform_flags(CMAKE_C_FLAGS_INIT) endif() - if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm) - set(ARM_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") - elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) - set(ARM_LINKER_FLAGS "-Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") - endif() - set(CMAKE_SHARED_LINKER_FLAGS "${ARM_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}") - set(CMAKE_MODULE_LINKER_FLAGS "${ARM_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${ARM_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}") -else() - #message(WARNING "CMAKE_CXX_FLAGS='${CMAKE_CXX_FLAGS}' is defined") endif() + if(USE_NEON) message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." ) set(ENABLE_NEON TRUE) diff --git a/platforms/linux/flags-aarch64.cmake b/platforms/linux/flags-aarch64.cmake new file mode 100644 index 0000000000..5aeb7a2b6a --- /dev/null +++ b/platforms/linux/flags-aarch64.cmake @@ -0,0 +1,19 @@ +# see https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#index-march +function(ocv_set_platform_flags VAR) + unset(flags) + if(ENABLE_BF16) + set(flags "${flags}+bf16") + endif() + if(ENABLE_DOTPROD) + set(flags "${flags}+dotprod") + endif() + if(ENABLE_FP16) + set(flags "${flags}+fp16") + endif() + if(DEFINED ENABLE_NEON AND NOT ENABLE_NEON) + set(flags "${flags}+nosimd") + endif() + if(flags) + set(${VAR} "-march=armv8.2-a${flags}" PARENT_SCOPE) + endif() +endfunction() diff --git a/platforms/linux/flags-riscv64.cmake b/platforms/linux/flags-riscv64.cmake new file mode 100644 index 0000000000..4488cf5887 --- /dev/null +++ b/platforms/linux/flags-riscv64.cmake @@ -0,0 +1,9 @@ +# see https://gcc.gnu.org/onlinedocs/gcc/RISC-V-Options.html#index-march-14 +function(ocv_set_platform_flags VAR) + if(ENABLE_RVV OR RISCV_RVV_SCALABLE) + set(flags "-march=rv64gcv") + else() + set(flags "-march=rv64gc") + endif() + set(${VAR} "${flags}" PARENT_SCOPE) +endfunction() diff --git a/platforms/linux/riscv64-andes-gcc.toolchain.cmake b/platforms/linux/riscv64-andes-gcc.toolchain.cmake index 9b9c0b5246..a18c3df9e1 100755 --- a/platforms/linux/riscv64-andes-gcc.toolchain.cmake +++ b/platforms/linux/riscv64-andes-gcc.toolchain.cmake @@ -10,16 +10,12 @@ set(CMAKE_C_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc) set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++) # fix toolchain macro - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ANDES=1") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ANDES=1") - # enable rvp -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp") +set(CMAKE_C_FLAGS_INIT "-march=rv64gc -mext-dsp -D__ANDES=1") +set(CMAKE_CXX_FLAGS_INIT "-march=rv64gc -mext-dsp -D__ANDES=1") # fix segment address -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000") -set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000") +set(CMAKE_EXE_LINKER_FLAGS_INIT "-Wl,-Ttext-segment=0x50000") +set(CMAKE_SHARED_LINKER_FLAGS_INIT "-Wl,-Ttext-segment=0x50000") diff --git a/platforms/linux/riscv64-clang.toolchain.cmake b/platforms/linux/riscv64-clang.toolchain.cmake index 612be05eab..939350fcbd 100644 --- a/platforms/linux/riscv64-clang.toolchain.cmake +++ b/platforms/linux/riscv64-clang.toolchain.cmake @@ -17,8 +17,13 @@ set(CMAKE_ASM_COMPILER_TARGET ${CLANG_TARGET_TRIPLE}) # Don't run the linker on compiler check set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) -set(CMAKE_C_FLAGS "-march=rv64gc --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "-march=rv64gc --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CMAKE_CXX_FLAGS}") +include("${CMAKE_CURRENT_LIST_DIR}/flags-riscv64.cmake") +if(COMMAND ocv_set_platform_flags) + ocv_set_platform_flags(CMAKE_CXX_FLAGS_INIT) + ocv_set_platform_flags(CMAKE_C_FLAGS_INIT) +endif() +set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w") +set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w") set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT}) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) diff --git a/platforms/linux/riscv64-gcc.toolchain.cmake b/platforms/linux/riscv64-gcc.toolchain.cmake index c3a0e161e3..7a067d3f1a 100644 --- a/platforms/linux/riscv64-gcc.toolchain.cmake +++ b/platforms/linux/riscv64-gcc.toolchain.cmake @@ -1,10 +1,11 @@ set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_VERSION 1) +set(CMAKE_SYSTEM_PROCESSOR riscv64) set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple") -if(NOT DEFINED CMAKE_CXX_FLAGS) # guards toolchain multiple calls - set(CMAKE_C_FLAGS "-march=rv64gc") - set(CMAKE_CXX_FLAGS "-march=rv64gc") +include("${CMAKE_CURRENT_LIST_DIR}/flags-riscv64.cmake") +if(COMMAND ocv_set_platform_flags) + ocv_set_platform_flags(CMAKE_CXX_FLAGS_INIT) + ocv_set_platform_flags(CMAKE_C_FLAGS_INIT) endif() include("${CMAKE_CURRENT_LIST_DIR}/riscv-gnu.toolchain.cmake") diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml index 99abf22b26..f2e9e98be2 100644 --- a/samples/dnn/models.yml +++ b/samples/dnn/models.yml @@ -74,7 +74,24 @@ yolov8l: rgb: true labels: "object_detection_classes_yolo.txt" postprocessing: "yolov8" - sample: "yolo_detector" + sample: "object_detection" + +# YOLOv5 object detection family from ultralytics (https://github.com/ultralytics/ultralytics) +# Might be used for all YOLOv5n YOLOv5s YOLOv5m YOLOv5l and YOLOv5x + +yolov5l: + load_info: + url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov5l.onnx" + sha1: "9de7e54c524b7fe7577bbd4cdbbdaed53375c8f1" + model: "yolov5l.onnx" + mean: 0.0 + scale: 0.00392 + width: 640 + height: 640 + rgb: true + classes: "object_detection_classes_yolo.txt" + background_label_id: 0 + sample: "object_detection" # YOLO4 object detection family from Darknet (https://github.com/AlexeyAB/darknet) # YOLO object detection family from Darknet (https://pjreddie.com/darknet/yolo/) diff --git a/samples/python/tracker.py b/samples/python/tracker.py index 9e6f939275..58a652a002 100644 --- a/samples/python/tracker.py +++ b/samples/python/tracker.py @@ -23,6 +23,10 @@ USAGE: [--nanotrack_backbone NANOTRACK_BACKBONE] [--nanotrack_headneck NANOTRACK_TARGET] [--vittrack_net VITTRACK_MODEL] + [--vittrack_net VITTRACK_MODEL] + [--tracking_score_threshold TRACKING SCORE THRESHOLD FOR ONLY VITTRACK] + [--backend CHOOSE ONE OF COMPUTATION BACKEND] + [--target CHOOSE ONE OF COMPUTATION TARGET] ''' # Python 2/3 compatibility @@ -36,6 +40,11 @@ import argparse from video import create_capture, presets +backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV, + cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA) +targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, + cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16) + class App(object): def __init__(self, args): @@ -51,15 +60,22 @@ class App(object): params.model = self.args.dasiamrpn_net params.kernel_cls1 = self.args.dasiamrpn_kernel_cls1 params.kernel_r1 = self.args.dasiamrpn_kernel_r1 + params.backend = args.backend + params.target = args.target tracker = cv.TrackerDaSiamRPN_create(params) elif self.trackerAlgorithm == 'nanotrack': params = cv.TrackerNano_Params() params.backbone = args.nanotrack_backbone params.neckhead = args.nanotrack_headneck + params.backend = args.backend + params.target = args.target tracker = cv.TrackerNano_create(params) elif self.trackerAlgorithm == 'vittrack': params = cv.TrackerVit_Params() params.net = args.vittrack_net + params.tracking_score_threshold = args.tracking_score_threshold + params.backend = args.backend + params.target = args.target tracker = cv.TrackerVit_create(params) else: sys.exit("Tracker {} is not recognized. Please use one of three available: mil, dasiamrpn, nanotrack.".format(self.trackerAlgorithm)) @@ -133,6 +149,24 @@ if __name__ == '__main__': parser.add_argument("--nanotrack_backbone", type=str, default="nanotrack_backbone_sim.onnx", help="Path to onnx model of NanoTrack backBone") parser.add_argument("--nanotrack_headneck", type=str, default="nanotrack_head_sim.onnx", help="Path to onnx model of NanoTrack headNeck") parser.add_argument("--vittrack_net", type=str, default="vitTracker.onnx", help="Path to onnx model of vittrack") + parser.add_argument('--tracking_score_threshold', type=float, help="Tracking score threshold. If a bbox of score >= 0.3, it is considered as found ") + parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int, + help="Choose one of computation backends: " + "%d: automatically (by default), " + "%d: Halide language (http://halide-lang.org/), " + "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "%d: OpenCV implementation, " + "%d: VKCOM, " + "%d: CUDA"% backends) + parser.add_argument("--target", choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int, + help="Choose one of target computation devices: " + '%d: CPU target (by default), ' + '%d: OpenCL, ' + '%d: OpenCL fp16 (half-float precision), ' + '%d: VPU, ' + '%d: VULKAN, ' + '%d: CUDA, ' + '%d: CUDA fp16 (half-float preprocess)'% targets) args = parser.parse_args() App(args).run()