mirror of
https://github.com/opencv/opencv.git
synced 2025-01-20 15:59:24 +08:00
Merge branch 'opencv:4.x' into simd-demosaicing
This commit is contained in:
commit
54d0610f50
3
.github/workflows/PR-4.x.yaml
vendored
3
.github/workflows/PR-4.x.yaml
vendored
@ -31,6 +31,9 @@ jobs:
|
||||
Windows10-x64:
|
||||
uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main
|
||||
|
||||
Windows10-x64-UWP:
|
||||
uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-UWP.yaml@main
|
||||
|
||||
Windows10-ARM64:
|
||||
uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-ARM64.yaml@main
|
||||
|
||||
|
46
3rdparty/carotene/hal/tegra_hal.hpp
vendored
46
3rdparty/carotene/hal/tegra_hal.hpp
vendored
@ -1932,4 +1932,50 @@ inline int TEGRA_GaussianBlurBinomial(const uchar* src_data, size_t src_step, uc
|
||||
|
||||
#endif // OPENCV_IMGPROC_HAL_INTERFACE_H
|
||||
|
||||
// The optimized branch was developed for old armv7 processors
|
||||
#if defined(__ARM_ARCH) && (__ARM_ARCH == 7)
|
||||
inline int TEGRA_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step,
|
||||
const short* prev_deriv_data, size_t prev_deriv_step,
|
||||
const uchar* next_data, size_t next_step,
|
||||
int width, int height, int cn,
|
||||
const float *prev_points, float *next_points, size_t point_count,
|
||||
uchar *status, float *err,
|
||||
const int win_width, const int win_height,
|
||||
int termination_count, double termination_epsilon,
|
||||
bool get_min_eigen_vals,
|
||||
float min_eigen_vals_threshold)
|
||||
{
|
||||
if (!CAROTENE_NS::isSupportedConfiguration())
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
CAROTENE_NS::pyrLKOptFlowLevel(CAROTENE_NS::Size2D(width, height), cn,
|
||||
prev_data, prev_data_step, prev_deriv_data, prev_deriv_step,
|
||||
next_data, next_step,
|
||||
point_count, prev_points, next_points,
|
||||
status, err, CAROTENE_NS::Size2D(win_width, win_height),
|
||||
termination_count, termination_epsilon,
|
||||
get_min_eigen_vals, min_eigen_vals_threshold);
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
#undef cv_hal_LKOpticalFlowLevel
|
||||
#define cv_hal_LKOpticalFlowLevel TEGRA_LKOpticalFlowLevel
|
||||
#endif // __ARM_ARCH=7
|
||||
|
||||
#if 0 // OpenCV provides fater parallel implementation
|
||||
inline int TEGRA_ScharrDeriv(const uchar* src_data, size_t src_step,
|
||||
short* dst_data, size_t dst_step,
|
||||
int width, int height, int cn)
|
||||
{
|
||||
if (!CAROTENE_NS::isSupportedConfiguration())
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
CAROTENE_NS::ScharrDeriv(CAROTENE_NS::Size2D(width, height), cn, src_data, src_step, dst_data, dst_step);
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
#undef cv_hal_ScharrDeriv
|
||||
#define cv_hal_ScharrDeriv TEGRA_ScharrDeriv
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -2485,7 +2485,7 @@ namespace CAROTENE_NS {
|
||||
u8 *status, f32 *err,
|
||||
const Size2D &winSize,
|
||||
u32 terminationCount, f64 terminationEpsilon,
|
||||
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
|
||||
bool getMinEigenVals,
|
||||
f32 minEigThreshold);
|
||||
}
|
||||
|
||||
|
53
3rdparty/carotene/src/opticalflow.cpp
vendored
53
3rdparty/carotene/src/opticalflow.cpp
vendored
@ -58,7 +58,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
u8 *status, f32 *err,
|
||||
const Size2D &winSize,
|
||||
u32 terminationCount, f64 terminationEpsilon,
|
||||
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
|
||||
bool getMinEigenVals,
|
||||
f32 minEigThreshold)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
@ -74,32 +74,11 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
|
||||
for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
|
||||
{
|
||||
f32 levscale = (1./(1 << level));
|
||||
u32 ptref = ptidx << 1;
|
||||
f32 prevPtX = prevPts[ptref+0]*levscale;
|
||||
f32 prevPtY = prevPts[ptref+1]*levscale;
|
||||
f32 nextPtX;
|
||||
f32 nextPtY;
|
||||
if( level == maxLevel )
|
||||
{
|
||||
if( useInitialFlow )
|
||||
{
|
||||
nextPtX = nextPts[ptref+0]*levscale;
|
||||
nextPtY = nextPts[ptref+1]*levscale;
|
||||
}
|
||||
else
|
||||
{
|
||||
nextPtX = prevPtX;
|
||||
nextPtY = prevPtY;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
nextPtX = nextPts[ptref+0]*2.f;
|
||||
nextPtY = nextPts[ptref+1]*2.f;
|
||||
}
|
||||
nextPts[ptref+0] = nextPtX;
|
||||
nextPts[ptref+1] = nextPtY;
|
||||
f32 prevPtX = prevPts[ptref+0];
|
||||
f32 prevPtY = prevPts[ptref+1];
|
||||
f32 nextPtX = nextPts[ptref+0];
|
||||
f32 nextPtY = nextPts[ptref+1];
|
||||
|
||||
s32 iprevPtX, iprevPtY;
|
||||
s32 inextPtX, inextPtY;
|
||||
@ -111,13 +90,10 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
|
||||
iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
|
||||
{
|
||||
if( level == 0 )
|
||||
{
|
||||
if( status )
|
||||
status[ptidx] = false;
|
||||
if( err )
|
||||
err[ptidx] = 0;
|
||||
}
|
||||
if( status )
|
||||
status[ptidx] = false;
|
||||
if( err )
|
||||
err[ptidx] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -333,7 +309,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
|
||||
if( minEig < minEigThreshold || D < FLT_EPSILON )
|
||||
{
|
||||
if( level == 0 && status )
|
||||
if( status )
|
||||
status[ptidx] = false;
|
||||
continue;
|
||||
}
|
||||
@ -353,7 +329,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
|
||||
inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
|
||||
{
|
||||
if( level == 0 && status )
|
||||
if( status )
|
||||
status[ptidx] = false;
|
||||
break;
|
||||
}
|
||||
@ -469,8 +445,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
prevDeltaX = deltaX;
|
||||
prevDeltaY = deltaY;
|
||||
}
|
||||
|
||||
if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
|
||||
if( status && status[ptidx] && err && !getMinEigenVals )
|
||||
{
|
||||
f32 nextPointX = nextPts[ptref+0] - halfWinX;
|
||||
f32 nextPointY = nextPts[ptref+1] - halfWinY;
|
||||
@ -526,9 +501,6 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
(void)winSize;
|
||||
(void)terminationCount;
|
||||
(void)terminationEpsilon;
|
||||
(void)level;
|
||||
(void)maxLevel;
|
||||
(void)useInitialFlow;
|
||||
(void)getMinEigenVals;
|
||||
(void)minEigThreshold;
|
||||
(void)ptCount;
|
||||
@ -536,4 +508,3 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
}
|
||||
|
||||
}//CAROTENE_NS
|
||||
|
||||
|
32
3rdparty/fastcv/CMakeLists.txt
vendored
Normal file
32
3rdparty/fastcv/CMakeLists.txt
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
if(HAVE_FASTCV)
|
||||
set(FASTCV_HAL_VERSION 0.0.1 CACHE INTERNAL "")
|
||||
set(FASTCV_HAL_LIBRARIES "fastcv_hal" CACHE INTERNAL "")
|
||||
set(FASTCV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include" CACHE INTERNAL "")
|
||||
set(FASTCV_HAL_HEADERS
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_core.hpp"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_imgproc.hpp"
|
||||
CACHE INTERNAL "")
|
||||
|
||||
file(GLOB FASTCV_HAL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
|
||||
|
||||
add_library(fastcv_hal STATIC ${FASTCV_HAL_FILES})
|
||||
|
||||
target_include_directories(fastcv_hal PRIVATE
|
||||
${CMAKE_SOURCE_DIR}/modules/core/include
|
||||
${CMAKE_SOURCE_DIR}/modules/imgproc/include
|
||||
${FASTCV_HAL_INCLUDE_DIRS} ${FastCV_INCLUDE_PATH})
|
||||
|
||||
target_link_libraries(fastcv_hal PUBLIC ${FASTCV_LIBRARY})
|
||||
|
||||
set_target_properties(fastcv_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
|
||||
|
||||
if(NOT BUILD_SHARED_LIBS)
|
||||
ocv_install_target(fastcv_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
|
||||
endif()
|
||||
|
||||
if(ENABLE_SOLUTION_FOLDERS)
|
||||
set_target_properties(fastcv_hal PROPERTIES FOLDER "3rdparty")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "FastCV is not available, disabling related HAL")
|
||||
endif(HAVE_FASTCV)
|
43
3rdparty/fastcv/fastcv.cmake
vendored
Normal file
43
3rdparty/fastcv/fastcv.cmake
vendored
Normal file
@ -0,0 +1,43 @@
|
||||
function(download_fastcv root_dir)
|
||||
|
||||
# Commit SHA in the opencv_3rdparty repo
|
||||
set(FASTCV_COMMIT "dc5d58018f3af915a8d209386d2c58c0501c0f2c")
|
||||
|
||||
# Define actual FastCV versions
|
||||
if(ANDROID)
|
||||
if(AARCH64)
|
||||
message(STATUS "Download FastCV for Android aarch64")
|
||||
set(FCV_PACKAGE_NAME "fastcv_android_aarch64_2024_12_11.tgz")
|
||||
set(FCV_PACKAGE_HASH "9dac41e86597305f846212dae31a4a88")
|
||||
else()
|
||||
message(STATUS "Download FastCV for Android armv7")
|
||||
set(FCV_PACKAGE_NAME "fastcv_android_arm32_2024_12_11.tgz")
|
||||
set(FCV_PACKAGE_HASH "fe2d30334180b17e3031eee92aac43b6")
|
||||
endif()
|
||||
elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
|
||||
if(AARCH64)
|
||||
set(FCV_PACKAGE_NAME "fastcv_linux_aarch64_2024_12_11.tgz")
|
||||
set(FCV_PACKAGE_HASH "7b33ad833e6f15ab6d4ec64fa3c17acd")
|
||||
else()
|
||||
message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
|
||||
endif()
|
||||
endif(ANDROID)
|
||||
|
||||
# Download Package
|
||||
set(OPENCV_FASTCV_URL "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${FASTCV_COMMIT}/fastcv/")
|
||||
|
||||
ocv_download( FILENAME ${FCV_PACKAGE_NAME}
|
||||
HASH ${FCV_PACKAGE_HASH}
|
||||
URL ${OPENCV_FASTCV_URL}
|
||||
DESTINATION_DIR ${root_dir}
|
||||
ID FASTCV
|
||||
STATUS res
|
||||
UNPACK
|
||||
RELATIVE_URL)
|
||||
if(res)
|
||||
set(HAVE_FASTCV TRUE CACHE BOOL "FastCV status")
|
||||
else()
|
||||
message(WARNING "FastCV: package download failed!")
|
||||
endif()
|
||||
|
||||
endfunction()
|
222
3rdparty/fastcv/include/fastcv_hal_core.hpp
vendored
Normal file
222
3rdparty/fastcv/include/fastcv_hal_core.hpp
vendored
Normal file
@ -0,0 +1,222 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#ifndef OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
|
||||
#define OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
|
||||
|
||||
#include <opencv2/core/base.hpp>
|
||||
|
||||
#undef cv_hal_lut
|
||||
#define cv_hal_lut fastcv_hal_lut
|
||||
#undef cv_hal_normHammingDiff8u
|
||||
#define cv_hal_normHammingDiff8u fastcv_hal_normHammingDiff8u
|
||||
#undef cv_hal_mul8u16u
|
||||
#define cv_hal_mul8u16u fastcv_hal_mul8u16u
|
||||
#undef cv_hal_sub8u32f
|
||||
#define cv_hal_sub8u32f fastcv_hal_sub8u32f
|
||||
#undef cv_hal_transpose2d
|
||||
#define cv_hal_transpose2d fastcv_hal_transpose2d
|
||||
#undef cv_hal_meanStdDev
|
||||
#define cv_hal_meanStdDev fastcv_hal_meanStdDev
|
||||
#undef cv_hal_flip
|
||||
#define cv_hal_flip fastcv_hal_flip
|
||||
#undef cv_hal_rotate90
|
||||
#define cv_hal_rotate90 fastcv_hal_rotate
|
||||
#undef cv_hal_addWeighted8u
|
||||
#define cv_hal_addWeighted8u fastcv_hal_addWeighted8u
|
||||
#undef cv_hal_mul8u
|
||||
#define cv_hal_mul8u fastcv_hal_mul8u
|
||||
#undef cv_hal_mul16s
|
||||
#define cv_hal_mul16s fastcv_hal_mul16s
|
||||
#undef cv_hal_mul32f
|
||||
#define cv_hal_mul32f fastcv_hal_mul32f
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief look-up table transform of an array.
|
||||
/// @param src_data Source image data
|
||||
/// @param src_step Source image step
|
||||
/// @param src_type Source image type
|
||||
/// @param lut_data Pointer to lookup table
|
||||
/// @param lut_channel_size Size of each channel in bytes
|
||||
/// @param lut_channels Number of channels in lookup table
|
||||
/// @param dst_data Destination data
|
||||
/// @param dst_step Destination step
|
||||
/// @param width Width of images
|
||||
/// @param height Height of images
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_lut(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
size_t src_type,
|
||||
const uchar* lut_data,
|
||||
size_t lut_channel_size,
|
||||
size_t lut_channels,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Hamming distance between two vectors
|
||||
/// @param a pointer to first vector data
|
||||
/// @param b pointer to second vector data
|
||||
/// @param n length of vectors
|
||||
/// @param cellSize how many bits of the vectors will be added and treated as a single bit, can be 1 (standard Hamming distance), 2 or 4
|
||||
/// @param result pointer to result output
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_mul8u16u(
|
||||
const uchar * src1_data,
|
||||
size_t src1_step,
|
||||
const uchar * src2_data,
|
||||
size_t src2_step,
|
||||
ushort * dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_sub8u32f(
|
||||
const uchar *src1_data,
|
||||
size_t src1_step,
|
||||
const uchar *src2_data,
|
||||
size_t src2_step,
|
||||
float *dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_transpose2d(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int element_size);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_meanStdDev(
|
||||
const uchar * src_data,
|
||||
size_t src_step,
|
||||
int width,
|
||||
int height,
|
||||
int src_type,
|
||||
double * mean_val,
|
||||
double * stddev_val,
|
||||
uchar * mask,
|
||||
size_t mask_step);
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Flips a 2D array around vertical, horizontal, or both axes
|
||||
/// @param src_type source and destination image type
|
||||
/// @param src_data source image data
|
||||
/// @param src_step source image step
|
||||
/// @param src_width source and destination image width
|
||||
/// @param src_height source and destination image height
|
||||
/// @param dst_data destination image data
|
||||
/// @param dst_step destination image step
|
||||
/// @param flip_mode 0 flips around x-axis, 1 around y-axis, -1 both
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_flip(
|
||||
int src_type,
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int flip_mode);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Rotates a 2D array in multiples of 90 degrees.
|
||||
/// @param src_type source and destination image type
|
||||
/// @param src_data source image data
|
||||
/// @param src_step source image step
|
||||
/// @param src_width source image width
|
||||
/// @If angle has value [180] it is also destination image width
|
||||
/// If angle has values [90, 270] it is also destination image height
|
||||
/// @param src_height source and destination image height (destination image width for angles [90, 270])
|
||||
/// If angle has value [180] it is also destination image height
|
||||
/// If angle has values [90, 270] it is also destination image width
|
||||
/// @param dst_data destination image data
|
||||
/// @param dst_step destination image step
|
||||
/// @param angle clockwise angle for rotation in degrees from set [90, 180, 270]
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_rotate(
|
||||
int src_type,
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int angle);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief weighted sum of two arrays using formula: dst[i] = a * src1[i] + b * src2[i]
|
||||
/// @param src1_data first source image data
|
||||
/// @param src1_step first source image step
|
||||
/// @param src2_data second source image data
|
||||
/// @param src2_step second source image step
|
||||
/// @param dst_data destination image data
|
||||
/// @param dst_step destination image step
|
||||
/// @param width width of the images
|
||||
/// @param height height of the images
|
||||
/// @param scalars numbers a, b, and c
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_addWeighted8u(
|
||||
const uchar* src1_data,
|
||||
size_t src1_step,
|
||||
const uchar* src2_data,
|
||||
size_t src2_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
const double scalars[3]);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_mul8u(
|
||||
const uchar *src1_data,
|
||||
size_t src1_step,
|
||||
const uchar *src2_data,
|
||||
size_t src2_step,
|
||||
uchar *dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_mul16s(
|
||||
const short *src1_data,
|
||||
size_t src1_step,
|
||||
const short *src2_data,
|
||||
size_t src2_step,
|
||||
short *dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_mul32f(
|
||||
const float *src1_data,
|
||||
size_t src1_step,
|
||||
const float *src2_data,
|
||||
size_t src2_step,
|
||||
float *dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale);
|
||||
|
||||
#endif
|
268
3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
vendored
Normal file
268
3rdparty/fastcv/include/fastcv_hal_imgproc.hpp
vendored
Normal file
@ -0,0 +1,268 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#ifndef OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
|
||||
#define OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
|
||||
|
||||
#include <opencv2/core/base.hpp>
|
||||
|
||||
#undef cv_hal_medianBlur
|
||||
#define cv_hal_medianBlur fastcv_hal_medianBlur
|
||||
#undef cv_hal_sobel
|
||||
#define cv_hal_sobel fastcv_hal_sobel
|
||||
#undef cv_hal_boxFilter
|
||||
#define cv_hal_boxFilter fastcv_hal_boxFilter
|
||||
#undef cv_hal_adaptiveThreshold
|
||||
#define cv_hal_adaptiveThreshold fastcv_hal_adaptiveThreshold
|
||||
#undef cv_hal_gaussianBlurBinomial
|
||||
#define cv_hal_gaussianBlurBinomial fastcv_hal_gaussianBlurBinomial
|
||||
#undef cv_hal_warpPerspective
|
||||
#define cv_hal_warpPerspective fastcv_hal_warpPerspective
|
||||
#undef cv_hal_pyrdown
|
||||
#define cv_hal_pyrdown fastcv_hal_pyrdown
|
||||
#undef cv_hal_cvtBGRtoHSV
|
||||
#define cv_hal_cvtBGRtoHSV fastcv_hal_cvtBGRtoHSV
|
||||
#undef cv_hal_cvtBGRtoYUVApprox
|
||||
#define cv_hal_cvtBGRtoYUVApprox fastcv_hal_cvtBGRtoYUVApprox
|
||||
#undef cv_hal_canny
|
||||
#define cv_hal_canny fastcv_hal_canny
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Calculate medianBlur filter
|
||||
/// @param src_data Source image data
|
||||
/// @param src_step Source image step
|
||||
/// @param dst_data Destination image data
|
||||
/// @param dst_step Destination image step
|
||||
/// @param width Source image width
|
||||
/// @param height Source image height
|
||||
/// @param depth Depths of source and destination image
|
||||
/// @param cn Number of channels
|
||||
/// @param ksize Size of kernel
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_medianBlur(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
int depth,
|
||||
int cn,
|
||||
int ksize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Computes Sobel derivatives
|
||||
///
|
||||
/// @param src_data Source image data
|
||||
/// @param src_step Source image step
|
||||
/// @param dst_data Destination image data
|
||||
/// @param dst_step Destination image step
|
||||
/// @param width Source image width
|
||||
/// @param height Source image height
|
||||
/// @param src_depth Depth of source image
|
||||
/// @param dst_depth Depths of destination image
|
||||
/// @param cn Number of channels
|
||||
/// @param margin_left Left margins for source image
|
||||
/// @param margin_top Top margins for source image
|
||||
/// @param margin_right Right margins for source image
|
||||
/// @param margin_bottom Bottom margins for source image
|
||||
/// @param dx orders of the derivative x
|
||||
/// @param dy orders of the derivative y
|
||||
/// @param ksize Size of kernel
|
||||
/// @param scale Scale factor for the computed derivative values
|
||||
/// @param delta Delta value that is added to the results prior to storing them in dst
|
||||
/// @param border_type Border type
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_sobel(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
int src_depth,
|
||||
int dst_depth,
|
||||
int cn,
|
||||
int margin_left,
|
||||
int margin_top,
|
||||
int margin_right,
|
||||
int margin_bottom,
|
||||
int dx,
|
||||
int dy,
|
||||
int ksize,
|
||||
double scale,
|
||||
double delta,
|
||||
int border_type);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int fastcv_hal_boxFilter(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
int src_depth,
|
||||
int dst_depth,
|
||||
int cn,
|
||||
int margin_left,
|
||||
int margin_top,
|
||||
int margin_right,
|
||||
int margin_bottom,
|
||||
size_t ksize_width,
|
||||
size_t ksize_height,
|
||||
int anchor_x,
|
||||
int anchor_y,
|
||||
bool normalize,
|
||||
int border_type);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_adaptiveThreshold(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double maxValue,
|
||||
int adaptiveMethod,
|
||||
int thresholdType,
|
||||
int blockSize,
|
||||
double C);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Blurs an image using a Gaussian filter.
|
||||
/// @param src_data Source image data
|
||||
/// @param src_step Source image step
|
||||
/// @param dst_data Destination image data
|
||||
/// @param dst_step Destination image step
|
||||
/// @param width Source image width
|
||||
/// @param height Source image height
|
||||
/// @param depth Depth of source and destination image
|
||||
/// @param cn Number of channels
|
||||
/// @param margin_left Left margins for source image
|
||||
/// @param margin_top Top margins for source image
|
||||
/// @param margin_right Right margins for source image
|
||||
/// @param margin_bottom Bottom margins for source image
|
||||
/// @param ksize Kernel size
|
||||
/// @param border_type Border type
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_gaussianBlurBinomial(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
int depth,
|
||||
int cn,
|
||||
size_t margin_left,
|
||||
size_t margin_top,
|
||||
size_t margin_right,
|
||||
size_t margin_bottom,
|
||||
size_t ksize,
|
||||
int border_type);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Applies a perspective transformation to an image.
|
||||
///
|
||||
/// @param src_type Source and destination image type
|
||||
/// @param src_data Source image data
|
||||
/// @param src_step Source image step
|
||||
/// @param src_width Source image width
|
||||
/// @param src_height Source image height
|
||||
/// @param dst_data Destination image data
|
||||
/// @param dst_step Destination image step
|
||||
/// @param dst_width Destination image width
|
||||
/// @param dst_height Destination image height
|
||||
/// @param M 3x3 matrix with transform coefficients
|
||||
/// @param interpolation Interpolation mode (CV_HAL_INTER_NEAREST, ...)
|
||||
/// @param border_type Border processing mode (CV_HAL_BORDER_REFLECT, ...)
|
||||
/// @param border_value Values to use for CV_HAL_BORDER_CONSTANT mode
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_warpPerspective(
|
||||
int src_type,
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
const double M[9],
|
||||
int interpolation,
|
||||
int border_type,
|
||||
const double border_value[4]);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_pyrdown(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
int depth,
|
||||
int cn,
|
||||
int border_type);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_cvtBGRtoHSV(
|
||||
const uchar * src_data,
|
||||
size_t src_step,
|
||||
uchar * dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
int depth,
|
||||
int scn,
|
||||
bool swapBlue,
|
||||
bool isFullRange,
|
||||
bool isHSV);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_cvtBGRtoYUVApprox(
|
||||
const uchar * src_data,
|
||||
size_t src_step,
|
||||
uchar * dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
int depth,
|
||||
int scn,
|
||||
bool swapBlue,
|
||||
bool isCbCr);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Canny edge detector
|
||||
/// @param src_data Source image data
|
||||
/// @param src_step Source image step
|
||||
/// @param dst_data Destination image data
|
||||
/// @param dst_step Destination image step
|
||||
/// @param width Source image width
|
||||
/// @param height Source image height
|
||||
/// @param cn Number of channels
|
||||
/// @param lowThreshold low hresholds value
|
||||
/// @param highThreshold high thresholds value
|
||||
/// @param ksize Kernel size for Sobel operator.
|
||||
/// @param L2gradient Flag, indicating use of L2 or L1 norma.
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int fastcv_hal_canny(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
int cn,
|
||||
double lowThreshold,
|
||||
double highThreshold,
|
||||
int ksize,
|
||||
bool L2gradient);
|
||||
|
||||
#endif
|
84
3rdparty/fastcv/include/fastcv_hal_utils.hpp
vendored
Normal file
84
3rdparty/fastcv/include/fastcv_hal_utils.hpp
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#ifndef OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
|
||||
#define OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
|
||||
|
||||
#include "fastcv.h"
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#define INITIALIZATION_CHECK \
|
||||
{ \
|
||||
if (!FastCvContext::getContext().isInitialized) \
|
||||
{ \
|
||||
return CV_HAL_ERROR_UNKNOWN; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CV_HAL_RETURN(status, func) \
|
||||
{ \
|
||||
if( status == FASTCV_SUCCESS ) \
|
||||
{ \
|
||||
CV_LOG_DEBUG(NULL, "FastCV HAL for "<<#func<<" run successfully!"); \
|
||||
return CV_HAL_ERROR_OK; \
|
||||
} \
|
||||
else if(status == FASTCV_EBADPARAM || status == FASTCV_EUNALIGNPARAM || \
|
||||
status == FASTCV_EUNSUPPORTED || status == FASTCV_EHWQDSP || \
|
||||
status == FASTCV_EHWGPU) \
|
||||
{ \
|
||||
CV_LOG_DEBUG(NULL, "FastCV status:"<<getFastCVErrorString(status) \
|
||||
<<", Switching to default OpenCV solution!"); \
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status)); \
|
||||
return CV_HAL_ERROR_UNKNOWN; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CV_HAL_RETURN_NOT_IMPLEMENTED(reason) \
|
||||
{ \
|
||||
CV_LOG_DEBUG(NULL,"Switching to default OpenCV\nInfo: "<<reason); \
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED; \
|
||||
}
|
||||
|
||||
#define FCV_KernelSize_SHIFT 3
|
||||
#define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
|
||||
#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)
|
||||
|
||||
const char* getFastCVErrorString(int status);
|
||||
const char* borderToString(int border);
|
||||
const char* interpolationToString(int interpolation);
|
||||
|
||||
struct FastCvContext
|
||||
{
|
||||
public:
|
||||
// initialize at first call
|
||||
// Defines a static local variable context. Variable is created only once.
|
||||
static FastCvContext& getContext()
|
||||
{
|
||||
static FastCvContext context;
|
||||
return context;
|
||||
}
|
||||
|
||||
FastCvContext()
|
||||
{
|
||||
if (fcvSetOperationMode(FASTCV_OP_CPU_PERFORMANCE) != 0)
|
||||
{
|
||||
CV_LOG_WARNING(NULL, "Failed to switch FastCV operation mode");
|
||||
isInitialized = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_LOG_INFO(NULL, "FastCV Operation Mode Switched");
|
||||
isInitialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool isInitialized;
|
||||
};
|
||||
|
||||
#endif
|
574
3rdparty/fastcv/src/fastcv_hal_core.cpp
vendored
Normal file
574
3rdparty/fastcv/src/fastcv_hal_core.cpp
vendored
Normal file
@ -0,0 +1,574 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "fastcv_hal_core.hpp"
|
||||
#include "fastcv_hal_utils.hpp"
|
||||
#include <opencv2/core/core.hpp>
|
||||
#include <opencv2/core/base.hpp>
|
||||
|
||||
|
||||
class ParallelTableLookup : public cv::ParallelLoopBody
|
||||
{
|
||||
public:
|
||||
|
||||
ParallelTableLookup(const uchar* src_data_, int width_, size_t src_step_, const uchar* lut_data_, uchar* dst_data_, size_t dst_step_) :
|
||||
cv::ParallelLoopBody(), src_data(src_data_), width(width_), src_step(src_step_), lut_data(lut_data_), dst_data(dst_data_), dst_step(dst_step_)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void operator()(const cv::Range& range) const CV_OVERRIDE
|
||||
{
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
for (int y = range.start; y < range.end; y++) {
|
||||
status = fcvTableLookupu8((uint8_t*)src_data + y * src_step, width, 1, src_step, (uint8_t*)lut_data, (uint8_t*)dst_data + y * dst_step, dst_step);
|
||||
if(status != FASTCV_SUCCESS)
|
||||
CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const uchar* src_data;
|
||||
int width;
|
||||
size_t src_step;
|
||||
const uchar* lut_data;
|
||||
uchar* dst_data;
|
||||
size_t dst_step;
|
||||
};
|
||||
|
||||
int fastcv_hal_lut(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
size_t src_type,
|
||||
const uchar* lut_data,
|
||||
size_t lut_channel_size,
|
||||
size_t lut_channels,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height)
|
||||
{
|
||||
if((width*height)<=(320*240))
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
fcvStatus status;
|
||||
if (src_type == CV_8UC1 && lut_channels == 1 && lut_channel_size == 1)
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, height),
|
||||
ParallelTableLookup(src_data, width, src_step, lut_data, dst_data, dst_step));
|
||||
status = FASTCV_SUCCESS;
|
||||
CV_HAL_RETURN(status, hal_lut);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Multi-channel input is not supported");
|
||||
}
|
||||
}
|
||||
|
||||
int fastcv_hal_normHammingDiff8u(
|
||||
const uchar* a,
|
||||
const uchar* b,
|
||||
int n,
|
||||
int cellSize,
|
||||
int* result)
|
||||
{
|
||||
fcvStatus status;
|
||||
|
||||
if (cellSize != 1)
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("NORM_HAMMING2 cellSize:%d is not supported", cellSize));
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
uint32_t dist = 0;
|
||||
|
||||
dist = fcvHammingDistanceu8((uint8_t*)a, (uint8_t*)b, n);
|
||||
|
||||
*result = dist;
|
||||
status = FASTCV_SUCCESS;
|
||||
CV_HAL_RETURN(status, hal_normHammingDiff8u);
|
||||
}
|
||||
|
||||
int fastcv_hal_mul8u16u(
|
||||
const uchar* src1_data,
|
||||
size_t src1_step,
|
||||
const uchar* src2_data,
|
||||
size_t src2_step,
|
||||
ushort* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale)
|
||||
{
|
||||
if(scale != 1.0)
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Scale factor not supported");
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
|
||||
if (src1_step < (size_t)width && src2_step < (size_t)width)
|
||||
{
|
||||
src1_step = width*sizeof(uchar);
|
||||
src2_step = width*sizeof(uchar);
|
||||
dst_step = width*sizeof(ushort);
|
||||
}
|
||||
|
||||
status = fcvElementMultiplyu8u16_v2(src1_data, width, height, src1_step,
|
||||
src2_data, src2_step, dst_data, dst_step);
|
||||
|
||||
CV_HAL_RETURN(status,hal_multiply);
|
||||
}
|
||||
|
||||
int fastcv_hal_sub8u32f(
|
||||
const uchar* src1_data,
|
||||
size_t src1_step,
|
||||
const uchar* src2_data,
|
||||
size_t src2_step,
|
||||
float* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height)
|
||||
{
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
|
||||
if (src1_step < (size_t)width && src2_step < (size_t)width)
|
||||
{
|
||||
src1_step = width*sizeof(uchar);
|
||||
src2_step = width*sizeof(uchar);
|
||||
dst_step = width*sizeof(float);
|
||||
}
|
||||
|
||||
status = fcvImageDiffu8f32_v2(src1_data, src2_data, width, height, src1_step,
|
||||
src2_step, dst_data, dst_step);
|
||||
|
||||
CV_HAL_RETURN(status,hal_subtract);
|
||||
|
||||
}
|
||||
|
||||
int fastcv_hal_transpose2d(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
int element_size)
|
||||
{
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
if (src_data == dst_data)
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("In-place not supported");
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
|
||||
switch (element_size)
|
||||
{
|
||||
case 1:
|
||||
status = fcvTransposeu8_v2(src_data, src_width, src_height, src_step,
|
||||
dst_data, dst_step);
|
||||
break;
|
||||
case 2:
|
||||
status = fcvTransposeu16_v2((const uint16_t*)src_data, src_width, src_height,
|
||||
src_step, (uint16_t*)dst_data, dst_step);
|
||||
break;
|
||||
case 4:
|
||||
status = fcvTransposef32_v2((const float32_t*)src_data, src_width, src_height,
|
||||
src_step, (float32_t*)dst_data, dst_step);
|
||||
break;
|
||||
default:
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("srcType not supported");
|
||||
}
|
||||
|
||||
CV_HAL_RETURN(status,hal_transpose);
|
||||
}
|
||||
|
||||
int fastcv_hal_meanStdDev(
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
int width,
|
||||
int height,
|
||||
int src_type,
|
||||
double* mean_val,
|
||||
double* stddev_val,
|
||||
uchar* mask,
|
||||
size_t mask_step)
|
||||
{
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
CV_UNUSED(mask_step);
|
||||
|
||||
if(src_type != CV_8UC1)
|
||||
{
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("src type not supported");
|
||||
}
|
||||
else if(mask != nullptr)
|
||||
{
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("mask not supported");
|
||||
}
|
||||
else if(mean_val == nullptr && stddev_val == nullptr)
|
||||
{
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("null ptr for mean and stddev");
|
||||
}
|
||||
|
||||
float32_t mean, variance;
|
||||
|
||||
fcvStatus status = fcvImageIntensityStats_v2(src_data, src_step, 0, 0, width, height,
|
||||
&mean, &variance, FASTCV_BIASED_VARIANCE_ESTIMATOR);
|
||||
|
||||
if(mean_val != nullptr)
|
||||
*mean_val = mean;
|
||||
if(stddev_val != nullptr)
|
||||
*stddev_val = std::sqrt(variance);
|
||||
|
||||
CV_HAL_RETURN(status,hal_meanStdDev);
|
||||
}
|
||||
|
||||
int fastcv_hal_flip(
|
||||
int src_type,
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int flip_mode)
|
||||
{
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
if(src_type!=CV_8UC1 && src_type!=CV_16UC1 && src_type!=CV_8UC3)
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Data type is not supported, Switching to default OpenCV solution!");
|
||||
|
||||
if((src_width*src_height)<=(640*480))
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;;
|
||||
fcvFlipDir dir;
|
||||
|
||||
switch (flip_mode)
|
||||
{
|
||||
//Flip around X-Axis: Vertical Flip or FLIP_ROWS
|
||||
case 0:
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution due to low perf!");
|
||||
dir = FASTCV_FLIP_VERT;
|
||||
break;
|
||||
|
||||
//Flip around Y-Axis: Horizontal Flip or FLIP_COLS
|
||||
case 1:
|
||||
dir = FASTCV_FLIP_HORIZ;
|
||||
break;
|
||||
|
||||
//Flip around both X and Y-Axis or FLIP_BOTH
|
||||
case -1:
|
||||
dir = FASTCV_FLIP_BOTH;
|
||||
break;
|
||||
default:
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Invalid flip_mode, Switching to default OpenCV solution!");
|
||||
}
|
||||
|
||||
if(src_type==CV_8UC1)
|
||||
fcvFlipu8(src_data, src_width, src_height, src_step, dst_data, dst_step, dir);
|
||||
else if(src_type==CV_16UC1)
|
||||
fcvFlipu16((uint16_t*)src_data, src_width, src_height, src_step, (uint16_t*)dst_data, dst_step, dir);
|
||||
else if(src_type==CV_8UC3)
|
||||
status = fcvFlipRGB888u8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data, dst_step, dir);
|
||||
else
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Data type:%d is not supported, Switching to default OpenCV solution!", src_type));
|
||||
|
||||
CV_HAL_RETURN(status, hal_flip);
|
||||
}
|
||||
|
||||
int fastcv_hal_rotate(
|
||||
int src_type,
|
||||
const uchar* src_data,
|
||||
size_t src_step,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int angle)
|
||||
{
|
||||
if((src_width*src_height)<(120*80))
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution for lower resolution!");
|
||||
|
||||
fcvStatus status;
|
||||
fcvRotateDegree degree;
|
||||
|
||||
if (src_type != CV_8UC1 && src_type != CV_8UC2)
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
switch (angle)
|
||||
{
|
||||
case 90:
|
||||
degree = FASTCV_ROTATE_90;
|
||||
break;
|
||||
case 180:
|
||||
degree = FASTCV_ROTATE_180;
|
||||
break;
|
||||
case 270:
|
||||
degree = FASTCV_ROTATE_270;
|
||||
break;
|
||||
default:
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Rotation angle:%d is not supported", angle));
|
||||
}
|
||||
|
||||
switch(src_type)
|
||||
{
|
||||
case CV_8UC1:
|
||||
status = fcvRotateImageu8(src_data, src_width, src_height, src_step, dst_data, dst_step, degree);
|
||||
break;
|
||||
case CV_8UC2:
|
||||
status = fcvRotateImageInterleavedu8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data,
|
||||
dst_step, degree);
|
||||
break;
|
||||
default:
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
|
||||
}
|
||||
CV_HAL_RETURN(status, hal_rotate);
|
||||
}
|
||||
|
||||
int fastcv_hal_addWeighted8u(
|
||||
const uchar* src1_data,
|
||||
size_t src1_step,
|
||||
const uchar* src2_data,
|
||||
size_t src2_step,
|
||||
uchar* dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
const double scalars[3])
|
||||
{
|
||||
if( (scalars[0] < -128.0f) || (scalars[0] >= 128.0f) ||
|
||||
(scalars[1] < -128.0f) || (scalars[1] >= 128.0f) ||
|
||||
(scalars[2] < -(1<<23))|| (scalars[2] >= 1<<23))
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED(
|
||||
cv::format("Alpha:%f,Beta:%f,Gamma:%f is not supported because it's too large or too small\n",
|
||||
scalars[0],scalars[1],scalars[2]));
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
|
||||
if (height == 1)
|
||||
{
|
||||
src1_step = width*sizeof(uchar);
|
||||
src2_step = width*sizeof(uchar);
|
||||
dst_step = width*sizeof(uchar);
|
||||
|
||||
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
|
||||
int rangeWidth = range.end - range.start;
|
||||
const uint8_t *src1 = src1_data + range.start;
|
||||
const uint8_t *src2 = src2_data + range.start;
|
||||
uint8_t *dst = dst_data + range.start;
|
||||
fcvAddWeightedu8_v2(src1, rangeWidth, height, src1_step, src2, src2_step,
|
||||
scalars[0], scalars[1], scalars[2], dst, dst_step);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
|
||||
int rangeHeight = range.end - range.start;
|
||||
const uint8_t *src1 = src1_data + range.start * src1_step;
|
||||
const uint8_t *src2 = src2_data + range.start * src2_step;
|
||||
uint8_t *dst = dst_data + range.start * dst_step;
|
||||
fcvAddWeightedu8_v2(src1, width, rangeHeight, src1_step, src2, src2_step,
|
||||
scalars[0], scalars[1], scalars[2], dst, dst_step);
|
||||
});
|
||||
}
|
||||
|
||||
CV_HAL_RETURN(status, hal_addWeighted8u_v2);
|
||||
}
|
||||
|
||||
int fastcv_hal_mul8u(
|
||||
const uchar *src1_data,
|
||||
size_t src1_step,
|
||||
const uchar *src2_data,
|
||||
size_t src2_step,
|
||||
uchar *dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale)
|
||||
{
|
||||
int8_t sF;
|
||||
|
||||
if(FCV_CMP_EQ(scale,1.0)) { sF = 0; }
|
||||
else if(scale > 1.0)
|
||||
{
|
||||
if(FCV_CMP_EQ(scale,2.0)) { sF = -1; }
|
||||
else if(FCV_CMP_EQ(scale,4.0)) { sF = -2; }
|
||||
else if(FCV_CMP_EQ(scale,8.0)) { sF = -3; }
|
||||
else if(FCV_CMP_EQ(scale,16.0)) { sF = -4; }
|
||||
else if(FCV_CMP_EQ(scale,32.0)) { sF = -5; }
|
||||
else if(FCV_CMP_EQ(scale,64.0)) { sF = -6; }
|
||||
else if(FCV_CMP_EQ(scale,128.0)) { sF = -7; }
|
||||
else if(FCV_CMP_EQ(scale,256.0)) { sF = -8; }
|
||||
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
|
||||
}
|
||||
else if(scale > 0 && scale < 1.0)
|
||||
{
|
||||
if(FCV_CMP_EQ(scale,1/2.0)) { sF = 1; }
|
||||
else if(FCV_CMP_EQ(scale,1/4.0)) { sF = 2; }
|
||||
else if(FCV_CMP_EQ(scale,1/8.0)) { sF = 3; }
|
||||
else if(FCV_CMP_EQ(scale,1/16.0)) { sF = 4; }
|
||||
else if(FCV_CMP_EQ(scale,1/32.0)) { sF = 5; }
|
||||
else if(FCV_CMP_EQ(scale,1/64.0)) { sF = 6; }
|
||||
else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7; }
|
||||
else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8; }
|
||||
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
|
||||
}
|
||||
else
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
int nStripes = cv::getNumThreads();
|
||||
|
||||
if(height == 1)
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
|
||||
int rangeWidth = range.end - range.start;
|
||||
const uchar* yS1 = src1_data + static_cast<size_t>(range.start);
|
||||
const uchar* yS2 = src2_data + static_cast<size_t>(range.start);
|
||||
uchar* yD = dst_data + static_cast<size_t>(range.start);
|
||||
fcvElementMultiplyu8(yS1, rangeWidth, 1, 0, yS2, 0, sF,
|
||||
FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
|
||||
}, nStripes);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
|
||||
int rangeHeight = range.end - range.start;
|
||||
const uchar* yS1 = src1_data + static_cast<size_t>(range.start)*src1_step;
|
||||
const uchar* yS2 = src2_data + static_cast<size_t>(range.start)*src2_step;
|
||||
uchar* yD = dst_data + static_cast<size_t>(range.start)*dst_step;
|
||||
fcvElementMultiplyu8(yS1, width, rangeHeight, src1_step, yS2, src2_step,
|
||||
sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
|
||||
}, nStripes);
|
||||
}
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
CV_HAL_RETURN(status, hal_mul8u);
|
||||
}
|
||||
|
||||
int fastcv_hal_mul16s(
|
||||
const short *src1_data,
|
||||
size_t src1_step,
|
||||
const short *src2_data,
|
||||
size_t src2_step,
|
||||
short *dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale)
|
||||
{
|
||||
int8_t sF;
|
||||
|
||||
if(FCV_CMP_EQ(scale,1.0)) { sF = 0; }
|
||||
else if(scale > 1.0)
|
||||
{
|
||||
if(FCV_CMP_EQ(scale,2.0)) { sF = -1; }
|
||||
else if(FCV_CMP_EQ(scale,4.0)) { sF = -2; }
|
||||
else if(FCV_CMP_EQ(scale,8.0)) { sF = -3; }
|
||||
else if(FCV_CMP_EQ(scale,16.0)) { sF = -4; }
|
||||
else if(FCV_CMP_EQ(scale,32.0)) { sF = -5; }
|
||||
else if(FCV_CMP_EQ(scale,64.0)) { sF = -6; }
|
||||
else if(FCV_CMP_EQ(scale,128.0)) { sF = -7; }
|
||||
else if(FCV_CMP_EQ(scale,256.0)) { sF = -8; }
|
||||
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
|
||||
}
|
||||
else if(scale > 0 && scale < 1.0)
|
||||
{
|
||||
if(FCV_CMP_EQ(scale,1/2.0)) { sF = 1; }
|
||||
else if(FCV_CMP_EQ(scale,1/4.0)) { sF = 2; }
|
||||
else if(FCV_CMP_EQ(scale,1/8.0)) { sF = 3; }
|
||||
else if(FCV_CMP_EQ(scale,1/16.0)) { sF = 4; }
|
||||
else if(FCV_CMP_EQ(scale,1/32.0)) { sF = 5; }
|
||||
else if(FCV_CMP_EQ(scale,1/64.0)) { sF = 6; }
|
||||
else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7; }
|
||||
else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8; }
|
||||
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
|
||||
}
|
||||
else
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
int nStripes = cv::getNumThreads();
|
||||
|
||||
if(height == 1)
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
|
||||
int rangeWidth = range.end - range.start;
|
||||
const short* yS1 = src1_data + static_cast<size_t>(range.start);
|
||||
const short* yS2 = src2_data + static_cast<size_t>(range.start);
|
||||
short* yD = dst_data + static_cast<size_t>(range.start);
|
||||
fcvElementMultiplys16(yS1, rangeWidth, 1, 0, yS2, 0, sF,
|
||||
FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
|
||||
}, nStripes);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
|
||||
int rangeHeight = range.end - range.start;
|
||||
const short* yS1 = src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(short));
|
||||
const short* yS2 = src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(short));
|
||||
short* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(short));
|
||||
fcvElementMultiplys16(yS1, width, rangeHeight, src1_step, yS2, src2_step,
|
||||
sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
|
||||
}, nStripes);
|
||||
}
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
CV_HAL_RETURN(status, hal_mul16s);
|
||||
}
|
||||
|
||||
int fastcv_hal_mul32f(
|
||||
const float *src1_data,
|
||||
size_t src1_step,
|
||||
const float *src2_data,
|
||||
size_t src2_step,
|
||||
float *dst_data,
|
||||
size_t dst_step,
|
||||
int width,
|
||||
int height,
|
||||
double scale)
|
||||
{
|
||||
if(!FCV_CMP_EQ(scale,1.0))
|
||||
CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
|
||||
|
||||
INITIALIZATION_CHECK;
|
||||
|
||||
int nStripes = cv::getNumThreads();
|
||||
|
||||
if(height == 1)
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
|
||||
int rangeWidth = range.end - range.start;
|
||||
const float* yS1 = src1_data + static_cast<size_t>(range.start);
|
||||
const float* yS2 = src2_data + static_cast<size_t>(range.start);
|
||||
float* yD = dst_data + static_cast<size_t>(range.start);
|
||||
fcvElementMultiplyf32(yS1, rangeWidth, 1, 0, yS2, 0, yD, 0);
|
||||
}, nStripes);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
|
||||
int rangeHeight = range.end - range.start;
|
||||
const float* yS1 = src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(float));
|
||||
const float* yS2 = src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(float));
|
||||
float* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(float));
|
||||
fcvElementMultiplyf32(yS1, width, rangeHeight, src1_step,
|
||||
yS2, src2_step, yD, dst_step);
|
||||
}, nStripes);
|
||||
}
|
||||
|
||||
fcvStatus status = FASTCV_SUCCESS;
|
||||
CV_HAL_RETURN(status, hal_mul32f);
|
||||
}
|
1050
3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
vendored
Normal file
1050
3rdparty/fastcv/src/fastcv_hal_imgproc.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
56
3rdparty/fastcv/src/fastcv_hal_utils.cpp
vendored
Normal file
56
3rdparty/fastcv/src/fastcv_hal_utils.cpp
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "fastcv_hal_utils.hpp"
|
||||
|
||||
const char* getFastCVErrorString(int status)
|
||||
{
|
||||
switch(status)
|
||||
{
|
||||
case FASTCV_SUCCESS: return "Successful";
|
||||
case FASTCV_EFAIL: return "General failure";
|
||||
case FASTCV_EUNALIGNPARAM: return "Unaligned pointer parameter";
|
||||
case FASTCV_EBADPARAM: return "Bad parameters";
|
||||
case FASTCV_EINVALSTATE: return "Called at invalid state";
|
||||
case FASTCV_ENORES: return "Insufficient resources, memory, thread, etc";
|
||||
case FASTCV_EUNSUPPORTED: return "Unsupported feature";
|
||||
case FASTCV_EHWQDSP: return "Hardware QDSP failed to respond";
|
||||
case FASTCV_EHWGPU: return "Hardware GPU failed to respond";
|
||||
default: return "Unknown FastCV Error";
|
||||
}
|
||||
}
|
||||
|
||||
const char* borderToString(int border)
|
||||
{
|
||||
switch (border)
|
||||
{
|
||||
case 0: return "BORDER_CONSTANT";
|
||||
case 1: return "BORDER_REPLICATE";
|
||||
case 2: return "BORDER_REFLECT";
|
||||
case 3: return "BORDER_WRAP";
|
||||
case 4: return "BORDER_REFLECT_101";
|
||||
case 5: return "BORDER_TRANSPARENT";
|
||||
default: return "Unknown border type";
|
||||
}
|
||||
}
|
||||
|
||||
const char* interpolationToString(int interpolation)
|
||||
{
|
||||
switch (interpolation)
|
||||
{
|
||||
case 0: return "INTER_NEAREST";
|
||||
case 1: return "INTER_LINEAR";
|
||||
case 2: return "INTER_CUBIC";
|
||||
case 3: return "INTER_AREA";
|
||||
case 4: return "INTER_LANCZOS4";
|
||||
case 5: return "INTER_LINEAR_EXACT";
|
||||
case 6: return "INTER_NEAREST_EXACT";
|
||||
case 7: return "INTER_MAX";
|
||||
case 8: return "WARP_FILL_OUTLIERS";
|
||||
case 16: return "WARP_INVERSE_MAP";
|
||||
case 32: return "WARP_RELATIVE_MAP";
|
||||
default: return "Unknown interpolation type";
|
||||
}
|
||||
}
|
10
3rdparty/ffmpeg/ffmpeg.cmake
vendored
10
3rdparty/ffmpeg/ffmpeg.cmake
vendored
@ -1,8 +1,8 @@
|
||||
# Binaries branch name: ffmpeg/4.x_20240522
|
||||
# Binaries were created for OpenCV: 8393885a39dac1e650bf5d0aaff84c04ad8bcdd3
|
||||
ocv_update(FFMPEG_BINARIES_COMMIT "394dca6ceb3085c979415e6385996b6570e94153")
|
||||
ocv_update(FFMPEG_FILE_HASH_BIN32 "bdfbd1efb295f3e54c07d2cb7a843bf9")
|
||||
ocv_update(FFMPEG_FILE_HASH_BIN64 "bfef029900f788480a363d6dc05c4f0e")
|
||||
# Binaries branch name: ffmpeg/4.x_20241226
|
||||
# Binaries were created for OpenCV: 09892c9d1706f40342bda0bc404580f63492d9f8
|
||||
ocv_update(FFMPEG_BINARIES_COMMIT "d63d7c154c57242bf2283be61166be2bd30ec47e")
|
||||
ocv_update(FFMPEG_FILE_HASH_BIN32 "642b94d032a8292b07550126934173f6")
|
||||
ocv_update(FFMPEG_FILE_HASH_BIN64 "a8c3560c8f20e1ae465bef81580fa92c")
|
||||
ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")
|
||||
|
||||
function(download_win_ffmpeg script_var)
|
||||
|
7
3rdparty/hal_rvv/hal_rvv.hpp
vendored
7
3rdparty/hal_rvv/hal_rvv.hpp
vendored
@ -19,4 +19,9 @@
|
||||
#include "version/hal_rvv_071.hpp"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#if defined(__riscv_v) && __riscv_v == 1000000
|
||||
#include "hal_rvv_1p0/merge.hpp" // core
|
||||
#include "hal_rvv_1p0/mean.hpp" // core
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
228
3rdparty/hal_rvv/hal_rvv_1p0/mean.hpp
vendored
Normal file
228
3rdparty/hal_rvv/hal_rvv_1p0/mean.hpp
vendored
Normal file
@ -0,0 +1,228 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv {
|
||||
|
||||
#undef cv_hal_meanStdDev
|
||||
#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev
|
||||
|
||||
inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
|
||||
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
|
||||
inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
|
||||
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
|
||||
inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
|
||||
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
|
||||
|
||||
inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
|
||||
int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
|
||||
switch (src_type)
|
||||
{
|
||||
case CV_8UC1:
|
||||
return meanStdDev_8UC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
|
||||
case CV_8UC4:
|
||||
return meanStdDev_8UC4(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
|
||||
case CV_32FC1:
|
||||
return meanStdDev_32FC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
|
||||
default:
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
|
||||
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
|
||||
int nz = 0;
|
||||
int vlmax = __riscv_vsetvlmax_e64m8();
|
||||
vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
|
||||
vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
|
||||
if (mask) {
|
||||
for (int i = 0; i < height; ++i) {
|
||||
const uchar* src_row = src_data + i * src_step;
|
||||
const uchar* mask_row = mask + i * mask_step;
|
||||
int j = 0, vl;
|
||||
for ( ; j < width; j += vl) {
|
||||
vl = __riscv_vsetvl_e8m1(width - j);
|
||||
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
|
||||
auto vmask_u8 = __riscv_vle8_v_u8m1(mask_row+j, vl);
|
||||
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
|
||||
auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
|
||||
vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
|
||||
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
|
||||
nz += __riscv_vcpop_m_b8(vmask, vl);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < height; i++) {
|
||||
const uchar* src_row = src_data + i * src_step;
|
||||
int j = 0, vl;
|
||||
for ( ; j < width; j += vl) {
|
||||
vl = __riscv_vsetvl_e8m1(width - j);
|
||||
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
|
||||
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
|
||||
vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
|
||||
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
|
||||
}
|
||||
}
|
||||
nz = height * width;
|
||||
}
|
||||
if (nz == 0) {
|
||||
if (mean_val) *mean_val = 0.0;
|
||||
if (stddev_val) *stddev_val = 0.0;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
auto zero = __riscv_vmv_s_x_u64m1(0, vlmax);
|
||||
auto vec_red = __riscv_vmv_v_x_u64m1(0, vlmax);
|
||||
auto vec_reddev = __riscv_vmv_v_x_u64m1(0, vlmax);
|
||||
vec_red = __riscv_vredsum(vec_sum, zero, vlmax);
|
||||
vec_reddev = __riscv_vredsum(vec_sqsum, zero, vlmax);
|
||||
double sum = __riscv_vmv_x(vec_red);
|
||||
double mean = sum / nz;
|
||||
if (mean_val) {
|
||||
*mean_val = mean;
|
||||
}
|
||||
if (stddev_val) {
|
||||
double sqsum = __riscv_vmv_x(vec_reddev);
|
||||
double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
|
||||
double stddev = std::sqrt(variance);
|
||||
*stddev_val = stddev;
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
|
||||
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
|
||||
int nz = 0;
|
||||
int vlmax = __riscv_vsetvlmax_e64m8();
|
||||
vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
|
||||
vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
|
||||
if (mask) {
|
||||
for (int i = 0; i < height; ++i) {
|
||||
const uchar* src_row = src_data + i * src_step;
|
||||
const uchar* mask_row = mask + i * mask_step;
|
||||
int j = 0, jm = 0, vl, vlm;
|
||||
for ( ; j < width*4; j += vl, jm += vlm) {
|
||||
vl = __riscv_vsetvl_e8m1(width*4 - j);
|
||||
vlm = __riscv_vsetvl_e8mf4(width - jm);
|
||||
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
|
||||
auto vmask_u8mf4 = __riscv_vle8_v_u8mf4(mask_row + jm, vlm);
|
||||
auto vmask_u32 = __riscv_vzext_vf4(vmask_u8mf4, vlm);
|
||||
// 0 -> 0000; 1 -> 1111
|
||||
vmask_u32 = __riscv_vmul(vmask_u32, 0b00000001000000010000000100000001, vlm);
|
||||
auto vmask_u8 = __riscv_vreinterpret_u8m1(vmask_u32);
|
||||
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
|
||||
auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
|
||||
vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
|
||||
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
|
||||
nz += __riscv_vcpop_m_b8(vmask, vl);
|
||||
}
|
||||
}
|
||||
nz /= 4;
|
||||
} else {
|
||||
for (int i = 0; i < height; i++) {
|
||||
const uchar* src_row = src_data + i * src_step;
|
||||
int j = 0, vl;
|
||||
for ( ; j < width*4; j += vl) {
|
||||
vl = __riscv_vsetvl_e8m1(width*4 - j);
|
||||
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
|
||||
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
|
||||
vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
|
||||
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
|
||||
}
|
||||
}
|
||||
nz = height * width;
|
||||
}
|
||||
if (nz == 0) {
|
||||
if (mean_val) *mean_val = 0.0;
|
||||
if (stddev_val) *stddev_val = 0.0;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
uint64_t s[256], sq[256], sum[4] = {0}, sqsum[4] = {0};
|
||||
__riscv_vse64(s, vec_sum, vlmax);
|
||||
__riscv_vse64(sq, vec_sqsum, vlmax);
|
||||
for (int i = 0; i < vlmax; ++i)
|
||||
{
|
||||
sum[i % 4] += s[i];
|
||||
sqsum[i % 4] += sq[i];
|
||||
}
|
||||
if (mean_val) {
|
||||
mean_val[0] = (double)sum[0] / nz;
|
||||
mean_val[1] = (double)sum[1] / nz;
|
||||
mean_val[2] = (double)sum[2] / nz;
|
||||
mean_val[3] = (double)sum[3] / nz;
|
||||
}
|
||||
if (stddev_val) {
|
||||
stddev_val[0] = std::sqrt(std::max(((double)sqsum[0] / nz) - (mean_val[0] * mean_val[0]), 0.0));
|
||||
stddev_val[1] = std::sqrt(std::max(((double)sqsum[1] / nz) - (mean_val[1] * mean_val[1]), 0.0));
|
||||
stddev_val[2] = std::sqrt(std::max(((double)sqsum[2] / nz) - (mean_val[2] * mean_val[2]), 0.0));
|
||||
stddev_val[3] = std::sqrt(std::max(((double)sqsum[3] / nz) - (mean_val[3] * mean_val[3]), 0.0));
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
|
||||
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
|
||||
int nz = 0;
|
||||
int vlmax = __riscv_vsetvlmax_e64m4();
|
||||
vfloat64m4_t vec_sum = __riscv_vfmv_v_f_f64m4(0, vlmax);
|
||||
vfloat64m4_t vec_sqsum = __riscv_vfmv_v_f_f64m4(0, vlmax);
|
||||
src_step /= sizeof(float);
|
||||
if (mask) {
|
||||
for (int i = 0; i < height; ++i) {
|
||||
const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
|
||||
const uchar* mask_row = mask + i * mask_step;
|
||||
int j = 0, vl;
|
||||
for ( ; j < width; j += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(width - j);
|
||||
auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
|
||||
auto vmask_u8 = __riscv_vle8_v_u8mf2(mask_row + j, vl);
|
||||
auto vmask_u32 = __riscv_vzext_vf4(vmask_u8, vl);
|
||||
auto vmask = __riscv_vmseq_vx_u32m2_b16(vmask_u32, 1, vl);
|
||||
vec_sum = __riscv_vfwadd_wv_f64m4_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
|
||||
vec_sqsum = __riscv_vfwmacc_vv_f64m4_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
|
||||
nz += __riscv_vcpop_m_b16(vmask, vl);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < height; i++) {
|
||||
const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
|
||||
int j = 0, vl;
|
||||
for ( ; j < width; j += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(width - j);
|
||||
auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
|
||||
vec_sum = __riscv_vfwadd_wv_f64m4_tu(vec_sum, vec_sum, vec_pixel, vl);
|
||||
vec_sqsum = __riscv_vfwmacc_vv_f64m4_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
|
||||
}
|
||||
}
|
||||
nz = height * width;
|
||||
}
|
||||
if (nz == 0) {
|
||||
if (mean_val) *mean_val = 0.0;
|
||||
if (stddev_val) *stddev_val = 0.0;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
auto zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
|
||||
auto vec_red = __riscv_vfmv_v_f_f64m1(0, vlmax);
|
||||
auto vec_reddev = __riscv_vfmv_v_f_f64m1(0, vlmax);
|
||||
vec_red = __riscv_vfredusum(vec_sum, zero, vlmax);
|
||||
vec_reddev = __riscv_vfredusum(vec_sqsum, zero, vlmax);
|
||||
double sum = __riscv_vfmv_f(vec_red);
|
||||
double mean = sum / nz;
|
||||
if (mean_val) {
|
||||
*mean_val = mean;
|
||||
}
|
||||
if (stddev_val) {
|
||||
double sqsum = __riscv_vfmv_f(vec_reddev);
|
||||
double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
|
||||
double stddev = std::sqrt(variance);
|
||||
*stddev_val = stddev;
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
397
3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
vendored
Normal file
397
3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
vendored
Normal file
@ -0,0 +1,397 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
namespace cv { namespace cv_hal_rvv {
|
||||
|
||||
#undef cv_hal_merge8u
|
||||
#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
|
||||
#undef cv_hal_merge16u
|
||||
#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
|
||||
#undef cv_hal_merge32s
|
||||
#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
|
||||
#undef cv_hal_merge64s
|
||||
#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
|
||||
|
||||
#if defined __GNUC__
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i = 0;
|
||||
int vl = __riscv_vsetvlmax_e8m1();
|
||||
if( k == 1 )
|
||||
{
|
||||
const uchar* src0 = src[0];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++)
|
||||
dst[i*cn] = src0[i];
|
||||
}
|
||||
else if( k == 2 )
|
||||
{
|
||||
const uchar *src0 = src[0], *src1 = src[1];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
{
|
||||
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
|
||||
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 3, sizeof(uchar)*cn, d, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
dst[i*cn+3] = src3[i];
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
const uchar *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
||||
i = 0;
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
|
||||
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn + 2, sizeof(uchar)*cn, c, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn + 3, sizeof(uchar)*cn, d, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[k+i*cn] = src0[i];
|
||||
dst[k+i*cn+1] = src1[i];
|
||||
dst[k+i*cn+2] = src2[i];
|
||||
dst[k+i*cn+3] = src3[i];
|
||||
}
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
#if defined __GNUC__
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i = 0;
|
||||
int vl = __riscv_vsetvlmax_e16m1();
|
||||
if( k == 1 )
|
||||
{
|
||||
const ushort* src0 = src[0];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++)
|
||||
dst[i*cn] = src0[i];
|
||||
}
|
||||
else if( k == 2 )
|
||||
{
|
||||
const ushort *src0 = src[0], *src1 = src[1];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
{
|
||||
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
|
||||
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 3, sizeof(ushort)*cn, d, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
dst[i*cn+3] = src3[i];
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
const uint16_t *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
||||
i = 0;
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
|
||||
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn + 2, sizeof(ushort)*cn, c, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn + 3, sizeof(ushort)*cn, d, vl);
|
||||
}
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[k+i*cn] = src0[i];
|
||||
dst[k+i*cn+1] = src1[i];
|
||||
dst[k+i*cn+2] = src2[i];
|
||||
dst[k+i*cn+3] = src3[i];
|
||||
}
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
#if defined __GNUC__
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline int merge32s(const int** src, int* dst, int len, int cn ) {
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i, j;
|
||||
if( k == 1 )
|
||||
{
|
||||
const int* src0 = src[0];
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( i = j = 0; i < len; i++, j += cn )
|
||||
dst[j] = src0[i];
|
||||
}
|
||||
else if( k == 2 )
|
||||
{
|
||||
const int *src0 = src[0], *src1 = src[1];
|
||||
i = j = 0;
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i];
|
||||
dst[j+1] = src1[i];
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
{
|
||||
const int *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
||||
i = j = 0;
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i];
|
||||
dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
i = j = 0;
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i]; dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
const int *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
||||
for( i = 0, j = k; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i]; dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
||||
}
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
#if defined __GNUC__
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i, j;
|
||||
if( k == 1 )
|
||||
{
|
||||
const int64* src0 = src[0];
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( i = j = 0; i < len; i++, j += cn )
|
||||
dst[j] = src0[i];
|
||||
}
|
||||
else if( k == 2 )
|
||||
{
|
||||
const int64 *src0 = src[0], *src1 = src[1];
|
||||
i = j = 0;
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i];
|
||||
dst[j+1] = src1[i];
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
{
|
||||
const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
||||
i = j = 0;
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i];
|
||||
dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
i = j = 0;
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i]; dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
const int64 *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
||||
for( i = 0, j = k; i < len; i++, j += cn )
|
||||
{
|
||||
dst[j] = src0[i]; dst[j+1] = src1[i];
|
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
||||
}
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
28
3rdparty/kleidicv/CMakeLists.txt
vendored
28
3rdparty/kleidicv/CMakeLists.txt
vendored
@ -1,23 +1,11 @@
|
||||
project(kleidicv_hal)
|
||||
|
||||
set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
|
||||
ocv_update(KLEIDICV_SRC_COMMIT "0.1.0")
|
||||
ocv_update(KLEIDICV_SRC_HASH "9388f28cf2fbe3338197b2b57d491468")
|
||||
|
||||
if(KLEIDICV_SOURCE_PATH)
|
||||
set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
|
||||
else()
|
||||
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
|
||||
HASH ${KLEIDICV_SRC_HASH}
|
||||
URL
|
||||
"${OPENCV_KLEIDICV_URL}"
|
||||
"$ENV{OPENCV_KLEIDICV_URL}"
|
||||
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
|
||||
DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
|
||||
ID KLEIDICV
|
||||
STATUS res
|
||||
UNPACK RELATIVE_URL)
|
||||
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
|
||||
if(HAVE_KLEIDICV)
|
||||
option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
|
||||
include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
|
||||
# HACK to suppress adapters/opencv/kleidicv_hal.cpp:343:12: warning: unused function 'from_opencv' [-Wunused-function]
|
||||
target_compile_options( kleidicv_hal PRIVATE
|
||||
$<TARGET_PROPERTY:kleidicv,COMPILE_OPTIONS>
|
||||
"-Wno-old-style-cast" "-Wno-unused-function"
|
||||
)
|
||||
endif()
|
||||
|
||||
include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")
|
||||
|
21
3rdparty/kleidicv/kleidicv.cmake
vendored
Normal file
21
3rdparty/kleidicv/kleidicv.cmake
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
function(download_kleidicv root_var)
|
||||
set(${root_var} "" PARENT_SCOPE)
|
||||
|
||||
ocv_update(KLEIDICV_SRC_COMMIT "0.3.0")
|
||||
ocv_update(KLEIDICV_SRC_HASH "51a77b0185c2bac2a968a2163869b1ed")
|
||||
|
||||
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv")
|
||||
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
|
||||
HASH ${KLEIDICV_SRC_HASH}
|
||||
URL
|
||||
"${OPENCV_KLEIDICV_URL}"
|
||||
"$ENV{OPENCV_KLEIDICV_URL}"
|
||||
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
|
||||
DESTINATION_DIR ${THE_ROOT}
|
||||
ID KLEIDICV
|
||||
STATUS res
|
||||
UNPACK RELATIVE_URL)
|
||||
if(res)
|
||||
set(${root_var} "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
48
3rdparty/ndsrvp/include/imgproc.hpp
vendored
48
3rdparty/ndsrvp/include/imgproc.hpp
vendored
@ -5,6 +5,8 @@
|
||||
#ifndef OPENCV_NDSRVP_IMGPROC_HPP
|
||||
#define OPENCV_NDSRVP_IMGPROC_HPP
|
||||
|
||||
struct cvhalFilter2D;
|
||||
|
||||
namespace cv {
|
||||
|
||||
namespace ndsrvp {
|
||||
@ -71,6 +73,52 @@ int threshold(const uchar* src_data, size_t src_step,
|
||||
#undef cv_hal_threshold
|
||||
#define cv_hal_threshold (cv::ndsrvp::threshold)
|
||||
|
||||
// ################ filter ################
|
||||
|
||||
int filterInit(cvhalFilter2D **context,
|
||||
uchar *kernel_data, size_t kernel_step,
|
||||
int kernel_type, int kernel_width,
|
||||
int kernel_height, int max_width, int max_height,
|
||||
int src_type, int dst_type, int borderType,
|
||||
double delta, int anchor_x, int anchor_y,
|
||||
bool allowSubmatrix, bool allowInplace);
|
||||
|
||||
#undef cv_hal_filterInit
|
||||
#define cv_hal_filterInit (cv::ndsrvp::filterInit)
|
||||
|
||||
int filter(cvhalFilter2D *context,
|
||||
const uchar *src_data, size_t src_step,
|
||||
uchar *dst_data, size_t dst_step,
|
||||
int width, int height,
|
||||
int full_width, int full_height,
|
||||
int offset_x, int offset_y);
|
||||
|
||||
#undef cv_hal_filter
|
||||
#define cv_hal_filter (cv::ndsrvp::filter)
|
||||
|
||||
int filterFree(cvhalFilter2D *context);
|
||||
|
||||
#undef cv_hal_filterFree
|
||||
#define cv_hal_filterFree (cv::ndsrvp::filterFree)
|
||||
|
||||
// ################ medianBlur ################
|
||||
|
||||
int medianBlur(const uchar* src_data, size_t src_step,
|
||||
uchar* dst_data, size_t dst_step,
|
||||
int width, int height, int depth, int cn, int ksize);
|
||||
|
||||
#undef cv_hal_medianBlur
|
||||
#define cv_hal_medianBlur (cv::ndsrvp::medianBlur)
|
||||
|
||||
// ################ bilateralFilter ################
|
||||
|
||||
int bilateralFilter(const uchar* src_data, size_t src_step,
|
||||
uchar* dst_data, size_t dst_step, int width, int height, int depth,
|
||||
int cn, int d, double sigma_color, double sigma_space, int border_type);
|
||||
|
||||
#undef cv_hal_bilateralFilter
|
||||
#define cv_hal_bilateralFilter (cv::ndsrvp::bilateralFilter)
|
||||
|
||||
} // namespace ndsrvp
|
||||
|
||||
} // namespace cv
|
||||
|
270
3rdparty/ndsrvp/src/bilateralFilter.cpp
vendored
Normal file
270
3rdparty/ndsrvp/src/bilateralFilter.cpp
vendored
Normal file
@ -0,0 +1,270 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "ndsrvp_hal.hpp"
|
||||
#include "opencv2/imgproc/hal/interface.h"
|
||||
#include "cvutils.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
namespace ndsrvp {
|
||||
|
||||
static void bilateralFilterProcess(uchar* dst_data, size_t dst_step, uchar* pad_data, size_t pad_step,
|
||||
int width, int height, int cn, int radius, int maxk,
|
||||
int* space_ofs, float *space_weight, float *color_weight)
|
||||
{
|
||||
int i, j, k;
|
||||
|
||||
for( i = 0; i < height; i++ )
|
||||
{
|
||||
const uchar* sptr = pad_data + (i + radius) * pad_step + radius * cn;
|
||||
uchar* dptr = dst_data + i * dst_step;
|
||||
|
||||
if( cn == 1 )
|
||||
{
|
||||
std::vector<float> buf(width + width, 0.0);
|
||||
float *sum = &buf[0];
|
||||
float *wsum = sum + width;
|
||||
k = 0;
|
||||
for(; k <= maxk-4; k+=4)
|
||||
{
|
||||
const uchar* ksptr0 = sptr + space_ofs[k];
|
||||
const uchar* ksptr1 = sptr + space_ofs[k+1];
|
||||
const uchar* ksptr2 = sptr + space_ofs[k+2];
|
||||
const uchar* ksptr3 = sptr + space_ofs[k+3];
|
||||
j = 0;
|
||||
for (; j < width; j++)
|
||||
{
|
||||
int rval = sptr[j];
|
||||
|
||||
int val = ksptr0[j];
|
||||
float w = space_weight[k] * color_weight[std::abs(val - rval)];
|
||||
wsum[j] += w;
|
||||
sum[j] += val * w;
|
||||
|
||||
val = ksptr1[j];
|
||||
w = space_weight[k+1] * color_weight[std::abs(val - rval)];
|
||||
wsum[j] += w;
|
||||
sum[j] += val * w;
|
||||
|
||||
val = ksptr2[j];
|
||||
w = space_weight[k+2] * color_weight[std::abs(val - rval)];
|
||||
wsum[j] += w;
|
||||
sum[j] += val * w;
|
||||
|
||||
val = ksptr3[j];
|
||||
w = space_weight[k+3] * color_weight[std::abs(val - rval)];
|
||||
wsum[j] += w;
|
||||
sum[j] += val * w;
|
||||
}
|
||||
}
|
||||
for(; k < maxk; k++)
|
||||
{
|
||||
const uchar* ksptr = sptr + space_ofs[k];
|
||||
j = 0;
|
||||
for (; j < width; j++)
|
||||
{
|
||||
int val = ksptr[j];
|
||||
float w = space_weight[k] * color_weight[std::abs(val - sptr[j])];
|
||||
wsum[j] += w;
|
||||
sum[j] += val * w;
|
||||
}
|
||||
}
|
||||
j = 0;
|
||||
for (; j < width; j++)
|
||||
{
|
||||
// overflow is not possible here => there is no need to use cv::saturate_cast
|
||||
ndsrvp_assert(fabs(wsum[j]) > 0);
|
||||
dptr[j] = (uchar)(sum[j] / wsum[j] + 0.5);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ndsrvp_assert( cn == 3 );
|
||||
std::vector<float> buf(width * 3 + width);
|
||||
float *sum_b = &buf[0];
|
||||
float *sum_g = sum_b + width;
|
||||
float *sum_r = sum_g + width;
|
||||
float *wsum = sum_r + width;
|
||||
k = 0;
|
||||
for(; k <= maxk-4; k+=4)
|
||||
{
|
||||
const uchar* ksptr0 = sptr + space_ofs[k];
|
||||
const uchar* ksptr1 = sptr + space_ofs[k+1];
|
||||
const uchar* ksptr2 = sptr + space_ofs[k+2];
|
||||
const uchar* ksptr3 = sptr + space_ofs[k+3];
|
||||
const uchar* rsptr = sptr;
|
||||
j = 0;
|
||||
for(; j < width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
|
||||
{
|
||||
int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
|
||||
|
||||
int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
|
||||
float w = space_weight[k] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
|
||||
wsum[j] += w;
|
||||
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
|
||||
|
||||
b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
|
||||
w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
|
||||
wsum[j] += w;
|
||||
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
|
||||
|
||||
b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
|
||||
w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
|
||||
wsum[j] += w;
|
||||
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
|
||||
|
||||
b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
|
||||
w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
|
||||
wsum[j] += w;
|
||||
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
|
||||
}
|
||||
}
|
||||
for(; k < maxk; k++)
|
||||
{
|
||||
const uchar* ksptr = sptr + space_ofs[k];
|
||||
const uchar* rsptr = sptr;
|
||||
j = 0;
|
||||
for(; j < width; j++, ksptr += 3, rsptr += 3)
|
||||
{
|
||||
int b = ksptr[0], g = ksptr[1], r = ksptr[2];
|
||||
float w = space_weight[k] * color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])];
|
||||
wsum[j] += w;
|
||||
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
|
||||
}
|
||||
}
|
||||
j = 0;
|
||||
for(; j < width; j++)
|
||||
{
|
||||
ndsrvp_assert(fabs(wsum[j]) > 0);
|
||||
wsum[j] = 1.f / wsum[j];
|
||||
*(dptr++) = (uchar)(sum_b[j] * wsum[j] + 0.5);
|
||||
*(dptr++) = (uchar)(sum_g[j] * wsum[j] + 0.5);
|
||||
*(dptr++) = (uchar)(sum_r[j] * wsum[j] + 0.5);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int bilateralFilter(const uchar* src_data, size_t src_step,
|
||||
uchar* dst_data, size_t dst_step, int width, int height, int depth,
|
||||
int cn, int d, double sigma_color, double sigma_space, int border_type)
|
||||
{
|
||||
if( depth != CV_8U || !(cn == 1 || cn == 3) || src_data == dst_data)
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
int i, j, maxk, radius;
|
||||
|
||||
if( sigma_color <= 0 )
|
||||
sigma_color = 1;
|
||||
if( sigma_space <= 0 )
|
||||
sigma_space = 1;
|
||||
|
||||
double gauss_color_coeff = -0.5/(sigma_color * sigma_color);
|
||||
double gauss_space_coeff = -0.5/(sigma_space * sigma_space);
|
||||
|
||||
if( d <= 0 )
|
||||
radius = (int)(sigma_space * 1.5 + 0.5);
|
||||
else
|
||||
radius = d / 2;
|
||||
|
||||
radius = MAX(radius, 1);
|
||||
d = radius * 2 + 1;
|
||||
|
||||
// no enough submatrix info
|
||||
// fetch original image data
|
||||
const uchar *ogn_data = src_data;
|
||||
int ogn_step = src_step;
|
||||
|
||||
// ROI fully used in the computation
|
||||
int cal_width = width + d - 1;
|
||||
int cal_height = height + d - 1;
|
||||
int cal_x = 0 - radius; // negative if left border exceeded
|
||||
int cal_y = 0 - radius; // negative if top border exceeded
|
||||
|
||||
// calculate source border
|
||||
std::vector<uchar> padding;
|
||||
padding.resize(cal_width * cal_height * cn);
|
||||
uchar* pad_data = &padding[0];
|
||||
int pad_step = cal_width * cn;
|
||||
|
||||
uchar* pad_ptr;
|
||||
const uchar* ogn_ptr;
|
||||
std::vector<uchar> vec_zeros(cn, 0);
|
||||
for(i = 0; i < cal_height; i++)
|
||||
{
|
||||
int y = borderInterpolate(i + cal_y, height, border_type);
|
||||
if(y < 0) {
|
||||
memset(pad_data + i * pad_step, 0, cn * cal_width);
|
||||
continue;
|
||||
}
|
||||
|
||||
// left border
|
||||
j = 0;
|
||||
for(; j + cal_x < 0; j++)
|
||||
{
|
||||
int x = borderInterpolate(j + cal_x, width, border_type);
|
||||
if(x < 0) // border constant return value -1
|
||||
ogn_ptr = &vec_zeros[0];
|
||||
else
|
||||
ogn_ptr = ogn_data + y * ogn_step + x * cn;
|
||||
pad_ptr = pad_data + i * pad_step + j * cn;
|
||||
memcpy(pad_ptr, ogn_ptr, cn);
|
||||
}
|
||||
|
||||
// center
|
||||
int rborder = MIN(cal_width, width - cal_x);
|
||||
ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cn;
|
||||
pad_ptr = pad_data + i * pad_step + j * cn;
|
||||
memcpy(pad_ptr, ogn_ptr, cn * (rborder - j));
|
||||
|
||||
// right border
|
||||
j = rborder;
|
||||
for(; j < cal_width; j++)
|
||||
{
|
||||
int x = borderInterpolate(j + cal_x, width, border_type);
|
||||
if(x < 0) // border constant return value -1
|
||||
ogn_ptr = &vec_zeros[0];
|
||||
else
|
||||
ogn_ptr = ogn_data + y * ogn_step + x * cn;
|
||||
pad_ptr = pad_data + i * pad_step + j * cn;
|
||||
memcpy(pad_ptr, ogn_ptr, cn);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> _color_weight(cn * 256);
|
||||
std::vector<float> _space_weight(d * d);
|
||||
std::vector<int> _space_ofs(d * d);
|
||||
float* color_weight = &_color_weight[0];
|
||||
float* space_weight = &_space_weight[0];
|
||||
int* space_ofs = &_space_ofs[0];
|
||||
|
||||
// initialize color-related bilateral filter coefficients
|
||||
|
||||
for( i = 0; i < 256 * cn; i++ )
|
||||
color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
|
||||
|
||||
// initialize space-related bilateral filter coefficients
|
||||
for( i = -radius, maxk = 0; i <= radius; i++ )
|
||||
{
|
||||
j = -radius;
|
||||
|
||||
for( ; j <= radius; j++ )
|
||||
{
|
||||
double r = std::sqrt((double)i * i + (double)j * j);
|
||||
if( r > radius )
|
||||
continue;
|
||||
space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
|
||||
space_ofs[maxk++] = (int)(i * pad_step + j * cn);
|
||||
}
|
||||
}
|
||||
|
||||
bilateralFilterProcess(dst_data, dst_step, pad_data, pad_step, width, height, cn, radius, maxk, space_ofs, space_weight, color_weight);
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
} // namespace ndsrvp
|
||||
|
||||
} // namespace cv
|
34
3rdparty/ndsrvp/src/cvutils.cpp
vendored
34
3rdparty/ndsrvp/src/cvutils.cpp
vendored
@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType)
|
||||
return p;
|
||||
}
|
||||
|
||||
int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType)
|
||||
{
|
||||
int16x4_t vzero = (int16x4_t){0, 0, 0, 0};
|
||||
int16x4_t vone = (int16x4_t){1, 1, 1, 1};
|
||||
int16x4_t vlen = (int16x4_t){len, len, len, len};
|
||||
if(borderType == CV_HAL_BORDER_REPLICATE)
|
||||
vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
|
||||
else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
|
||||
{
|
||||
int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero;
|
||||
if(len == 1)
|
||||
return vzero;
|
||||
do
|
||||
{
|
||||
int16x4_t vneg = -vp - 1 + vdelta;
|
||||
int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta;
|
||||
vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
|
||||
}
|
||||
while( (long)(vp >= vlen) || (long)(vp < 0) );
|
||||
}
|
||||
else if(borderType == CV_HAL_BORDER_WRAP)
|
||||
{
|
||||
ndsrvp_assert(len > 0);
|
||||
int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen;
|
||||
int16x4_t vpos = vp % vlen;
|
||||
vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
|
||||
}
|
||||
else if(borderType == CV_HAL_BORDER_CONSTANT)
|
||||
vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen));
|
||||
else
|
||||
ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type");
|
||||
return vp;
|
||||
}
|
||||
|
||||
} // namespace ndsrvp
|
||||
|
||||
} // namespace cv
|
||||
|
160
3rdparty/ndsrvp/src/cvutils.hpp
vendored
160
3rdparty/ndsrvp/src/cvutils.hpp
vendored
@ -14,6 +14,7 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <climits>
|
||||
#include <algorithm>
|
||||
|
||||
@ -26,16 +27,26 @@ namespace ndsrvp {
|
||||
void* fastMalloc(size_t size);
|
||||
void fastFree(void* ptr);
|
||||
int borderInterpolate(int p, int len, int borderType);
|
||||
int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType);
|
||||
|
||||
#ifndef MAX
|
||||
# define MAX(a,b) ((a) < (b) ? (b) : (a))
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
# define MIN(a,b) ((a) > (b) ? (b) : (a))
|
||||
#endif
|
||||
|
||||
#define CV_MAT_CN_MASK ((CV_CN_MAX - 1) << CV_CN_SHIFT)
|
||||
#define CV_MAT_CN(flags) ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
|
||||
|
||||
#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
|
||||
#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
|
||||
|
||||
#define CV_MALLOC_ALIGN 64
|
||||
|
||||
inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
|
||||
|
||||
// error codes
|
||||
|
||||
enum Error{
|
||||
@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
|
||||
return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
|
||||
}
|
||||
|
||||
// expand
|
||||
|
||||
/*
|
||||
[0] [1] [2] [3] [4] [5] [6] [7]
|
||||
810 [ 0 ] [ 1 ] [ 4 ] [ 5 ]
|
||||
832 [ 2 ] [ 3 ] [ 6 ] [ 7 ]
|
||||
bb [ 0 ] [ 1 ] [ 2 ] [ 3 ]
|
||||
tt [ 4 ] [ 5 ] [ 6 ] [ 7 ]
|
||||
*/
|
||||
|
||||
inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst)
|
||||
{
|
||||
unsigned long vs810 = __nds__zunpkd810(vs);
|
||||
unsigned long vs832 = __nds__zunpkd832(vs);
|
||||
*(unsigned long*)dst = __nds__pkbb32(vs832, vs810);
|
||||
*(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810);
|
||||
}
|
||||
|
||||
/*
|
||||
[0] [1] [2] [3] [4] [5] [6] [7]
|
||||
820 [ 0 ] [ 2 ] [ 4 ] [ 6 ]
|
||||
831 [ 1 ] [ 3 ] [ 5 ] [ 7 ]
|
||||
bb [ 0 ] [ 2 ] [ 1 ] [ 3 ]
|
||||
tt [ 4 ] [ 6 ] [ 5 ] [ 7 ]
|
||||
*/
|
||||
|
||||
inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst)
|
||||
{
|
||||
unsigned long vs820 = __nds__zunpkd820(vs);
|
||||
unsigned long vs831 = __nds__zunpkd831(vs);
|
||||
*(unsigned long*)dst = __nds__pkbb32(vs831, vs820);
|
||||
*(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820);
|
||||
}
|
||||
|
||||
/*
|
||||
[0] [1] [2] [3] [4] [5] [6] [7]
|
||||
820 [ 0 ] [ 2 ] [ 4 ] [ 6 ]
|
||||
831 [ 1 ] [ 3 ] [ 5 ] [ 7 ]
|
||||
bb [ 0 ] [ 2 ] [ 1 ] [ 3 ]
|
||||
tt [ 4 ] [ 6 ] [ 5 ] [ 7 ]
|
||||
bbbb[ 0 ] [ 1 ]
|
||||
bbtt[ 2 ] [ 3 ]
|
||||
ttbb[ 4 ] [ 5 ]
|
||||
tttt[ 6 ] [ 7 ]
|
||||
*/
|
||||
|
||||
|
||||
inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst)
|
||||
{
|
||||
unsigned long vs820 = __nds__zunpkd820(vs);
|
||||
unsigned long vs831 = __nds__zunpkd831(vs);
|
||||
unsigned long vsbb = __nds__pkbb32(vs831, vs820);
|
||||
unsigned long vstt = __nds__pktt32(vs831, vs820);
|
||||
*(unsigned long*)dst = __nds__pkbb16(0, vsbb);
|
||||
*(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb);
|
||||
*(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt);
|
||||
*(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt);
|
||||
}
|
||||
|
||||
// float replacement
|
||||
|
||||
inline void ndsrvp_f32_add8(const float* a, const float* b, float* c)
|
||||
{
|
||||
c[0] = a[0] + b[0];
|
||||
c[1] = a[1] + b[1];
|
||||
c[2] = a[2] + b[2];
|
||||
c[3] = a[3] + b[3];
|
||||
c[4] = a[4] + b[4];
|
||||
c[5] = a[5] + b[5];
|
||||
c[6] = a[6] + b[6];
|
||||
c[7] = a[7] + b[7];
|
||||
}
|
||||
|
||||
/*
|
||||
[1] [8] [23]
|
||||
[24] [8]
|
||||
*/
|
||||
|
||||
inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact
|
||||
{
|
||||
const int mask_frac = 0x007FFFFF;
|
||||
const int mask_sign = 0x7FFFFFFF;
|
||||
const int mask_lead = 0x40000000;
|
||||
const int ofs_exp = 23;
|
||||
|
||||
uint32x2_t va01 = *(uint32x2_t*)a;
|
||||
uint32x2_t va23 = *(uint32x2_t*)(a + 2);
|
||||
uint32x2_t va45 = *(uint32x2_t*)(a + 4);
|
||||
uint32x2_t va67 = *(uint32x2_t*)(a + 6);
|
||||
|
||||
uint32x2_t vaexp01 = va01 >> ofs_exp;
|
||||
uint32x2_t vaexp23 = va23 >> ofs_exp;
|
||||
uint32x2_t vaexp45 = va45 >> ofs_exp;
|
||||
uint32x2_t vaexp67 = va67 >> ofs_exp;
|
||||
|
||||
uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead;
|
||||
uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead;
|
||||
uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead;
|
||||
uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead;
|
||||
|
||||
int16x4_t vb[2]; // fake signed for signed multiply
|
||||
ndsrvp_u8_u16_eswap8(b, (ushort*)vb);
|
||||
|
||||
vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]);
|
||||
vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]);
|
||||
vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]);
|
||||
vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]);
|
||||
|
||||
uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8;
|
||||
uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8;
|
||||
uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8;
|
||||
uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8;
|
||||
|
||||
vaexp01 += 8 - vaclz01;
|
||||
vaexp23 += 8 - vaclz23;
|
||||
vaexp45 += 8 - vaclz45;
|
||||
vaexp67 += 8 - vaclz67;
|
||||
|
||||
vafrac01 <<= vaclz01;
|
||||
vafrac23 <<= vaclz23;
|
||||
vafrac45 <<= vaclz45;
|
||||
vafrac67 <<= vaclz67;
|
||||
|
||||
*(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac);
|
||||
*(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac);
|
||||
*(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac);
|
||||
*(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac);
|
||||
}
|
||||
|
||||
// saturate
|
||||
|
||||
template<typename _Tp> static inline _Tp saturate_cast(int v) { return _Tp(v); }
|
||||
@ -94,6 +234,26 @@ template<> inline short saturate_cast<short>(double v) { return saturate_cas
|
||||
template<> inline int saturate_cast<int>(float v) { return (int)lrintf(v); }
|
||||
template<> inline int saturate_cast<int>(double v) { return (int)lrint(v); }
|
||||
|
||||
inline double cast_ptr_to_double(const uchar* v, int depth) {
|
||||
switch (depth) {
|
||||
case CV_8U: return (double)*(uchar*)v;
|
||||
case CV_8S: return (double)*(char*)v;
|
||||
case CV_16U: return (double)*(ushort*)v;
|
||||
case CV_16S: return (double)*(short*)v;
|
||||
case CV_32S: return (double)*(int*)v;
|
||||
case CV_32F: return (double)*(float*)v;
|
||||
case CV_64F: return (double)*(double*)v;
|
||||
case CV_16F: return (double)*(float*)v;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename _Tp>
|
||||
inline _Tp data_at(const uchar* data, int step, int y, int x, int cn)
|
||||
{
|
||||
return ((_Tp*)(data + y * step))[x * cn];
|
||||
}
|
||||
|
||||
// align
|
||||
|
||||
inline long align(size_t v, int n)
|
||||
|
321
3rdparty/ndsrvp/src/filter.cpp
vendored
Normal file
321
3rdparty/ndsrvp/src/filter.cpp
vendored
Normal file
@ -0,0 +1,321 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "ndsrvp_hal.hpp"
|
||||
#include "opencv2/imgproc/hal/interface.h"
|
||||
#include "cvutils.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
namespace ndsrvp {
|
||||
|
||||
class FilterData
|
||||
{
|
||||
public:
|
||||
FilterData(uchar *_kernel_data, size_t _kernel_step, int _kernel_type, int _src_type, int _dst_type, int _borderType,
|
||||
int _kernel_width, int _kernel_height, int _max_width, int _max_height, double _delta, int _anchor_x, int _anchor_y)
|
||||
: kernel_data(_kernel_data), kernel_step(_kernel_step), kernel_type(_kernel_type), src_type(_src_type), dst_type(_dst_type), borderType(_borderType),
|
||||
kernel_width(_kernel_width), kernel_height(_kernel_height), max_width(_max_width), max_height(_max_height), delta(_delta), anchor_x(_anchor_x), anchor_y(_anchor_y)
|
||||
{
|
||||
}
|
||||
|
||||
uchar *kernel_data;
|
||||
size_t kernel_step; // bytes between rows(height)
|
||||
int kernel_type, src_type, dst_type, borderType;
|
||||
int kernel_width, kernel_height;
|
||||
int max_width, max_height;
|
||||
double delta;
|
||||
int anchor_x, anchor_y;
|
||||
std::vector<uchar> coords;
|
||||
std::vector<float> coeffs;
|
||||
int nz;
|
||||
std::vector<uchar> padding;
|
||||
};
|
||||
|
||||
static int countNonZero(const FilterData* ctx)
|
||||
{
|
||||
int i, j, nz = 0;
|
||||
const uchar* ker_row = ctx->kernel_data;
|
||||
for( i = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
|
||||
{
|
||||
for( j = 0; j < ctx->kernel_width; j++ )
|
||||
{
|
||||
if( ((float*)ker_row)[j] != 0.0 )
|
||||
nz++;
|
||||
}
|
||||
}
|
||||
return nz;
|
||||
}
|
||||
|
||||
static void preprocess2DKernel(FilterData* ctx)
|
||||
{
|
||||
int i, j, k, nz = countNonZero(ctx), ktype = ctx->kernel_type;
|
||||
if(nz == 0)
|
||||
nz = 1; // (0, 0) == 0 by default
|
||||
ndsrvp_assert( ktype == CV_32F );
|
||||
|
||||
ctx->coords.resize(nz * 2);
|
||||
ctx->coeffs.resize(nz);
|
||||
|
||||
const uchar* ker_row = ctx->kernel_data;
|
||||
for( i = k = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
|
||||
{
|
||||
for( j = 0; j < ctx->kernel_width; j++ )
|
||||
{
|
||||
float val = ((float*)ker_row)[j];
|
||||
if( val == 0.0 )
|
||||
continue;
|
||||
ctx->coords[k * 2] = j;
|
||||
ctx->coords[k * 2 + 1] = i;
|
||||
ctx->coeffs[k++] = val;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->nz = k;
|
||||
}
|
||||
|
||||
int filterInit(cvhalFilter2D **context,
|
||||
uchar *kernel_data, size_t kernel_step,
|
||||
int kernel_type, int kernel_width,
|
||||
int kernel_height, int max_width, int max_height,
|
||||
int src_type, int dst_type, int borderType,
|
||||
double delta, int anchor_x, int anchor_y,
|
||||
bool allowSubmatrix, bool allowInplace)
|
||||
{
|
||||
int sdepth = CV_MAT_DEPTH(src_type), ddepth = CV_MAT_DEPTH(dst_type);
|
||||
int cn = CV_MAT_CN(src_type), kdepth = kernel_type;
|
||||
|
||||
(void)allowSubmatrix;
|
||||
(void)allowInplace;
|
||||
|
||||
if(delta - (int)delta != 0.0)
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
if(kdepth != CV_32F || (sdepth != CV_8U && sdepth != CV_16U) || ddepth != sdepth)
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
FilterData *ctx = new FilterData(kernel_data, kernel_step, kernel_type, src_type, dst_type, borderType,
|
||||
kernel_width, kernel_height, max_width, max_height, delta, anchor_x, anchor_y);
|
||||
|
||||
*context = (cvhalFilter2D*)ctx;
|
||||
|
||||
ndsrvp_assert(cn == CV_MAT_CN(dst_type) && ddepth >= sdepth);
|
||||
|
||||
preprocess2DKernel(ctx);
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
int filter(cvhalFilter2D *context,
|
||||
const uchar *src_data, size_t src_step,
|
||||
uchar *dst_data, size_t dst_step,
|
||||
int width, int height,
|
||||
int full_width, int full_height,
|
||||
int offset_x, int offset_y)
|
||||
{
|
||||
FilterData *ctx = (FilterData*)context;
|
||||
|
||||
int cn = CV_MAT_CN(ctx->src_type);
|
||||
int cnes = CV_ELEM_SIZE(ctx->src_type);
|
||||
int ddepth = CV_MAT_DEPTH(ctx->dst_type);
|
||||
float delta_sat = (uchar)(ctx->delta);
|
||||
if(ddepth == CV_8U)
|
||||
delta_sat = (float)saturate_cast<uchar>(ctx->delta);
|
||||
else if(ddepth == CV_16U)
|
||||
delta_sat = (float)saturate_cast<ushort>(ctx->delta);
|
||||
|
||||
// fetch original image data
|
||||
const uchar *ogn_data = src_data - offset_y * src_step - offset_x * cnes;
|
||||
int ogn_step = src_step;
|
||||
|
||||
// ROI fully used in the computation
|
||||
int cal_width = width + ctx->kernel_width - 1;
|
||||
int cal_height = height + ctx->kernel_height - 1;
|
||||
int cal_x = offset_x - ctx->anchor_x; // negative if left border exceeded
|
||||
int cal_y = offset_y - ctx->anchor_y; // negative if top border exceeded
|
||||
|
||||
// calculate source border
|
||||
ctx->padding.resize(cal_width * cal_height * cnes);
|
||||
uchar* pad_data = &ctx->padding[0];
|
||||
int pad_step = cal_width * cnes;
|
||||
|
||||
uchar* pad_ptr;
|
||||
const uchar* ogn_ptr;
|
||||
std::vector<uchar> vec_zeros(cnes, 0);
|
||||
for(int i = 0; i < cal_height; i++)
|
||||
{
|
||||
int y = borderInterpolate(i + cal_y, full_height, ctx->borderType);
|
||||
if(y < 0) {
|
||||
memset(pad_data + i * pad_step, 0, cnes * cal_width);
|
||||
continue;
|
||||
}
|
||||
|
||||
// left border
|
||||
int j = 0;
|
||||
int16x4_t vj = {0, 1, 2, 3};
|
||||
vj += saturate_cast<short>(cal_x);
|
||||
for(; j + cal_x < -4; j += 4, vj += 4)
|
||||
{
|
||||
int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
|
||||
for(int k = 0; k < 4; k++) {
|
||||
if(vx[k] < 0) // border constant return value -1
|
||||
ogn_ptr = &vec_zeros[0];
|
||||
else
|
||||
ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
|
||||
pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
|
||||
memcpy(pad_ptr, ogn_ptr, cnes);
|
||||
}
|
||||
}
|
||||
for(; j + cal_x < 0; j++)
|
||||
{
|
||||
int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
|
||||
if(x < 0) // border constant return value -1
|
||||
ogn_ptr = &vec_zeros[0];
|
||||
else
|
||||
ogn_ptr = ogn_data + y * ogn_step + x * cnes;
|
||||
pad_ptr = pad_data + i * pad_step + j * cnes;
|
||||
memcpy(pad_ptr, ogn_ptr, cnes);
|
||||
}
|
||||
|
||||
// center
|
||||
int rborder = MIN(cal_width, full_width - cal_x);
|
||||
ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cnes;
|
||||
pad_ptr = pad_data + i * pad_step + j * cnes;
|
||||
memcpy(pad_ptr, ogn_ptr, cnes * (rborder - j));
|
||||
|
||||
// right border
|
||||
j = rborder;
|
||||
vj = (int16x4_t){0, 1, 2, 3} + saturate_cast<short>(cal_x + rborder);
|
||||
for(; j <= cal_width - 4; j += 4, vj += 4)
|
||||
{
|
||||
int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
|
||||
for(int k = 0; k < 4; k++) {
|
||||
if(vx[k] < 0) // border constant return value -1
|
||||
ogn_ptr = &vec_zeros[0];
|
||||
else
|
||||
ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
|
||||
pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
|
||||
memcpy(pad_ptr, ogn_ptr, cnes);
|
||||
}
|
||||
}
|
||||
for(; j < cal_width; j++)
|
||||
{
|
||||
int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
|
||||
if(x < 0) // border constant return value -1
|
||||
ogn_ptr = &vec_zeros[0];
|
||||
else
|
||||
ogn_ptr = ogn_data + y * ogn_step + x * cnes;
|
||||
pad_ptr = pad_data + i * pad_step + j * cnes;
|
||||
memcpy(pad_ptr, ogn_ptr, cnes);
|
||||
}
|
||||
}
|
||||
|
||||
// prepare the pointers
|
||||
int i, k, count, nz = ctx->nz;
|
||||
const uchar* ker_pts = &ctx->coords[0];
|
||||
const float* ker_cfs = &ctx->coeffs[0];
|
||||
|
||||
if( ddepth == CV_8U )
|
||||
{
|
||||
std::vector<uchar*> src_ptrarr;
|
||||
src_ptrarr.resize(nz);
|
||||
uchar** src_ptrs = &src_ptrarr[0];
|
||||
uchar* dst_row = dst_data;
|
||||
uchar* pad_row = pad_data;
|
||||
|
||||
for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
|
||||
{
|
||||
for( k = 0; k < nz; k++ )
|
||||
src_ptrs[k] = (uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes;
|
||||
|
||||
i = 0;
|
||||
for( ; i <= width * cnes - 8; i += 8 )
|
||||
{
|
||||
float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat};
|
||||
for( k = 0; k < nz; k++ ) {
|
||||
float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
|
||||
// experimental code
|
||||
// ndsrvp_f32_u8_mul8(vker_cfs, *(unsigned long*)(src_ptrs[k] + i), vker_cfs);
|
||||
// ndsrvp_f32_add8(vs0, vker_cfs, vs0);
|
||||
vs0[0] += vker_cfs[0] * src_ptrs[k][i];
|
||||
vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
|
||||
vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
|
||||
vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
|
||||
vs0[4] += vker_cfs[4] * src_ptrs[k][i + 4];
|
||||
vs0[5] += vker_cfs[5] * src_ptrs[k][i + 5];
|
||||
vs0[6] += vker_cfs[6] * src_ptrs[k][i + 6];
|
||||
vs0[7] += vker_cfs[7] * src_ptrs[k][i + 7];
|
||||
}
|
||||
dst_row[i] = saturate_cast<uchar>(vs0[0]);
|
||||
dst_row[i + 1] = saturate_cast<uchar>(vs0[1]);
|
||||
dst_row[i + 2] = saturate_cast<uchar>(vs0[2]);
|
||||
dst_row[i + 3] = saturate_cast<uchar>(vs0[3]);
|
||||
dst_row[i + 4] = saturate_cast<uchar>(vs0[4]);
|
||||
dst_row[i + 5] = saturate_cast<uchar>(vs0[5]);
|
||||
dst_row[i + 6] = saturate_cast<uchar>(vs0[6]);
|
||||
dst_row[i + 7] = saturate_cast<uchar>(vs0[7]);
|
||||
}
|
||||
for( ; i < width * cnes; i++ )
|
||||
{
|
||||
float s0 = delta_sat;
|
||||
for( k = 0; k < nz; k++ ) {
|
||||
s0 += ker_cfs[k] * src_ptrs[k][i];
|
||||
}
|
||||
dst_row[i] = saturate_cast<uchar>(s0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( ddepth == CV_16U )
|
||||
{
|
||||
std::vector<ushort*> src_ptrarr;
|
||||
src_ptrarr.resize(nz);
|
||||
ushort** src_ptrs = &src_ptrarr[0];
|
||||
uchar* dst_row = dst_data;
|
||||
uchar* pad_row = pad_data;
|
||||
|
||||
for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
|
||||
{
|
||||
for( k = 0; k < nz; k++ )
|
||||
src_ptrs[k] = (ushort*)((uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes);
|
||||
|
||||
i = 0;
|
||||
for( ; i <= width * cn - 4; i += 4 )
|
||||
{
|
||||
float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat};
|
||||
for( k = 0; k < nz; k++ ) {
|
||||
float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
|
||||
vs0[0] += vker_cfs[0] * src_ptrs[k][i];
|
||||
vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
|
||||
vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
|
||||
vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
|
||||
}
|
||||
ushort* dst_row_ptr = (ushort*)dst_row;
|
||||
dst_row_ptr[i] = saturate_cast<ushort>(vs0[0]);
|
||||
dst_row_ptr[i + 1] = saturate_cast<ushort>(vs0[1]);
|
||||
dst_row_ptr[i + 2] = saturate_cast<ushort>(vs0[2]);
|
||||
dst_row_ptr[i + 3] = saturate_cast<ushort>(vs0[3]);
|
||||
}
|
||||
for( ; i < width * cn; i++ )
|
||||
{
|
||||
float s0 = delta_sat;
|
||||
for( k = 0; k < nz; k++ ) {
|
||||
s0 += ker_cfs[k] * src_ptrs[k][i];
|
||||
}
|
||||
((ushort*)dst_row)[i] = saturate_cast<ushort>(s0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
int filterFree(cvhalFilter2D *context) {
|
||||
FilterData *ctx = (FilterData*)context;
|
||||
delete ctx;
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
} // namespace ndsrvp
|
||||
|
||||
} // namespace cv
|
300
3rdparty/ndsrvp/src/medianBlur.cpp
vendored
Normal file
300
3rdparty/ndsrvp/src/medianBlur.cpp
vendored
Normal file
@ -0,0 +1,300 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "ndsrvp_hal.hpp"
|
||||
#include "opencv2/imgproc/hal/interface.h"
|
||||
#include "cvutils.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
namespace ndsrvp {
|
||||
|
||||
struct operators_minmax_t {
|
||||
inline void vector(uint8x8_t & a, uint8x8_t & b) const {
|
||||
uint8x8_t t = a;
|
||||
a = __nds__v_umin8(a, b);
|
||||
b = __nds__v_umax8(t, b);
|
||||
}
|
||||
inline void scalar(uchar & a, uchar & b) const {
|
||||
uchar t = a;
|
||||
a = __nds__umin8(a, b);
|
||||
b = __nds__umax8(t, b);
|
||||
}
|
||||
inline void vector(int8x8_t & a, int8x8_t & b) const {
|
||||
int8x8_t t = a;
|
||||
a = __nds__v_smin8(a, b);
|
||||
b = __nds__v_smax8(t, b);
|
||||
}
|
||||
inline void scalar(schar & a, schar & b) const {
|
||||
schar t = a;
|
||||
a = __nds__smin8(a, b);
|
||||
b = __nds__smax8(t, b);
|
||||
}
|
||||
inline void vector(uint16x4_t & a, uint16x4_t & b) const {
|
||||
uint16x4_t t = a;
|
||||
a = __nds__v_umin16(a, b);
|
||||
b = __nds__v_umax16(t, b);
|
||||
}
|
||||
inline void scalar(ushort & a, ushort & b) const {
|
||||
ushort t = a;
|
||||
a = __nds__umin16(a, b);
|
||||
b = __nds__umax16(t, b);
|
||||
}
|
||||
inline void vector(int16x4_t & a, int16x4_t & b) const {
|
||||
int16x4_t t = a;
|
||||
a = __nds__v_smin16(a, b);
|
||||
b = __nds__v_smax16(t, b);
|
||||
}
|
||||
inline void scalar(short & a, short & b) const {
|
||||
short t = a;
|
||||
a = __nds__smin16(a, b);
|
||||
b = __nds__smax16(t, b);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename WT, typename VT> // type, widen type, vector type
|
||||
static void
|
||||
medianBlur_SortNet( const uchar* src_data, size_t src_step,
|
||||
uchar* dst_data, size_t dst_step,
|
||||
int width, int height, int cn, int ksize )
|
||||
{
|
||||
const T* src = (T*)src_data;
|
||||
T* dst = (T*)dst_data;
|
||||
int sstep = (int)(src_step / sizeof(T));
|
||||
int dstep = (int)(dst_step / sizeof(T));
|
||||
int i, j, k;
|
||||
operators_minmax_t op;
|
||||
|
||||
if( ksize == 3 )
|
||||
{
|
||||
if( width == 1 || height == 1 )
|
||||
{
|
||||
int len = width + height - 1;
|
||||
int sdelta = height == 1 ? cn : sstep;
|
||||
int sdelta0 = height == 1 ? 0 : sstep - cn;
|
||||
int ddelta = height == 1 ? cn : dstep;
|
||||
|
||||
for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
|
||||
for( j = 0; j < cn; j++, src++ )
|
||||
{
|
||||
T p0 = src[i > 0 ? -sdelta : 0];
|
||||
T p1 = src[0];
|
||||
T p2 = src[i < len - 1 ? sdelta : 0];
|
||||
|
||||
op.scalar(p0, p1); op.scalar(p1, p2); op.scalar(p0, p1);
|
||||
dst[j] = (T)p1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
width *= cn;
|
||||
for( i = 0; i < height; i++, dst += dstep )
|
||||
{
|
||||
const T* row0 = src + std::max(i - 1, 0)*sstep;
|
||||
const T* row1 = src + i*sstep;
|
||||
const T* row2 = src + std::min(i + 1, height-1)*sstep;
|
||||
int limit = cn;
|
||||
|
||||
for(j = 0;; )
|
||||
{
|
||||
for( ; j < limit; j++ )
|
||||
{
|
||||
int j0 = j >= cn ? j - cn : j;
|
||||
int j2 = j < width - cn ? j + cn : j;
|
||||
T p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
|
||||
T p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
|
||||
T p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
|
||||
|
||||
op.scalar(p1, p2); op.scalar(p4, p5); op.scalar(p7, p8); op.scalar(p0, p1);
|
||||
op.scalar(p3, p4); op.scalar(p6, p7); op.scalar(p1, p2); op.scalar(p4, p5);
|
||||
op.scalar(p7, p8); op.scalar(p0, p3); op.scalar(p5, p8); op.scalar(p4, p7);
|
||||
op.scalar(p3, p6); op.scalar(p1, p4); op.scalar(p2, p5); op.scalar(p4, p7);
|
||||
op.scalar(p4, p2); op.scalar(p6, p4); op.scalar(p4, p2);
|
||||
dst[j] = (T)p4;
|
||||
}
|
||||
|
||||
if( limit == width )
|
||||
break;
|
||||
|
||||
int nlanes = 8 / sizeof(T);
|
||||
|
||||
for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn); j += nlanes ) // alignment
|
||||
{
|
||||
VT p0 = *(VT*)(row0+j-cn), p1 = *(VT*)(row0+j), p2 = *(VT*)(row0+j+cn);
|
||||
VT p3 = *(VT*)(row1+j-cn), p4 = *(VT*)(row1+j), p5 = *(VT*)(row1+j+cn);
|
||||
VT p6 = *(VT*)(row2+j-cn), p7 = *(VT*)(row2+j), p8 = *(VT*)(row2+j+cn);
|
||||
|
||||
op.vector(p1, p2); op.vector(p4, p5); op.vector(p7, p8); op.vector(p0, p1);
|
||||
op.vector(p3, p4); op.vector(p6, p7); op.vector(p1, p2); op.vector(p4, p5);
|
||||
op.vector(p7, p8); op.vector(p0, p3); op.vector(p5, p8); op.vector(p4, p7);
|
||||
op.vector(p3, p6); op.vector(p1, p4); op.vector(p2, p5); op.vector(p4, p7);
|
||||
op.vector(p4, p2); op.vector(p6, p4); op.vector(p4, p2);
|
||||
*(VT*)(dst+j) = p4;
|
||||
}
|
||||
|
||||
limit = width;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( ksize == 5 )
|
||||
{
|
||||
if( width == 1 || height == 1 )
|
||||
{
|
||||
int len = width + height - 1;
|
||||
int sdelta = height == 1 ? cn : sstep;
|
||||
int sdelta0 = height == 1 ? 0 : sstep - cn;
|
||||
int ddelta = height == 1 ? cn : dstep;
|
||||
|
||||
for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
|
||||
for( j = 0; j < cn; j++, src++ )
|
||||
{
|
||||
int i1 = i > 0 ? -sdelta : 0;
|
||||
int i0 = i > 1 ? -sdelta*2 : i1;
|
||||
int i3 = i < len-1 ? sdelta : 0;
|
||||
int i4 = i < len-2 ? sdelta*2 : i3;
|
||||
T p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
|
||||
|
||||
op.scalar(p0, p1); op.scalar(p3, p4); op.scalar(p2, p3); op.scalar(p3, p4); op.scalar(p0, p2);
|
||||
op.scalar(p2, p4); op.scalar(p1, p3); op.scalar(p1, p2);
|
||||
dst[j] = (T)p2;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
width *= cn;
|
||||
for( i = 0; i < height; i++, dst += dstep )
|
||||
{
|
||||
const T* row[5];
|
||||
row[0] = src + std::max(i - 2, 0)*sstep;
|
||||
row[1] = src + std::max(i - 1, 0)*sstep;
|
||||
row[2] = src + i*sstep;
|
||||
row[3] = src + std::min(i + 1, height-1)*sstep;
|
||||
row[4] = src + std::min(i + 2, height-1)*sstep;
|
||||
int limit = cn*2;
|
||||
|
||||
for(j = 0;; )
|
||||
{
|
||||
for( ; j < limit; j++ )
|
||||
{
|
||||
T p[25];
|
||||
int j1 = j >= cn ? j - cn : j;
|
||||
int j0 = j >= cn*2 ? j - cn*2 : j1;
|
||||
int j3 = j < width - cn ? j + cn : j;
|
||||
int j4 = j < width - cn*2 ? j + cn*2 : j3;
|
||||
for( k = 0; k < 5; k++ )
|
||||
{
|
||||
const T* rowk = row[k];
|
||||
p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
|
||||
p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
|
||||
p[k*5+4] = rowk[j4];
|
||||
}
|
||||
|
||||
op.scalar(p[1], p[2]); op.scalar(p[0], p[1]); op.scalar(p[1], p[2]); op.scalar(p[4], p[5]); op.scalar(p[3], p[4]);
|
||||
op.scalar(p[4], p[5]); op.scalar(p[0], p[3]); op.scalar(p[2], p[5]); op.scalar(p[2], p[3]); op.scalar(p[1], p[4]);
|
||||
op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[7], p[8]); op.scalar(p[6], p[7]); op.scalar(p[7], p[8]);
|
||||
op.scalar(p[10], p[11]); op.scalar(p[9], p[10]); op.scalar(p[10], p[11]); op.scalar(p[6], p[9]); op.scalar(p[8], p[11]);
|
||||
op.scalar(p[8], p[9]); op.scalar(p[7], p[10]); op.scalar(p[7], p[8]); op.scalar(p[9], p[10]); op.scalar(p[0], p[6]);
|
||||
op.scalar(p[4], p[10]); op.scalar(p[4], p[6]); op.scalar(p[2], p[8]); op.scalar(p[2], p[4]); op.scalar(p[6], p[8]);
|
||||
op.scalar(p[1], p[7]); op.scalar(p[5], p[11]); op.scalar(p[5], p[7]); op.scalar(p[3], p[9]); op.scalar(p[3], p[5]);
|
||||
op.scalar(p[7], p[9]); op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[5], p[6]); op.scalar(p[7], p[8]);
|
||||
op.scalar(p[9], p[10]); op.scalar(p[13], p[14]); op.scalar(p[12], p[13]); op.scalar(p[13], p[14]); op.scalar(p[16], p[17]);
|
||||
op.scalar(p[15], p[16]); op.scalar(p[16], p[17]); op.scalar(p[12], p[15]); op.scalar(p[14], p[17]); op.scalar(p[14], p[15]);
|
||||
op.scalar(p[13], p[16]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]); op.scalar(p[19], p[20]); op.scalar(p[18], p[19]);
|
||||
op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[21], p[23]); op.scalar(p[22], p[24]);
|
||||
op.scalar(p[22], p[23]); op.scalar(p[18], p[21]); op.scalar(p[20], p[23]); op.scalar(p[20], p[21]); op.scalar(p[19], p[22]);
|
||||
op.scalar(p[22], p[24]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[12], p[18]);
|
||||
op.scalar(p[16], p[22]); op.scalar(p[16], p[18]); op.scalar(p[14], p[20]); op.scalar(p[20], p[24]); op.scalar(p[14], p[16]);
|
||||
op.scalar(p[18], p[20]); op.scalar(p[22], p[24]); op.scalar(p[13], p[19]); op.scalar(p[17], p[23]); op.scalar(p[17], p[19]);
|
||||
op.scalar(p[15], p[21]); op.scalar(p[15], p[17]); op.scalar(p[19], p[21]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]);
|
||||
op.scalar(p[17], p[18]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[0], p[12]);
|
||||
op.scalar(p[8], p[20]); op.scalar(p[8], p[12]); op.scalar(p[4], p[16]); op.scalar(p[16], p[24]); op.scalar(p[12], p[16]);
|
||||
op.scalar(p[2], p[14]); op.scalar(p[10], p[22]); op.scalar(p[10], p[14]); op.scalar(p[6], p[18]); op.scalar(p[6], p[10]);
|
||||
op.scalar(p[10], p[12]); op.scalar(p[1], p[13]); op.scalar(p[9], p[21]); op.scalar(p[9], p[13]); op.scalar(p[5], p[17]);
|
||||
op.scalar(p[13], p[17]); op.scalar(p[3], p[15]); op.scalar(p[11], p[23]); op.scalar(p[11], p[15]); op.scalar(p[7], p[19]);
|
||||
op.scalar(p[7], p[11]); op.scalar(p[11], p[13]); op.scalar(p[11], p[12]);
|
||||
dst[j] = (T)p[12];
|
||||
}
|
||||
|
||||
if( limit == width )
|
||||
break;
|
||||
|
||||
int nlanes = 8 / sizeof(T);
|
||||
|
||||
for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn*2); j += nlanes )
|
||||
{
|
||||
VT p0 = *(VT*)(row[0]+j-cn*2), p5 = *(VT*)(row[1]+j-cn*2), p10 = *(VT*)(row[2]+j-cn*2), p15 = *(VT*)(row[3]+j-cn*2), p20 = *(VT*)(row[4]+j-cn*2);
|
||||
VT p1 = *(VT*)(row[0]+j-cn*1), p6 = *(VT*)(row[1]+j-cn*1), p11 = *(VT*)(row[2]+j-cn*1), p16 = *(VT*)(row[3]+j-cn*1), p21 = *(VT*)(row[4]+j-cn*1);
|
||||
VT p2 = *(VT*)(row[0]+j-cn*0), p7 = *(VT*)(row[1]+j-cn*0), p12 = *(VT*)(row[2]+j-cn*0), p17 = *(VT*)(row[3]+j-cn*0), p22 = *(VT*)(row[4]+j-cn*0);
|
||||
VT p3 = *(VT*)(row[0]+j+cn*1), p8 = *(VT*)(row[1]+j+cn*1), p13 = *(VT*)(row[2]+j+cn*1), p18 = *(VT*)(row[3]+j+cn*1), p23 = *(VT*)(row[4]+j+cn*1);
|
||||
VT p4 = *(VT*)(row[0]+j+cn*2), p9 = *(VT*)(row[1]+j+cn*2), p14 = *(VT*)(row[2]+j+cn*2), p19 = *(VT*)(row[3]+j+cn*2), p24 = *(VT*)(row[4]+j+cn*2);
|
||||
|
||||
op.vector(p1, p2); op.vector(p0, p1); op.vector(p1, p2); op.vector(p4, p5); op.vector(p3, p4);
|
||||
op.vector(p4, p5); op.vector(p0, p3); op.vector(p2, p5); op.vector(p2, p3); op.vector(p1, p4);
|
||||
op.vector(p1, p2); op.vector(p3, p4); op.vector(p7, p8); op.vector(p6, p7); op.vector(p7, p8);
|
||||
op.vector(p10, p11); op.vector(p9, p10); op.vector(p10, p11); op.vector(p6, p9); op.vector(p8, p11);
|
||||
op.vector(p8, p9); op.vector(p7, p10); op.vector(p7, p8); op.vector(p9, p10); op.vector(p0, p6);
|
||||
op.vector(p4, p10); op.vector(p4, p6); op.vector(p2, p8); op.vector(p2, p4); op.vector(p6, p8);
|
||||
op.vector(p1, p7); op.vector(p5, p11); op.vector(p5, p7); op.vector(p3, p9); op.vector(p3, p5);
|
||||
op.vector(p7, p9); op.vector(p1, p2); op.vector(p3, p4); op.vector(p5, p6); op.vector(p7, p8);
|
||||
op.vector(p9, p10); op.vector(p13, p14); op.vector(p12, p13); op.vector(p13, p14); op.vector(p16, p17);
|
||||
op.vector(p15, p16); op.vector(p16, p17); op.vector(p12, p15); op.vector(p14, p17); op.vector(p14, p15);
|
||||
op.vector(p13, p16); op.vector(p13, p14); op.vector(p15, p16); op.vector(p19, p20); op.vector(p18, p19);
|
||||
op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p21, p23); op.vector(p22, p24);
|
||||
op.vector(p22, p23); op.vector(p18, p21); op.vector(p20, p23); op.vector(p20, p21); op.vector(p19, p22);
|
||||
op.vector(p22, p24); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p12, p18);
|
||||
op.vector(p16, p22); op.vector(p16, p18); op.vector(p14, p20); op.vector(p20, p24); op.vector(p14, p16);
|
||||
op.vector(p18, p20); op.vector(p22, p24); op.vector(p13, p19); op.vector(p17, p23); op.vector(p17, p19);
|
||||
op.vector(p15, p21); op.vector(p15, p17); op.vector(p19, p21); op.vector(p13, p14); op.vector(p15, p16);
|
||||
op.vector(p17, p18); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p0, p12);
|
||||
op.vector(p8, p20); op.vector(p8, p12); op.vector(p4, p16); op.vector(p16, p24); op.vector(p12, p16);
|
||||
op.vector(p2, p14); op.vector(p10, p22); op.vector(p10, p14); op.vector(p6, p18); op.vector(p6, p10);
|
||||
op.vector(p10, p12); op.vector(p1, p13); op.vector(p9, p21); op.vector(p9, p13); op.vector(p5, p17);
|
||||
op.vector(p13, p17); op.vector(p3, p15); op.vector(p11, p23); op.vector(p11, p15); op.vector(p7, p19);
|
||||
op.vector(p7, p11); op.vector(p11, p13); op.vector(p11, p12);
|
||||
*(VT*)(dst+j) = p12;
|
||||
}
|
||||
|
||||
limit = width;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int medianBlur(const uchar* src_data, size_t src_step,
|
||||
uchar* dst_data, size_t dst_step,
|
||||
int width, int height, int depth, int cn, int ksize)
|
||||
{
|
||||
bool useSortNet = ((ksize == 3) || (ksize == 5 && ( depth > CV_8U || cn == 2 || cn > 4 )));
|
||||
|
||||
if( useSortNet )
|
||||
{
|
||||
uchar* src_data_rep;
|
||||
if( dst_data == src_data ) {
|
||||
std::vector<uchar> src_data_copy(src_step * height);
|
||||
memcpy(src_data_copy.data(), src_data, src_step * height);
|
||||
src_data_rep = &src_data_copy[0];
|
||||
}
|
||||
else {
|
||||
src_data_rep = (uchar*)src_data;
|
||||
}
|
||||
|
||||
if( depth == CV_8U )
|
||||
medianBlur_SortNet<uchar, int, uint8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
|
||||
else if( depth == CV_8S )
|
||||
medianBlur_SortNet<schar, int, int8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
|
||||
else if( depth == CV_16U )
|
||||
medianBlur_SortNet<ushort, int, uint16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
|
||||
else if( depth == CV_16S )
|
||||
medianBlur_SortNet<short, int, int16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
|
||||
else
|
||||
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
else return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
} // namespace ndsrvp
|
||||
|
||||
} // namespace cv
|
1714
3rdparty/zlib-ng/CMakeLists.txt
vendored
1714
3rdparty/zlib-ng/CMakeLists.txt
vendored
File diff suppressed because it is too large
Load Diff
2
3rdparty/zlib-ng/LICENSE.md
vendored
2
3rdparty/zlib-ng/LICENSE.md
vendored
@ -1,4 +1,4 @@
|
||||
(C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
(C) 1995-2024 Jean-loup Gailly and Mark Adler
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
|
30
3rdparty/zlib-ng/README.md
vendored
30
3rdparty/zlib-ng/README.md
vendored
@ -21,7 +21,6 @@ Features
|
||||
* Support for CPU intrinsics when available
|
||||
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
|
||||
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
|
||||
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
|
||||
* Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
|
||||
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
|
||||
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
|
||||
@ -95,20 +94,21 @@ make test
|
||||
Build Options
|
||||
-------------
|
||||
|
||||
| CMake | configure | Description | Default |
|
||||
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
|
||||
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
|
||||
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
|
||||
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
|
||||
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
|
||||
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
|
||||
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
|
||||
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
|
||||
| WITH_GTEST | | Build gtest_zlib | ON |
|
||||
| WITH_FUZZERS | | Build test/fuzz | OFF |
|
||||
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
|
||||
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
|
||||
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
|
||||
| CMake | configure | Description | Default |
|
||||
|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------|
|
||||
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
|
||||
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
|
||||
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
|
||||
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
|
||||
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
|
||||
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
|
||||
| WITH_RUNTIME_CPU_DETECTION | | Compiles with runtime CPU detection | ON |
|
||||
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
|
||||
| WITH_GTEST | | Build gtest_zlib | ON |
|
||||
| WITH_FUZZERS | | Build test/fuzz | OFF |
|
||||
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
|
||||
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
|
||||
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
|
||||
|
||||
|
||||
Install
|
||||
|
54
3rdparty/zlib-ng/adler32.c
vendored
54
3rdparty/zlib-ng/adler32.c
vendored
@ -7,70 +7,24 @@
|
||||
#include "functable.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
unsigned n;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
/* do length NMAX blocks -- requires just one modulo operation */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
#ifdef UNROLL_MORE
|
||||
n = NMAX / 16; /* NMAX is divisible by 16 */
|
||||
#else
|
||||
n = NMAX / 8; /* NMAX is divisible by 8 */
|
||||
#endif
|
||||
do {
|
||||
#ifdef UNROLL_MORE
|
||||
DO16(adler, sum2, buf); /* 16 sums unrolled */
|
||||
buf += 16;
|
||||
#else
|
||||
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
|
||||
buf += 8;
|
||||
#endif
|
||||
} while (--n);
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
}
|
||||
|
||||
/* do remaining bytes (less than NMAX, still just one modulo) */
|
||||
return adler32_len_64(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
|
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
|
||||
return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
return functable.adler32(adler, buf, len);
|
||||
return FUNCTABLE_CALL(adler32)(adler, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
|
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
|
||||
return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
|
||||
return functable.adler32(adler, buf, len);
|
||||
return FUNCTABLE_CALL(adler32)(adler, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
11
3rdparty/zlib-ng/adler32_fold.h
vendored
11
3rdparty/zlib-ng/adler32_fold.h
vendored
@ -1,11 +0,0 @@
|
||||
/* adler32_fold.h -- adler32 folding interface
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_FOLD_H_
|
||||
#define ADLER32_FOLD_H_
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
|
||||
#endif
|
2
3rdparty/zlib-ng/arch/.gitignore
vendored
2
3rdparty/zlib-ng/arch/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
# ignore Makefiles; they're all automatically generated
|
||||
Makefile
|
7
3rdparty/zlib-ng/arch/arm/Makefile.in
vendored
7
3rdparty/zlib-ng/arch/arm/Makefile.in
vendored
@ -25,7 +25,6 @@ all: \
|
||||
crc32_acle.o crc32_acle.lo \
|
||||
slide_hash_neon.o slide_hash_neon.lo \
|
||||
slide_hash_armv6.o slide_hash_armv6.lo \
|
||||
insert_string_acle.o insert_string_acle.lo
|
||||
|
||||
adler32_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
@ -69,12 +68,6 @@ slide_hash_armv6.o:
|
||||
slide_hash_armv6.lo:
|
||||
$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
|
||||
|
||||
insert_string_acle.o:
|
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
insert_string_acle.lo:
|
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
|
4
3rdparty/zlib-ng/arch/arm/adler32_neon.c
vendored
4
3rdparty/zlib-ng/arch/arm/adler32_neon.c
vendored
@ -7,8 +7,8 @@
|
||||
*/
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
|
||||
static const uint16_t ALIGNED_(16) taps[64] = {
|
||||
|
17
3rdparty/zlib-ng/arch/arm/arm_features.c
vendored
17
3rdparty/zlib-ng/arch/arm/arm_features.c
vendored
@ -1,4 +1,4 @@
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
#include "arm_features.h"
|
||||
|
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
|
||||
@ -11,6 +11,11 @@
|
||||
# ifndef ID_AA64ISAR0_CRC32_VAL
|
||||
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
|
||||
# endif
|
||||
#elif defined(__OpenBSD__) && defined(__aarch64__)
|
||||
# include <machine/armreg.h>
|
||||
# include <machine/cpu.h>
|
||||
# include <sys/sysctl.h>
|
||||
# include <sys/types.h>
|
||||
#elif defined(__APPLE__)
|
||||
# if !defined(_DARWIN_C_SOURCE)
|
||||
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
|
||||
@ -30,6 +35,16 @@ static int arm_has_crc32() {
|
||||
#elif defined(__FreeBSD__) && defined(__aarch64__)
|
||||
return getenv("QEMU_EMULATING") == NULL
|
||||
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
|
||||
#elif defined(__OpenBSD__) && defined(__aarch64__)
|
||||
int hascrc32 = 0;
|
||||
int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
|
||||
uint64_t isar0 = 0;
|
||||
size_t len = sizeof(isar0);
|
||||
if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
|
||||
if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
|
||||
hascrc32 = 1;
|
||||
}
|
||||
return hascrc32;
|
||||
#elif defined(__APPLE__)
|
||||
int hascrc32;
|
||||
size_t size = sizeof(hascrc32);
|
||||
|
6
3rdparty/zlib-ng/arch/arm/arm_features.h
vendored
6
3rdparty/zlib-ng/arch/arm/arm_features.h
vendored
@ -2,8 +2,8 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ARM_H_
|
||||
#define ARM_H_
|
||||
#ifndef ARM_FEATURES_H_
|
||||
#define ARM_FEATURES_H_
|
||||
|
||||
struct arm_cpu_features {
|
||||
int has_simd;
|
||||
@ -13,4 +13,4 @@ struct arm_cpu_features {
|
||||
|
||||
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
|
||||
|
||||
#endif /* ARM_H_ */
|
||||
#endif /* ARM_FEATURES_H_ */
|
||||
|
65
3rdparty/zlib-ng/arch/arm/arm_functions.h
vendored
Normal file
65
3rdparty/zlib-ng/arch/arm/arm_functions.h
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
/* arm_functions.h -- ARM implementations for arch-specific functions.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ARM_FUNCTIONS_H_
|
||||
#define ARM_FUNCTIONS_H_
|
||||
|
||||
#ifdef ARM_NEON
|
||||
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t chunksize_neon(void);
|
||||
uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
|
||||
# ifdef HAVE_BUILTIN_CTZLL
|
||||
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
|
||||
# endif
|
||||
void slide_hash_neon(deflate_state *s);
|
||||
void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
|
||||
#ifdef ARM_SIMD
|
||||
void slide_hash_armv6(deflate_state *s);
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// ARM - SIMD
|
||||
# if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_armv6
|
||||
# endif
|
||||
// ARM - NEON
|
||||
# if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_neon
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_neon
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_neon
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_neon
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_neon
|
||||
# ifdef HAVE_BUILTIN_CTZLL
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_neon
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_neon
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_neon
|
||||
# endif
|
||||
# endif
|
||||
// ARM - ACLE
|
||||
# if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32)
|
||||
# undef native_crc32
|
||||
# define native_crc32 crc32_acle
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* ARM_FUNCTIONS_H_ */
|
4
3rdparty/zlib-ng/arch/arm/chunkset_neon.c
vendored
4
3rdparty/zlib-ng/arch/arm/chunkset_neon.c
vendored
@ -4,8 +4,8 @@
|
||||
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
#include "zbuild.h"
|
||||
#include "arch/generic/chunk_permute_table.h"
|
||||
|
||||
typedef uint8x16_t chunk_t;
|
||||
|
||||
|
5
3rdparty/zlib-ng/arch/arm/compare256_neon.c
vendored
5
3rdparty/zlib-ng/arch/arm/compare256_neon.c
vendored
@ -3,8 +3,9 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
|
||||
|
2
3rdparty/zlib-ng/arch/arm/crc32_acle.c
vendored
2
3rdparty/zlib-ng/arch/arm/crc32_acle.c
vendored
@ -7,7 +7,7 @@
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
#include "acle_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
|
||||
Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
Z_REGISTER uint32_t c;
|
||||
|
24
3rdparty/zlib-ng/arch/arm/insert_string_acle.c
vendored
24
3rdparty/zlib-ng/arch/arm/insert_string_acle.c
vendored
@ -1,24 +0,0 @@
|
||||
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
#include "acle_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#define HASH_CALC(s, h, val) \
|
||||
h = __crc32w(0, val)
|
||||
|
||||
#define HASH_CALC_VAR h
|
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0
|
||||
|
||||
#define UPDATE_HASH Z_TARGET_CRC update_hash_acle
|
||||
#define INSERT_STRING Z_TARGET_CRC insert_string_acle
|
||||
#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
|
||||
|
||||
#include "../../insert_string_tpl.h"
|
||||
#endif
|
7
3rdparty/zlib-ng/arch/arm/neon_intrins.h
vendored
7
3rdparty/zlib-ng/arch/arm/neon_intrins.h
vendored
@ -25,6 +25,13 @@
|
||||
out.val[3] = vqsubq_u16(a.val[3], b); \
|
||||
} while (0)
|
||||
|
||||
# if defined(__clang__) && defined(__arm__) && defined(__ANDROID__)
|
||||
/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */
|
||||
# undef ARM_NEON_HASLD4
|
||||
# undef vld1q_u16_x4
|
||||
# undef vld1q_u8_x4
|
||||
# undef vst1q_u16_x4
|
||||
# endif
|
||||
|
||||
# ifndef ARM_NEON_HASLD4
|
||||
|
||||
|
4
3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
vendored
4
3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
vendored
@ -5,8 +5,8 @@
|
||||
|
||||
#if defined(ARM_SIMD)
|
||||
#include "acle_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
/* SIMD version of hash_chain rebase */
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
|
4
3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
vendored
4
3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
vendored
@ -10,8 +10,8 @@
|
||||
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
/* SIMD version of hash_chain rebase */
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
|
57
3rdparty/zlib-ng/arch/generic/Makefile.in
vendored
57
3rdparty/zlib-ng/arch/generic/Makefile.in
vendored
@ -1,5 +1,6 @@
|
||||
# Makefile for zlib
|
||||
# Makefile for zlib-ng
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# Copyright (C) 2024 Hans Kristian Rosbach
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
@ -11,12 +12,62 @@ SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all:
|
||||
all: \
|
||||
adler32_c.o adler32_c.lo \
|
||||
adler32_fold_c.o adler32_fold_c.lo \
|
||||
chunkset_c.o chunkset_c.lo \
|
||||
compare256_c.o compare256_c.lo \
|
||||
crc32_braid_c.o crc32_braid_c.lo \
|
||||
crc32_fold_c.o crc32_fold_c.lo \
|
||||
slide_hash_c.o slide_hash_c.lo
|
||||
|
||||
|
||||
adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
|
||||
|
||||
adler32_c.lo: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
|
||||
|
||||
adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
|
||||
|
||||
adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
|
||||
|
||||
chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
|
||||
|
||||
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
|
||||
|
||||
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
|
||||
|
||||
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
|
||||
|
||||
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
|
||||
|
||||
crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
|
||||
|
||||
crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
|
||||
|
||||
crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
|
||||
|
||||
slide_hash_c.o: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
|
||||
|
||||
slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
|
||||
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~ \
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
|
54
3rdparty/zlib-ng/arch/generic/adler32_c.c
vendored
Normal file
54
3rdparty/zlib-ng/arch/generic/adler32_c.c
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "functable.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
unsigned n;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
/* do length NMAX blocks -- requires just one modulo operation */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
#ifdef UNROLL_MORE
|
||||
n = NMAX / 16; /* NMAX is divisible by 16 */
|
||||
#else
|
||||
n = NMAX / 8; /* NMAX is divisible by 8 */
|
||||
#endif
|
||||
do {
|
||||
#ifdef UNROLL_MORE
|
||||
DO16(adler, sum2, buf); /* 16 sums unrolled */
|
||||
buf += 16;
|
||||
#else
|
||||
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
|
||||
buf += 8;
|
||||
#endif
|
||||
} while (--n);
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
}
|
||||
|
||||
/* do remaining bytes (less than NMAX, still just one modulo) */
|
||||
return adler32_len_64(adler, buf, len, sum2);
|
||||
}
|
@ -5,12 +5,11 @@
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "functable.h"
|
||||
#include "adler32_fold.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
adler = functable.adler32(adler, src, len);
|
||||
adler = FUNCTABLE_CALL(adler32)(adler, src, len);
|
||||
memcpy(dst, src, len);
|
||||
return adler;
|
||||
}
|
@ -5,6 +5,7 @@
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
/* ALIGNED, byte comparison */
|
@ -8,43 +8,9 @@
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "functable.h"
|
||||
#include "crc32_braid_p.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
|
||||
const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
|
||||
return (const uint32_t *)crc_table;
|
||||
}
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
|
||||
if (buf == NULL) return 0;
|
||||
|
||||
return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
|
||||
if (buf == NULL) return 0;
|
||||
|
||||
return functable.crc32(crc, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
|
||||
return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
|
||||
return PREFIX(crc32_z)(crc, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
|
||||
/*
|
||||
A CRC of a message is computed on N braids of words in the message, where
|
||||
each word consists of W bytes (4 or 8). If N is 3, for example, then three
|
||||
@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
|
||||
level. Your mileage may vary.
|
||||
*/
|
||||
|
||||
/* ========================================================================= */
|
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
# define ZSWAPWORD(word) (word)
|
||||
# define BRAID_TABLE crc_braid_table
|
||||
#elif BYTE_ORDER == BIG_ENDIAN
|
||||
# if W == 8
|
||||
# define ZSWAPWORD(word) ZSWAP64(word)
|
||||
# elif W == 4
|
||||
# define ZSWAPWORD(word) ZSWAP32(word)
|
||||
# endif
|
||||
# define BRAID_TABLE crc_braid_big_table
|
||||
#else
|
||||
# error "No endian defined"
|
||||
#endif
|
||||
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
|
||||
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef W
|
||||
/*
|
||||
@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) {
|
||||
|
||||
/* ========================================================================= */
|
||||
Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
Z_REGISTER uint32_t c;
|
||||
uint32_t c;
|
||||
|
||||
/* Pre-condition the CRC */
|
||||
c = (~crc) & 0xffffffff;
|
@ -3,11 +3,9 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "functable.h"
|
||||
|
||||
#include "crc32_fold.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include "crc32.h"
|
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
|
||||
crc->value = CRC32_INITIAL_VALUE;
|
||||
@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
|
||||
}
|
||||
|
||||
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
crc->value = functable.crc32(crc->value, src, len);
|
||||
crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
|
||||
memcpy(dst, src, len);
|
||||
}
|
||||
|
||||
@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui
|
||||
* same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
|
||||
* init_crc is an unused argument in this context */
|
||||
Z_UNUSED(init_crc);
|
||||
crc->value = functable.crc32(crc->value, src, len);
|
||||
crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
|
106
3rdparty/zlib-ng/arch/generic/generic_functions.h
vendored
Normal file
106
3rdparty/zlib-ng/arch/generic/generic_functions.h
vendored
Normal file
@ -0,0 +1,106 @@
|
||||
/* generic_functions.h -- generic C implementations for arch-specific functions.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef GENERIC_FUNCTIONS_H_
|
||||
#define GENERIC_FUNCTIONS_H_
|
||||
|
||||
#include "zendian.h"
|
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
|
||||
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
|
||||
|
||||
typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
|
||||
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
|
||||
uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
|
||||
uint32_t chunksize_c(void);
|
||||
uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
|
||||
|
||||
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
|
||||
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
|
||||
# endif
|
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef void (*slide_hash_func)(deflate_state *s);
|
||||
|
||||
void slide_hash_c(deflate_state *s);
|
||||
|
||||
uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
|
||||
# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
|
||||
# endif
|
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
|
||||
# endif
|
||||
# endif
|
||||
|
||||
uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
|
||||
# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
|
||||
# ifdef UNALIGNED64_OK
|
||||
uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
# define longest_match_generic longest_match_unaligned_64
|
||||
# define longest_match_slow_generic longest_match_slow_unaligned_64
|
||||
# define compare256_generic compare256_unaligned_64
|
||||
# elif defined(HAVE_BUILTIN_CTZ)
|
||||
# define longest_match_generic longest_match_unaligned_32
|
||||
# define longest_match_slow_generic longest_match_slow_unaligned_32
|
||||
# define compare256_generic compare256_unaligned_32
|
||||
# else
|
||||
# define longest_match_generic longest_match_unaligned_16
|
||||
# define longest_match_slow_generic longest_match_slow_unaligned_16
|
||||
# define compare256_generic compare256_unaligned_16
|
||||
# endif
|
||||
#else
|
||||
# define longest_match_generic longest_match_c
|
||||
# define longest_match_slow_generic longest_match_slow_c
|
||||
# define compare256_generic compare256_c
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// Generic code
|
||||
# define native_adler32 adler32_c
|
||||
# define native_adler32_fold_copy adler32_fold_copy_c
|
||||
# define native_chunkmemset_safe chunkmemset_safe_c
|
||||
# define native_chunksize chunksize_c
|
||||
# define native_crc32 PREFIX(crc32_braid)
|
||||
# define native_crc32_fold crc32_fold_c
|
||||
# define native_crc32_fold_copy crc32_fold_copy_c
|
||||
# define native_crc32_fold_final crc32_fold_final_c
|
||||
# define native_crc32_fold_reset crc32_fold_reset_c
|
||||
# define native_inflate_fast inflate_fast_c
|
||||
# define native_slide_hash slide_hash_c
|
||||
# define native_longest_match longest_match_generic
|
||||
# define native_longest_match_slow longest_match_slow_generic
|
||||
# define native_compare256 compare256_generic
|
||||
#endif
|
||||
|
||||
#endif
|
@ -1,6 +1,6 @@
|
||||
/* slide_hash.c -- slide hash table C implementation
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
#include <altivec.h>
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
|
||||
typedef vector unsigned char chunk_t;
|
||||
|
||||
|
@ -5,8 +5,10 @@
|
||||
|
||||
#ifdef POWER9
|
||||
#include <altivec.h>
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zendian.h"
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "deflate.h"
|
||||
#include "zendian.h"
|
||||
|
||||
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
|
||||
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
|
||||
|
7
3rdparty/zlib-ng/arch/power/power_features.c
vendored
7
3rdparty/zlib-ng/arch/power/power_features.c
vendored
@ -1,16 +1,19 @@
|
||||
/* power_features.c - POWER feature check
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef HAVE_SYS_AUXV_H
|
||||
# include <sys/auxv.h>
|
||||
#endif
|
||||
#ifdef POWER_NEED_AUXVEC_H
|
||||
# include <linux/auxvec.h>
|
||||
#endif
|
||||
#ifdef __FreeBSD__
|
||||
# include <machine/cpu.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
#include "power_features.h"
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
|
||||
|
6
3rdparty/zlib-ng/arch/power/power_features.h
vendored
6
3rdparty/zlib-ng/arch/power/power_features.h
vendored
@ -4,8 +4,8 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_H_
|
||||
#define POWER_H_
|
||||
#ifndef POWER_FEATURES_H_
|
||||
#define POWER_FEATURES_H_
|
||||
|
||||
struct power_cpu_features {
|
||||
int has_altivec;
|
||||
@ -15,4 +15,4 @@ struct power_cpu_features {
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
|
||||
|
||||
#endif /* POWER_H_ */
|
||||
#endif /* POWER_FEATURES_H_ */
|
||||
|
67
3rdparty/zlib-ng/arch/power/power_functions.h
vendored
Normal file
67
3rdparty/zlib-ng/arch/power/power_functions.h
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
/* power_functions.h -- POWER implementations for arch-specific functions.
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_FUNCTIONS_H_
|
||||
#define POWER_FUNCTIONS_H_
|
||||
|
||||
#ifdef PPC_VMX
|
||||
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
void slide_hash_vmx(deflate_state *s);
|
||||
#endif
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t chunksize_power8(void);
|
||||
uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
void slide_hash_power8(deflate_state *s);
|
||||
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef POWER9
|
||||
uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// Power - VMX
|
||||
# if defined(PPC_VMX) && defined(__ALTIVEC__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_vmx
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_vmx
|
||||
# endif
|
||||
// Power8 - VSX
|
||||
# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_power8
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_power8
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_power8
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_power8
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_power8
|
||||
# endif
|
||||
# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
|
||||
# undef native_crc32
|
||||
# define native_crc32 crc32_power8
|
||||
# endif
|
||||
// Power9
|
||||
# if defined(POWER9) && defined(_ARCH_PWR9)
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_power9
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_power9
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_power9
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* POWER_FUNCTIONS_H_ */
|
4
3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
vendored
4
3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
vendored
@ -9,8 +9,8 @@
|
||||
#include <riscv_vector.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
|
||||
/* split Adler-32 into component sums */
|
||||
|
4
3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
vendored
4
3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
vendored
@ -6,7 +6,9 @@
|
||||
|
||||
#ifdef RISCV_RVV
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
11
3rdparty/zlib-ng/arch/riscv/riscv_features.c
vendored
11
3rdparty/zlib-ng/arch/riscv/riscv_features.c
vendored
@ -1,10 +1,13 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/auxv.h>
|
||||
#include <sys/utsname.h>
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
|
||||
# include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "riscv_features.h"
|
||||
|
||||
#define ISA_V_HWCAP (1 << ('v' - 'a'))
|
||||
@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea
|
||||
}
|
||||
|
||||
void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
|
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
#else
|
||||
unsigned long hw_cap = 0;
|
||||
#endif
|
||||
features->has_rvv = hw_cap & ISA_V_HWCAP;
|
||||
}
|
||||
|
||||
|
6
3rdparty/zlib-ng/arch/riscv/riscv_features.h
vendored
6
3rdparty/zlib-ng/arch/riscv/riscv_features.h
vendored
@ -6,8 +6,8 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef RISCV_H_
|
||||
#define RISCV_H_
|
||||
#ifndef RISCV_FEATURES_H_
|
||||
#define RISCV_FEATURES_H_
|
||||
|
||||
struct riscv_cpu_features {
|
||||
int has_rvv;
|
||||
@ -15,4 +15,4 @@ struct riscv_cpu_features {
|
||||
|
||||
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
|
||||
|
||||
#endif /* RISCV_H_ */
|
||||
#endif /* RISCV_FEATURES_H_ */
|
||||
|
49
3rdparty/zlib-ng/arch/riscv/riscv_functions.h
vendored
Normal file
49
3rdparty/zlib-ng/arch/riscv/riscv_functions.h
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
/* riscv_functions.h -- RISCV implementations for arch-specific functions.
|
||||
*
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
|
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef RISCV_FUNCTIONS_H_
|
||||
#define RISCV_FUNCTIONS_H_
|
||||
|
||||
#ifdef RISCV_RVV
|
||||
uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
uint32_t chunksize_rvv(void);
|
||||
uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
|
||||
|
||||
uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
|
||||
void slide_hash_rvv(deflate_state *s);
|
||||
void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// RISCV - RVV
|
||||
# if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_rvv
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_rvv
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_rvv
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_rvv
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_rvv
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_rvv
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_rvv
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_rvv
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_rvv
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* RISCV_FUNCTIONS_H_ */
|
10
3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
vendored
10
3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
vendored
@ -8,18 +8,16 @@
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
size_t vl;
|
||||
while (entries > 0) {
|
||||
vl = __riscv_vsetvl_e16m4(entries);
|
||||
vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
|
||||
vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
|
||||
vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
|
||||
v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
|
||||
__riscv_vse16_v_u16m4(table, v_tab, vl);
|
||||
vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
|
||||
__riscv_vse16_v_u16m4(table, v_diff, vl);
|
||||
table += vl, entries -= vl;
|
||||
}
|
||||
}
|
||||
|
48
3rdparty/zlib-ng/arch/s390/Makefile.in
vendored
Normal file
48
3rdparty/zlib-ng/arch/s390/Makefile.in
vendored
Normal file
@ -0,0 +1,48 @@
|
||||
# Makefile for zlib-ng
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
VGFMAFLAG=
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
s390_features.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
|
||||
|
||||
s390_features.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
|
||||
|
||||
dfltcc_deflate.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
|
||||
|
||||
dfltcc_deflate.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
|
||||
|
||||
dfltcc_inflate.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
|
||||
|
||||
dfltcc_inflate.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
|
||||
|
||||
crc32-vx.o:
|
||||
$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
|
||||
|
||||
crc32-vx.lo:
|
||||
$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
277
3rdparty/zlib-ng/arch/s390/README.md
vendored
Normal file
277
3rdparty/zlib-ng/arch/s390/README.md
vendored
Normal file
@ -0,0 +1,277 @@
|
||||
# Introduction
|
||||
|
||||
This directory contains SystemZ deflate hardware acceleration support.
|
||||
It can be enabled using the following build commands:
|
||||
|
||||
$ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
|
||||
$ make
|
||||
|
||||
or
|
||||
|
||||
$ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
|
||||
$ make
|
||||
|
||||
When built like this, zlib-ng would compress using hardware on level 1,
|
||||
and using software on all other levels. Decompression will always happen
|
||||
in hardware. In order to enable hardware compression for levels 1-6
|
||||
(i.e. to make it used by default) one could add
|
||||
`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
|
||||
|
||||
SystemZ deflate hardware acceleration is available on [IBM z15](
|
||||
https://www.ibm.com/products/z15) and newer machines under the name [
|
||||
"Integrated Accelerator for zEnterprise Data Compression"](
|
||||
https://www.ibm.com/support/z-content-solutions/compression/). The
|
||||
programming interface to it is a machine instruction called DEFLATE
|
||||
CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
|
||||
of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
|
||||
the code and the rest of this document refer to this feature simply as
|
||||
"DFLTCC".
|
||||
|
||||
# Performance
|
||||
|
||||
Performance figures are published [here](
|
||||
https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
|
||||
). The compression speed-up can be as high as 110x and the decompression
|
||||
speed-up can be as high as 15x.
|
||||
|
||||
# Limitations
|
||||
|
||||
Two DFLTCC compression calls with identical inputs are not guaranteed to
|
||||
produce identical outputs. Therefore care should be taken when using
|
||||
hardware compression when reproducible results are desired. In
|
||||
particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
|
||||
`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
|
||||
particular stream.
|
||||
|
||||
DFLTCC does not support every single zlib-ng feature, in particular:
|
||||
|
||||
* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
|
||||
* `inflateMark()`
|
||||
* `inflatePrime()`
|
||||
* `inflateSyncPoint()`
|
||||
|
||||
When used, these functions will either switch to software, or, in case
|
||||
this is not possible, gracefully fail.
|
||||
|
||||
# Code structure
|
||||
|
||||
All SystemZ-specific code lives in `arch/s390` directory and is
|
||||
integrated with the rest of zlib-ng using hook macros.
|
||||
|
||||
## Hook macros
|
||||
|
||||
DFLTCC takes as arguments a parameter block, an input buffer, an output
|
||||
buffer, and a window. Parameter blocks are stored alongside zlib states;
|
||||
buffers are forwarded from the caller; and window - which must be
|
||||
4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
|
||||
`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
|
||||
and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
|
||||
|
||||
Software and hardware window formats do not match, therefore,
|
||||
`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
|
||||
and `inflateGetDictionary()` need special handling, which is triggered using
|
||||
`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
|
||||
`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
|
||||
|
||||
`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
|
||||
parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
|
||||
`INFLATE_RESET_KEEP_HOOK()` macros.
|
||||
|
||||
`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
|
||||
`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
|
||||
calls gracefully fail.
|
||||
|
||||
`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
|
||||
software compression mid-stream using `deflateParams()`. Switching
|
||||
normally entails flushing the current block, which might not be possible
|
||||
in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
|
||||
in order to detect and gracefully handle such situations.
|
||||
|
||||
The algorithm implemented in hardware has different compression ratio
|
||||
than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
|
||||
and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
|
||||
return the correct results for the hardware implementation.
|
||||
|
||||
Actual compression and decompression are handled by `DEFLATE_HOOK()` and
|
||||
`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
|
||||
window on its own, calling `updatewindow()` is suppressed using
|
||||
`INFLATE_NEED_UPDATEWINDOW()` macro.
|
||||
|
||||
In addition to compression, DFLTCC computes CRC-32 and Adler-32
|
||||
checksums, therefore, whenever it's used, software checksumming is
|
||||
suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
|
||||
macros.
|
||||
|
||||
While software always produces reproducible compression results, this
|
||||
is not the case for DFLTCC. Therefore, zlib-ng users are given the
|
||||
ability to specify whether or not reproducible compression results
|
||||
are required. While it is always possible to specify this setting
|
||||
before the compression begins, it is not always possible to do so in
|
||||
the middle of a deflate stream - the exact conditions for that are
|
||||
determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
|
||||
|
||||
## SystemZ-specific code
|
||||
|
||||
When zlib-ng is built with DFLTCC, the hooks described above are
|
||||
converted to calls to functions, which are implemented in
|
||||
`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
|
||||
categories:
|
||||
|
||||
* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
|
||||
* Translating between software and hardware data formats, e.g.
|
||||
`dfltcc_deflate_set_dictionary()`.
|
||||
* Translating between software and hardware state machines, e.g.
|
||||
`dfltcc_deflate()` and `dfltcc_inflate()`.
|
||||
|
||||
The functions from the first two categories are fairly simple, however,
|
||||
various quirks in both software and hardware state machines make the
|
||||
functions from the third category quite complicated.
|
||||
|
||||
### `dfltcc_deflate()` function
|
||||
|
||||
This function is called by `deflate()` and has the following
|
||||
responsibilities:
|
||||
|
||||
* Checking whether DFLTCC can be used with the current stream. If this
|
||||
is not the case, then it returns `0`, making `deflate()` use some
|
||||
other function in order to compress in software. Otherwise it returns
|
||||
`1`.
|
||||
* Block management and Huffman table generation. DFLTCC ends blocks only
|
||||
when explicitly instructed to do so by the software. Furthermore,
|
||||
whether to use fixed or dynamic Huffman tables must also be determined
|
||||
by the software. Since looking at data in order to gather statistics
|
||||
would negate performance benefits, the following approach is used: the
|
||||
first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
|
||||
block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
|
||||
dynamic blocks.
|
||||
* Writing EOBS. Block Closing Control bit in the parameter block
|
||||
instructs DFLTCC to write EOBS, however, certain conditions need to be
|
||||
met: input data length must be non-zero or Continuation Flag must be
|
||||
set. To put this in simpler terms, DFLTCC will silently refuse to
|
||||
write EOBS if this is the only thing that it is asked to do. Since the
|
||||
code has to be able to emit EOBS in software anyway, in order to avoid
|
||||
tricky corner cases Block Closing Control is never used. Whether to
|
||||
write EOBS is instead controlled by `soft_bcc` variable.
|
||||
* Triggering block post-processing. Depending on flush mode, `deflate()`
|
||||
must perform various additional actions when a block or a stream ends.
|
||||
`dfltcc_deflate()` informs `deflate()` about this using
|
||||
`block_state *result` parameter.
|
||||
* Converting software state fields into hardware parameter block fields,
|
||||
and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
|
||||
and Sub-Byte Boundary. Certain fields cannot be translated and must
|
||||
persist untouched in the parameter block between calls, for example,
|
||||
Continuation Flag or Continuation State Buffer.
|
||||
* Handling flush modes and low-memory situations. These aspects are
|
||||
quite intertwined and pervasive. The general idea here is that the
|
||||
code must not do anything in software - whether explicitly by e.g.
|
||||
calling `send_eobs()`, or implicitly - by returning to `deflate()`
|
||||
with certain return and `*result` values, when Continuation Flag is
|
||||
set.
|
||||
* Ending streams. When a new block is started and flush mode is
|
||||
`Z_FINISH`, Block Header Final parameter block bit is used to mark
|
||||
this block as final. However, sometimes an empty final block is
|
||||
needed, and, unfortunately, just like with EOBS, DFLTCC will silently
|
||||
refuse to do this. The general idea of DFLTCC implementation is to
|
||||
rely as much as possible on the existing code. Here in order to do
|
||||
this, the code pretends that it does not support DFLTCC, which makes
|
||||
`deflate()` call a software compression function, which writes an
|
||||
empty final block. Whether this is required is controlled by
|
||||
`need_empty_block` variable.
|
||||
* Error handling. This is simply converting
|
||||
Operation-Ending-Supplemental Code to string. Errors can only happen
|
||||
due to things like memory corruption, and therefore they don't affect
|
||||
the `deflate()` return code.
|
||||
|
||||
### `dfltcc_inflate()` function
|
||||
|
||||
This function is called by `inflate()` from the `TYPEDO` state (that is,
|
||||
when all the metadata is parsed and the stream is positioned at the type
|
||||
bits of deflate block header) and it's responsible for the following:
|
||||
|
||||
* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
|
||||
Unfortunately, there is no way to ask DFLTCC to stop decompressing on
|
||||
block or tree boundary.
|
||||
* `inflate()` decompression loop management. This is controlled using
|
||||
the return value, which can be either `DFLTCC_INFLATE_BREAK` or
|
||||
`DFLTCC_INFLATE_CONTINUE`.
|
||||
* Converting software state fields into hardware parameter block fields,
|
||||
and vice versa. For example, `whave` and History Length or `wnext` and
|
||||
History Offset.
|
||||
* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
|
||||
and is controlled by `last` state field.
|
||||
* Error handling. Like deflate, error handling comprises
|
||||
Operation-Ending-Supplemental Code to string conversion. Unlike
|
||||
deflate, errors may happen due to bad inputs, therefore they are
|
||||
propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
|
||||
|
||||
# Testing
|
||||
|
||||
Given complexity of DFLTCC machine instruction, it is not clear whether
|
||||
QEMU TCG will ever support it. At the time of writing, one has to have
|
||||
access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
|
||||
DFLTCC is a non-privileged instruction, neither special VM/LPAR
|
||||
configuration nor root are required.
|
||||
|
||||
zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
|
||||
testing. There is no official IBM Z GitHub Actions runner, so we build
|
||||
one inspired by `anup-kodlekere/gaplib`.
|
||||
Future updates to actions-runner might need an updated patch. The .net
|
||||
version number patch has been separated into a separate file to avoid a
|
||||
need for constantly changing the patch.
|
||||
|
||||
## Configuring the builder.
|
||||
|
||||
### Install prerequisites.
|
||||
|
||||
```
|
||||
sudo dnf install podman
|
||||
```
|
||||
|
||||
### Add actions-runner service.
|
||||
|
||||
```
|
||||
sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
```
|
||||
|
||||
### Create a config file, needs github personal access token.
|
||||
|
||||
```
|
||||
# Create file /etc/actions-runner
|
||||
repo=<owner>/<name>
|
||||
access_token=<ghp_***>
|
||||
```
|
||||
|
||||
Access token should have the repo scope, consult
|
||||
https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
|
||||
for details.
|
||||
|
||||
### Autostart actions-runner.
|
||||
|
||||
```
|
||||
$ sudo systemctl enable --now actions-runner
|
||||
```
|
||||
|
||||
## Rebuilding the container
|
||||
|
||||
In order to update the `gaplib-actions-runner` podman container, e.g. to get the
|
||||
latest OS security fixes, follow these steps:
|
||||
```
|
||||
# Stop actions-runner service
|
||||
sudo systemctl stop actions-runner
|
||||
|
||||
# Delete old container
|
||||
sudo podman container rm gaplib-actions-runner
|
||||
|
||||
# Delete old image
|
||||
sudo podman image rm localhost/zlib-ng/actions-runner
|
||||
|
||||
# Build image
|
||||
sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg .
|
||||
|
||||
# Build container
|
||||
sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner
|
||||
|
||||
# Start actions-runner service
|
||||
sudo systemctl start actions-runner
|
||||
```
|
222
3rdparty/zlib-ng/arch/s390/crc32-vx.c
vendored
Normal file
222
3rdparty/zlib-ng/arch/s390/crc32-vx.c
vendored
Normal file
@ -0,0 +1,222 @@
|
||||
/*
|
||||
* Hardware-accelerated CRC-32 variants for Linux on z Systems
|
||||
*
|
||||
* Use the z/Architecture Vector Extension Facility to accelerate the
|
||||
* computing of bitreflected CRC-32 checksums.
|
||||
*
|
||||
* This CRC-32 implementation algorithm is bitreflected and processes
|
||||
* the least-significant bit first (Little-Endian).
|
||||
*
|
||||
* This code was originally written by Hendrik Brueckner
|
||||
* <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
|
||||
* relicensed under the zlib license.
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "arch_functions.h"
|
||||
|
||||
#include <vecintrin.h>
|
||||
|
||||
typedef unsigned char uv16qi __attribute__((vector_size(16)));
|
||||
typedef unsigned int uv4si __attribute__((vector_size(16)));
|
||||
typedef unsigned long long uv2di __attribute__((vector_size(16)));
|
||||
|
||||
static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
/*
|
||||
* The CRC-32 constant block contains reduction constants to fold and
|
||||
* process particular chunks of the input data stream in parallel.
|
||||
*
|
||||
* For the CRC-32 variants, the constants are precomputed according to
|
||||
* these definitions:
|
||||
*
|
||||
* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
|
||||
* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
|
||||
* R3 = [(x128+32 mod P'(x) << 32)]' << 1
|
||||
* R4 = [(x128-32 mod P'(x) << 32)]' << 1
|
||||
* R5 = [(x64 mod P'(x) << 32)]' << 1
|
||||
* R6 = [(x32 mod P'(x) << 32)]' << 1
|
||||
*
|
||||
* The bitreflected Barret reduction constant, u', is defined as
|
||||
* the bit reversal of floor(x**64 / P(x)).
|
||||
*
|
||||
* where P(x) is the polynomial in the normal domain and the P'(x) is the
|
||||
* polynomial in the reversed (bitreflected) domain.
|
||||
*
|
||||
* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
|
||||
*
|
||||
* P(x) = 0x04C11DB7
|
||||
* P'(x) = 0xEDB88320
|
||||
*/
|
||||
const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
|
||||
const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
|
||||
const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
|
||||
const uv2di r5 = {0, 0x163CD6124}; /* R5 */
|
||||
const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
|
||||
const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
|
||||
|
||||
/*
|
||||
* Load the initial CRC value.
|
||||
*
|
||||
* The CRC value is loaded into the rightmost word of the
|
||||
* vector register and is later XORed with the LSB portion
|
||||
* of the loaded input data.
|
||||
*/
|
||||
uv2di v0 = {0, 0};
|
||||
v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
|
||||
|
||||
/* Load a 64-byte data chunk and XOR with CRC */
|
||||
uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
|
||||
uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
|
||||
uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
|
||||
uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
|
||||
|
||||
v1 ^= v0;
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
|
||||
while (len >= 64) {
|
||||
/* Load the next 64-byte data chunk */
|
||||
uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
|
||||
uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
|
||||
uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
|
||||
uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
|
||||
|
||||
/*
|
||||
* Perform a GF(2) multiplication of the doublewords in V1 with
|
||||
* the R1 and R2 reduction constants in V0. The intermediate result
|
||||
* is then folded (accumulated) with the next data chunk in PART1 and
|
||||
* stored in V1. Repeat this step for the register contents
|
||||
* in V2, V3, and V4 respectively.
|
||||
*/
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
|
||||
v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
|
||||
v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
|
||||
v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
|
||||
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
|
||||
* and R4 and accumulating the next 128-bit chunk until a single 128-bit
|
||||
* value remains.
|
||||
*/
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
|
||||
|
||||
while (len >= 16) {
|
||||
/* Load next data chunk */
|
||||
v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
|
||||
|
||||
/* Fold next data chunk */
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
|
||||
|
||||
buf += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up a vector register for byte shifts. The shift value must
|
||||
* be loaded in bits 1-4 in byte element 7 of a vector register.
|
||||
* Shift by 8 bytes: 0x40
|
||||
* Shift by 4 bytes: 0x20
|
||||
*/
|
||||
uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
v9 = vec_insert((unsigned char)0x40, v9, 7);
|
||||
|
||||
/*
|
||||
* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
|
||||
* to move R4 into the rightmost doubleword and set the leftmost
|
||||
* doubleword to 0x1.
|
||||
*/
|
||||
v0 = vec_srb(r4r3, (uv2di)v9);
|
||||
v0[0] = 1;
|
||||
|
||||
/*
|
||||
* Compute GF(2) product of V1 and V0. The rightmost doubleword
|
||||
* of V1 is multiplied with R4. The leftmost doubleword of V1 is
|
||||
* multiplied by 0x1 and is then XORed with rightmost product.
|
||||
* Implicitly, the intermediate leftmost product becomes padded
|
||||
*/
|
||||
v1 = (uv2di)vec_gfmsum_128(v0, v1);
|
||||
|
||||
/*
|
||||
* Now do the final 32-bit fold by multiplying the rightmost word
|
||||
* in V1 with R5 and XOR the result with the remaining bits in V1.
|
||||
*
|
||||
* To achieve this by a single VGFMAG, right shift V1 by a word
|
||||
* and store the result in V2 which is then accumulated. Use the
|
||||
* vector unpack instruction to load the rightmost half of the
|
||||
* doubleword into the rightmost doubleword element of V1; the other
|
||||
* half is loaded in the leftmost doubleword.
|
||||
* The vector register with CONST_R5 contains the R5 constant in the
|
||||
* rightmost doubleword and the leftmost doubleword is zero to ignore
|
||||
* the leftmost product of V1.
|
||||
*/
|
||||
v9 = vec_insert((unsigned char)0x20, v9, 7);
|
||||
v2 = vec_srb(v1, (uv2di)v9);
|
||||
v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
|
||||
v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
|
||||
|
||||
/*
|
||||
* Apply a Barret reduction to compute the final 32-bit CRC value.
|
||||
*
|
||||
* The input values to the Barret reduction are the degree-63 polynomial
|
||||
* in V1 (R(x)), degree-32 generator polynomial, and the reduction
|
||||
* constant u. The Barret reduction result is the CRC value of R(x) mod
|
||||
* P(x).
|
||||
*
|
||||
* The Barret reduction algorithm is defined as:
|
||||
*
|
||||
* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
|
||||
* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
|
||||
* 3. C(x) = R(x) XOR T2(x) mod x^32
|
||||
*
|
||||
* Note: The leftmost doubleword of vector register containing
|
||||
* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
|
||||
* is zero and does not contribute to the final result.
|
||||
*/
|
||||
|
||||
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
|
||||
v2 = vec_unpackl((uv4si)v1);
|
||||
v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
|
||||
|
||||
/*
|
||||
* Compute the GF(2) product of the CRC polynomial with T1(x) in
|
||||
* V2 and XOR the intermediate result, T2(x), with the value in V1.
|
||||
* The final result is stored in word element 2 of V2.
|
||||
*/
|
||||
v2 = vec_unpackl((uv4si)v2);
|
||||
v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
|
||||
|
||||
return ((uv4si)v2)[2];
|
||||
}
|
||||
|
||||
#define VX_MIN_LEN 64
|
||||
#define VX_ALIGNMENT 16L
|
||||
#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
|
||||
|
||||
uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
|
||||
size_t prealign, aligned, remaining;
|
||||
|
||||
if (len < VX_MIN_LEN + VX_ALIGN_MASK)
|
||||
return PREFIX(crc32_braid)(crc, buf, len);
|
||||
|
||||
if ((uintptr_t)buf & VX_ALIGN_MASK) {
|
||||
prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
|
||||
len -= prealign;
|
||||
crc = PREFIX(crc32_braid)(crc, buf, prealign);
|
||||
buf += prealign;
|
||||
}
|
||||
aligned = len & ~VX_ALIGN_MASK;
|
||||
remaining = len & VX_ALIGN_MASK;
|
||||
|
||||
crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
|
||||
|
||||
if (remaining)
|
||||
crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
|
||||
|
||||
return crc;
|
||||
}
|
119
3rdparty/zlib-ng/arch/s390/dfltcc_common.h
vendored
Normal file
119
3rdparty/zlib-ng/arch/s390/dfltcc_common.h
vendored
Normal file
@ -0,0 +1,119 @@
|
||||
#ifndef DFLTCC_COMMON_H
|
||||
#define DFLTCC_COMMON_H
|
||||
|
||||
#include "zutil.h"
|
||||
|
||||
/*
|
||||
Parameter Block for Query Available Functions.
|
||||
*/
|
||||
struct dfltcc_qaf_param {
|
||||
char fns[16];
|
||||
char reserved1[8];
|
||||
char fmts[2];
|
||||
char reserved2[6];
|
||||
} ALIGNED_(8);
|
||||
|
||||
/*
|
||||
Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
|
||||
*/
|
||||
struct dfltcc_param_v0 {
|
||||
uint16_t pbvn; /* Parameter-Block-Version Number */
|
||||
uint8_t mvn; /* Model-Version Number */
|
||||
uint8_t ribm; /* Reserved for IBM use */
|
||||
uint32_t reserved32 : 31;
|
||||
uint32_t cf : 1; /* Continuation Flag */
|
||||
uint8_t reserved64[8];
|
||||
uint32_t nt : 1; /* New Task */
|
||||
uint32_t reserved129 : 1;
|
||||
uint32_t cvt : 1; /* Check Value Type */
|
||||
uint32_t reserved131 : 1;
|
||||
uint32_t htt : 1; /* Huffman-Table Type */
|
||||
uint32_t bcf : 1; /* Block-Continuation Flag */
|
||||
uint32_t bcc : 1; /* Block Closing Control */
|
||||
uint32_t bhf : 1; /* Block Header Final */
|
||||
uint32_t reserved136 : 1;
|
||||
uint32_t reserved137 : 1;
|
||||
uint32_t dhtgc : 1; /* DHT Generation Control */
|
||||
uint32_t reserved139 : 5;
|
||||
uint32_t reserved144 : 5;
|
||||
uint32_t sbb : 3; /* Sub-Byte Boundary */
|
||||
uint8_t oesc; /* Operation-Ending-Supplemental Code */
|
||||
uint32_t reserved160 : 12;
|
||||
uint32_t ifs : 4; /* Incomplete-Function Status */
|
||||
uint16_t ifl; /* Incomplete-Function Length */
|
||||
uint8_t reserved192[8];
|
||||
uint8_t reserved256[8];
|
||||
uint8_t reserved320[4];
|
||||
uint16_t hl; /* History Length */
|
||||
uint32_t reserved368 : 1;
|
||||
uint16_t ho : 15; /* History Offset */
|
||||
uint32_t cv; /* Check Value */
|
||||
uint32_t eobs : 15; /* End-of-block Symbol */
|
||||
uint32_t reserved431: 1;
|
||||
uint8_t eobl : 4; /* End-of-block Length */
|
||||
uint32_t reserved436 : 12;
|
||||
uint32_t reserved448 : 4;
|
||||
uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table
|
||||
Length */
|
||||
uint8_t reserved464[6];
|
||||
uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */
|
||||
uint8_t reserved[24];
|
||||
uint8_t ribm2[8]; /* Reserved for IBM use */
|
||||
uint8_t csb[1152]; /* Continuation-State Buffer */
|
||||
} ALIGNED_(8);
|
||||
|
||||
/*
|
||||
Extension of inflate_state and deflate_state.
|
||||
*/
|
||||
struct dfltcc_state {
|
||||
struct dfltcc_param_v0 param; /* Parameter block. */
|
||||
struct dfltcc_qaf_param af; /* Available functions. */
|
||||
char msg[64]; /* Buffer for strm->msg */
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
struct dfltcc_state common;
|
||||
uint16_t level_mask; /* Levels on which to use DFLTCC */
|
||||
uint32_t block_size; /* New block each X bytes */
|
||||
size_t block_threshold; /* New block after total_in > X */
|
||||
uint32_t dht_threshold; /* New block only if avail_in >= X */
|
||||
} arch_deflate_state;
|
||||
|
||||
typedef struct {
|
||||
struct dfltcc_state common;
|
||||
} arch_inflate_state;
|
||||
|
||||
/*
|
||||
History buffer size.
|
||||
*/
|
||||
#define HB_BITS 15
|
||||
#define HB_SIZE (1 << HB_BITS)
|
||||
|
||||
/*
|
||||
Sizes of deflate block parts.
|
||||
*/
|
||||
#define DFLTCC_BLOCK_HEADER_BITS 3
|
||||
#define DFLTCC_HLITS_COUNT_BITS 5
|
||||
#define DFLTCC_HDISTS_COUNT_BITS 5
|
||||
#define DFLTCC_HCLENS_COUNT_BITS 4
|
||||
#define DFLTCC_MAX_HCLENS 19
|
||||
#define DFLTCC_HCLEN_BITS 3
|
||||
#define DFLTCC_MAX_HLITS 286
|
||||
#define DFLTCC_MAX_HDISTS 30
|
||||
#define DFLTCC_MAX_HLIT_HDIST_BITS 7
|
||||
#define DFLTCC_MAX_SYMBOL_BITS 16
|
||||
#define DFLTCC_MAX_EOBS_BITS 15
|
||||
#define DFLTCC_MAX_PADDING_BITS 7
|
||||
|
||||
#define DEFLATE_BOUND_COMPLEN(source_len) \
|
||||
((DFLTCC_BLOCK_HEADER_BITS + \
|
||||
DFLTCC_HLITS_COUNT_BITS + \
|
||||
DFLTCC_HDISTS_COUNT_BITS + \
|
||||
DFLTCC_HCLENS_COUNT_BITS + \
|
||||
DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
|
||||
(DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
|
||||
(source_len) * DFLTCC_MAX_SYMBOL_BITS + \
|
||||
DFLTCC_MAX_EOBS_BITS + \
|
||||
DFLTCC_MAX_PADDING_BITS) >> 3)
|
||||
|
||||
#endif
|
383
3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
vendored
Normal file
383
3rdparty/zlib-ng/arch/s390/dfltcc_deflate.c
vendored
Normal file
@ -0,0 +1,383 @@
|
||||
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
|
||||
|
||||
/*
|
||||
Use the following commands to build zlib-ng with DFLTCC compression support:
|
||||
|
||||
$ ./configure --with-dfltcc-deflate
|
||||
or
|
||||
|
||||
$ cmake -DWITH_DFLTCC_DEFLATE=1 .
|
||||
|
||||
and then
|
||||
|
||||
$ make
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "trees_emit.h"
|
||||
#include "dfltcc_deflate.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
arch_deflate_state *dfltcc_state = &state->arch;
|
||||
|
||||
dfltcc_reset_state(&dfltcc_state->common);
|
||||
|
||||
/* Initialize tuning parameters */
|
||||
dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
|
||||
dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
|
||||
dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
|
||||
dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
|
||||
}
|
||||
|
||||
static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
|
||||
int reproducible) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
arch_deflate_state *dfltcc_state = &state->arch;
|
||||
|
||||
/* Unsupported compression settings */
|
||||
if ((dfltcc_state->level_mask & (1 << level)) == 0)
|
||||
return 0;
|
||||
if (window_bits != HB_BITS)
|
||||
return 0;
|
||||
if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
|
||||
return 0;
|
||||
if (reproducible)
|
||||
return 0;
|
||||
|
||||
/* Unsupported hardware */
|
||||
if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
|
||||
!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
|
||||
!is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
|
||||
}
|
||||
|
||||
static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
|
||||
dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
|
||||
}
|
||||
|
||||
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
size_t avail_out = strm->avail_out;
|
||||
dfltcc_cc cc;
|
||||
|
||||
cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
|
||||
param, &strm->next_out, &avail_out,
|
||||
&strm->next_in, &avail_in, state->window);
|
||||
strm->total_in += (strm->avail_in - avail_in);
|
||||
strm->total_out += (strm->avail_out - avail_out);
|
||||
strm->avail_in = avail_in;
|
||||
strm->avail_out = avail_out;
|
||||
return cc;
|
||||
}
|
||||
|
||||
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
|
||||
PREFIX(flush_pending)(strm);
|
||||
if (state->pending != 0) {
|
||||
/* The remaining data is located in pending_out[0:pending]. If someone
|
||||
* calls put_byte() - this might happen in deflate() - the byte will be
|
||||
* placed into pending_buf[pending], which is incorrect. Move the
|
||||
* remaining data to the beginning of pending_buf so that put_byte() is
|
||||
* usable again.
|
||||
*/
|
||||
memmove(state->pending_buf, state->pending_out, state->pending);
|
||||
state->pending_out = state->pending_buf;
|
||||
}
|
||||
#ifdef ZLIB_DEBUG
|
||||
state->compressed_len += param->eobl;
|
||||
#endif
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
arch_deflate_state *dfltcc_state = &state->arch;
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
|
||||
uInt masked_avail_in;
|
||||
dfltcc_cc cc;
|
||||
int need_empty_block;
|
||||
int soft_bcc;
|
||||
int no_flush;
|
||||
|
||||
if (!PREFIX(dfltcc_can_deflate)(strm)) {
|
||||
/* Clear history. */
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
again:
|
||||
masked_avail_in = 0;
|
||||
soft_bcc = 0;
|
||||
no_flush = flush == Z_NO_FLUSH;
|
||||
|
||||
/* No input data. Return, except when Continuation Flag is set, which means
|
||||
* that DFLTCC has buffered some output in the parameter block and needs to
|
||||
* be called again in order to flush it.
|
||||
*/
|
||||
if (strm->avail_in == 0 && !param->cf) {
|
||||
/* A block is still open, and the hardware does not support closing
|
||||
* blocks without adding data. Thus, close it manually.
|
||||
*/
|
||||
if (!no_flush && param->bcf) {
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
}
|
||||
/* Let one of deflate_* functions write a trailing empty block. */
|
||||
if (flush == Z_FINISH)
|
||||
return 0;
|
||||
/* Clear history. */
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0;
|
||||
/* Trigger block post-processing if necessary. */
|
||||
*result = no_flush ? need_more : block_done;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* There is an open non-BFINAL block, we are not going to close it just
|
||||
* yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
|
||||
* more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
|
||||
* DHT in order to adapt to a possibly changed input data distribution.
|
||||
*/
|
||||
if (param->bcf && no_flush &&
|
||||
strm->total_in > dfltcc_state->block_threshold &&
|
||||
strm->avail_in >= dfltcc_state->dht_threshold) {
|
||||
if (param->cf) {
|
||||
/* We need to flush the DFLTCC buffer before writing the
|
||||
* End-of-block Symbol. Mask the input data and proceed as usual.
|
||||
*/
|
||||
masked_avail_in += strm->avail_in;
|
||||
strm->avail_in = 0;
|
||||
no_flush = 0;
|
||||
} else {
|
||||
/* DFLTCC buffer is empty, so we can manually write the
|
||||
* End-of-block Symbol right away.
|
||||
*/
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
|
||||
}
|
||||
}
|
||||
|
||||
/* No space for compressed data. If we proceed, dfltcc_cmpr() will return
|
||||
* DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
|
||||
* set BCF=1, which is wrong. Avoid complications and return early.
|
||||
*/
|
||||
if (strm->avail_out == 0) {
|
||||
*result = need_more;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* The caller gave us too much data. Pass only one block worth of
|
||||
* uncompressed data to DFLTCC and mask the rest, so that on the next
|
||||
* iteration we start a new block.
|
||||
*/
|
||||
if (no_flush && strm->avail_in > dfltcc_state->block_size) {
|
||||
masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
|
||||
strm->avail_in = dfltcc_state->block_size;
|
||||
}
|
||||
|
||||
/* When we have an open non-BFINAL deflate block and caller indicates that
|
||||
* the stream is ending, we need to close an open deflate block and open a
|
||||
* BFINAL one.
|
||||
*/
|
||||
need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
|
||||
|
||||
/* Translate stream to parameter block */
|
||||
param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
|
||||
if (!no_flush)
|
||||
/* We need to close a block. Always do this in software - when there is
|
||||
* no input data, the hardware will not honor BCC. */
|
||||
soft_bcc = 1;
|
||||
if (flush == Z_FINISH && !param->bcf)
|
||||
/* We are about to open a BFINAL block, set Block Header Final bit
|
||||
* until the stream ends.
|
||||
*/
|
||||
param->bhf = 1;
|
||||
/* DFLTCC-CMPR will write to next_out, so make sure that buffers with
|
||||
* higher precedence are empty.
|
||||
*/
|
||||
Assert(state->pending == 0, "There must be no pending bytes");
|
||||
Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
|
||||
param->sbb = (unsigned int)state->bi_valid;
|
||||
if (param->sbb > 0)
|
||||
*strm->next_out = (unsigned char)state->bi_buf;
|
||||
/* Honor history and check value */
|
||||
param->nt = 0;
|
||||
if (state->wrap == 1)
|
||||
param->cv = strm->adler;
|
||||
else if (state->wrap == 2)
|
||||
param->cv = ZSWAP32(state->crc_fold.value);
|
||||
|
||||
/* When opening a block, choose a Huffman-Table Type */
|
||||
if (!param->bcf) {
|
||||
if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
|
||||
param->htt = HTT_FIXED;
|
||||
else {
|
||||
param->htt = HTT_DYNAMIC;
|
||||
dfltcc_gdht(strm);
|
||||
}
|
||||
}
|
||||
|
||||
/* Deflate */
|
||||
do {
|
||||
cc = dfltcc_cmpr(strm);
|
||||
if (strm->avail_in < 4096 && masked_avail_in > 0)
|
||||
/* We are about to call DFLTCC with a small input buffer, which is
|
||||
* inefficient. Since there is masked data, there will be at least
|
||||
* one more DFLTCC call, so skip the current one and make the next
|
||||
* one handle more data.
|
||||
*/
|
||||
break;
|
||||
} while (cc == DFLTCC_CC_AGAIN);
|
||||
|
||||
/* Translate parameter block to stream */
|
||||
strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
|
||||
state->bi_valid = param->sbb;
|
||||
if (state->bi_valid == 0)
|
||||
state->bi_buf = 0; /* Avoid accessing next_out */
|
||||
else
|
||||
state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
|
||||
if (state->wrap == 1)
|
||||
strm->adler = param->cv;
|
||||
else if (state->wrap == 2)
|
||||
state->crc_fold.value = ZSWAP32(param->cv);
|
||||
|
||||
/* Unmask the input data */
|
||||
strm->avail_in += masked_avail_in;
|
||||
masked_avail_in = 0;
|
||||
|
||||
/* If we encounter an error, it means there is a bug in DFLTCC call */
|
||||
Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
|
||||
|
||||
/* Update Block-Continuation Flag. It will be used to check whether to call
|
||||
* GDHT the next time.
|
||||
*/
|
||||
if (cc == DFLTCC_CC_OK) {
|
||||
if (soft_bcc) {
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
|
||||
} else
|
||||
param->bcf = 1;
|
||||
if (flush == Z_FINISH) {
|
||||
if (need_empty_block)
|
||||
/* Make the current deflate() call also close the stream */
|
||||
return 0;
|
||||
else {
|
||||
bi_windup(state);
|
||||
*result = finish_done;
|
||||
}
|
||||
} else {
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0; /* Clear history */
|
||||
*result = flush == Z_NO_FLUSH ? need_more : block_done;
|
||||
}
|
||||
} else {
|
||||
param->bcf = 1;
|
||||
*result = need_more;
|
||||
}
|
||||
if (strm->avail_in != 0 && strm->avail_out != 0)
|
||||
goto again; /* deflate() must use all input or all output */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
Switching between hardware and software compression.
|
||||
|
||||
DFLTCC does not support all zlib settings, e.g. generation of non-compressed
|
||||
blocks or alternative window sizes. When such settings are applied on the
|
||||
fly with deflateParams, we need to convert between hardware and software
|
||||
window formats.
|
||||
*/
|
||||
static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
|
||||
return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
|
||||
int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
|
||||
|
||||
if (can_deflate == could_deflate)
|
||||
/* We continue to work in the same mode - no changes needed */
|
||||
return Z_OK;
|
||||
|
||||
if (!dfltcc_was_deflate_used(strm))
|
||||
/* DFLTCC was not used yet - no changes needed */
|
||||
return Z_OK;
|
||||
|
||||
/* For now, do not convert between window formats - simply get rid of the old data instead */
|
||||
*flush = Z_FULL_FLUSH;
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
|
||||
/* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
|
||||
* close the block without resetting the compression state. Detect this
|
||||
* situation and return that deflation is not done.
|
||||
*/
|
||||
if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
|
||||
return 0;
|
||||
|
||||
/* Return that deflation is not done if DFLTCC is used and either it
|
||||
* buffered some data (Continuation Flag is set), or has not written EOBS
|
||||
* yet (Block-Continuation Flag is set).
|
||||
*/
|
||||
return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
|
||||
}
|
||||
|
||||
/*
|
||||
Preloading history.
|
||||
*/
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
|
||||
append_history(param, state->window, dictionary, dict_length);
|
||||
state->strstart = 1; /* Add FDICT to zlib header */
|
||||
state->block_start = state->strstart; /* Make deflate_stored happy */
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
|
||||
if (dictionary)
|
||||
get_history(param, state->window, dictionary);
|
||||
if (dict_length)
|
||||
*dict_length = param->hl;
|
||||
return Z_OK;
|
||||
}
|
58
3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
vendored
Normal file
58
3rdparty/zlib-ng/arch/s390/dfltcc_deflate.h
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
#ifndef DFLTCC_DEFLATE_H
|
||||
#define DFLTCC_DEFLATE_H
|
||||
|
||||
#include "deflate.h"
|
||||
#include "dfltcc_common.h"
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length);
|
||||
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
|
||||
|
||||
#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_deflate)((strm))) \
|
||||
return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_deflate)((strm))) \
|
||||
return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
|
||||
|
||||
#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
|
||||
do { \
|
||||
int err; \
|
||||
\
|
||||
err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
|
||||
if (err == Z_STREAM_ERROR) \
|
||||
return err; \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
|
||||
|
||||
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
|
||||
do { \
|
||||
if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
|
||||
(complen) = DEFLATE_BOUND_COMPLEN(source_len); \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
|
||||
|
||||
#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
|
||||
|
||||
#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
|
||||
|
||||
#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
|
||||
|
||||
#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
|
||||
|
||||
#endif
|
275
3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
vendored
Normal file
275
3rdparty/zlib-ng/arch/s390/dfltcc_detail.h
vendored
Normal file
@ -0,0 +1,275 @@
|
||||
#include "zbuild.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
#include <sys/sdt.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
Tuning parameters.
|
||||
*/
|
||||
#ifndef DFLTCC_LEVEL_MASK
|
||||
#define DFLTCC_LEVEL_MASK 0x2
|
||||
#endif
|
||||
#ifndef DFLTCC_BLOCK_SIZE
|
||||
#define DFLTCC_BLOCK_SIZE 1048576
|
||||
#endif
|
||||
#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
|
||||
#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
|
||||
#endif
|
||||
#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
|
||||
#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
|
||||
#endif
|
||||
#ifndef DFLTCC_RIBM
|
||||
#define DFLTCC_RIBM 0
|
||||
#endif
|
||||
|
||||
#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
|
||||
|
||||
#define DFLTCC_SIZEOF_QAF 32
|
||||
static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
|
||||
|
||||
static inline int is_bit_set(const char *bits, int n) {
|
||||
return bits[n / 8] & (1 << (7 - (n % 8)));
|
||||
}
|
||||
|
||||
static inline void clear_bit(char *bits, int n) {
|
||||
bits[n / 8] &= ~(1 << (7 - (n % 8)));
|
||||
}
|
||||
|
||||
#define DFLTCC_FACILITY 151
|
||||
|
||||
static inline int is_dfltcc_enabled(void) {
|
||||
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
|
||||
Z_REGISTER uint8_t r0 __asm__("r0");
|
||||
|
||||
memset(facilities, 0, sizeof(facilities));
|
||||
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
|
||||
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
|
||||
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
|
||||
* is 64-bit, it's always z/Architecture mode at runtime.
|
||||
*/
|
||||
__asm__ volatile(
|
||||
#ifndef __clang__
|
||||
".machinemode push\n"
|
||||
".machinemode zarch\n"
|
||||
#endif
|
||||
"stfle %[facilities]\n"
|
||||
#ifndef __clang__
|
||||
".machinemode pop\n"
|
||||
#endif
|
||||
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
|
||||
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
|
||||
}
|
||||
|
||||
#define DFLTCC_FMT0 0
|
||||
|
||||
#define CVT_CRC32 0
|
||||
#define CVT_ADLER32 1
|
||||
#define HTT_FIXED 0
|
||||
#define HTT_DYNAMIC 1
|
||||
|
||||
#define DFLTCC_SIZEOF_GDHT_V0 384
|
||||
#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
|
||||
static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
|
||||
static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
|
||||
|
||||
static inline z_const char *oesc_msg(char *buf, int oesc) {
|
||||
if (oesc == 0x00)
|
||||
return NULL; /* Successful completion */
|
||||
else {
|
||||
sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
|
||||
return buf;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
C wrapper for the DEFLATE CONVERSION CALL instruction.
|
||||
*/
|
||||
typedef enum {
|
||||
DFLTCC_CC_OK = 0,
|
||||
DFLTCC_CC_OP1_TOO_SHORT = 1,
|
||||
DFLTCC_CC_OP2_TOO_SHORT = 2,
|
||||
DFLTCC_CC_OP2_CORRUPT = 2,
|
||||
DFLTCC_CC_AGAIN = 3,
|
||||
} dfltcc_cc;
|
||||
|
||||
#define DFLTCC_QAF 0
|
||||
#define DFLTCC_GDHT 1
|
||||
#define DFLTCC_CMPR 2
|
||||
#define DFLTCC_XPND 4
|
||||
#define HBT_CIRCULAR (1 << 7)
|
||||
#define DFLTCC_FN_MASK ((1 << 7) - 1)
|
||||
|
||||
/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
|
||||
static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
|
||||
*hl_high = MIN(param->hl, HB_SIZE - param->ho);
|
||||
*hl_low = param->hl - *hl_high;
|
||||
}
|
||||
|
||||
/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
|
||||
static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
|
||||
size_t hl_high, hl_low;
|
||||
|
||||
get_history_lengths(param, &hl_high, &hl_low);
|
||||
instrument_read_write(hist + param->ho, hl_high);
|
||||
instrument_read_write(hist, hl_low);
|
||||
}
|
||||
|
||||
/* Notify MSan about a completed write to the circular history buffer. */
|
||||
static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
|
||||
size_t hl_high, hl_low;
|
||||
|
||||
get_history_lengths(param, &hl_high, &hl_low);
|
||||
__msan_unpoison(hist + param->ho, hl_high);
|
||||
__msan_unpoison(hist, hl_low);
|
||||
}
|
||||
|
||||
static inline dfltcc_cc dfltcc(int fn, void *param,
|
||||
unsigned char **op1, size_t *len1,
|
||||
z_const unsigned char **op2, size_t *len2, void *hist) {
|
||||
unsigned char *t2 = op1 ? *op1 : NULL;
|
||||
unsigned char *orig_t2 = t2;
|
||||
size_t t3 = len1 ? *len1 : 0;
|
||||
z_const unsigned char *t4 = op2 ? *op2 : NULL;
|
||||
size_t t5 = len2 ? *len2 : 0;
|
||||
Z_REGISTER int r0 __asm__("r0");
|
||||
Z_REGISTER void *r1 __asm__("r1");
|
||||
Z_REGISTER unsigned char *r2 __asm__("r2");
|
||||
Z_REGISTER size_t r3 __asm__("r3");
|
||||
Z_REGISTER z_const unsigned char *r4 __asm__("r4");
|
||||
Z_REGISTER size_t r5 __asm__("r5");
|
||||
int cc;
|
||||
|
||||
/* Insert pre-instrumentation for DFLTCC. */
|
||||
switch (fn & DFLTCC_FN_MASK) {
|
||||
case DFLTCC_QAF:
|
||||
instrument_write(param, DFLTCC_SIZEOF_QAF);
|
||||
break;
|
||||
case DFLTCC_GDHT:
|
||||
instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
|
||||
instrument_read(t4, t5);
|
||||
break;
|
||||
case DFLTCC_CMPR:
|
||||
case DFLTCC_XPND:
|
||||
instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
|
||||
instrument_read(t4, t5);
|
||||
instrument_write(t2, t3);
|
||||
instrument_read_write_hist(param, hist);
|
||||
break;
|
||||
}
|
||||
|
||||
r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
|
||||
__asm__ volatile(
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
|
||||
#endif
|
||||
".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
|
||||
#endif
|
||||
"ipm %[cc]\n"
|
||||
: [r2] "+r" (r2)
|
||||
, [r3] "+r" (r3)
|
||||
, [r4] "+r" (r4)
|
||||
, [r5] "+r" (r5)
|
||||
, [cc] "=r" (cc)
|
||||
: [r0] "r" (r0)
|
||||
, [r1] "r" (r1)
|
||||
, [hist] "r" (hist)
|
||||
#ifdef HAVE_SYS_SDT_H
|
||||
, STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
|
||||
#endif
|
||||
: "cc", "memory");
|
||||
t2 = r2; t3 = r3; t4 = r4; t5 = r5;
|
||||
|
||||
/* Insert post-instrumentation for DFLTCC. */
|
||||
switch (fn & DFLTCC_FN_MASK) {
|
||||
case DFLTCC_QAF:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_QAF);
|
||||
break;
|
||||
case DFLTCC_GDHT:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
|
||||
break;
|
||||
case DFLTCC_CMPR:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
|
||||
__msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
|
||||
msan_unpoison_hist(param, hist);
|
||||
break;
|
||||
case DFLTCC_XPND:
|
||||
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
|
||||
__msan_unpoison(orig_t2, t2 - orig_t2);
|
||||
msan_unpoison_hist(param, hist);
|
||||
break;
|
||||
}
|
||||
|
||||
if (op1)
|
||||
*op1 = t2;
|
||||
if (len1)
|
||||
*len1 = t3;
|
||||
if (op2)
|
||||
*op2 = t4;
|
||||
if (len2)
|
||||
*len2 = t5;
|
||||
return (cc >> 28) & 3;
|
||||
}
|
||||
|
||||
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
|
||||
|
||||
static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
|
||||
/* Initialize available functions */
|
||||
if (is_dfltcc_enabled()) {
|
||||
dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
|
||||
memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
|
||||
} else
|
||||
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
|
||||
|
||||
/* Initialize parameter block */
|
||||
memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
|
||||
dfltcc_state->param.nt = 1;
|
||||
dfltcc_state->param.ribm = DFLTCC_RIBM;
|
||||
}
|
||||
|
||||
static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
|
||||
memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
|
||||
}
|
||||
|
||||
static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
|
||||
const unsigned char *buf, uInt count) {
|
||||
size_t offset;
|
||||
size_t n;
|
||||
|
||||
/* Do not use more than 32K */
|
||||
if (count > HB_SIZE) {
|
||||
buf += count - HB_SIZE;
|
||||
count = HB_SIZE;
|
||||
}
|
||||
offset = (param->ho + param->hl) % HB_SIZE;
|
||||
if (offset + count <= HB_SIZE)
|
||||
/* Circular history buffer does not wrap - copy one chunk */
|
||||
memcpy(history + offset, buf, count);
|
||||
else {
|
||||
/* Circular history buffer wraps - copy two chunks */
|
||||
n = HB_SIZE - offset;
|
||||
memcpy(history + offset, buf, n);
|
||||
memcpy(history, buf + n, count - n);
|
||||
}
|
||||
n = param->hl + count;
|
||||
if (n <= HB_SIZE)
|
||||
/* All history fits into buffer - no need to discard anything */
|
||||
param->hl = n;
|
||||
else {
|
||||
/* History does not fit into buffer - discard extra bytes */
|
||||
param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
|
||||
param->hl = HB_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
|
||||
unsigned char *buf) {
|
||||
size_t hl_high, hl_low;
|
||||
|
||||
get_history_lengths(param, &hl_high, &hl_low);
|
||||
memcpy(buf, history + param->ho, hl_high);
|
||||
memcpy(buf + hl_high, history, hl_low);
|
||||
}
|
191
3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
vendored
Normal file
191
3rdparty/zlib-ng/arch/s390/dfltcc_inflate.c
vendored
Normal file
@ -0,0 +1,191 @@
|
||||
/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
|
||||
|
||||
/*
|
||||
Use the following commands to build zlib-ng with DFLTCC decompression support:
|
||||
|
||||
$ ./configure --with-dfltcc-inflate
|
||||
or
|
||||
|
||||
$ cmake -DWITH_DFLTCC_INFLATE=1 .
|
||||
|
||||
and then
|
||||
|
||||
$ make
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "inftrees.h"
|
||||
#include "inflate.h"
|
||||
#include "dfltcc_inflate.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
|
||||
dfltcc_reset_state(&state->arch.common);
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = &state->arch.common;
|
||||
|
||||
/* Unsupported hardware */
|
||||
return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
|
||||
}
|
||||
|
||||
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
size_t avail_out = strm->avail_out;
|
||||
dfltcc_cc cc;
|
||||
|
||||
cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
|
||||
param, &strm->next_out, &avail_out,
|
||||
&strm->next_in, &avail_in, state->window);
|
||||
strm->avail_in = avail_in;
|
||||
strm->avail_out = avail_out;
|
||||
return cc;
|
||||
}
|
||||
|
||||
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = &state->arch.common;
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
dfltcc_cc cc;
|
||||
|
||||
if (flush == Z_BLOCK || flush == Z_TREES) {
|
||||
/* DFLTCC does not support stopping on block boundaries */
|
||||
if (PREFIX(dfltcc_inflate_disable)(strm)) {
|
||||
*ret = Z_STREAM_ERROR;
|
||||
return DFLTCC_INFLATE_BREAK;
|
||||
} else
|
||||
return DFLTCC_INFLATE_SOFTWARE;
|
||||
}
|
||||
|
||||
if (state->last) {
|
||||
if (state->bits != 0) {
|
||||
strm->next_in++;
|
||||
strm->avail_in--;
|
||||
state->bits = 0;
|
||||
}
|
||||
state->mode = CHECK;
|
||||
return DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
|
||||
if (strm->avail_in == 0 && !param->cf)
|
||||
return DFLTCC_INFLATE_BREAK;
|
||||
|
||||
/* if window not in use yet, initialize */
|
||||
if (state->wsize == 0)
|
||||
state->wsize = 1U << state->wbits;
|
||||
|
||||
/* Translate stream to parameter block */
|
||||
param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
|
||||
param->sbb = state->bits;
|
||||
if (param->hl)
|
||||
param->nt = 0; /* Honor history for the first block */
|
||||
if (state->wrap & 4)
|
||||
param->cv = state->flags ? ZSWAP32(state->check) : state->check;
|
||||
|
||||
/* Inflate */
|
||||
do {
|
||||
cc = dfltcc_xpnd(strm);
|
||||
} while (cc == DFLTCC_CC_AGAIN);
|
||||
|
||||
/* Translate parameter block to stream */
|
||||
strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
|
||||
state->last = cc == DFLTCC_CC_OK;
|
||||
state->bits = param->sbb;
|
||||
if (state->wrap & 4)
|
||||
strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
|
||||
if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
|
||||
/* Report an error if stream is corrupted */
|
||||
state->mode = BAD;
|
||||
return DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
state->mode = TYPEDO;
|
||||
/* Break if operands are exhausted, otherwise continue looping */
|
||||
return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
|
||||
DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
|
||||
return !state->arch.common.param.nt;
|
||||
}
|
||||
|
||||
/*
|
||||
Rotates a circular buffer.
|
||||
The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
|
||||
*/
|
||||
static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
|
||||
unsigned char *p = pivot;
|
||||
unsigned char tmp;
|
||||
|
||||
while (p != start) {
|
||||
tmp = *start;
|
||||
*start = *p;
|
||||
*p = tmp;
|
||||
|
||||
start++;
|
||||
p++;
|
||||
|
||||
if (p == end)
|
||||
p = pivot;
|
||||
else if (start == pivot)
|
||||
pivot = p;
|
||||
}
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = &state->arch.common;
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
if (!PREFIX(dfltcc_can_inflate)(strm))
|
||||
return 0;
|
||||
if (PREFIX(dfltcc_was_inflate_used)(strm))
|
||||
/* DFLTCC has already decompressed some data. Since there is not
|
||||
* enough information to resume decompression in software, the call
|
||||
* must fail.
|
||||
*/
|
||||
return 1;
|
||||
/* DFLTCC was not used yet - decompress in software */
|
||||
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
|
||||
/* Convert the window from the hardware to the software format */
|
||||
rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
|
||||
state->whave = state->wnext = MIN(param->hl, state->wsize);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Preloading history.
|
||||
*/
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
|
||||
/* if window not in use yet, initialize */
|
||||
if (state->wsize == 0)
|
||||
state->wsize = 1U << state->wbits;
|
||||
|
||||
append_history(param, state->window, dictionary, dict_length);
|
||||
state->havedict = 1;
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
|
||||
unsigned char *dictionary, uInt *dict_length) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &state->arch.common.param;
|
||||
|
||||
if (dictionary && state->window)
|
||||
get_history(param, state->window, dictionary);
|
||||
if (dict_length)
|
||||
*dict_length = param->hl;
|
||||
return Z_OK;
|
||||
}
|
67
3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
vendored
Normal file
67
3rdparty/zlib-ng/arch/s390/dfltcc_inflate.h
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
#ifndef DFLTCC_INFLATE_H
|
||||
#define DFLTCC_INFLATE_H
|
||||
|
||||
#include "dfltcc_common.h"
|
||||
|
||||
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
|
||||
typedef enum {
|
||||
DFLTCC_INFLATE_CONTINUE,
|
||||
DFLTCC_INFLATE_BREAK,
|
||||
DFLTCC_INFLATE_SOFTWARE,
|
||||
} dfltcc_inflate_action;
|
||||
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
|
||||
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length);
|
||||
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
|
||||
unsigned char *dictionary, uInt* dict_length);
|
||||
|
||||
#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
|
||||
|
||||
#define INFLATE_PRIME_HOOK(strm, bits, value) \
|
||||
do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
|
||||
|
||||
#define INFLATE_TYPEDO_HOOK(strm, flush) \
|
||||
if (PREFIX(dfltcc_can_inflate)((strm))) { \
|
||||
dfltcc_inflate_action action; \
|
||||
\
|
||||
RESTORE(); \
|
||||
action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
|
||||
LOAD(); \
|
||||
if (action == DFLTCC_INFLATE_CONTINUE) \
|
||||
break; \
|
||||
else if (action == DFLTCC_INFLATE_BREAK) \
|
||||
goto inf_leave; \
|
||||
}
|
||||
|
||||
#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
|
||||
|
||||
#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
|
||||
|
||||
#define INFLATE_MARK_HOOK(strm) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_SYNC_POINT_HOOK(strm) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_inflate)((strm))) \
|
||||
return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
if (PREFIX(dfltcc_can_inflate)((strm))) \
|
||||
return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
|
||||
|
||||
#endif
|
14
3rdparty/zlib-ng/arch/s390/s390_features.c
vendored
Normal file
14
3rdparty/zlib-ng/arch/s390/s390_features.c
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
#include "zbuild.h"
|
||||
#include "s390_features.h"
|
||||
|
||||
#ifdef HAVE_SYS_AUXV_H
|
||||
# include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#ifndef HWCAP_S390_VXRS
|
||||
#define HWCAP_S390_VXRS HWCAP_S390_VX
|
||||
#endif
|
||||
|
||||
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
|
||||
features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
|
||||
}
|
14
3rdparty/zlib-ng/arch/s390/s390_features.h
vendored
Normal file
14
3rdparty/zlib-ng/arch/s390/s390_features.h
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
/* s390_features.h -- check for s390 features.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef S390_FEATURES_H_
|
||||
#define S390_FEATURES_H_
|
||||
|
||||
struct s390_cpu_features {
|
||||
int has_vx;
|
||||
};
|
||||
|
||||
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
|
||||
|
||||
#endif
|
20
3rdparty/zlib-ng/arch/s390/s390_functions.h
vendored
Normal file
20
3rdparty/zlib-ng/arch/s390/s390_functions.h
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
/* s390_functions.h -- s390 implementations for arch-specific functions.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef S390_FUNCTIONS_H_
|
||||
#define S390_FUNCTIONS_H_
|
||||
|
||||
#ifdef S390_CRC32_VX
|
||||
uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
# if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
|
||||
# undef native_crc32
|
||||
# define native_crc32 = crc32_s390_vx
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif
|
47
3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
vendored
Normal file
47
3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.Dockerfile
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
# Self-Hosted IBM Z Github Actions Runner.
|
||||
|
||||
FROM almalinux:9
|
||||
|
||||
RUN dnf update -y -q && \
|
||||
dnf install -y -q --enablerepo=crb wget git which sudo jq \
|
||||
cmake make automake autoconf m4 libtool ninja-build python3-pip \
|
||||
gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
|
||||
glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
|
||||
|
||||
RUN dnf install -y -q dotnet-sdk-6.0 && \
|
||||
echo "Using SDK - `dotnet --version`"
|
||||
|
||||
COPY runner-s390x.patch /tmp/runner.patch
|
||||
COPY runner-global.json /tmp/global.json
|
||||
|
||||
RUN cd /tmp && \
|
||||
git clone -q https://github.com/actions/runner && \
|
||||
cd runner && \
|
||||
git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -b build && \
|
||||
git apply /tmp/runner.patch && \
|
||||
cp -f /tmp/global.json src/global.json
|
||||
|
||||
|
||||
RUN cd /tmp/runner/src && \
|
||||
./dev.sh layout && \
|
||||
./dev.sh package && \
|
||||
rm -rf /root/.dotnet /root/.nuget
|
||||
|
||||
RUN useradd -c "Action Runner" -m actions-runner && \
|
||||
usermod -L actions-runner
|
||||
|
||||
RUN tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
|
||||
chown -R actions-runner:actions-runner /home/actions-runner
|
||||
|
||||
#VOLUME /home/actions-runner
|
||||
|
||||
RUN rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
|
||||
dnf clean all
|
||||
|
||||
USER actions-runner
|
||||
|
||||
# Scripts.
|
||||
COPY fs/ /
|
||||
WORKDIR /home/actions-runner
|
||||
ENTRYPOINT ["/usr/bin/entrypoint"]
|
||||
CMD ["/usr/bin/actions-runner"]
|
18
3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
vendored
Normal file
18
3rdparty/zlib-ng/arch/s390/self-hosted-builder/actions-runner.service
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Podman container: Gaplib Github Actions Runner
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
StartLimitIntervalSec=1
|
||||
RequiresMountsFor=/run/user/1001/containers
|
||||
|
||||
[Service]
|
||||
Environment=PODMAN_SYSTEMD_UNIT=%n
|
||||
Restart=always
|
||||
TimeoutStopSec=61
|
||||
ExecStart=/usr/bin/podman start gaplib-actions-runner
|
||||
ExecStop=/usr/bin/podman stop -t 1 gaplib-actions-runner
|
||||
ExecStopPost=/usr/bin/podman stop -t 1 gaplib-actions-runner
|
||||
Type=forking
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
5
3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
vendored
Normal file
5
3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-global.json
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"sdk": {
|
||||
"version": "6.0.421"
|
||||
}
|
||||
}
|
243
3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
vendored
Normal file
243
3rdparty/zlib-ng/arch/s390/self-hosted-builder/runner-s390x.patch
vendored
Normal file
@ -0,0 +1,243 @@
|
||||
diff --git a/src/Directory.Build.props b/src/Directory.Build.props
|
||||
index 9db5fac..f02e235 100644
|
||||
--- a/src/Directory.Build.props
|
||||
+++ b/src/Directory.Build.props
|
||||
@@ -44,6 +44,9 @@
|
||||
<PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-arm64'">
|
||||
<DefineConstants>$(DefineConstants);ARM64</DefineConstants>
|
||||
</PropertyGroup>
|
||||
+ <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-s390x'">
|
||||
+ <DefineConstants>$(DefineConstants);S390X</DefineConstants>
|
||||
+ </PropertyGroup>
|
||||
|
||||
<!-- Set TRACE/DEBUG vars -->
|
||||
<PropertyGroup>
|
||||
diff --git a/src/Misc/externals.sh b/src/Misc/externals.sh
|
||||
index 383221e..1555f67 100755
|
||||
--- a/src/Misc/externals.sh
|
||||
+++ b/src/Misc/externals.sh
|
||||
@@ -189,3 +189,8 @@ if [[ "$PACKAGERUNTIME" == "linux-arm" ]]; then
|
||||
acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-armv7l.tar.gz" node16 fix_nested_dir
|
||||
acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-armv7l.tar.gz" node20 fix_nested_dir
|
||||
fi
|
||||
+
|
||||
+if [[ "$PACKAGERUNTIME" == "linux-s390x" ]]; then
|
||||
+ acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-s390x.tar.gz" node16 fix_nested_dir
|
||||
+ acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-s390x.tar.gz" node20 fix_nested_dir
|
||||
+fi
|
||||
diff --git a/src/Misc/layoutroot/config.sh b/src/Misc/layoutroot/config.sh
|
||||
index 14cc6ba..9b5b8e6 100755
|
||||
--- a/src/Misc/layoutroot/config.sh
|
||||
+++ b/src/Misc/layoutroot/config.sh
|
||||
@@ -20,25 +20,29 @@ then
|
||||
|
||||
message="Execute sudo ./bin/installdependencies.sh to install any missing Dotnet Core 6.0 dependencies."
|
||||
|
||||
- ldd ./bin/libcoreclr.so | grep 'not found'
|
||||
- if [ $? -eq 0 ]; then
|
||||
- echo "Dependencies is missing for Dotnet Core 6.0"
|
||||
- echo $message
|
||||
- exit 1
|
||||
- fi
|
||||
+ ARCH=`uname -m`
|
||||
+ if [ "${ARCH}" != "s390x" -a "${ARCH}" != "ppc64le" ]
|
||||
+ then
|
||||
+ ldd ./bin/libcoreclr.so | grep 'not found'
|
||||
+ if [ $? -eq 0 ]; then
|
||||
+ echo "Dependencies is missing for Dotnet Core 6.0"
|
||||
+ echo $message
|
||||
+ exit 1
|
||||
+ fi
|
||||
|
||||
- ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
|
||||
- if [ $? -eq 0 ]; then
|
||||
- echo "Dependencies is missing for Dotnet Core 6.0"
|
||||
- echo $message
|
||||
- exit 1
|
||||
- fi
|
||||
+ ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
|
||||
+ if [ $? -eq 0 ]; then
|
||||
+ echo "Dependencies is missing for Dotnet Core 6.0"
|
||||
+ echo $message
|
||||
+ exit 1
|
||||
+ fi
|
||||
|
||||
- ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
|
||||
- if [ $? -eq 0 ]; then
|
||||
- echo "Dependencies is missing for Dotnet Core 6.0"
|
||||
- echo $message
|
||||
- exit 1
|
||||
+ ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
|
||||
+ if [ $? -eq 0 ]; then
|
||||
+ echo "Dependencies is missing for Dotnet Core 6.0"
|
||||
+ echo $message
|
||||
+ exit 1
|
||||
+ fi
|
||||
fi
|
||||
|
||||
if ! [ -x "$(command -v ldconfig)" ]; then
|
||||
diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs
|
||||
index 177e3c9..9545981 100644
|
||||
--- a/src/Runner.Common/Constants.cs
|
||||
+++ b/src/Runner.Common/Constants.cs
|
||||
@@ -58,7 +58,8 @@ namespace GitHub.Runner.Common
|
||||
X86,
|
||||
X64,
|
||||
Arm,
|
||||
- Arm64
|
||||
+ Arm64,
|
||||
+ S390x
|
||||
}
|
||||
|
||||
public static class Runner
|
||||
@@ -81,6 +82,8 @@ namespace GitHub.Runner.Common
|
||||
public static readonly Architecture PlatformArchitecture = Architecture.Arm;
|
||||
#elif ARM64
|
||||
public static readonly Architecture PlatformArchitecture = Architecture.Arm64;
|
||||
+#elif S390X
|
||||
+ public static readonly Architecture PlatformArchitecture = Architecture.S390x;
|
||||
#else
|
||||
public static readonly Architecture PlatformArchitecture = Architecture.X64;
|
||||
#endif
|
||||
diff --git a/src/Runner.Common/Util/VarUtil.cs b/src/Runner.Common/Util/VarUtil.cs
|
||||
index 97273a1..2a34430 100644
|
||||
--- a/src/Runner.Common/Util/VarUtil.cs
|
||||
+++ b/src/Runner.Common/Util/VarUtil.cs
|
||||
@@ -53,6 +53,8 @@ namespace GitHub.Runner.Common.Util
|
||||
return "ARM";
|
||||
case Constants.Architecture.Arm64:
|
||||
return "ARM64";
|
||||
+ case Constants.Architecture.S390x:
|
||||
+ return "S390X";
|
||||
default:
|
||||
throw new NotSupportedException(); // Should never reach here.
|
||||
}
|
||||
diff --git a/src/Test/L0/ConstantGenerationL0.cs b/src/Test/L0/ConstantGenerationL0.cs
|
||||
index 2042485..a9d8b46 100644
|
||||
--- a/src/Test/L0/ConstantGenerationL0.cs
|
||||
+++ b/src/Test/L0/ConstantGenerationL0.cs
|
||||
@@ -20,6 +20,7 @@ namespace GitHub.Runner.Common.Tests
|
||||
"linux-x64",
|
||||
"linux-arm",
|
||||
"linux-arm64",
|
||||
+ "linux-s390x",
|
||||
"osx-x64",
|
||||
"osx-arm64"
|
||||
};
|
||||
diff --git a/src/Test/L0/Listener/SelfUpdaterL0.cs b/src/Test/L0/Listener/SelfUpdaterL0.cs
|
||||
index 26ba65e..6791df3 100644
|
||||
--- a/src/Test/L0/Listener/SelfUpdaterL0.cs
|
||||
+++ b/src/Test/L0/Listener/SelfUpdaterL0.cs
|
||||
@@ -1,4 +1,4 @@
|
||||
-#if !(OS_WINDOWS && ARM64)
|
||||
+#if !(OS_WINDOWS && ARM64) && !S390X
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
@@ -16,6 +16,7 @@ using Xunit;
|
||||
|
||||
namespace GitHub.Runner.Common.Tests.Listener
|
||||
{
|
||||
+#if !S390X // Self-update is not currently supported on S390X
|
||||
public sealed class SelfUpdaterL0
|
||||
{
|
||||
private Mock<IRunnerServer> _runnerServer;
|
||||
@@ -291,5 +292,6 @@ namespace GitHub.Runner.Common.Tests.Listener
|
||||
}
|
||||
}
|
||||
}
|
||||
+#endif
|
||||
}
|
||||
#endif
|
||||
diff --git a/src/Test/L0/Listener/SelfUpdaterV2L0.cs b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
|
||||
index 5115a6b..dd8d198 100644
|
||||
--- a/src/Test/L0/Listener/SelfUpdaterV2L0.cs
|
||||
+++ b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
|
||||
@@ -1,4 +1,4 @@
|
||||
-#if !(OS_WINDOWS && ARM64)
|
||||
+#if !(OS_WINDOWS && ARM64) && !S390X
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
diff --git a/src/Test/L0/Worker/StepHostL0.cs b/src/Test/L0/Worker/StepHostL0.cs
|
||||
index f6b5889..26f8e21 100644
|
||||
--- a/src/Test/L0/Worker/StepHostL0.cs
|
||||
+++ b/src/Test/L0/Worker/StepHostL0.cs
|
||||
@@ -31,7 +31,7 @@ namespace GitHub.Runner.Common.Tests.Worker
|
||||
return hc;
|
||||
}
|
||||
|
||||
-#if OS_LINUX
|
||||
+#if OS_LINUX && !S390X
|
||||
[Fact]
|
||||
[Trait("Level", "L0")]
|
||||
[Trait("Category", "Worker")]
|
||||
diff --git a/src/dev.sh b/src/dev.sh
|
||||
index fa637d1..8c66f37 100755
|
||||
--- a/src/dev.sh
|
||||
+++ b/src/dev.sh
|
||||
@@ -54,6 +54,7 @@ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
|
||||
case $CPU_NAME in
|
||||
armv7l) RUNTIME_ID="linux-arm";;
|
||||
aarch64) RUNTIME_ID="linux-arm64";;
|
||||
+ s390x) RUNTIME_ID="linux-s390x";;
|
||||
esac
|
||||
fi
|
||||
elif [[ "$CURRENT_PLATFORM" == 'darwin' ]]; then
|
||||
@@ -80,7 +81,7 @@ if [[ "$CURRENT_PLATFORM" == 'windows' ]]; then
|
||||
exit 1
|
||||
fi
|
||||
elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
|
||||
- if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') ]]; then
|
||||
+ if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') && ("$RUNTIME_ID" != 'linux-s390x') ]]; then
|
||||
echo "Failed: Can't build $RUNTIME_ID package $CURRENT_PLATFORM" >&2
|
||||
exit 1
|
||||
fi
|
||||
@@ -199,7 +200,8 @@ function package ()
|
||||
popd > /dev/null
|
||||
}
|
||||
|
||||
-if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet") ]]; then
|
||||
+if [[ "${RUNTIME_ID}" != "linux-s390x" && ((! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet")) ]]; then
|
||||
+
|
||||
|
||||
# Download dotnet SDK to ../_dotnetsdk directory
|
||||
heading "Ensure Dotnet SDK"
|
||||
@@ -224,8 +226,10 @@ if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTN
|
||||
echo "${DOTNETSDK_VERSION}" > "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}"
|
||||
fi
|
||||
|
||||
-echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
|
||||
-export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
|
||||
+if [[ -d "${DOTNETSDK_INSTALLDIR}" ]]; then
|
||||
+ echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
|
||||
+ export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
|
||||
+fi
|
||||
|
||||
heading "Dotnet SDK Version"
|
||||
dotnet --version
|
||||
diff --git a/src/dir.proj b/src/dir.proj
|
||||
index 056a312..8370922 100644
|
||||
--- a/src/dir.proj
|
||||
+++ b/src/dir.proj
|
||||
@@ -41,8 +41,18 @@
|
||||
</ItemGroup>
|
||||
|
||||
<Target Name="Build" DependsOnTargets="GenerateConstant">
|
||||
- <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" />
|
||||
- <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);RuntimeIdentifier=$(PackageRuntime);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
|
||||
+ <PropertyGroup>
|
||||
+ <!-- Normally we want to publish a self-contained app for $(PackageRuntime) -->
|
||||
+ <PublishRuntimeIdentifier>RuntimeIdentifier=$(PackageRuntime)</PublishRuntimeIdentifier>
|
||||
+ <!-- However, on s390x there are no apphost or runtime packages on nuget.org, so self-contained publishing is not supported.
|
||||
+ Perform a non-self-contained publish using the current runtime identifier (normally something like rhel.8-s390x) instead.
|
||||
+ In addition, when not using an explicit runtime identifier, the SDK will copy runtime assets from dependent packages;
|
||||
+ as this would confuse the expected layout, disable that behavior as well. -->
|
||||
+ <PublishRuntimeIdentifier Condition="'$(PackageRuntime)' == 'linux-s390x'">SelfContained=false;CopyLocalRuntimeTargetAssets=false</PublishRuntimeIdentifier>
|
||||
+ </PropertyGroup>
|
||||
+
|
||||
+ <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" Properties="$(PublishRuntimeIdentifier)" />
|
||||
+ <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);$(PublishRuntimeIdentifier);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
|
||||
<Exec Command="%22$(DesktopMSBuild)%22 Runner.Service/Windows/RunnerService.csproj /p:Configuration=$(BUILDCONFIG) /p:PackageRuntime=$(PackageRuntime) /p:OutputPath=%22$(MSBuildProjectDirectory)/../_layout/bin%22" ConsoleToMSBuild="true" Condition="'$(PackageRuntime)' == 'win-x64' Or '$(PackageRuntime)' == 'win-x86' Or '$(PackageRuntime)' == 'win-arm64'" />
|
||||
</Target>
|
||||
|
11
3rdparty/zlib-ng/arch/x86/Makefile.in
vendored
11
3rdparty/zlib-ng/arch/x86/Makefile.in
vendored
@ -35,7 +35,6 @@ all: \
|
||||
chunkset_ssse3.o chunkset_ssse3.lo \
|
||||
compare256_avx2.o compare256_avx2.lo \
|
||||
compare256_sse2.o compare256_sse2.lo \
|
||||
insert_string_sse42.o insert_string_sse42.lo \
|
||||
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
|
||||
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
|
||||
slide_hash_avx2.o slide_hash_avx2.lo \
|
||||
@ -77,12 +76,6 @@ compare256_sse2.o:
|
||||
compare256_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
insert_string_sse42.o:
|
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
insert_string_sse42.lo:
|
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
crc32_pclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
@ -90,10 +83,10 @@ crc32_pclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
slide_hash_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
17
3rdparty/zlib-ng/arch/x86/adler32_avx2.c
vendored
17
3rdparty/zlib-ng/arch/x86/adler32_avx2.c
vendored
@ -9,24 +9,15 @@
|
||||
|
||||
#ifdef X86_AVX2
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#include "../../adler32_fold.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "adler32_p.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
#include "x86_intrins.h"
|
||||
|
||||
#ifdef X86_SSE42
|
||||
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
|
||||
|
||||
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
|
||||
#define sub32(a, b, c) adler32_ssse3(a, b, c)
|
||||
#else
|
||||
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
|
||||
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
|
||||
#endif
|
||||
|
||||
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
@ -44,9 +35,9 @@ rem_peel:
|
||||
}
|
||||
} else if (len < 32) {
|
||||
if (COPY) {
|
||||
return copy_sub32(adler, dst, src, len);
|
||||
return adler32_fold_copy_sse42(adler, dst, src, len);
|
||||
} else {
|
||||
return sub32(adler, src, len);
|
||||
return adler32_ssse3(adler, src, len);
|
||||
}
|
||||
}
|
||||
|
||||
|
13
3rdparty/zlib-ng/arch/x86/adler32_avx512.c
vendored
13
3rdparty/zlib-ng/arch/x86/adler32_avx512.c
vendored
@ -8,10 +8,9 @@
|
||||
|
||||
#ifdef X86_AVX512
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../adler32_fold.h"
|
||||
#include "../../cpu_features.h"
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "arch_functions.h"
|
||||
#include <immintrin.h>
|
||||
#include "x86_intrins.h"
|
||||
#include "adler32_avx512_p.h"
|
||||
@ -33,13 +32,7 @@ rem_peel:
|
||||
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
}
|
||||
|
||||
#ifdef X86_AVX2
|
||||
return adler32_avx2(adler, src, len);
|
||||
#elif defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
}
|
||||
|
||||
__m512i vbuf, vs1_0, vs3;
|
||||
|
21
3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
21
3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
@ -9,11 +9,10 @@
|
||||
|
||||
#ifdef X86_AVX512VNNI
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../cpu_features.h"
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "arch_functions.h"
|
||||
#include <immintrin.h>
|
||||
#include "../../adler32_fold.h"
|
||||
#include "x86_intrins.h"
|
||||
#include "adler32_avx512_p.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
@ -28,20 +27,10 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size
|
||||
|
||||
rem_peel:
|
||||
if (len < 32)
|
||||
#if defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
|
||||
if (len < 64)
|
||||
#ifdef X86_AVX2
|
||||
return adler32_avx2(adler, src, len);
|
||||
#elif defined(X86_SSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
@ -135,11 +124,7 @@ rem_peel_copy:
|
||||
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
|
||||
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
|
||||
#if defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
}
|
||||
|
||||
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
|
5
3rdparty/zlib-ng/arch/x86/adler32_sse42.c
vendored
5
3rdparty/zlib-ng/arch/x86/adler32_sse42.c
vendored
@ -6,9 +6,8 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../adler32_fold.h"
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
|
4
3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
vendored
4
3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
vendored
@ -6,8 +6,8 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
|
14
3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
14
3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
@ -4,10 +4,7 @@
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
|
||||
* code size by sharing the chunkcopy functions, which will certainly compile
|
||||
* to identical machine code */
|
||||
#if defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
#if defined(X86_SSSE3)
|
||||
#include <immintrin.h>
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
@ -19,8 +16,6 @@ typedef __m128i chunk_t;
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
#define HAVE_CHUNKCOPY
|
||||
#define HAVE_CHUNKUNROLL
|
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = {
|
||||
{0, 1}, /* 3 */
|
||||
@ -83,14 +78,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
|
||||
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
|
||||
|
||||
#define CHUNKSIZE chunksize_ssse3
|
||||
#define CHUNKMEMSET chunkmemset_ssse3
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
#define CHUNKCOPY chunkcopy_ssse3
|
||||
#define CHUNKUNROLL chunkunroll_ssse3
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
|
5
3rdparty/zlib-ng/arch/x86/compare256_avx2.c
vendored
5
3rdparty/zlib-ng/arch/x86/compare256_avx2.c
vendored
@ -3,8 +3,9 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
5
3rdparty/zlib-ng/arch/x86/compare256_sse2.c
vendored
5
3rdparty/zlib-ng/arch/x86/compare256_sse2.c
vendored
@ -3,8 +3,9 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
||||
__m128i xmm_crc_part = _mm_setzero_si128();
|
||||
#ifdef COPY
|
||||
char ALIGNED_(16) partial_buf[16] = { 0 };
|
||||
#else
|
||||
#ifndef COPY
|
||||
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
|
||||
int32_t first = init_crc != 0;
|
||||
|
||||
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
|
||||
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
|
||||
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
|
||||
* by definition can be up to 15 bytes + one full vector load. */
|
||||
assert(len >= 31 || first == 0);
|
||||
/* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
|
||||
* for the aligning load that occurs. If there's an initial CRC, to carry it forward through
|
||||
* the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
|
||||
* up to 15 bytes + one full vector load. */
|
||||
assert(len >= 16 || first == 0);
|
||||
#endif
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
if (len < 16) {
|
||||
#ifdef COPY
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
memcpy(partial_buf, src, len);
|
||||
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
|
||||
#ifdef COPY
|
||||
memcpy(dst, partial_buf, len);
|
||||
#endif
|
||||
goto partial;
|
||||
@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
|
||||
|
||||
if (algn_diff < 4 && init_crc != 0) {
|
||||
xmm_t0 = xmm_crc_part;
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
if (len >= 32) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
} else {
|
||||
memcpy(partial_buf, src + 16, len - 16);
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
src += 16;
|
||||
len -= 16;
|
||||
#ifdef COPY
|
||||
dst -= algn_diff;
|
||||
#endif
|
||||
goto partial;
|
||||
}
|
||||
|
||||
src += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
26
3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
26
3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
@ -17,7 +17,7 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
@ -26,8 +26,9 @@
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include "../../crc32_fold.h"
|
||||
#include "../../crc32_braid_p.h"
|
||||
#include "crc32.h"
|
||||
#include "crc32_braid_p.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
#include "x86_intrins.h"
|
||||
#include <assert.h>
|
||||
|
||||
@ -350,11 +351,22 @@ Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
|
||||
return crc->value;
|
||||
}
|
||||
|
||||
static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
uint32_t c = (~crc) & 0xffffffff;
|
||||
|
||||
while (len) {
|
||||
len--;
|
||||
DO1;
|
||||
}
|
||||
|
||||
return c ^ 0xffffffff;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
|
||||
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
|
||||
* these short lengths might also prove to be effective */
|
||||
if (len < 64)
|
||||
return PREFIX(crc32_braid)(crc32, buf, len);
|
||||
/* For lens smaller than ~12, crc32_small method is faster.
|
||||
* But there are also minimum requirements for the pclmul functions due to alignment */
|
||||
if (len < 16)
|
||||
return crc32_small(crc32, buf, len);
|
||||
|
||||
crc32_fold ALIGNED_(16) crc_state;
|
||||
CRC32_FOLD_RESET(&crc_state);
|
||||
|
2
3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
2
3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
@ -3,7 +3,7 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
|
||||
#ifdef X86_VPCLMULQDQ_CRC
|
||||
|
||||
#define X86_VPCLMULQDQ
|
||||
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy
|
||||
|
24
3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
vendored
24
3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
vendored
@ -1,24 +0,0 @@
|
||||
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef X86_SSE42
|
||||
#include "../../zbuild.h"
|
||||
#include <nmmintrin.h>
|
||||
#include "../../deflate.h"
|
||||
|
||||
#define HASH_CALC(s, h, val)\
|
||||
h = _mm_crc32_u32(h, val)
|
||||
|
||||
#define HASH_CALC_VAR h
|
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0
|
||||
|
||||
#define UPDATE_HASH update_hash_sse42
|
||||
#define INSERT_STRING insert_string_sse42
|
||||
#define QUICK_INSERT_STRING quick_insert_string_sse42
|
||||
|
||||
#include "../../insert_string_tpl.h"
|
||||
#endif
|
4
3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
4
3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
@ -9,8 +9,8 @@
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
|
4
3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
4
3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
@ -8,8 +8,8 @@
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <assert.h>
|
||||
|
24
3rdparty/zlib-ng/arch/x86/x86_features.c
vendored
24
3rdparty/zlib-ng/arch/x86/x86_features.c
vendored
@ -7,7 +7,7 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "zbuild.h"
|
||||
#include "x86_features.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
@ -15,6 +15,13 @@
|
||||
#else
|
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
# include <cpuid.h>
|
||||
# ifdef X86_HAVE_XSAVE_INTRIN
|
||||
# if __GNUC__ == 8
|
||||
# include <xsaveintrin.h>
|
||||
# else
|
||||
# include <immintrin.h>
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <string.h>
|
||||
@ -29,6 +36,7 @@ static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx,
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
*eax = *ebx = *ecx = *edx = 0;
|
||||
__cpuid(info, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
@ -43,12 +51,13 @@ static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx,
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
*eax = *ebx = *ecx = *edx = 0;
|
||||
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t xgetbv(unsigned int xcr) {
|
||||
#ifdef _MSC_VER
|
||||
#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
|
||||
return _xgetbv(xcr);
|
||||
#else
|
||||
uint32_t eax, edx;
|
||||
@ -90,7 +99,16 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
|
||||
|
||||
// check AVX512 bits if the OS supports saving ZMM registers
|
||||
if (features->has_os_save_zmm) {
|
||||
features->has_avx512 = ebx & 0x00010000;
|
||||
features->has_avx512f = ebx & 0x00010000;
|
||||
if (features->has_avx512f) {
|
||||
// According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
|
||||
// AVX512(DQ,BW,VL).
|
||||
features->has_avx512dq = ebx & 0x00020000;
|
||||
features->has_avx512bw = ebx & 0x40000000;
|
||||
features->has_avx512vl = ebx & 0x80000000;
|
||||
}
|
||||
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
|
||||
&& features->has_avx512vl;
|
||||
features->has_avx512vnni = ecx & 0x800;
|
||||
}
|
||||
}
|
||||
|
14
3rdparty/zlib-ng/arch/x86/x86_features.h
vendored
14
3rdparty/zlib-ng/arch/x86/x86_features.h
vendored
@ -1,14 +1,18 @@
|
||||
/* x86_features.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef X86_FEATURES_H_
|
||||
#define X86_FEATURES_H_
|
||||
|
||||
struct x86_cpu_features {
|
||||
int has_avx2;
|
||||
int has_avx512;
|
||||
int has_avx512f;
|
||||
int has_avx512dq;
|
||||
int has_avx512bw;
|
||||
int has_avx512vl;
|
||||
int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
|
||||
int has_avx512vnni;
|
||||
int has_sse2;
|
||||
int has_ssse3;
|
||||
@ -21,4 +25,4 @@ struct x86_cpu_features {
|
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
|
||||
|
||||
#endif /* CPU_H_ */
|
||||
#endif /* X86_FEATURES_H_ */
|
||||
|
172
3rdparty/zlib-ng/arch/x86/x86_functions.h
vendored
Normal file
172
3rdparty/zlib-ng/arch/x86/x86_functions.h
vendored
Normal file
@ -0,0 +1,172 @@
|
||||
/* x86_functions.h -- x86 implementations for arch-specific functions.
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef X86_FUNCTIONS_H_
|
||||
#define X86_FUNCTIONS_H_
|
||||
|
||||
#ifdef X86_SSE2
|
||||
uint32_t chunksize_sse2(void);
|
||||
uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
|
||||
void slide_hash_sse2(deflate_state *s);
|
||||
# endif
|
||||
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
#ifdef X86_SSE42
|
||||
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
#ifdef X86_AVX2
|
||||
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
uint32_t chunksize_avx2(void);
|
||||
uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
|
||||
void slide_hash_avx2(deflate_state *s);
|
||||
# endif
|
||||
void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef X86_AVX512
|
||||
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
#ifdef X86_AVX512VNNI
|
||||
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
|
||||
void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
|
||||
uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef X86_VPCLMULQDQ_CRC
|
||||
uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
|
||||
void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
|
||||
uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// X86 - SSE2
|
||||
# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_sse2
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_sse2
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_sse2
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_sse2
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_sse2
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_sse2
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_sse2
|
||||
# endif
|
||||
#endif
|
||||
// X86 - SSSE3
|
||||
# if defined(X86_SSSE3) && defined(__SSSE3__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_ssse3
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_ssse3
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_ssse3
|
||||
# endif
|
||||
// X86 - SSE4.2
|
||||
# if defined(X86_SSE42) && defined(__SSE4_2__)
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_sse42
|
||||
# endif
|
||||
|
||||
// X86 - PCLMUL
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
|
||||
# undef native_crc32
|
||||
# define native_crc32 crc32_pclmulqdq
|
||||
# undef native_crc32_fold
|
||||
# define native_crc32_fold crc32_fold_pclmulqdq
|
||||
# undef native_crc32_fold_copy
|
||||
# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
|
||||
# undef native_crc32_fold_final
|
||||
# define native_crc32_fold_final crc32_fold_pclmulqdq_final
|
||||
# undef native_crc32_fold_reset
|
||||
# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
|
||||
#endif
|
||||
// X86 - AVX
|
||||
# if defined(X86_AVX2) && defined(__AVX2__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_avx2
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_avx2
|
||||
# undef native_chunkmemset_safe
|
||||
# define native_chunkmemset_safe chunkmemset_safe_avx2
|
||||
# undef native_chunksize
|
||||
# define native_chunksize chunksize_avx2
|
||||
# undef native_inflate_fast
|
||||
# define native_inflate_fast inflate_fast_avx2
|
||||
# undef native_slide_hash
|
||||
# define native_slide_hash slide_hash_avx2
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
# undef native_compare256
|
||||
# define native_compare256 compare256_avx2
|
||||
# undef native_longest_match
|
||||
# define native_longest_match longest_match_avx2
|
||||
# undef native_longest_match_slow
|
||||
# define native_longest_match_slow longest_match_slow_avx2
|
||||
# endif
|
||||
# endif
|
||||
|
||||
// X86 - AVX512 (F,DQ,BW,Vl)
|
||||
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_avx512
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_avx512
|
||||
// X86 - AVX512 (VNNI)
|
||||
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
|
||||
# undef native_adler32
|
||||
# define native_adler32 adler32_avx512_vnni
|
||||
# undef native_adler32_fold_copy
|
||||
# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
|
||||
# endif
|
||||
// X86 - VPCLMULQDQ
|
||||
# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
|
||||
# undef native_crc32
|
||||
# define native_crc32 crc32_vpclmulqdq
|
||||
# undef native_crc32_fold
|
||||
# define native_crc32_fold crc32_fold_vpclmulqdq
|
||||
# undef native_crc32_fold_copy
|
||||
# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
|
||||
# undef native_crc32_fold_final
|
||||
# define native_crc32_fold_final crc32_fold_vpclmulqdq_final
|
||||
# undef native_crc32_fold_reset
|
||||
# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* X86_FUNCTIONS_H_ */
|
4
3rdparty/zlib-ng/arch/x86/x86_intrins.h
vendored
4
3rdparty/zlib-ng/arch/x86/x86_intrins.h
vendored
@ -7,7 +7,7 @@
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
|
||||
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
|
||||
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
|
||||
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
|
||||
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
|
||||
__m128i r;
|
||||
@ -29,7 +29,7 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
|
||||
/* GCC <9 is missing some AVX512 intrinsics.
|
||||
*/
|
||||
#ifdef __AVX512F__
|
||||
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
|
||||
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
|
||||
#include <immintrin.h>
|
||||
|
||||
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
|
||||
|
29
3rdparty/zlib-ng/arch_functions.h
vendored
Normal file
29
3rdparty/zlib-ng/arch_functions.h
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
/* arch_functions.h -- Arch-specific function prototypes.
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CPU_FUNCTIONS_H_
|
||||
#define CPU_FUNCTIONS_H_
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "crc32.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#include "arch/generic/generic_functions.h"
|
||||
|
||||
#if defined(X86_FEATURES)
|
||||
# include "arch/x86/x86_functions.h"
|
||||
#elif defined(ARM_FEATURES)
|
||||
# include "arch/arm/arm_functions.h"
|
||||
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
|
||||
# include "arch/power/power_functions.h"
|
||||
#elif defined(S390_FEATURES)
|
||||
# include "arch/s390/s390_functions.h"
|
||||
#elif defined(RISCV_FEATURES)
|
||||
# include "arch/riscv/riscv_functions.h"
|
||||
#endif
|
||||
|
||||
#endif
|
8
3rdparty/zlib-ng/chunkset_tpl.h
vendored
8
3rdparty/zlib-ng/chunkset_tpl.h
vendored
@ -5,7 +5,7 @@
|
||||
#include "zbuild.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
|
||||
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
|
||||
#endif
|
||||
|
||||
@ -25,7 +25,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
|
||||
without iteration, which will hopefully make the branch prediction more
|
||||
reliable. */
|
||||
#ifndef HAVE_CHUNKCOPY
|
||||
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
Assert(len > 0, "chunkcopy should never have a length 0");
|
||||
chunk_t chunk;
|
||||
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
|
||||
@ -54,7 +54,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
least 258 bytes of output space available (258 being the maximum length
|
||||
output from a single token; see inflate_fast()'s assumptions below). */
|
||||
#ifndef HAVE_CHUNKUNROLL
|
||||
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
|
||||
static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
|
||||
unsigned char const *from = out - *dist;
|
||||
chunk_t chunk;
|
||||
while (*dist < *len && *dist < sizeof(chunk_t)) {
|
||||
@ -98,7 +98,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
|
||||
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
|
||||
Assert(dist > 0, "chunkmemset cannot have a distance 0");
|
||||
/* Only AVX2 */
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
|
||||
if (len <= 16) {
|
||||
return chunkmemset_ssse3(out, dist, len);
|
||||
}
|
||||
|
115
3rdparty/zlib-ng/cmake/detect-arch.c
vendored
Normal file
115
3rdparty/zlib-ng/cmake/detect-arch.c
vendored
Normal file
@ -0,0 +1,115 @@
|
||||
// archdetect.c -- Detect compiler architecture and raise preprocessor error
|
||||
// containing a simple arch identifier.
|
||||
// Copyright (C) 2019 Hans Kristian Rosbach
|
||||
// Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
// x86_64
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#error archfound x86_64
|
||||
|
||||
// x86
|
||||
#elif defined(__i386) || defined(_M_IX86)
|
||||
#error archfound i686
|
||||
|
||||
// ARM
|
||||
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#error archfound aarch64
|
||||
#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
|
||||
#if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
|
||||
#error archfound armv8
|
||||
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
|
||||
#error archfound armv7
|
||||
#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
|
||||
#error archfound armv6
|
||||
#elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
|
||||
#error archfound armv5
|
||||
#elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
|
||||
#error archfound armv4
|
||||
#elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
|
||||
#error archfound armv3
|
||||
#elif defined(__ARM_ARCH_2__)
|
||||
#error archfound armv2
|
||||
#endif
|
||||
|
||||
// PowerPC
|
||||
#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
|
||||
#if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#error archfound powerpc64le
|
||||
#else
|
||||
#error archfound powerpc64
|
||||
#endif
|
||||
#else
|
||||
#error archfound powerpc
|
||||
#endif
|
||||
|
||||
// --------------- Less common architectures alphabetically below ---------------
|
||||
|
||||
// ALPHA
|
||||
#elif defined(__alpha__) || defined(__alpha)
|
||||
#error archfound alpha
|
||||
|
||||
// Blackfin
|
||||
#elif defined(__BFIN__)
|
||||
#error archfound blackfin
|
||||
|
||||
// Itanium
|
||||
#elif defined(__ia64) || defined(_M_IA64)
|
||||
#error archfound ia64
|
||||
|
||||
// MIPS
|
||||
#elif defined(__mips__) || defined(__mips)
|
||||
#error archfound mips
|
||||
|
||||
// Motorola 68000-series
|
||||
#elif defined(__m68k__)
|
||||
#error archfound m68k
|
||||
|
||||
// SuperH
|
||||
#elif defined(__sh__)
|
||||
#error archfound sh
|
||||
|
||||
// SPARC
|
||||
#elif defined(__sparc__) || defined(__sparc)
|
||||
#if defined(__sparcv9) || defined(__sparc_v9__)
|
||||
#error archfound sparc9
|
||||
#elif defined(__sparcv8) || defined(__sparc_v8__)
|
||||
#error archfound sparc8
|
||||
#endif
|
||||
|
||||
// SystemZ
|
||||
#elif defined(__370__)
|
||||
#error archfound s370
|
||||
#elif defined(__s390__)
|
||||
#error archfound s390
|
||||
#elif defined(__s390x) || defined(__zarch__)
|
||||
#error archfound s390x
|
||||
|
||||
// PARISC
|
||||
#elif defined(__hppa__)
|
||||
#error archfound parisc
|
||||
|
||||
// RS-6000
|
||||
#elif defined(__THW_RS6000)
|
||||
#error archfound rs6000
|
||||
|
||||
// RISC-V
|
||||
#elif defined(__riscv)
|
||||
#if __riscv_xlen == 64
|
||||
#error archfound riscv64
|
||||
#elif __riscv_xlen == 32
|
||||
#error archfound riscv32
|
||||
#endif
|
||||
|
||||
// LOONGARCH
|
||||
#elif defined(__loongarch_lp64)
|
||||
#error archfound loongarch64
|
||||
|
||||
// Emscripten (WebAssembly)
|
||||
#elif defined(__EMSCRIPTEN__)
|
||||
#error archfound wasm32
|
||||
|
||||
// return 'unrecognized' if we do not know what architecture this is
|
||||
#else
|
||||
#error archfound unrecognized
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user