Merge branch 'opencv:4.x' into simd-demosaicing

This commit is contained in:
_Ayaka 2025-01-16 18:02:59 +08:00 committed by GitHub
commit 54d0610f50
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
548 changed files with 33717 additions and 16044 deletions

View File

@ -31,6 +31,9 @@ jobs:
Windows10-x64:
uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main
Windows10-x64-UWP:
uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-UWP.yaml@main
Windows10-ARM64:
uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-ARM64.yaml@main

View File

@ -1932,4 +1932,50 @@ inline int TEGRA_GaussianBlurBinomial(const uchar* src_data, size_t src_step, uc
#endif // OPENCV_IMGPROC_HAL_INTERFACE_H
// The optimized branch was developed for old armv7 processors
#if defined(__ARM_ARCH) && (__ARM_ARCH == 7)
inline int TEGRA_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step,
const short* prev_deriv_data, size_t prev_deriv_step,
const uchar* next_data, size_t next_step,
int width, int height, int cn,
const float *prev_points, float *next_points, size_t point_count,
uchar *status, float *err,
const int win_width, const int win_height,
int termination_count, double termination_epsilon,
bool get_min_eigen_vals,
float min_eigen_vals_threshold)
{
if (!CAROTENE_NS::isSupportedConfiguration())
return CV_HAL_ERROR_NOT_IMPLEMENTED;
CAROTENE_NS::pyrLKOptFlowLevel(CAROTENE_NS::Size2D(width, height), cn,
prev_data, prev_data_step, prev_deriv_data, prev_deriv_step,
next_data, next_step,
point_count, prev_points, next_points,
status, err, CAROTENE_NS::Size2D(win_width, win_height),
termination_count, termination_epsilon,
get_min_eigen_vals, min_eigen_vals_threshold);
return CV_HAL_ERROR_OK;
}
#undef cv_hal_LKOpticalFlowLevel
#define cv_hal_LKOpticalFlowLevel TEGRA_LKOpticalFlowLevel
#endif // __ARM_ARCH=7
#if 0 // OpenCV provides fater parallel implementation
inline int TEGRA_ScharrDeriv(const uchar* src_data, size_t src_step,
short* dst_data, size_t dst_step,
int width, int height, int cn)
{
if (!CAROTENE_NS::isSupportedConfiguration())
return CV_HAL_ERROR_NOT_IMPLEMENTED;
CAROTENE_NS::ScharrDeriv(CAROTENE_NS::Size2D(width, height), cn, src_data, src_step, dst_data, dst_step);
return CV_HAL_ERROR_OK;
}
#undef cv_hal_ScharrDeriv
#define cv_hal_ScharrDeriv TEGRA_ScharrDeriv
#endif
#endif

View File

@ -2485,7 +2485,7 @@ namespace CAROTENE_NS {
u8 *status, f32 *err,
const Size2D &winSize,
u32 terminationCount, f64 terminationEpsilon,
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
bool getMinEigenVals,
f32 minEigThreshold);
}

View File

@ -58,7 +58,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
u8 *status, f32 *err,
const Size2D &winSize,
u32 terminationCount, f64 terminationEpsilon,
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
bool getMinEigenVals,
f32 minEigThreshold)
{
internal::assertSupportedConfiguration();
@ -74,32 +74,11 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
{
f32 levscale = (1./(1 << level));
u32 ptref = ptidx << 1;
f32 prevPtX = prevPts[ptref+0]*levscale;
f32 prevPtY = prevPts[ptref+1]*levscale;
f32 nextPtX;
f32 nextPtY;
if( level == maxLevel )
{
if( useInitialFlow )
{
nextPtX = nextPts[ptref+0]*levscale;
nextPtY = nextPts[ptref+1]*levscale;
}
else
{
nextPtX = prevPtX;
nextPtY = prevPtY;
}
}
else
{
nextPtX = nextPts[ptref+0]*2.f;
nextPtY = nextPts[ptref+1]*2.f;
}
nextPts[ptref+0] = nextPtX;
nextPts[ptref+1] = nextPtY;
f32 prevPtX = prevPts[ptref+0];
f32 prevPtY = prevPts[ptref+1];
f32 nextPtX = nextPts[ptref+0];
f32 nextPtY = nextPts[ptref+1];
s32 iprevPtX, iprevPtY;
s32 inextPtX, inextPtY;
@ -111,13 +90,10 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
{
if( level == 0 )
{
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;
}
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;
continue;
}
@ -333,7 +309,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
if( minEig < minEigThreshold || D < FLT_EPSILON )
{
if( level == 0 && status )
if( status )
status[ptidx] = false;
continue;
}
@ -353,7 +329,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
{
if( level == 0 && status )
if( status )
status[ptidx] = false;
break;
}
@ -469,8 +445,7 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
prevDeltaX = deltaX;
prevDeltaY = deltaY;
}
if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
if( status && status[ptidx] && err && !getMinEigenVals )
{
f32 nextPointX = nextPts[ptref+0] - halfWinX;
f32 nextPointY = nextPts[ptref+1] - halfWinY;
@ -526,9 +501,6 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
(void)winSize;
(void)terminationCount;
(void)terminationEpsilon;
(void)level;
(void)maxLevel;
(void)useInitialFlow;
(void)getMinEigenVals;
(void)minEigThreshold;
(void)ptCount;
@ -536,4 +508,3 @@ void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
}
}//CAROTENE_NS

32
3rdparty/fastcv/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,32 @@
if(HAVE_FASTCV)
set(FASTCV_HAL_VERSION 0.0.1 CACHE INTERNAL "")
set(FASTCV_HAL_LIBRARIES "fastcv_hal" CACHE INTERNAL "")
set(FASTCV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include" CACHE INTERNAL "")
set(FASTCV_HAL_HEADERS
"${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_core.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/include/fastcv_hal_imgproc.hpp"
CACHE INTERNAL "")
file(GLOB FASTCV_HAL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
add_library(fastcv_hal STATIC ${FASTCV_HAL_FILES})
target_include_directories(fastcv_hal PRIVATE
${CMAKE_SOURCE_DIR}/modules/core/include
${CMAKE_SOURCE_DIR}/modules/imgproc/include
${FASTCV_HAL_INCLUDE_DIRS} ${FastCV_INCLUDE_PATH})
target_link_libraries(fastcv_hal PUBLIC ${FASTCV_LIBRARY})
set_target_properties(fastcv_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
if(NOT BUILD_SHARED_LIBS)
ocv_install_target(fastcv_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
endif()
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(fastcv_hal PROPERTIES FOLDER "3rdparty")
endif()
else()
message(STATUS "FastCV is not available, disabling related HAL")
endif(HAVE_FASTCV)

43
3rdparty/fastcv/fastcv.cmake vendored Normal file
View File

@ -0,0 +1,43 @@
function(download_fastcv root_dir)
# Commit SHA in the opencv_3rdparty repo
set(FASTCV_COMMIT "dc5d58018f3af915a8d209386d2c58c0501c0f2c")
# Define actual FastCV versions
if(ANDROID)
if(AARCH64)
message(STATUS "Download FastCV for Android aarch64")
set(FCV_PACKAGE_NAME "fastcv_android_aarch64_2024_12_11.tgz")
set(FCV_PACKAGE_HASH "9dac41e86597305f846212dae31a4a88")
else()
message(STATUS "Download FastCV for Android armv7")
set(FCV_PACKAGE_NAME "fastcv_android_arm32_2024_12_11.tgz")
set(FCV_PACKAGE_HASH "fe2d30334180b17e3031eee92aac43b6")
endif()
elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
if(AARCH64)
set(FCV_PACKAGE_NAME "fastcv_linux_aarch64_2024_12_11.tgz")
set(FCV_PACKAGE_HASH "7b33ad833e6f15ab6d4ec64fa3c17acd")
else()
message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
endif()
endif(ANDROID)
# Download Package
set(OPENCV_FASTCV_URL "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${FASTCV_COMMIT}/fastcv/")
ocv_download( FILENAME ${FCV_PACKAGE_NAME}
HASH ${FCV_PACKAGE_HASH}
URL ${OPENCV_FASTCV_URL}
DESTINATION_DIR ${root_dir}
ID FASTCV
STATUS res
UNPACK
RELATIVE_URL)
if(res)
set(HAVE_FASTCV TRUE CACHE BOOL "FastCV status")
else()
message(WARNING "FastCV: package download failed!")
endif()
endfunction()

View File

@ -0,0 +1,222 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
#define OPENCV_FASTCV_HAL_CORE_HPP_INCLUDED
#include <opencv2/core/base.hpp>
#undef cv_hal_lut
#define cv_hal_lut fastcv_hal_lut
#undef cv_hal_normHammingDiff8u
#define cv_hal_normHammingDiff8u fastcv_hal_normHammingDiff8u
#undef cv_hal_mul8u16u
#define cv_hal_mul8u16u fastcv_hal_mul8u16u
#undef cv_hal_sub8u32f
#define cv_hal_sub8u32f fastcv_hal_sub8u32f
#undef cv_hal_transpose2d
#define cv_hal_transpose2d fastcv_hal_transpose2d
#undef cv_hal_meanStdDev
#define cv_hal_meanStdDev fastcv_hal_meanStdDev
#undef cv_hal_flip
#define cv_hal_flip fastcv_hal_flip
#undef cv_hal_rotate90
#define cv_hal_rotate90 fastcv_hal_rotate
#undef cv_hal_addWeighted8u
#define cv_hal_addWeighted8u fastcv_hal_addWeighted8u
#undef cv_hal_mul8u
#define cv_hal_mul8u fastcv_hal_mul8u
#undef cv_hal_mul16s
#define cv_hal_mul16s fastcv_hal_mul16s
#undef cv_hal_mul32f
#define cv_hal_mul32f fastcv_hal_mul32f
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief look-up table transform of an array.
/// @param src_data Source image data
/// @param src_step Source image step
/// @param src_type Source image type
/// @param lut_data Pointer to lookup table
/// @param lut_channel_size Size of each channel in bytes
/// @param lut_channels Number of channels in lookup table
/// @param dst_data Destination data
/// @param dst_step Destination step
/// @param width Width of images
/// @param height Height of images
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_lut(
const uchar* src_data,
size_t src_step,
size_t src_type,
const uchar* lut_data,
size_t lut_channel_size,
size_t lut_channels,
uchar* dst_data,
size_t dst_step,
int width,
int height);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Hamming distance between two vectors
/// @param a pointer to first vector data
/// @param b pointer to second vector data
/// @param n length of vectors
/// @param cellSize how many bits of the vectors will be added and treated as a single bit, can be 1 (standard Hamming distance), 2 or 4
/// @param result pointer to result output
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_mul8u16u(
const uchar * src1_data,
size_t src1_step,
const uchar * src2_data,
size_t src2_step,
ushort * dst_data,
size_t dst_step,
int width,
int height,
double scale);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_sub8u32f(
const uchar *src1_data,
size_t src1_step,
const uchar *src2_data,
size_t src2_step,
float *dst_data,
size_t dst_step,
int width,
int height);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_transpose2d(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int src_width,
int src_height,
int element_size);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_meanStdDev(
const uchar * src_data,
size_t src_step,
int width,
int height,
int src_type,
double * mean_val,
double * stddev_val,
uchar * mask,
size_t mask_step);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Flips a 2D array around vertical, horizontal, or both axes
/// @param src_type source and destination image type
/// @param src_data source image data
/// @param src_step source image step
/// @param src_width source and destination image width
/// @param src_height source and destination image height
/// @param dst_data destination image data
/// @param dst_step destination image step
/// @param flip_mode 0 flips around x-axis, 1 around y-axis, -1 both
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_flip(
int src_type,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step,
int flip_mode);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Rotates a 2D array in multiples of 90 degrees.
/// @param src_type source and destination image type
/// @param src_data source image data
/// @param src_step source image step
/// @param src_width source image width
/// @If angle has value [180] it is also destination image width
/// If angle has values [90, 270] it is also destination image height
/// @param src_height source and destination image height (destination image width for angles [90, 270])
/// If angle has value [180] it is also destination image height
/// If angle has values [90, 270] it is also destination image width
/// @param dst_data destination image data
/// @param dst_step destination image step
/// @param angle clockwise angle for rotation in degrees from set [90, 180, 270]
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_rotate(
int src_type,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step,
int angle);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief weighted sum of two arrays using formula: dst[i] = a * src1[i] + b * src2[i]
/// @param src1_data first source image data
/// @param src1_step first source image step
/// @param src2_data second source image data
/// @param src2_step second source image step
/// @param dst_data destination image data
/// @param dst_step destination image step
/// @param width width of the images
/// @param height height of the images
/// @param scalars numbers a, b, and c
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_addWeighted8u(
const uchar* src1_data,
size_t src1_step,
const uchar* src2_data,
size_t src2_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
const double scalars[3]);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_mul8u(
const uchar *src1_data,
size_t src1_step,
const uchar *src2_data,
size_t src2_step,
uchar *dst_data,
size_t dst_step,
int width,
int height,
double scale);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_mul16s(
const short *src1_data,
size_t src1_step,
const short *src2_data,
size_t src2_step,
short *dst_data,
size_t dst_step,
int width,
int height,
double scale);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_mul32f(
const float *src1_data,
size_t src1_step,
const float *src2_data,
size_t src2_step,
float *dst_data,
size_t dst_step,
int width,
int height,
double scale);
#endif

View File

@ -0,0 +1,268 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
#define OPENCV_FASTCV_HAL_IMGPROC_HPP_INCLUDED
#include <opencv2/core/base.hpp>
#undef cv_hal_medianBlur
#define cv_hal_medianBlur fastcv_hal_medianBlur
#undef cv_hal_sobel
#define cv_hal_sobel fastcv_hal_sobel
#undef cv_hal_boxFilter
#define cv_hal_boxFilter fastcv_hal_boxFilter
#undef cv_hal_adaptiveThreshold
#define cv_hal_adaptiveThreshold fastcv_hal_adaptiveThreshold
#undef cv_hal_gaussianBlurBinomial
#define cv_hal_gaussianBlurBinomial fastcv_hal_gaussianBlurBinomial
#undef cv_hal_warpPerspective
#define cv_hal_warpPerspective fastcv_hal_warpPerspective
#undef cv_hal_pyrdown
#define cv_hal_pyrdown fastcv_hal_pyrdown
#undef cv_hal_cvtBGRtoHSV
#define cv_hal_cvtBGRtoHSV fastcv_hal_cvtBGRtoHSV
#undef cv_hal_cvtBGRtoYUVApprox
#define cv_hal_cvtBGRtoYUVApprox fastcv_hal_cvtBGRtoYUVApprox
#undef cv_hal_canny
#define cv_hal_canny fastcv_hal_canny
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Calculate medianBlur filter
/// @param src_data Source image data
/// @param src_step Source image step
/// @param dst_data Destination image data
/// @param dst_step Destination image step
/// @param width Source image width
/// @param height Source image height
/// @param depth Depths of source and destination image
/// @param cn Number of channels
/// @param ksize Size of kernel
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_medianBlur(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
int depth,
int cn,
int ksize);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Computes Sobel derivatives
///
/// @param src_data Source image data
/// @param src_step Source image step
/// @param dst_data Destination image data
/// @param dst_step Destination image step
/// @param width Source image width
/// @param height Source image height
/// @param src_depth Depth of source image
/// @param dst_depth Depths of destination image
/// @param cn Number of channels
/// @param margin_left Left margins for source image
/// @param margin_top Top margins for source image
/// @param margin_right Right margins for source image
/// @param margin_bottom Bottom margins for source image
/// @param dx orders of the derivative x
/// @param dy orders of the derivative y
/// @param ksize Size of kernel
/// @param scale Scale factor for the computed derivative values
/// @param delta Delta value that is added to the results prior to storing them in dst
/// @param border_type Border type
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_sobel(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
int src_depth,
int dst_depth,
int cn,
int margin_left,
int margin_top,
int margin_right,
int margin_bottom,
int dx,
int dy,
int ksize,
double scale,
double delta,
int border_type);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_boxFilter(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
int src_depth,
int dst_depth,
int cn,
int margin_left,
int margin_top,
int margin_right,
int margin_bottom,
size_t ksize_width,
size_t ksize_height,
int anchor_x,
int anchor_y,
bool normalize,
int border_type);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_adaptiveThreshold(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
double maxValue,
int adaptiveMethod,
int thresholdType,
int blockSize,
double C);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Blurs an image using a Gaussian filter.
/// @param src_data Source image data
/// @param src_step Source image step
/// @param dst_data Destination image data
/// @param dst_step Destination image step
/// @param width Source image width
/// @param height Source image height
/// @param depth Depth of source and destination image
/// @param cn Number of channels
/// @param margin_left Left margins for source image
/// @param margin_top Top margins for source image
/// @param margin_right Right margins for source image
/// @param margin_bottom Bottom margins for source image
/// @param ksize Kernel size
/// @param border_type Border type
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_gaussianBlurBinomial(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
int depth,
int cn,
size_t margin_left,
size_t margin_top,
size_t margin_right,
size_t margin_bottom,
size_t ksize,
int border_type);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Applies a perspective transformation to an image.
///
/// @param src_type Source and destination image type
/// @param src_data Source image data
/// @param src_step Source image step
/// @param src_width Source image width
/// @param src_height Source image height
/// @param dst_data Destination image data
/// @param dst_step Destination image step
/// @param dst_width Destination image width
/// @param dst_height Destination image height
/// @param M 3x3 matrix with transform coefficients
/// @param interpolation Interpolation mode (CV_HAL_INTER_NEAREST, ...)
/// @param border_type Border processing mode (CV_HAL_BORDER_REFLECT, ...)
/// @param border_value Values to use for CV_HAL_BORDER_CONSTANT mode
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_warpPerspective(
int src_type,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step,
int dst_width,
int dst_height,
const double M[9],
int interpolation,
int border_type,
const double border_value[4]);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_pyrdown(
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step,
int dst_width,
int dst_height,
int depth,
int cn,
int border_type);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_cvtBGRtoHSV(
const uchar * src_data,
size_t src_step,
uchar * dst_data,
size_t dst_step,
int width,
int height,
int depth,
int scn,
bool swapBlue,
bool isFullRange,
bool isHSV);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_cvtBGRtoYUVApprox(
const uchar * src_data,
size_t src_step,
uchar * dst_data,
size_t dst_step,
int width,
int height,
int depth,
int scn,
bool swapBlue,
bool isCbCr);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// @brief Canny edge detector
/// @param src_data Source image data
/// @param src_step Source image step
/// @param dst_data Destination image data
/// @param dst_step Destination image step
/// @param width Source image width
/// @param height Source image height
/// @param cn Number of channels
/// @param lowThreshold low hresholds value
/// @param highThreshold high thresholds value
/// @param ksize Kernel size for Sobel operator.
/// @param L2gradient Flag, indicating use of L2 or L1 norma.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int fastcv_hal_canny(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
int cn,
double lowThreshold,
double highThreshold,
int ksize,
bool L2gradient);
#endif

View File

@ -0,0 +1,84 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
#define OPENCV_FASTCV_HAL_UTILS_HPP_INCLUDED
#include "fastcv.h"
#include <opencv2/core/utils/logger.hpp>
#define INITIALIZATION_CHECK \
{ \
if (!FastCvContext::getContext().isInitialized) \
{ \
return CV_HAL_ERROR_UNKNOWN; \
} \
}
#define CV_HAL_RETURN(status, func) \
{ \
if( status == FASTCV_SUCCESS ) \
{ \
CV_LOG_DEBUG(NULL, "FastCV HAL for "<<#func<<" run successfully!"); \
return CV_HAL_ERROR_OK; \
} \
else if(status == FASTCV_EBADPARAM || status == FASTCV_EUNALIGNPARAM || \
status == FASTCV_EUNSUPPORTED || status == FASTCV_EHWQDSP || \
status == FASTCV_EHWGPU) \
{ \
CV_LOG_DEBUG(NULL, "FastCV status:"<<getFastCVErrorString(status) \
<<", Switching to default OpenCV solution!"); \
return CV_HAL_ERROR_NOT_IMPLEMENTED; \
} \
else \
{ \
CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status)); \
return CV_HAL_ERROR_UNKNOWN; \
} \
}
#define CV_HAL_RETURN_NOT_IMPLEMENTED(reason) \
{ \
CV_LOG_DEBUG(NULL,"Switching to default OpenCV\nInfo: "<<reason); \
return CV_HAL_ERROR_NOT_IMPLEMENTED; \
}
#define FCV_KernelSize_SHIFT 3
#define FCV_MAKETYPE(ksize,depth) ((ksize<<FCV_KernelSize_SHIFT) + depth)
#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)
const char* getFastCVErrorString(int status);
const char* borderToString(int border);
const char* interpolationToString(int interpolation);
struct FastCvContext
{
public:
// initialize at first call
// Defines a static local variable context. Variable is created only once.
static FastCvContext& getContext()
{
static FastCvContext context;
return context;
}
FastCvContext()
{
if (fcvSetOperationMode(FASTCV_OP_CPU_PERFORMANCE) != 0)
{
CV_LOG_WARNING(NULL, "Failed to switch FastCV operation mode");
isInitialized = false;
}
else
{
CV_LOG_INFO(NULL, "FastCV Operation Mode Switched");
isInitialized = true;
}
}
bool isInitialized;
};
#endif

574
3rdparty/fastcv/src/fastcv_hal_core.cpp vendored Normal file
View File

@ -0,0 +1,574 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#include "fastcv_hal_core.hpp"
#include "fastcv_hal_utils.hpp"
#include <opencv2/core/core.hpp>
#include <opencv2/core/base.hpp>
class ParallelTableLookup : public cv::ParallelLoopBody
{
public:
ParallelTableLookup(const uchar* src_data_, int width_, size_t src_step_, const uchar* lut_data_, uchar* dst_data_, size_t dst_step_) :
cv::ParallelLoopBody(), src_data(src_data_), width(width_), src_step(src_step_), lut_data(lut_data_), dst_data(dst_data_), dst_step(dst_step_)
{
}
virtual void operator()(const cv::Range& range) const CV_OVERRIDE
{
fcvStatus status = FASTCV_SUCCESS;
for (int y = range.start; y < range.end; y++) {
status = fcvTableLookupu8((uint8_t*)src_data + y * src_step, width, 1, src_step, (uint8_t*)lut_data, (uint8_t*)dst_data + y * dst_step, dst_step);
if(status != FASTCV_SUCCESS)
CV_LOG_ERROR(NULL,"FastCV error:"<<getFastCVErrorString(status));
}
}
private:
const uchar* src_data;
int width;
size_t src_step;
const uchar* lut_data;
uchar* dst_data;
size_t dst_step;
};
int fastcv_hal_lut(
const uchar* src_data,
size_t src_step,
size_t src_type,
const uchar* lut_data,
size_t lut_channel_size,
size_t lut_channels,
uchar* dst_data,
size_t dst_step,
int width,
int height)
{
if((width*height)<=(320*240))
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
INITIALIZATION_CHECK;
fcvStatus status;
if (src_type == CV_8UC1 && lut_channels == 1 && lut_channel_size == 1)
{
cv::parallel_for_(cv::Range(0, height),
ParallelTableLookup(src_data, width, src_step, lut_data, dst_data, dst_step));
status = FASTCV_SUCCESS;
CV_HAL_RETURN(status, hal_lut);
}
else
{
CV_HAL_RETURN_NOT_IMPLEMENTED("Multi-channel input is not supported");
}
}
int fastcv_hal_normHammingDiff8u(
const uchar* a,
const uchar* b,
int n,
int cellSize,
int* result)
{
fcvStatus status;
if (cellSize != 1)
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("NORM_HAMMING2 cellSize:%d is not supported", cellSize));
INITIALIZATION_CHECK;
uint32_t dist = 0;
dist = fcvHammingDistanceu8((uint8_t*)a, (uint8_t*)b, n);
*result = dist;
status = FASTCV_SUCCESS;
CV_HAL_RETURN(status, hal_normHammingDiff8u);
}
int fastcv_hal_mul8u16u(
const uchar* src1_data,
size_t src1_step,
const uchar* src2_data,
size_t src2_step,
ushort* dst_data,
size_t dst_step,
int width,
int height,
double scale)
{
if(scale != 1.0)
CV_HAL_RETURN_NOT_IMPLEMENTED("Scale factor not supported");
INITIALIZATION_CHECK;
fcvStatus status = FASTCV_SUCCESS;
if (src1_step < (size_t)width && src2_step < (size_t)width)
{
src1_step = width*sizeof(uchar);
src2_step = width*sizeof(uchar);
dst_step = width*sizeof(ushort);
}
status = fcvElementMultiplyu8u16_v2(src1_data, width, height, src1_step,
src2_data, src2_step, dst_data, dst_step);
CV_HAL_RETURN(status,hal_multiply);
}
int fastcv_hal_sub8u32f(
const uchar* src1_data,
size_t src1_step,
const uchar* src2_data,
size_t src2_step,
float* dst_data,
size_t dst_step,
int width,
int height)
{
INITIALIZATION_CHECK;
fcvStatus status = FASTCV_SUCCESS;
if (src1_step < (size_t)width && src2_step < (size_t)width)
{
src1_step = width*sizeof(uchar);
src2_step = width*sizeof(uchar);
dst_step = width*sizeof(float);
}
status = fcvImageDiffu8f32_v2(src1_data, src2_data, width, height, src1_step,
src2_step, dst_data, dst_step);
CV_HAL_RETURN(status,hal_subtract);
}
int fastcv_hal_transpose2d(
const uchar* src_data,
size_t src_step,
uchar* dst_data,
size_t dst_step,
int src_width,
int src_height,
int element_size)
{
INITIALIZATION_CHECK;
if (src_data == dst_data)
CV_HAL_RETURN_NOT_IMPLEMENTED("In-place not supported");
fcvStatus status = FASTCV_SUCCESS;
switch (element_size)
{
case 1:
status = fcvTransposeu8_v2(src_data, src_width, src_height, src_step,
dst_data, dst_step);
break;
case 2:
status = fcvTransposeu16_v2((const uint16_t*)src_data, src_width, src_height,
src_step, (uint16_t*)dst_data, dst_step);
break;
case 4:
status = fcvTransposef32_v2((const float32_t*)src_data, src_width, src_height,
src_step, (float32_t*)dst_data, dst_step);
break;
default:
CV_HAL_RETURN_NOT_IMPLEMENTED("srcType not supported");
}
CV_HAL_RETURN(status,hal_transpose);
}
int fastcv_hal_meanStdDev(
const uchar* src_data,
size_t src_step,
int width,
int height,
int src_type,
double* mean_val,
double* stddev_val,
uchar* mask,
size_t mask_step)
{
INITIALIZATION_CHECK;
CV_UNUSED(mask_step);
if(src_type != CV_8UC1)
{
CV_HAL_RETURN_NOT_IMPLEMENTED("src type not supported");
}
else if(mask != nullptr)
{
CV_HAL_RETURN_NOT_IMPLEMENTED("mask not supported");
}
else if(mean_val == nullptr && stddev_val == nullptr)
{
CV_HAL_RETURN_NOT_IMPLEMENTED("null ptr for mean and stddev");
}
float32_t mean, variance;
fcvStatus status = fcvImageIntensityStats_v2(src_data, src_step, 0, 0, width, height,
&mean, &variance, FASTCV_BIASED_VARIANCE_ESTIMATOR);
if(mean_val != nullptr)
*mean_val = mean;
if(stddev_val != nullptr)
*stddev_val = std::sqrt(variance);
CV_HAL_RETURN(status,hal_meanStdDev);
}
int fastcv_hal_flip(
int src_type,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step,
int flip_mode)
{
INITIALIZATION_CHECK;
if(src_type!=CV_8UC1 && src_type!=CV_16UC1 && src_type!=CV_8UC3)
CV_HAL_RETURN_NOT_IMPLEMENTED("Data type is not supported, Switching to default OpenCV solution!");
if((src_width*src_height)<=(640*480))
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution!");
fcvStatus status = FASTCV_SUCCESS;;
fcvFlipDir dir;
switch (flip_mode)
{
//Flip around X-Axis: Vertical Flip or FLIP_ROWS
case 0:
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution due to low perf!");
dir = FASTCV_FLIP_VERT;
break;
//Flip around Y-Axis: Horizontal Flip or FLIP_COLS
case 1:
dir = FASTCV_FLIP_HORIZ;
break;
//Flip around both X and Y-Axis or FLIP_BOTH
case -1:
dir = FASTCV_FLIP_BOTH;
break;
default:
CV_HAL_RETURN_NOT_IMPLEMENTED("Invalid flip_mode, Switching to default OpenCV solution!");
}
if(src_type==CV_8UC1)
fcvFlipu8(src_data, src_width, src_height, src_step, dst_data, dst_step, dir);
else if(src_type==CV_16UC1)
fcvFlipu16((uint16_t*)src_data, src_width, src_height, src_step, (uint16_t*)dst_data, dst_step, dir);
else if(src_type==CV_8UC3)
status = fcvFlipRGB888u8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data, dst_step, dir);
else
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Data type:%d is not supported, Switching to default OpenCV solution!", src_type));
CV_HAL_RETURN(status, hal_flip);
}
int fastcv_hal_rotate(
int src_type,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step,
int angle)
{
if((src_width*src_height)<(120*80))
CV_HAL_RETURN_NOT_IMPLEMENTED("Switching to default OpenCV solution for lower resolution!");
fcvStatus status;
fcvRotateDegree degree;
if (src_type != CV_8UC1 && src_type != CV_8UC2)
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
INITIALIZATION_CHECK;
switch (angle)
{
case 90:
degree = FASTCV_ROTATE_90;
break;
case 180:
degree = FASTCV_ROTATE_180;
break;
case 270:
degree = FASTCV_ROTATE_270;
break;
default:
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("Rotation angle:%d is not supported", angle));
}
switch(src_type)
{
case CV_8UC1:
status = fcvRotateImageu8(src_data, src_width, src_height, src_step, dst_data, dst_step, degree);
break;
case CV_8UC2:
status = fcvRotateImageInterleavedu8((uint8_t*)src_data, src_width, src_height, src_step, (uint8_t*)dst_data,
dst_step, degree);
break;
default:
CV_HAL_RETURN_NOT_IMPLEMENTED(cv::format("src_type:%d is not supported", src_type));
}
CV_HAL_RETURN(status, hal_rotate);
}
int fastcv_hal_addWeighted8u(
const uchar* src1_data,
size_t src1_step,
const uchar* src2_data,
size_t src2_step,
uchar* dst_data,
size_t dst_step,
int width,
int height,
const double scalars[3])
{
if( (scalars[0] < -128.0f) || (scalars[0] >= 128.0f) ||
(scalars[1] < -128.0f) || (scalars[1] >= 128.0f) ||
(scalars[2] < -(1<<23))|| (scalars[2] >= 1<<23))
CV_HAL_RETURN_NOT_IMPLEMENTED(
cv::format("Alpha:%f,Beta:%f,Gamma:%f is not supported because it's too large or too small\n",
scalars[0],scalars[1],scalars[2]));
INITIALIZATION_CHECK;
fcvStatus status = FASTCV_SUCCESS;
if (height == 1)
{
src1_step = width*sizeof(uchar);
src2_step = width*sizeof(uchar);
dst_step = width*sizeof(uchar);
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
int rangeWidth = range.end - range.start;
const uint8_t *src1 = src1_data + range.start;
const uint8_t *src2 = src2_data + range.start;
uint8_t *dst = dst_data + range.start;
fcvAddWeightedu8_v2(src1, rangeWidth, height, src1_step, src2, src2_step,
scalars[0], scalars[1], scalars[2], dst, dst_step);
});
}
else
{
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const uint8_t *src1 = src1_data + range.start * src1_step;
const uint8_t *src2 = src2_data + range.start * src2_step;
uint8_t *dst = dst_data + range.start * dst_step;
fcvAddWeightedu8_v2(src1, width, rangeHeight, src1_step, src2, src2_step,
scalars[0], scalars[1], scalars[2], dst, dst_step);
});
}
CV_HAL_RETURN(status, hal_addWeighted8u_v2);
}
int fastcv_hal_mul8u(
const uchar *src1_data,
size_t src1_step,
const uchar *src2_data,
size_t src2_step,
uchar *dst_data,
size_t dst_step,
int width,
int height,
double scale)
{
int8_t sF;
if(FCV_CMP_EQ(scale,1.0)) { sF = 0; }
else if(scale > 1.0)
{
if(FCV_CMP_EQ(scale,2.0)) { sF = -1; }
else if(FCV_CMP_EQ(scale,4.0)) { sF = -2; }
else if(FCV_CMP_EQ(scale,8.0)) { sF = -3; }
else if(FCV_CMP_EQ(scale,16.0)) { sF = -4; }
else if(FCV_CMP_EQ(scale,32.0)) { sF = -5; }
else if(FCV_CMP_EQ(scale,64.0)) { sF = -6; }
else if(FCV_CMP_EQ(scale,128.0)) { sF = -7; }
else if(FCV_CMP_EQ(scale,256.0)) { sF = -8; }
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
}
else if(scale > 0 && scale < 1.0)
{
if(FCV_CMP_EQ(scale,1/2.0)) { sF = 1; }
else if(FCV_CMP_EQ(scale,1/4.0)) { sF = 2; }
else if(FCV_CMP_EQ(scale,1/8.0)) { sF = 3; }
else if(FCV_CMP_EQ(scale,1/16.0)) { sF = 4; }
else if(FCV_CMP_EQ(scale,1/32.0)) { sF = 5; }
else if(FCV_CMP_EQ(scale,1/64.0)) { sF = 6; }
else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7; }
else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8; }
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
}
else
CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
INITIALIZATION_CHECK;
int nStripes = cv::getNumThreads();
if(height == 1)
{
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
int rangeWidth = range.end - range.start;
const uchar* yS1 = src1_data + static_cast<size_t>(range.start);
const uchar* yS2 = src2_data + static_cast<size_t>(range.start);
uchar* yD = dst_data + static_cast<size_t>(range.start);
fcvElementMultiplyu8(yS1, rangeWidth, 1, 0, yS2, 0, sF,
FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
}, nStripes);
}
else
{
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const uchar* yS1 = src1_data + static_cast<size_t>(range.start)*src1_step;
const uchar* yS2 = src2_data + static_cast<size_t>(range.start)*src2_step;
uchar* yD = dst_data + static_cast<size_t>(range.start)*dst_step;
fcvElementMultiplyu8(yS1, width, rangeHeight, src1_step, yS2, src2_step,
sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
}, nStripes);
}
fcvStatus status = FASTCV_SUCCESS;
CV_HAL_RETURN(status, hal_mul8u);
}
int fastcv_hal_mul16s(
const short *src1_data,
size_t src1_step,
const short *src2_data,
size_t src2_step,
short *dst_data,
size_t dst_step,
int width,
int height,
double scale)
{
int8_t sF;
if(FCV_CMP_EQ(scale,1.0)) { sF = 0; }
else if(scale > 1.0)
{
if(FCV_CMP_EQ(scale,2.0)) { sF = -1; }
else if(FCV_CMP_EQ(scale,4.0)) { sF = -2; }
else if(FCV_CMP_EQ(scale,8.0)) { sF = -3; }
else if(FCV_CMP_EQ(scale,16.0)) { sF = -4; }
else if(FCV_CMP_EQ(scale,32.0)) { sF = -5; }
else if(FCV_CMP_EQ(scale,64.0)) { sF = -6; }
else if(FCV_CMP_EQ(scale,128.0)) { sF = -7; }
else if(FCV_CMP_EQ(scale,256.0)) { sF = -8; }
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
}
else if(scale > 0 && scale < 1.0)
{
if(FCV_CMP_EQ(scale,1/2.0)) { sF = 1; }
else if(FCV_CMP_EQ(scale,1/4.0)) { sF = 2; }
else if(FCV_CMP_EQ(scale,1/8.0)) { sF = 3; }
else if(FCV_CMP_EQ(scale,1/16.0)) { sF = 4; }
else if(FCV_CMP_EQ(scale,1/32.0)) { sF = 5; }
else if(FCV_CMP_EQ(scale,1/64.0)) { sF = 6; }
else if(FCV_CMP_EQ(scale,1/128.0)) { sF = 7; }
else if(FCV_CMP_EQ(scale,1/256.0)) { sF = 8; }
else CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
}
else
CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
INITIALIZATION_CHECK;
int nStripes = cv::getNumThreads();
if(height == 1)
{
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
int rangeWidth = range.end - range.start;
const short* yS1 = src1_data + static_cast<size_t>(range.start);
const short* yS2 = src2_data + static_cast<size_t>(range.start);
short* yD = dst_data + static_cast<size_t>(range.start);
fcvElementMultiplys16(yS1, rangeWidth, 1, 0, yS2, 0, sF,
FASTCV_CONVERT_POLICY_SATURATE, yD, 0);
}, nStripes);
}
else
{
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const short* yS1 = src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(short));
const short* yS2 = src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(short));
short* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(short));
fcvElementMultiplys16(yS1, width, rangeHeight, src1_step, yS2, src2_step,
sF, FASTCV_CONVERT_POLICY_SATURATE, yD, dst_step);
}, nStripes);
}
fcvStatus status = FASTCV_SUCCESS;
CV_HAL_RETURN(status, hal_mul16s);
}
int fastcv_hal_mul32f(
const float *src1_data,
size_t src1_step,
const float *src2_data,
size_t src2_step,
float *dst_data,
size_t dst_step,
int width,
int height,
double scale)
{
if(!FCV_CMP_EQ(scale,1.0))
CV_HAL_RETURN_NOT_IMPLEMENTED("scale factor not supported");
INITIALIZATION_CHECK;
int nStripes = cv::getNumThreads();
if(height == 1)
{
cv::parallel_for_(cv::Range(0, width), [&](const cv::Range &range){
int rangeWidth = range.end - range.start;
const float* yS1 = src1_data + static_cast<size_t>(range.start);
const float* yS2 = src2_data + static_cast<size_t>(range.start);
float* yD = dst_data + static_cast<size_t>(range.start);
fcvElementMultiplyf32(yS1, rangeWidth, 1, 0, yS2, 0, yD, 0);
}, nStripes);
}
else
{
cv::parallel_for_(cv::Range(0, height), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const float* yS1 = src1_data + static_cast<size_t>(range.start) * (src1_step/sizeof(float));
const float* yS2 = src2_data + static_cast<size_t>(range.start) * (src2_step/sizeof(float));
float* yD = dst_data + static_cast<size_t>(range.start) * (dst_step/sizeof(float));
fcvElementMultiplyf32(yS1, width, rangeHeight, src1_step,
yS2, src2_step, yD, dst_step);
}, nStripes);
}
fcvStatus status = FASTCV_SUCCESS;
CV_HAL_RETURN(status, hal_mul32f);
}

1050
3rdparty/fastcv/src/fastcv_hal_imgproc.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,56 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#include "fastcv_hal_utils.hpp"
const char* getFastCVErrorString(int status)
{
switch(status)
{
case FASTCV_SUCCESS: return "Successful";
case FASTCV_EFAIL: return "General failure";
case FASTCV_EUNALIGNPARAM: return "Unaligned pointer parameter";
case FASTCV_EBADPARAM: return "Bad parameters";
case FASTCV_EINVALSTATE: return "Called at invalid state";
case FASTCV_ENORES: return "Insufficient resources, memory, thread, etc";
case FASTCV_EUNSUPPORTED: return "Unsupported feature";
case FASTCV_EHWQDSP: return "Hardware QDSP failed to respond";
case FASTCV_EHWGPU: return "Hardware GPU failed to respond";
default: return "Unknown FastCV Error";
}
}
const char* borderToString(int border)
{
switch (border)
{
case 0: return "BORDER_CONSTANT";
case 1: return "BORDER_REPLICATE";
case 2: return "BORDER_REFLECT";
case 3: return "BORDER_WRAP";
case 4: return "BORDER_REFLECT_101";
case 5: return "BORDER_TRANSPARENT";
default: return "Unknown border type";
}
}
const char* interpolationToString(int interpolation)
{
switch (interpolation)
{
case 0: return "INTER_NEAREST";
case 1: return "INTER_LINEAR";
case 2: return "INTER_CUBIC";
case 3: return "INTER_AREA";
case 4: return "INTER_LANCZOS4";
case 5: return "INTER_LINEAR_EXACT";
case 6: return "INTER_NEAREST_EXACT";
case 7: return "INTER_MAX";
case 8: return "WARP_FILL_OUTLIERS";
case 16: return "WARP_INVERSE_MAP";
case 32: return "WARP_RELATIVE_MAP";
default: return "Unknown interpolation type";
}
}

View File

@ -1,8 +1,8 @@
# Binaries branch name: ffmpeg/4.x_20240522
# Binaries were created for OpenCV: 8393885a39dac1e650bf5d0aaff84c04ad8bcdd3
ocv_update(FFMPEG_BINARIES_COMMIT "394dca6ceb3085c979415e6385996b6570e94153")
ocv_update(FFMPEG_FILE_HASH_BIN32 "bdfbd1efb295f3e54c07d2cb7a843bf9")
ocv_update(FFMPEG_FILE_HASH_BIN64 "bfef029900f788480a363d6dc05c4f0e")
# Binaries branch name: ffmpeg/4.x_20241226
# Binaries were created for OpenCV: 09892c9d1706f40342bda0bc404580f63492d9f8
ocv_update(FFMPEG_BINARIES_COMMIT "d63d7c154c57242bf2283be61166be2bd30ec47e")
ocv_update(FFMPEG_FILE_HASH_BIN32 "642b94d032a8292b07550126934173f6")
ocv_update(FFMPEG_FILE_HASH_BIN64 "a8c3560c8f20e1ae465bef81580fa92c")
ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")
function(download_win_ffmpeg script_var)

View File

@ -19,4 +19,9 @@
#include "version/hal_rvv_071.hpp"
#endif
#endif
#if defined(__riscv_v) && __riscv_v == 1000000
#include "hal_rvv_1p0/merge.hpp" // core
#include "hal_rvv_1p0/mean.hpp" // core
#endif
#endif

228
3rdparty/hal_rvv/hal_rvv_1p0/mean.hpp vendored Normal file
View File

@ -0,0 +1,228 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv {
#undef cv_hal_meanStdDev
#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev
inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
switch (src_type)
{
case CV_8UC1:
return meanStdDev_8UC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
case CV_8UC4:
return meanStdDev_8UC4(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
case CV_32FC1:
return meanStdDev_32FC1(src_data, src_step, width, height, mean_val, stddev_val, mask, mask_step);
default:
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
}
inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
int nz = 0;
int vlmax = __riscv_vsetvlmax_e64m8();
vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
if (mask) {
for (int i = 0; i < height; ++i) {
const uchar* src_row = src_data + i * src_step;
const uchar* mask_row = mask + i * mask_step;
int j = 0, vl;
for ( ; j < width; j += vl) {
vl = __riscv_vsetvl_e8m1(width - j);
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
auto vmask_u8 = __riscv_vle8_v_u8m1(mask_row+j, vl);
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
nz += __riscv_vcpop_m_b8(vmask, vl);
}
}
} else {
for (int i = 0; i < height; i++) {
const uchar* src_row = src_data + i * src_step;
int j = 0, vl;
for ( ; j < width; j += vl) {
vl = __riscv_vsetvl_e8m1(width - j);
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
}
}
nz = height * width;
}
if (nz == 0) {
if (mean_val) *mean_val = 0.0;
if (stddev_val) *stddev_val = 0.0;
return CV_HAL_ERROR_OK;
}
auto zero = __riscv_vmv_s_x_u64m1(0, vlmax);
auto vec_red = __riscv_vmv_v_x_u64m1(0, vlmax);
auto vec_reddev = __riscv_vmv_v_x_u64m1(0, vlmax);
vec_red = __riscv_vredsum(vec_sum, zero, vlmax);
vec_reddev = __riscv_vredsum(vec_sqsum, zero, vlmax);
double sum = __riscv_vmv_x(vec_red);
double mean = sum / nz;
if (mean_val) {
*mean_val = mean;
}
if (stddev_val) {
double sqsum = __riscv_vmv_x(vec_reddev);
double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
double stddev = std::sqrt(variance);
*stddev_val = stddev;
}
return CV_HAL_ERROR_OK;
}
inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, int height,
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
int nz = 0;
int vlmax = __riscv_vsetvlmax_e64m8();
vuint64m8_t vec_sum = __riscv_vmv_v_x_u64m8(0, vlmax);
vuint64m8_t vec_sqsum = __riscv_vmv_v_x_u64m8(0, vlmax);
if (mask) {
for (int i = 0; i < height; ++i) {
const uchar* src_row = src_data + i * src_step;
const uchar* mask_row = mask + i * mask_step;
int j = 0, jm = 0, vl, vlm;
for ( ; j < width*4; j += vl, jm += vlm) {
vl = __riscv_vsetvl_e8m1(width*4 - j);
vlm = __riscv_vsetvl_e8mf4(width - jm);
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
auto vmask_u8mf4 = __riscv_vle8_v_u8mf4(mask_row + jm, vlm);
auto vmask_u32 = __riscv_vzext_vf4(vmask_u8mf4, vlm);
// 0 -> 0000; 1 -> 1111
vmask_u32 = __riscv_vmul(vmask_u32, 0b00000001000000010000000100000001, vlm);
auto vmask_u8 = __riscv_vreinterpret_u8m1(vmask_u32);
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
auto vmask = __riscv_vmseq_vx_u8m1_b8(vmask_u8, 1, vl);
vec_sum = __riscv_vwaddu_wv_u64m8_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
nz += __riscv_vcpop_m_b8(vmask, vl);
}
}
nz /= 4;
} else {
for (int i = 0; i < height; i++) {
const uchar* src_row = src_data + i * src_step;
int j = 0, vl;
for ( ; j < width*4; j += vl) {
vl = __riscv_vsetvl_e8m1(width*4 - j);
auto vec_pixel_u8 = __riscv_vle8_v_u8m1(src_row + j, vl);
auto vec_pixel = __riscv_vzext_vf4(vec_pixel_u8, vl);
vec_sum = __riscv_vwaddu_wv_u64m8_tu(vec_sum, vec_sum, vec_pixel, vl);
vec_sqsum = __riscv_vwmaccu_vv_u64m8_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
}
}
nz = height * width;
}
if (nz == 0) {
if (mean_val) *mean_val = 0.0;
if (stddev_val) *stddev_val = 0.0;
return CV_HAL_ERROR_OK;
}
uint64_t s[256], sq[256], sum[4] = {0}, sqsum[4] = {0};
__riscv_vse64(s, vec_sum, vlmax);
__riscv_vse64(sq, vec_sqsum, vlmax);
for (int i = 0; i < vlmax; ++i)
{
sum[i % 4] += s[i];
sqsum[i % 4] += sq[i];
}
if (mean_val) {
mean_val[0] = (double)sum[0] / nz;
mean_val[1] = (double)sum[1] / nz;
mean_val[2] = (double)sum[2] / nz;
mean_val[3] = (double)sum[3] / nz;
}
if (stddev_val) {
stddev_val[0] = std::sqrt(std::max(((double)sqsum[0] / nz) - (mean_val[0] * mean_val[0]), 0.0));
stddev_val[1] = std::sqrt(std::max(((double)sqsum[1] / nz) - (mean_val[1] * mean_val[1]), 0.0));
stddev_val[2] = std::sqrt(std::max(((double)sqsum[2] / nz) - (mean_val[2] * mean_val[2]), 0.0));
stddev_val[3] = std::sqrt(std::max(((double)sqsum[3] / nz) - (mean_val[3] * mean_val[3]), 0.0));
}
return CV_HAL_ERROR_OK;
}
inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
int nz = 0;
int vlmax = __riscv_vsetvlmax_e64m4();
vfloat64m4_t vec_sum = __riscv_vfmv_v_f_f64m4(0, vlmax);
vfloat64m4_t vec_sqsum = __riscv_vfmv_v_f_f64m4(0, vlmax);
src_step /= sizeof(float);
if (mask) {
for (int i = 0; i < height; ++i) {
const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
const uchar* mask_row = mask + i * mask_step;
int j = 0, vl;
for ( ; j < width; j += vl) {
vl = __riscv_vsetvl_e32m2(width - j);
auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
auto vmask_u8 = __riscv_vle8_v_u8mf2(mask_row + j, vl);
auto vmask_u32 = __riscv_vzext_vf4(vmask_u8, vl);
auto vmask = __riscv_vmseq_vx_u32m2_b16(vmask_u32, 1, vl);
vec_sum = __riscv_vfwadd_wv_f64m4_tumu(vmask, vec_sum, vec_sum, vec_pixel, vl);
vec_sqsum = __riscv_vfwmacc_vv_f64m4_tumu(vmask, vec_sqsum, vec_pixel, vec_pixel, vl);
nz += __riscv_vcpop_m_b16(vmask, vl);
}
}
} else {
for (int i = 0; i < height; i++) {
const float* src_row0 = reinterpret_cast<const float*>(src_data) + i * src_step;
int j = 0, vl;
for ( ; j < width; j += vl) {
vl = __riscv_vsetvl_e32m2(width - j);
auto vec_pixel = __riscv_vle32_v_f32m2(src_row0 + j, vl);
vec_sum = __riscv_vfwadd_wv_f64m4_tu(vec_sum, vec_sum, vec_pixel, vl);
vec_sqsum = __riscv_vfwmacc_vv_f64m4_tu(vec_sqsum, vec_pixel, vec_pixel, vl);
}
}
nz = height * width;
}
if (nz == 0) {
if (mean_val) *mean_val = 0.0;
if (stddev_val) *stddev_val = 0.0;
return CV_HAL_ERROR_OK;
}
auto zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
auto vec_red = __riscv_vfmv_v_f_f64m1(0, vlmax);
auto vec_reddev = __riscv_vfmv_v_f_f64m1(0, vlmax);
vec_red = __riscv_vfredusum(vec_sum, zero, vlmax);
vec_reddev = __riscv_vfredusum(vec_sqsum, zero, vlmax);
double sum = __riscv_vfmv_f(vec_red);
double mean = sum / nz;
if (mean_val) {
*mean_val = mean;
}
if (stddev_val) {
double sqsum = __riscv_vfmv_f(vec_reddev);
double variance = std::max((sqsum / nz) - (mean * mean), 0.0);
double stddev = std::sqrt(variance);
*stddev_val = stddev;
}
return CV_HAL_ERROR_OK;
}
}}
#endif

397
3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp vendored Normal file
View File

@ -0,0 +1,397 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv {
#undef cv_hal_merge8u
#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
#undef cv_hal_merge16u
#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
#undef cv_hal_merge32s
#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
#undef cv_hal_merge64s
#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
#if defined __GNUC__
__attribute__((optimize("no-tree-vectorize")))
#endif
inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
int k = cn % 4 ? cn % 4 : 4;
int i = 0;
int vl = __riscv_vsetvlmax_e8m1();
if( k == 1 )
{
const uchar* src0 = src[0];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++)
dst[i*cn] = src0[i];
}
else if( k == 2 )
{
const uchar *src0 = src[0], *src1 = src[1];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
}
}
else if( k == 3 )
{
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
}
}
else
{
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 3, sizeof(uchar)*cn, d, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
dst[i*cn+3] = src3[i];
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; k < cn; k += 4 )
{
const uchar *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
i = 0;
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn + 1, sizeof(uchar)*cn, b, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn + 2, sizeof(uchar)*cn, c, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn + 3, sizeof(uchar)*cn, d, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[k+i*cn] = src0[i];
dst[k+i*cn+1] = src1[i];
dst[k+i*cn+2] = src2[i];
dst[k+i*cn+3] = src3[i];
}
}
return CV_HAL_ERROR_OK;
}
#if defined __GNUC__
__attribute__((optimize("no-tree-vectorize")))
#endif
inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
int k = cn % 4 ? cn % 4 : 4;
int i = 0;
int vl = __riscv_vsetvlmax_e16m1();
if( k == 1 )
{
const ushort* src0 = src[0];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++)
dst[i*cn] = src0[i];
}
else if( k == 2 )
{
const ushort *src0 = src[0], *src1 = src[1];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
}
}
else if( k == 3 )
{
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
}
}
else
{
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 3, sizeof(ushort)*cn, d, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
dst[i*cn+3] = src3[i];
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; k < cn; k += 4 )
{
const uint16_t *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
i = 0;
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn + 1, sizeof(ushort)*cn, b, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn + 2, sizeof(ushort)*cn, c, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn + 3, sizeof(ushort)*cn, d, vl);
}
for( ; i < len; i++ )
{
dst[k+i*cn] = src0[i];
dst[k+i*cn+1] = src1[i];
dst[k+i*cn+2] = src2[i];
dst[k+i*cn+3] = src3[i];
}
}
return CV_HAL_ERROR_OK;
}
#if defined __GNUC__
__attribute__((optimize("no-tree-vectorize")))
#endif
inline int merge32s(const int** src, int* dst, int len, int cn ) {
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
const int* src0 = src[0];
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( i = j = 0; i < len; i++, j += cn )
dst[j] = src0[i];
}
else if( k == 2 )
{
const int *src0 = src[0], *src1 = src[1];
i = j = 0;
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
}
}
else if( k == 3 )
{
const int *src0 = src[0], *src1 = src[1], *src2 = src[2];
i = j = 0;
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
dst[j+2] = src2[i];
}
}
else
{
const int *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
i = j = 0;
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; k < cn; k += 4 )
{
const int *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
return CV_HAL_ERROR_OK;
}
#if defined __GNUC__
__attribute__((optimize("no-tree-vectorize")))
#endif
inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
const int64* src0 = src[0];
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( i = j = 0; i < len; i++, j += cn )
dst[j] = src0[i];
}
else if( k == 2 )
{
const int64 *src0 = src[0], *src1 = src[1];
i = j = 0;
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
}
}
else if( k == 3 )
{
const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2];
i = j = 0;
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
dst[j+2] = src2[i];
}
}
else
{
const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
i = j = 0;
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; k < cn; k += 4 )
{
const int64 *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
return CV_HAL_ERROR_OK;
}
}}
#endif

View File

@ -1,23 +1,11 @@
project(kleidicv_hal)
set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
ocv_update(KLEIDICV_SRC_COMMIT "0.1.0")
ocv_update(KLEIDICV_SRC_HASH "9388f28cf2fbe3338197b2b57d491468")
if(KLEIDICV_SOURCE_PATH)
set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
else()
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
HASH ${KLEIDICV_SRC_HASH}
URL
"${OPENCV_KLEIDICV_URL}"
"$ENV{OPENCV_KLEIDICV_URL}"
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
ID KLEIDICV
STATUS res
UNPACK RELATIVE_URL)
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
if(HAVE_KLEIDICV)
option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
# HACK to suppress adapters/opencv/kleidicv_hal.cpp:343:12: warning: unused function 'from_opencv' [-Wunused-function]
target_compile_options( kleidicv_hal PRIVATE
$<TARGET_PROPERTY:kleidicv,COMPILE_OPTIONS>
"-Wno-old-style-cast" "-Wno-unused-function"
)
endif()
include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")

21
3rdparty/kleidicv/kleidicv.cmake vendored Normal file
View File

@ -0,0 +1,21 @@
function(download_kleidicv root_var)
set(${root_var} "" PARENT_SCOPE)
ocv_update(KLEIDICV_SRC_COMMIT "0.3.0")
ocv_update(KLEIDICV_SRC_HASH "51a77b0185c2bac2a968a2163869b1ed")
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv")
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
HASH ${KLEIDICV_SRC_HASH}
URL
"${OPENCV_KLEIDICV_URL}"
"$ENV{OPENCV_KLEIDICV_URL}"
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
DESTINATION_DIR ${THE_ROOT}
ID KLEIDICV
STATUS res
UNPACK RELATIVE_URL)
if(res)
set(${root_var} "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}" PARENT_SCOPE)
endif()
endfunction()

View File

@ -5,6 +5,8 @@
#ifndef OPENCV_NDSRVP_IMGPROC_HPP
#define OPENCV_NDSRVP_IMGPROC_HPP
struct cvhalFilter2D;
namespace cv {
namespace ndsrvp {
@ -71,6 +73,52 @@ int threshold(const uchar* src_data, size_t src_step,
#undef cv_hal_threshold
#define cv_hal_threshold (cv::ndsrvp::threshold)
// ################ filter ################
int filterInit(cvhalFilter2D **context,
uchar *kernel_data, size_t kernel_step,
int kernel_type, int kernel_width,
int kernel_height, int max_width, int max_height,
int src_type, int dst_type, int borderType,
double delta, int anchor_x, int anchor_y,
bool allowSubmatrix, bool allowInplace);
#undef cv_hal_filterInit
#define cv_hal_filterInit (cv::ndsrvp::filterInit)
int filter(cvhalFilter2D *context,
const uchar *src_data, size_t src_step,
uchar *dst_data, size_t dst_step,
int width, int height,
int full_width, int full_height,
int offset_x, int offset_y);
#undef cv_hal_filter
#define cv_hal_filter (cv::ndsrvp::filter)
int filterFree(cvhalFilter2D *context);
#undef cv_hal_filterFree
#define cv_hal_filterFree (cv::ndsrvp::filterFree)
// ################ medianBlur ################
int medianBlur(const uchar* src_data, size_t src_step,
uchar* dst_data, size_t dst_step,
int width, int height, int depth, int cn, int ksize);
#undef cv_hal_medianBlur
#define cv_hal_medianBlur (cv::ndsrvp::medianBlur)
// ################ bilateralFilter ################
int bilateralFilter(const uchar* src_data, size_t src_step,
uchar* dst_data, size_t dst_step, int width, int height, int depth,
int cn, int d, double sigma_color, double sigma_space, int border_type);
#undef cv_hal_bilateralFilter
#define cv_hal_bilateralFilter (cv::ndsrvp::bilateralFilter)
} // namespace ndsrvp
} // namespace cv

270
3rdparty/ndsrvp/src/bilateralFilter.cpp vendored Normal file
View File

@ -0,0 +1,270 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "ndsrvp_hal.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
static void bilateralFilterProcess(uchar* dst_data, size_t dst_step, uchar* pad_data, size_t pad_step,
int width, int height, int cn, int radius, int maxk,
int* space_ofs, float *space_weight, float *color_weight)
{
int i, j, k;
for( i = 0; i < height; i++ )
{
const uchar* sptr = pad_data + (i + radius) * pad_step + radius * cn;
uchar* dptr = dst_data + i * dst_step;
if( cn == 1 )
{
std::vector<float> buf(width + width, 0.0);
float *sum = &buf[0];
float *wsum = sum + width;
k = 0;
for(; k <= maxk-4; k+=4)
{
const uchar* ksptr0 = sptr + space_ofs[k];
const uchar* ksptr1 = sptr + space_ofs[k+1];
const uchar* ksptr2 = sptr + space_ofs[k+2];
const uchar* ksptr3 = sptr + space_ofs[k+3];
j = 0;
for (; j < width; j++)
{
int rval = sptr[j];
int val = ksptr0[j];
float w = space_weight[k] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
val = ksptr1[j];
w = space_weight[k+1] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
val = ksptr2[j];
w = space_weight[k+2] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
val = ksptr3[j];
w = space_weight[k+3] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
}
}
for(; k < maxk; k++)
{
const uchar* ksptr = sptr + space_ofs[k];
j = 0;
for (; j < width; j++)
{
int val = ksptr[j];
float w = space_weight[k] * color_weight[std::abs(val - sptr[j])];
wsum[j] += w;
sum[j] += val * w;
}
}
j = 0;
for (; j < width; j++)
{
// overflow is not possible here => there is no need to use cv::saturate_cast
ndsrvp_assert(fabs(wsum[j]) > 0);
dptr[j] = (uchar)(sum[j] / wsum[j] + 0.5);
}
}
else
{
ndsrvp_assert( cn == 3 );
std::vector<float> buf(width * 3 + width);
float *sum_b = &buf[0];
float *sum_g = sum_b + width;
float *sum_r = sum_g + width;
float *wsum = sum_r + width;
k = 0;
for(; k <= maxk-4; k+=4)
{
const uchar* ksptr0 = sptr + space_ofs[k];
const uchar* ksptr1 = sptr + space_ofs[k+1];
const uchar* ksptr2 = sptr + space_ofs[k+2];
const uchar* ksptr3 = sptr + space_ofs[k+3];
const uchar* rsptr = sptr;
j = 0;
for(; j < width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
{
int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
float w = space_weight[k] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
}
}
for(; k < maxk; k++)
{
const uchar* ksptr = sptr + space_ofs[k];
const uchar* rsptr = sptr;
j = 0;
for(; j < width; j++, ksptr += 3, rsptr += 3)
{
int b = ksptr[0], g = ksptr[1], r = ksptr[2];
float w = space_weight[k] * color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])];
wsum[j] += w;
sum_b[j] += b * w; sum_g[j] += g * w; sum_r[j] += r * w;
}
}
j = 0;
for(; j < width; j++)
{
ndsrvp_assert(fabs(wsum[j]) > 0);
wsum[j] = 1.f / wsum[j];
*(dptr++) = (uchar)(sum_b[j] * wsum[j] + 0.5);
*(dptr++) = (uchar)(sum_g[j] * wsum[j] + 0.5);
*(dptr++) = (uchar)(sum_r[j] * wsum[j] + 0.5);
}
}
}
}
int bilateralFilter(const uchar* src_data, size_t src_step,
uchar* dst_data, size_t dst_step, int width, int height, int depth,
int cn, int d, double sigma_color, double sigma_space, int border_type)
{
if( depth != CV_8U || !(cn == 1 || cn == 3) || src_data == dst_data)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
int i, j, maxk, radius;
if( sigma_color <= 0 )
sigma_color = 1;
if( sigma_space <= 0 )
sigma_space = 1;
double gauss_color_coeff = -0.5/(sigma_color * sigma_color);
double gauss_space_coeff = -0.5/(sigma_space * sigma_space);
if( d <= 0 )
radius = (int)(sigma_space * 1.5 + 0.5);
else
radius = d / 2;
radius = MAX(radius, 1);
d = radius * 2 + 1;
// no enough submatrix info
// fetch original image data
const uchar *ogn_data = src_data;
int ogn_step = src_step;
// ROI fully used in the computation
int cal_width = width + d - 1;
int cal_height = height + d - 1;
int cal_x = 0 - radius; // negative if left border exceeded
int cal_y = 0 - radius; // negative if top border exceeded
// calculate source border
std::vector<uchar> padding;
padding.resize(cal_width * cal_height * cn);
uchar* pad_data = &padding[0];
int pad_step = cal_width * cn;
uchar* pad_ptr;
const uchar* ogn_ptr;
std::vector<uchar> vec_zeros(cn, 0);
for(i = 0; i < cal_height; i++)
{
int y = borderInterpolate(i + cal_y, height, border_type);
if(y < 0) {
memset(pad_data + i * pad_step, 0, cn * cal_width);
continue;
}
// left border
j = 0;
for(; j + cal_x < 0; j++)
{
int x = borderInterpolate(j + cal_x, width, border_type);
if(x < 0) // border constant return value -1
ogn_ptr = &vec_zeros[0];
else
ogn_ptr = ogn_data + y * ogn_step + x * cn;
pad_ptr = pad_data + i * pad_step + j * cn;
memcpy(pad_ptr, ogn_ptr, cn);
}
// center
int rborder = MIN(cal_width, width - cal_x);
ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cn;
pad_ptr = pad_data + i * pad_step + j * cn;
memcpy(pad_ptr, ogn_ptr, cn * (rborder - j));
// right border
j = rborder;
for(; j < cal_width; j++)
{
int x = borderInterpolate(j + cal_x, width, border_type);
if(x < 0) // border constant return value -1
ogn_ptr = &vec_zeros[0];
else
ogn_ptr = ogn_data + y * ogn_step + x * cn;
pad_ptr = pad_data + i * pad_step + j * cn;
memcpy(pad_ptr, ogn_ptr, cn);
}
}
std::vector<float> _color_weight(cn * 256);
std::vector<float> _space_weight(d * d);
std::vector<int> _space_ofs(d * d);
float* color_weight = &_color_weight[0];
float* space_weight = &_space_weight[0];
int* space_ofs = &_space_ofs[0];
// initialize color-related bilateral filter coefficients
for( i = 0; i < 256 * cn; i++ )
color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
// initialize space-related bilateral filter coefficients
for( i = -radius, maxk = 0; i <= radius; i++ )
{
j = -radius;
for( ; j <= radius; j++ )
{
double r = std::sqrt((double)i * i + (double)j * j);
if( r > radius )
continue;
space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
space_ofs[maxk++] = (int)(i * pad_step + j * cn);
}
}
bilateralFilterProcess(dst_data, dst_step, pad_data, pad_step, width, height, cn, radius, maxk, space_ofs, space_weight, color_weight);
return CV_HAL_ERROR_OK;
}
} // namespace ndsrvp
} // namespace cv

View File

@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType)
return p;
}
int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType)
{
int16x4_t vzero = (int16x4_t){0, 0, 0, 0};
int16x4_t vone = (int16x4_t){1, 1, 1, 1};
int16x4_t vlen = (int16x4_t){len, len, len, len};
if(borderType == CV_HAL_BORDER_REPLICATE)
vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
{
int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero;
if(len == 1)
return vzero;
do
{
int16x4_t vneg = -vp - 1 + vdelta;
int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta;
vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
}
while( (long)(vp >= vlen) || (long)(vp < 0) );
}
else if(borderType == CV_HAL_BORDER_WRAP)
{
ndsrvp_assert(len > 0);
int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen;
int16x4_t vpos = vp % vlen;
vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
}
else if(borderType == CV_HAL_BORDER_CONSTANT)
vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen));
else
ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type");
return vp;
}
} // namespace ndsrvp
} // namespace cv

View File

@ -14,6 +14,7 @@
#include <iostream>
#include <string>
#include <array>
#include <vector>
#include <climits>
#include <algorithm>
@ -26,16 +27,26 @@ namespace ndsrvp {
void* fastMalloc(size_t size);
void fastFree(void* ptr);
int borderInterpolate(int p, int len, int borderType);
int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType);
#ifndef MAX
# define MAX(a,b) ((a) < (b) ? (b) : (a))
#endif
#ifndef MIN
# define MIN(a,b) ((a) > (b) ? (b) : (a))
#endif
#define CV_MAT_CN_MASK ((CV_CN_MAX - 1) << CV_CN_SHIFT)
#define CV_MAT_CN(flags) ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
#define CV_MALLOC_ALIGN 64
inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
// error codes
enum Error{
@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
}
// expand
/*
[0] [1] [2] [3] [4] [5] [6] [7]
810 [ 0 ] [ 1 ] [ 4 ] [ 5 ]
832 [ 2 ] [ 3 ] [ 6 ] [ 7 ]
bb [ 0 ] [ 1 ] [ 2 ] [ 3 ]
tt [ 4 ] [ 5 ] [ 6 ] [ 7 ]
*/
inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst)
{
unsigned long vs810 = __nds__zunpkd810(vs);
unsigned long vs832 = __nds__zunpkd832(vs);
*(unsigned long*)dst = __nds__pkbb32(vs832, vs810);
*(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810);
}
/*
[0] [1] [2] [3] [4] [5] [6] [7]
820 [ 0 ] [ 2 ] [ 4 ] [ 6 ]
831 [ 1 ] [ 3 ] [ 5 ] [ 7 ]
bb [ 0 ] [ 2 ] [ 1 ] [ 3 ]
tt [ 4 ] [ 6 ] [ 5 ] [ 7 ]
*/
inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst)
{
unsigned long vs820 = __nds__zunpkd820(vs);
unsigned long vs831 = __nds__zunpkd831(vs);
*(unsigned long*)dst = __nds__pkbb32(vs831, vs820);
*(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820);
}
/*
[0] [1] [2] [3] [4] [5] [6] [7]
820 [ 0 ] [ 2 ] [ 4 ] [ 6 ]
831 [ 1 ] [ 3 ] [ 5 ] [ 7 ]
bb [ 0 ] [ 2 ] [ 1 ] [ 3 ]
tt [ 4 ] [ 6 ] [ 5 ] [ 7 ]
bbbb[ 0 ] [ 1 ]
bbtt[ 2 ] [ 3 ]
ttbb[ 4 ] [ 5 ]
tttt[ 6 ] [ 7 ]
*/
inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst)
{
unsigned long vs820 = __nds__zunpkd820(vs);
unsigned long vs831 = __nds__zunpkd831(vs);
unsigned long vsbb = __nds__pkbb32(vs831, vs820);
unsigned long vstt = __nds__pktt32(vs831, vs820);
*(unsigned long*)dst = __nds__pkbb16(0, vsbb);
*(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb);
*(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt);
*(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt);
}
// float replacement
inline void ndsrvp_f32_add8(const float* a, const float* b, float* c)
{
c[0] = a[0] + b[0];
c[1] = a[1] + b[1];
c[2] = a[2] + b[2];
c[3] = a[3] + b[3];
c[4] = a[4] + b[4];
c[5] = a[5] + b[5];
c[6] = a[6] + b[6];
c[7] = a[7] + b[7];
}
/*
[1] [8] [23]
[24] [8]
*/
inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact
{
const int mask_frac = 0x007FFFFF;
const int mask_sign = 0x7FFFFFFF;
const int mask_lead = 0x40000000;
const int ofs_exp = 23;
uint32x2_t va01 = *(uint32x2_t*)a;
uint32x2_t va23 = *(uint32x2_t*)(a + 2);
uint32x2_t va45 = *(uint32x2_t*)(a + 4);
uint32x2_t va67 = *(uint32x2_t*)(a + 6);
uint32x2_t vaexp01 = va01 >> ofs_exp;
uint32x2_t vaexp23 = va23 >> ofs_exp;
uint32x2_t vaexp45 = va45 >> ofs_exp;
uint32x2_t vaexp67 = va67 >> ofs_exp;
uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead;
uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead;
uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead;
uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead;
int16x4_t vb[2]; // fake signed for signed multiply
ndsrvp_u8_u16_eswap8(b, (ushort*)vb);
vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]);
vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]);
vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]);
vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]);
uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8;
uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8;
uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8;
uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8;
vaexp01 += 8 - vaclz01;
vaexp23 += 8 - vaclz23;
vaexp45 += 8 - vaclz45;
vaexp67 += 8 - vaclz67;
vafrac01 <<= vaclz01;
vafrac23 <<= vaclz23;
vafrac45 <<= vaclz45;
vafrac67 <<= vaclz67;
*(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac);
*(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac);
*(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac);
*(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac);
}
// saturate
template<typename _Tp> static inline _Tp saturate_cast(int v) { return _Tp(v); }
@ -94,6 +234,26 @@ template<> inline short saturate_cast<short>(double v) { return saturate_cas
template<> inline int saturate_cast<int>(float v) { return (int)lrintf(v); }
template<> inline int saturate_cast<int>(double v) { return (int)lrint(v); }
inline double cast_ptr_to_double(const uchar* v, int depth) {
switch (depth) {
case CV_8U: return (double)*(uchar*)v;
case CV_8S: return (double)*(char*)v;
case CV_16U: return (double)*(ushort*)v;
case CV_16S: return (double)*(short*)v;
case CV_32S: return (double)*(int*)v;
case CV_32F: return (double)*(float*)v;
case CV_64F: return (double)*(double*)v;
case CV_16F: return (double)*(float*)v;
default: return 0;
}
}
template <typename _Tp>
inline _Tp data_at(const uchar* data, int step, int y, int x, int cn)
{
return ((_Tp*)(data + y * step))[x * cn];
}
// align
inline long align(size_t v, int n)

321
3rdparty/ndsrvp/src/filter.cpp vendored Normal file
View File

@ -0,0 +1,321 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "ndsrvp_hal.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
class FilterData
{
public:
FilterData(uchar *_kernel_data, size_t _kernel_step, int _kernel_type, int _src_type, int _dst_type, int _borderType,
int _kernel_width, int _kernel_height, int _max_width, int _max_height, double _delta, int _anchor_x, int _anchor_y)
: kernel_data(_kernel_data), kernel_step(_kernel_step), kernel_type(_kernel_type), src_type(_src_type), dst_type(_dst_type), borderType(_borderType),
kernel_width(_kernel_width), kernel_height(_kernel_height), max_width(_max_width), max_height(_max_height), delta(_delta), anchor_x(_anchor_x), anchor_y(_anchor_y)
{
}
uchar *kernel_data;
size_t kernel_step; // bytes between rows(height)
int kernel_type, src_type, dst_type, borderType;
int kernel_width, kernel_height;
int max_width, max_height;
double delta;
int anchor_x, anchor_y;
std::vector<uchar> coords;
std::vector<float> coeffs;
int nz;
std::vector<uchar> padding;
};
static int countNonZero(const FilterData* ctx)
{
int i, j, nz = 0;
const uchar* ker_row = ctx->kernel_data;
for( i = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
{
for( j = 0; j < ctx->kernel_width; j++ )
{
if( ((float*)ker_row)[j] != 0.0 )
nz++;
}
}
return nz;
}
static void preprocess2DKernel(FilterData* ctx)
{
int i, j, k, nz = countNonZero(ctx), ktype = ctx->kernel_type;
if(nz == 0)
nz = 1; // (0, 0) == 0 by default
ndsrvp_assert( ktype == CV_32F );
ctx->coords.resize(nz * 2);
ctx->coeffs.resize(nz);
const uchar* ker_row = ctx->kernel_data;
for( i = k = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step )
{
for( j = 0; j < ctx->kernel_width; j++ )
{
float val = ((float*)ker_row)[j];
if( val == 0.0 )
continue;
ctx->coords[k * 2] = j;
ctx->coords[k * 2 + 1] = i;
ctx->coeffs[k++] = val;
}
}
ctx->nz = k;
}
int filterInit(cvhalFilter2D **context,
uchar *kernel_data, size_t kernel_step,
int kernel_type, int kernel_width,
int kernel_height, int max_width, int max_height,
int src_type, int dst_type, int borderType,
double delta, int anchor_x, int anchor_y,
bool allowSubmatrix, bool allowInplace)
{
int sdepth = CV_MAT_DEPTH(src_type), ddepth = CV_MAT_DEPTH(dst_type);
int cn = CV_MAT_CN(src_type), kdepth = kernel_type;
(void)allowSubmatrix;
(void)allowInplace;
if(delta - (int)delta != 0.0)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if(kdepth != CV_32F || (sdepth != CV_8U && sdepth != CV_16U) || ddepth != sdepth)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
FilterData *ctx = new FilterData(kernel_data, kernel_step, kernel_type, src_type, dst_type, borderType,
kernel_width, kernel_height, max_width, max_height, delta, anchor_x, anchor_y);
*context = (cvhalFilter2D*)ctx;
ndsrvp_assert(cn == CV_MAT_CN(dst_type) && ddepth >= sdepth);
preprocess2DKernel(ctx);
return CV_HAL_ERROR_OK;
}
int filter(cvhalFilter2D *context,
const uchar *src_data, size_t src_step,
uchar *dst_data, size_t dst_step,
int width, int height,
int full_width, int full_height,
int offset_x, int offset_y)
{
FilterData *ctx = (FilterData*)context;
int cn = CV_MAT_CN(ctx->src_type);
int cnes = CV_ELEM_SIZE(ctx->src_type);
int ddepth = CV_MAT_DEPTH(ctx->dst_type);
float delta_sat = (uchar)(ctx->delta);
if(ddepth == CV_8U)
delta_sat = (float)saturate_cast<uchar>(ctx->delta);
else if(ddepth == CV_16U)
delta_sat = (float)saturate_cast<ushort>(ctx->delta);
// fetch original image data
const uchar *ogn_data = src_data - offset_y * src_step - offset_x * cnes;
int ogn_step = src_step;
// ROI fully used in the computation
int cal_width = width + ctx->kernel_width - 1;
int cal_height = height + ctx->kernel_height - 1;
int cal_x = offset_x - ctx->anchor_x; // negative if left border exceeded
int cal_y = offset_y - ctx->anchor_y; // negative if top border exceeded
// calculate source border
ctx->padding.resize(cal_width * cal_height * cnes);
uchar* pad_data = &ctx->padding[0];
int pad_step = cal_width * cnes;
uchar* pad_ptr;
const uchar* ogn_ptr;
std::vector<uchar> vec_zeros(cnes, 0);
for(int i = 0; i < cal_height; i++)
{
int y = borderInterpolate(i + cal_y, full_height, ctx->borderType);
if(y < 0) {
memset(pad_data + i * pad_step, 0, cnes * cal_width);
continue;
}
// left border
int j = 0;
int16x4_t vj = {0, 1, 2, 3};
vj += saturate_cast<short>(cal_x);
for(; j + cal_x < -4; j += 4, vj += 4)
{
int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
for(int k = 0; k < 4; k++) {
if(vx[k] < 0) // border constant return value -1
ogn_ptr = &vec_zeros[0];
else
ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
memcpy(pad_ptr, ogn_ptr, cnes);
}
}
for(; j + cal_x < 0; j++)
{
int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
if(x < 0) // border constant return value -1
ogn_ptr = &vec_zeros[0];
else
ogn_ptr = ogn_data + y * ogn_step + x * cnes;
pad_ptr = pad_data + i * pad_step + j * cnes;
memcpy(pad_ptr, ogn_ptr, cnes);
}
// center
int rborder = MIN(cal_width, full_width - cal_x);
ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cnes;
pad_ptr = pad_data + i * pad_step + j * cnes;
memcpy(pad_ptr, ogn_ptr, cnes * (rborder - j));
// right border
j = rborder;
vj = (int16x4_t){0, 1, 2, 3} + saturate_cast<short>(cal_x + rborder);
for(; j <= cal_width - 4; j += 4, vj += 4)
{
int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType);
for(int k = 0; k < 4; k++) {
if(vx[k] < 0) // border constant return value -1
ogn_ptr = &vec_zeros[0];
else
ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes;
pad_ptr = pad_data + i * pad_step + (j + k) * cnes;
memcpy(pad_ptr, ogn_ptr, cnes);
}
}
for(; j < cal_width; j++)
{
int x = borderInterpolate(j + cal_x, full_width, ctx->borderType);
if(x < 0) // border constant return value -1
ogn_ptr = &vec_zeros[0];
else
ogn_ptr = ogn_data + y * ogn_step + x * cnes;
pad_ptr = pad_data + i * pad_step + j * cnes;
memcpy(pad_ptr, ogn_ptr, cnes);
}
}
// prepare the pointers
int i, k, count, nz = ctx->nz;
const uchar* ker_pts = &ctx->coords[0];
const float* ker_cfs = &ctx->coeffs[0];
if( ddepth == CV_8U )
{
std::vector<uchar*> src_ptrarr;
src_ptrarr.resize(nz);
uchar** src_ptrs = &src_ptrarr[0];
uchar* dst_row = dst_data;
uchar* pad_row = pad_data;
for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
{
for( k = 0; k < nz; k++ )
src_ptrs[k] = (uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes;
i = 0;
for( ; i <= width * cnes - 8; i += 8 )
{
float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat};
for( k = 0; k < nz; k++ ) {
float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
// experimental code
// ndsrvp_f32_u8_mul8(vker_cfs, *(unsigned long*)(src_ptrs[k] + i), vker_cfs);
// ndsrvp_f32_add8(vs0, vker_cfs, vs0);
vs0[0] += vker_cfs[0] * src_ptrs[k][i];
vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
vs0[4] += vker_cfs[4] * src_ptrs[k][i + 4];
vs0[5] += vker_cfs[5] * src_ptrs[k][i + 5];
vs0[6] += vker_cfs[6] * src_ptrs[k][i + 6];
vs0[7] += vker_cfs[7] * src_ptrs[k][i + 7];
}
dst_row[i] = saturate_cast<uchar>(vs0[0]);
dst_row[i + 1] = saturate_cast<uchar>(vs0[1]);
dst_row[i + 2] = saturate_cast<uchar>(vs0[2]);
dst_row[i + 3] = saturate_cast<uchar>(vs0[3]);
dst_row[i + 4] = saturate_cast<uchar>(vs0[4]);
dst_row[i + 5] = saturate_cast<uchar>(vs0[5]);
dst_row[i + 6] = saturate_cast<uchar>(vs0[6]);
dst_row[i + 7] = saturate_cast<uchar>(vs0[7]);
}
for( ; i < width * cnes; i++ )
{
float s0 = delta_sat;
for( k = 0; k < nz; k++ ) {
s0 += ker_cfs[k] * src_ptrs[k][i];
}
dst_row[i] = saturate_cast<uchar>(s0);
}
}
}
else if( ddepth == CV_16U )
{
std::vector<ushort*> src_ptrarr;
src_ptrarr.resize(nz);
ushort** src_ptrs = &src_ptrarr[0];
uchar* dst_row = dst_data;
uchar* pad_row = pad_data;
for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step )
{
for( k = 0; k < nz; k++ )
src_ptrs[k] = (ushort*)((uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes);
i = 0;
for( ; i <= width * cn - 4; i += 4 )
{
float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat};
for( k = 0; k < nz; k++ ) {
float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]};
vs0[0] += vker_cfs[0] * src_ptrs[k][i];
vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1];
vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2];
vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3];
}
ushort* dst_row_ptr = (ushort*)dst_row;
dst_row_ptr[i] = saturate_cast<ushort>(vs0[0]);
dst_row_ptr[i + 1] = saturate_cast<ushort>(vs0[1]);
dst_row_ptr[i + 2] = saturate_cast<ushort>(vs0[2]);
dst_row_ptr[i + 3] = saturate_cast<ushort>(vs0[3]);
}
for( ; i < width * cn; i++ )
{
float s0 = delta_sat;
for( k = 0; k < nz; k++ ) {
s0 += ker_cfs[k] * src_ptrs[k][i];
}
((ushort*)dst_row)[i] = saturate_cast<ushort>(s0);
}
}
}
return CV_HAL_ERROR_OK;
}
int filterFree(cvhalFilter2D *context) {
FilterData *ctx = (FilterData*)context;
delete ctx;
return CV_HAL_ERROR_OK;
}
} // namespace ndsrvp
} // namespace cv

300
3rdparty/ndsrvp/src/medianBlur.cpp vendored Normal file
View File

@ -0,0 +1,300 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "ndsrvp_hal.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
struct operators_minmax_t {
inline void vector(uint8x8_t & a, uint8x8_t & b) const {
uint8x8_t t = a;
a = __nds__v_umin8(a, b);
b = __nds__v_umax8(t, b);
}
inline void scalar(uchar & a, uchar & b) const {
uchar t = a;
a = __nds__umin8(a, b);
b = __nds__umax8(t, b);
}
inline void vector(int8x8_t & a, int8x8_t & b) const {
int8x8_t t = a;
a = __nds__v_smin8(a, b);
b = __nds__v_smax8(t, b);
}
inline void scalar(schar & a, schar & b) const {
schar t = a;
a = __nds__smin8(a, b);
b = __nds__smax8(t, b);
}
inline void vector(uint16x4_t & a, uint16x4_t & b) const {
uint16x4_t t = a;
a = __nds__v_umin16(a, b);
b = __nds__v_umax16(t, b);
}
inline void scalar(ushort & a, ushort & b) const {
ushort t = a;
a = __nds__umin16(a, b);
b = __nds__umax16(t, b);
}
inline void vector(int16x4_t & a, int16x4_t & b) const {
int16x4_t t = a;
a = __nds__v_smin16(a, b);
b = __nds__v_smax16(t, b);
}
inline void scalar(short & a, short & b) const {
short t = a;
a = __nds__smin16(a, b);
b = __nds__smax16(t, b);
}
};
template<typename T, typename WT, typename VT> // type, widen type, vector type
static void
medianBlur_SortNet( const uchar* src_data, size_t src_step,
uchar* dst_data, size_t dst_step,
int width, int height, int cn, int ksize )
{
const T* src = (T*)src_data;
T* dst = (T*)dst_data;
int sstep = (int)(src_step / sizeof(T));
int dstep = (int)(dst_step / sizeof(T));
int i, j, k;
operators_minmax_t op;
if( ksize == 3 )
{
if( width == 1 || height == 1 )
{
int len = width + height - 1;
int sdelta = height == 1 ? cn : sstep;
int sdelta0 = height == 1 ? 0 : sstep - cn;
int ddelta = height == 1 ? cn : dstep;
for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
for( j = 0; j < cn; j++, src++ )
{
T p0 = src[i > 0 ? -sdelta : 0];
T p1 = src[0];
T p2 = src[i < len - 1 ? sdelta : 0];
op.scalar(p0, p1); op.scalar(p1, p2); op.scalar(p0, p1);
dst[j] = (T)p1;
}
return;
}
width *= cn;
for( i = 0; i < height; i++, dst += dstep )
{
const T* row0 = src + std::max(i - 1, 0)*sstep;
const T* row1 = src + i*sstep;
const T* row2 = src + std::min(i + 1, height-1)*sstep;
int limit = cn;
for(j = 0;; )
{
for( ; j < limit; j++ )
{
int j0 = j >= cn ? j - cn : j;
int j2 = j < width - cn ? j + cn : j;
T p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
T p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
T p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
op.scalar(p1, p2); op.scalar(p4, p5); op.scalar(p7, p8); op.scalar(p0, p1);
op.scalar(p3, p4); op.scalar(p6, p7); op.scalar(p1, p2); op.scalar(p4, p5);
op.scalar(p7, p8); op.scalar(p0, p3); op.scalar(p5, p8); op.scalar(p4, p7);
op.scalar(p3, p6); op.scalar(p1, p4); op.scalar(p2, p5); op.scalar(p4, p7);
op.scalar(p4, p2); op.scalar(p6, p4); op.scalar(p4, p2);
dst[j] = (T)p4;
}
if( limit == width )
break;
int nlanes = 8 / sizeof(T);
for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn); j += nlanes ) // alignment
{
VT p0 = *(VT*)(row0+j-cn), p1 = *(VT*)(row0+j), p2 = *(VT*)(row0+j+cn);
VT p3 = *(VT*)(row1+j-cn), p4 = *(VT*)(row1+j), p5 = *(VT*)(row1+j+cn);
VT p6 = *(VT*)(row2+j-cn), p7 = *(VT*)(row2+j), p8 = *(VT*)(row2+j+cn);
op.vector(p1, p2); op.vector(p4, p5); op.vector(p7, p8); op.vector(p0, p1);
op.vector(p3, p4); op.vector(p6, p7); op.vector(p1, p2); op.vector(p4, p5);
op.vector(p7, p8); op.vector(p0, p3); op.vector(p5, p8); op.vector(p4, p7);
op.vector(p3, p6); op.vector(p1, p4); op.vector(p2, p5); op.vector(p4, p7);
op.vector(p4, p2); op.vector(p6, p4); op.vector(p4, p2);
*(VT*)(dst+j) = p4;
}
limit = width;
}
}
}
else if( ksize == 5 )
{
if( width == 1 || height == 1 )
{
int len = width + height - 1;
int sdelta = height == 1 ? cn : sstep;
int sdelta0 = height == 1 ? 0 : sstep - cn;
int ddelta = height == 1 ? cn : dstep;
for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
for( j = 0; j < cn; j++, src++ )
{
int i1 = i > 0 ? -sdelta : 0;
int i0 = i > 1 ? -sdelta*2 : i1;
int i3 = i < len-1 ? sdelta : 0;
int i4 = i < len-2 ? sdelta*2 : i3;
T p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
op.scalar(p0, p1); op.scalar(p3, p4); op.scalar(p2, p3); op.scalar(p3, p4); op.scalar(p0, p2);
op.scalar(p2, p4); op.scalar(p1, p3); op.scalar(p1, p2);
dst[j] = (T)p2;
}
return;
}
width *= cn;
for( i = 0; i < height; i++, dst += dstep )
{
const T* row[5];
row[0] = src + std::max(i - 2, 0)*sstep;
row[1] = src + std::max(i - 1, 0)*sstep;
row[2] = src + i*sstep;
row[3] = src + std::min(i + 1, height-1)*sstep;
row[4] = src + std::min(i + 2, height-1)*sstep;
int limit = cn*2;
for(j = 0;; )
{
for( ; j < limit; j++ )
{
T p[25];
int j1 = j >= cn ? j - cn : j;
int j0 = j >= cn*2 ? j - cn*2 : j1;
int j3 = j < width - cn ? j + cn : j;
int j4 = j < width - cn*2 ? j + cn*2 : j3;
for( k = 0; k < 5; k++ )
{
const T* rowk = row[k];
p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
p[k*5+4] = rowk[j4];
}
op.scalar(p[1], p[2]); op.scalar(p[0], p[1]); op.scalar(p[1], p[2]); op.scalar(p[4], p[5]); op.scalar(p[3], p[4]);
op.scalar(p[4], p[5]); op.scalar(p[0], p[3]); op.scalar(p[2], p[5]); op.scalar(p[2], p[3]); op.scalar(p[1], p[4]);
op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[7], p[8]); op.scalar(p[6], p[7]); op.scalar(p[7], p[8]);
op.scalar(p[10], p[11]); op.scalar(p[9], p[10]); op.scalar(p[10], p[11]); op.scalar(p[6], p[9]); op.scalar(p[8], p[11]);
op.scalar(p[8], p[9]); op.scalar(p[7], p[10]); op.scalar(p[7], p[8]); op.scalar(p[9], p[10]); op.scalar(p[0], p[6]);
op.scalar(p[4], p[10]); op.scalar(p[4], p[6]); op.scalar(p[2], p[8]); op.scalar(p[2], p[4]); op.scalar(p[6], p[8]);
op.scalar(p[1], p[7]); op.scalar(p[5], p[11]); op.scalar(p[5], p[7]); op.scalar(p[3], p[9]); op.scalar(p[3], p[5]);
op.scalar(p[7], p[9]); op.scalar(p[1], p[2]); op.scalar(p[3], p[4]); op.scalar(p[5], p[6]); op.scalar(p[7], p[8]);
op.scalar(p[9], p[10]); op.scalar(p[13], p[14]); op.scalar(p[12], p[13]); op.scalar(p[13], p[14]); op.scalar(p[16], p[17]);
op.scalar(p[15], p[16]); op.scalar(p[16], p[17]); op.scalar(p[12], p[15]); op.scalar(p[14], p[17]); op.scalar(p[14], p[15]);
op.scalar(p[13], p[16]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]); op.scalar(p[19], p[20]); op.scalar(p[18], p[19]);
op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[21], p[23]); op.scalar(p[22], p[24]);
op.scalar(p[22], p[23]); op.scalar(p[18], p[21]); op.scalar(p[20], p[23]); op.scalar(p[20], p[21]); op.scalar(p[19], p[22]);
op.scalar(p[22], p[24]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[12], p[18]);
op.scalar(p[16], p[22]); op.scalar(p[16], p[18]); op.scalar(p[14], p[20]); op.scalar(p[20], p[24]); op.scalar(p[14], p[16]);
op.scalar(p[18], p[20]); op.scalar(p[22], p[24]); op.scalar(p[13], p[19]); op.scalar(p[17], p[23]); op.scalar(p[17], p[19]);
op.scalar(p[15], p[21]); op.scalar(p[15], p[17]); op.scalar(p[19], p[21]); op.scalar(p[13], p[14]); op.scalar(p[15], p[16]);
op.scalar(p[17], p[18]); op.scalar(p[19], p[20]); op.scalar(p[21], p[22]); op.scalar(p[23], p[24]); op.scalar(p[0], p[12]);
op.scalar(p[8], p[20]); op.scalar(p[8], p[12]); op.scalar(p[4], p[16]); op.scalar(p[16], p[24]); op.scalar(p[12], p[16]);
op.scalar(p[2], p[14]); op.scalar(p[10], p[22]); op.scalar(p[10], p[14]); op.scalar(p[6], p[18]); op.scalar(p[6], p[10]);
op.scalar(p[10], p[12]); op.scalar(p[1], p[13]); op.scalar(p[9], p[21]); op.scalar(p[9], p[13]); op.scalar(p[5], p[17]);
op.scalar(p[13], p[17]); op.scalar(p[3], p[15]); op.scalar(p[11], p[23]); op.scalar(p[11], p[15]); op.scalar(p[7], p[19]);
op.scalar(p[7], p[11]); op.scalar(p[11], p[13]); op.scalar(p[11], p[12]);
dst[j] = (T)p[12];
}
if( limit == width )
break;
int nlanes = 8 / sizeof(T);
for( ; (cn % nlanes == 0) && (j <= width - nlanes - cn*2); j += nlanes )
{
VT p0 = *(VT*)(row[0]+j-cn*2), p5 = *(VT*)(row[1]+j-cn*2), p10 = *(VT*)(row[2]+j-cn*2), p15 = *(VT*)(row[3]+j-cn*2), p20 = *(VT*)(row[4]+j-cn*2);
VT p1 = *(VT*)(row[0]+j-cn*1), p6 = *(VT*)(row[1]+j-cn*1), p11 = *(VT*)(row[2]+j-cn*1), p16 = *(VT*)(row[3]+j-cn*1), p21 = *(VT*)(row[4]+j-cn*1);
VT p2 = *(VT*)(row[0]+j-cn*0), p7 = *(VT*)(row[1]+j-cn*0), p12 = *(VT*)(row[2]+j-cn*0), p17 = *(VT*)(row[3]+j-cn*0), p22 = *(VT*)(row[4]+j-cn*0);
VT p3 = *(VT*)(row[0]+j+cn*1), p8 = *(VT*)(row[1]+j+cn*1), p13 = *(VT*)(row[2]+j+cn*1), p18 = *(VT*)(row[3]+j+cn*1), p23 = *(VT*)(row[4]+j+cn*1);
VT p4 = *(VT*)(row[0]+j+cn*2), p9 = *(VT*)(row[1]+j+cn*2), p14 = *(VT*)(row[2]+j+cn*2), p19 = *(VT*)(row[3]+j+cn*2), p24 = *(VT*)(row[4]+j+cn*2);
op.vector(p1, p2); op.vector(p0, p1); op.vector(p1, p2); op.vector(p4, p5); op.vector(p3, p4);
op.vector(p4, p5); op.vector(p0, p3); op.vector(p2, p5); op.vector(p2, p3); op.vector(p1, p4);
op.vector(p1, p2); op.vector(p3, p4); op.vector(p7, p8); op.vector(p6, p7); op.vector(p7, p8);
op.vector(p10, p11); op.vector(p9, p10); op.vector(p10, p11); op.vector(p6, p9); op.vector(p8, p11);
op.vector(p8, p9); op.vector(p7, p10); op.vector(p7, p8); op.vector(p9, p10); op.vector(p0, p6);
op.vector(p4, p10); op.vector(p4, p6); op.vector(p2, p8); op.vector(p2, p4); op.vector(p6, p8);
op.vector(p1, p7); op.vector(p5, p11); op.vector(p5, p7); op.vector(p3, p9); op.vector(p3, p5);
op.vector(p7, p9); op.vector(p1, p2); op.vector(p3, p4); op.vector(p5, p6); op.vector(p7, p8);
op.vector(p9, p10); op.vector(p13, p14); op.vector(p12, p13); op.vector(p13, p14); op.vector(p16, p17);
op.vector(p15, p16); op.vector(p16, p17); op.vector(p12, p15); op.vector(p14, p17); op.vector(p14, p15);
op.vector(p13, p16); op.vector(p13, p14); op.vector(p15, p16); op.vector(p19, p20); op.vector(p18, p19);
op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p21, p23); op.vector(p22, p24);
op.vector(p22, p23); op.vector(p18, p21); op.vector(p20, p23); op.vector(p20, p21); op.vector(p19, p22);
op.vector(p22, p24); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p12, p18);
op.vector(p16, p22); op.vector(p16, p18); op.vector(p14, p20); op.vector(p20, p24); op.vector(p14, p16);
op.vector(p18, p20); op.vector(p22, p24); op.vector(p13, p19); op.vector(p17, p23); op.vector(p17, p19);
op.vector(p15, p21); op.vector(p15, p17); op.vector(p19, p21); op.vector(p13, p14); op.vector(p15, p16);
op.vector(p17, p18); op.vector(p19, p20); op.vector(p21, p22); op.vector(p23, p24); op.vector(p0, p12);
op.vector(p8, p20); op.vector(p8, p12); op.vector(p4, p16); op.vector(p16, p24); op.vector(p12, p16);
op.vector(p2, p14); op.vector(p10, p22); op.vector(p10, p14); op.vector(p6, p18); op.vector(p6, p10);
op.vector(p10, p12); op.vector(p1, p13); op.vector(p9, p21); op.vector(p9, p13); op.vector(p5, p17);
op.vector(p13, p17); op.vector(p3, p15); op.vector(p11, p23); op.vector(p11, p15); op.vector(p7, p19);
op.vector(p7, p11); op.vector(p11, p13); op.vector(p11, p12);
*(VT*)(dst+j) = p12;
}
limit = width;
}
}
}
}
int medianBlur(const uchar* src_data, size_t src_step,
uchar* dst_data, size_t dst_step,
int width, int height, int depth, int cn, int ksize)
{
bool useSortNet = ((ksize == 3) || (ksize == 5 && ( depth > CV_8U || cn == 2 || cn > 4 )));
if( useSortNet )
{
uchar* src_data_rep;
if( dst_data == src_data ) {
std::vector<uchar> src_data_copy(src_step * height);
memcpy(src_data_copy.data(), src_data, src_step * height);
src_data_rep = &src_data_copy[0];
}
else {
src_data_rep = (uchar*)src_data;
}
if( depth == CV_8U )
medianBlur_SortNet<uchar, int, uint8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
else if( depth == CV_8S )
medianBlur_SortNet<schar, int, int8x8_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
else if( depth == CV_16U )
medianBlur_SortNet<ushort, int, uint16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
else if( depth == CV_16S )
medianBlur_SortNet<short, int, int16x4_t>( src_data_rep, src_step, dst_data, dst_step, width, height, cn, ksize );
else
return CV_HAL_ERROR_NOT_IMPLEMENTED;
return CV_HAL_ERROR_OK;
}
else return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
} // namespace ndsrvp
} // namespace cv

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
(C) 1995-2013 Jean-loup Gailly and Mark Adler
(C) 1995-2024 Jean-loup Gailly and Mark Adler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages

View File

@ -21,7 +21,6 @@ Features
* Support for CPU intrinsics when available
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
@ -95,20 +94,21 @@ make test
Build Options
-------------
| CMake | configure | Description | Default |
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
| WITH_GTEST | | Build gtest_zlib | ON |
| WITH_FUZZERS | | Build test/fuzz | OFF |
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
| CMake | configure | Description | Default |
|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------|
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
| WITH_RUNTIME_CPU_DETECTION | | Compiles with runtime CPU detection | ON |
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
| WITH_GTEST | | Build gtest_zlib | ON |
| WITH_FUZZERS | | Build test/fuzz | OFF |
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
Install

View File

@ -7,70 +7,24 @@
#include "functable.h"
#include "adler32_p.h"
/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
#ifdef UNROLL_MORE
n = NMAX / 16; /* NMAX is divisible by 16 */
#else
n = NMAX / 8; /* NMAX is divisible by 8 */
#endif
do {
#ifdef UNROLL_MORE
DO16(adler, sum2, buf); /* 16 sums unrolled */
buf += 16;
#else
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
buf += 8;
#endif
} while (--n);
adler %= BASE;
sum2 %= BASE;
}
/* do remaining bytes (less than NMAX, still just one modulo) */
return adler32_len_64(adler, buf, len, sum2);
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
return functable.adler32(adler, buf, len);
return FUNCTABLE_CALL(adler32)(adler, buf, len);
}
#endif
/* ========================================================================= */
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
return functable.adler32(adler, buf, len);
return FUNCTABLE_CALL(adler32)(adler, buf, len);
}
#endif

View File

@ -1,11 +0,0 @@
/* adler32_fold.h -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif

View File

@ -1,2 +0,0 @@
# ignore Makefiles; they're all automatically generated
Makefile

View File

@ -25,7 +25,6 @@ all: \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
slide_hash_armv6.o slide_hash_armv6.lo \
insert_string_acle.o insert_string_acle.lo
adler32_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@ -69,12 +68,6 @@ slide_hash_armv6.o:
slide_hash_armv6.lo:
$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
insert_string_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
insert_string_acle.lo:
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~

View File

@ -7,8 +7,8 @@
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "zbuild.h"
#include "adler32_p.h"
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {

View File

@ -1,4 +1,4 @@
#include "../../zbuild.h"
#include "zbuild.h"
#include "arm_features.h"
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
@ -11,6 +11,11 @@
# ifndef ID_AA64ISAR0_CRC32_VAL
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
# endif
#elif defined(__OpenBSD__) && defined(__aarch64__)
# include <machine/armreg.h>
# include <machine/cpu.h>
# include <sys/sysctl.h>
# include <sys/types.h>
#elif defined(__APPLE__)
# if !defined(_DARWIN_C_SOURCE)
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
@ -30,6 +35,16 @@ static int arm_has_crc32() {
#elif defined(__FreeBSD__) && defined(__aarch64__)
return getenv("QEMU_EMULATING") == NULL
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
#elif defined(__OpenBSD__) && defined(__aarch64__)
int hascrc32 = 0;
int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
uint64_t isar0 = 0;
size_t len = sizeof(isar0);
if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
hascrc32 = 1;
}
return hascrc32;
#elif defined(__APPLE__)
int hascrc32;
size_t size = sizeof(hascrc32);

View File

@ -2,8 +2,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ARM_H_
#define ARM_H_
#ifndef ARM_FEATURES_H_
#define ARM_FEATURES_H_
struct arm_cpu_features {
int has_simd;
@ -13,4 +13,4 @@ struct arm_cpu_features {
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
#endif /* ARM_H_ */
#endif /* ARM_FEATURES_H_ */

View File

@ -0,0 +1,65 @@
/* arm_functions.h -- ARM implementations for arch-specific functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ARM_FUNCTIONS_H_
#define ARM_FUNCTIONS_H_
#ifdef ARM_NEON
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_neon(void);
uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZLL
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
# endif
void slide_hash_neon(deflate_state *s);
void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef ARM_ACLE
uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
#endif
#ifdef ARM_SIMD
void slide_hash_armv6(deflate_state *s);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// ARM - SIMD
# if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
# undef native_slide_hash
# define native_slide_hash slide_hash_armv6
# endif
// ARM - NEON
# if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
# undef native_adler32
# define native_adler32 adler32_neon
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_neon
# undef native_chunksize
# define native_chunksize chunksize_neon
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_neon
# undef native_slide_hash
# define native_slide_hash slide_hash_neon
# ifdef HAVE_BUILTIN_CTZLL
# undef native_compare256
# define native_compare256 compare256_neon
# undef native_longest_match
# define native_longest_match longest_match_neon
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_neon
# endif
# endif
// ARM - ACLE
# if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32)
# undef native_crc32
# define native_crc32 crc32_acle
# endif
#endif
#endif /* ARM_FUNCTIONS_H_ */

View File

@ -4,8 +4,8 @@
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../generic/chunk_permute_table.h"
#include "zbuild.h"
#include "arch/generic/chunk_permute_table.h"
typedef uint8x16_t chunk_t;

View File

@ -3,8 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)

View File

@ -7,7 +7,7 @@
#ifdef ARM_ACLE
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "zbuild.h"
Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;

View File

@ -1,24 +0,0 @@
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef ARM_ACLE
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
#define HASH_CALC(s, h, val) \
h = __crc32w(0, val)
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH Z_TARGET_CRC update_hash_acle
#define INSERT_STRING Z_TARGET_CRC insert_string_acle
#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
#include "../../insert_string_tpl.h"
#endif

View File

@ -25,6 +25,13 @@
out.val[3] = vqsubq_u16(a.val[3], b); \
} while (0)
# if defined(__clang__) && defined(__arm__) && defined(__ANDROID__)
/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */
# undef ARM_NEON_HASLD4
# undef vld1q_u16_x4
# undef vld1q_u8_x4
# undef vst1q_u16_x4
# endif
# ifndef ARM_NEON_HASLD4

View File

@ -5,8 +5,8 @@
#if defined(ARM_SIMD)
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {

View File

@ -10,8 +10,8 @@
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {

View File

@ -1,5 +1,6 @@
# Makefile for zlib
# Makefile for zlib-ng
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# Copyright (C) 2024 Hans Kristian Rosbach
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
@ -11,12 +12,62 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all:
all: \
adler32_c.o adler32_c.lo \
adler32_fold_c.o adler32_fold_c.lo \
chunkset_c.o chunkset_c.lo \
compare256_c.o compare256_c.lo \
crc32_braid_c.o crc32_braid_c.lo \
crc32_fold_c.o crc32_fold_c.lo \
slide_hash_c.o slide_hash_c.lo
adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
adler32_c.lo: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
slide_hash_c.o: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~ \
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov

View File

@ -0,0 +1,54 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011, 2016 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "functable.h"
#include "adler32_p.h"
/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
#ifdef UNROLL_MORE
n = NMAX / 16; /* NMAX is divisible by 16 */
#else
n = NMAX / 8; /* NMAX is divisible by 8 */
#endif
do {
#ifdef UNROLL_MORE
DO16(adler, sum2, buf); /* 16 sums unrolled */
buf += 16;
#else
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
buf += 8;
#endif
} while (--n);
adler %= BASE;
sum2 %= BASE;
}
/* do remaining bytes (less than NMAX, still just one modulo) */
return adler32_len_64(adler, buf, len, sum2);
}

View File

@ -5,12 +5,11 @@
#include "zbuild.h"
#include "functable.h"
#include "adler32_fold.h"
#include <limits.h>
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
adler = functable.adler32(adler, src, len);
adler = FUNCTABLE_CALL(adler32)(adler, src, len);
memcpy(dst, src, len);
return adler;
}

View File

@ -5,6 +5,7 @@
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
/* ALIGNED, byte comparison */

View File

@ -8,43 +8,9 @@
*/
#include "zbuild.h"
#include "zutil.h"
#include "functable.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
/* ========================================================================= */
const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
return (const uint32_t *)crc_table;
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return functable.crc32(crc, buf, len);
}
#endif
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
return PREFIX(crc32_z)(crc, buf, len);
}
#endif
/* ========================================================================= */
/*
A CRC of a message is computed on N braids of words in the message, where
each word consists of W bytes (4 or 8). If N is 3, for example, then three
@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
level. Your mileage may vary.
*/
/* ========================================================================= */
#if BYTE_ORDER == LITTLE_ENDIAN
# define ZSWAPWORD(word) (word)
# define BRAID_TABLE crc_braid_table
#elif BYTE_ORDER == BIG_ENDIAN
# if W == 8
# define ZSWAPWORD(word) ZSWAP64(word)
# elif W == 4
# define ZSWAPWORD(word) ZSWAP32(word)
# endif
# define BRAID_TABLE crc_braid_big_table
#else
# error "No endian defined"
#endif
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
/* ========================================================================= */
#ifdef W
/*
@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) {
/* ========================================================================= */
Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;
uint32_t c;
/* Pre-condition the CRC */
c = (~crc) & 0xffffffff;

View File

@ -3,11 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
#include "functable.h"
#include "crc32_fold.h"
#include <limits.h>
#include "crc32.h"
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
crc->value = CRC32_INITIAL_VALUE;
@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
}
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
crc->value = functable.crc32(crc->value, src, len);
crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
memcpy(dst, src, len);
}
@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui
* same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
* init_crc is an unused argument in this context */
Z_UNUSED(init_crc);
crc->value = functable.crc32(crc->value, src, len);
crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
}
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {

View File

@ -0,0 +1,106 @@
/* generic_functions.h -- generic C implementations for arch-specific functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef GENERIC_FUNCTIONS_H_
#define GENERIC_FUNCTIONS_H_
#include "zendian.h"
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_c(void);
uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
# endif
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
# endif
#endif
typedef void (*slide_hash_func)(deflate_state *s);
void slide_hash_c(deflate_state *s);
uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
# ifdef HAVE_BUILTIN_CTZ
uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
# endif
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
# endif
# endif
uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
# ifdef UNALIGNED64_OK
uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
# endif
# endif
// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
# define longest_match_generic longest_match_unaligned_64
# define longest_match_slow_generic longest_match_slow_unaligned_64
# define compare256_generic compare256_unaligned_64
# elif defined(HAVE_BUILTIN_CTZ)
# define longest_match_generic longest_match_unaligned_32
# define longest_match_slow_generic longest_match_slow_unaligned_32
# define compare256_generic compare256_unaligned_32
# else
# define longest_match_generic longest_match_unaligned_16
# define longest_match_slow_generic longest_match_slow_unaligned_16
# define compare256_generic compare256_unaligned_16
# endif
#else
# define longest_match_generic longest_match_c
# define longest_match_slow_generic longest_match_slow_c
# define compare256_generic compare256_c
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Generic code
# define native_adler32 adler32_c
# define native_adler32_fold_copy adler32_fold_copy_c
# define native_chunkmemset_safe chunkmemset_safe_c
# define native_chunksize chunksize_c
# define native_crc32 PREFIX(crc32_braid)
# define native_crc32_fold crc32_fold_c
# define native_crc32_fold_copy crc32_fold_copy_c
# define native_crc32_fold_final crc32_fold_final_c
# define native_crc32_fold_reset crc32_fold_reset_c
# define native_inflate_fast inflate_fast_c
# define native_slide_hash slide_hash_c
# define native_longest_match longest_match_generic
# define native_longest_match_slow longest_match_slow_generic
# define native_compare256 compare256_generic
#endif
#endif

View File

@ -1,6 +1,6 @@
/* slide_hash.c -- slide hash table C implementation
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/

View File

@ -4,7 +4,7 @@
#ifdef POWER8_VSX
#include <altivec.h>
#include "../../zbuild.h"
#include "zbuild.h"
typedef vector unsigned char chunk_t;

View File

@ -5,8 +5,10 @@
#ifdef POWER9
#include <altivec.h>
#include "../../zbuild.h"
#include "../../zendian.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "zendian.h"
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */

View File

@ -1,16 +1,19 @@
/* power_features.c - POWER feature check
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
* Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifdef POWER_NEED_AUXVEC_H
# include <linux/auxvec.h>
#endif
#ifdef __FreeBSD__
# include <machine/cpu.h>
#endif
#include "../../zbuild.h"
#include "zbuild.h"
#include "power_features.h"
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {

View File

@ -4,8 +4,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_H_
#define POWER_H_
#ifndef POWER_FEATURES_H_
#define POWER_FEATURES_H_
struct power_cpu_features {
int has_altivec;
@ -15,4 +15,4 @@ struct power_cpu_features {
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
#endif /* POWER_H_ */
#endif /* POWER_FEATURES_H_ */

View File

@ -0,0 +1,67 @@
/* power_functions.h -- POWER implementations for arch-specific functions.
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_FUNCTIONS_H_
#define POWER_FUNCTIONS_H_
#ifdef PPC_VMX
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
void slide_hash_vmx(deflate_state *s);
#endif
#ifdef POWER8_VSX
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_power8(void);
uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
void slide_hash_power8(deflate_state *s);
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef POWER9
uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Power - VMX
# if defined(PPC_VMX) && defined(__ALTIVEC__)
# undef native_adler32
# define native_adler32 adler32_vmx
# undef native_slide_hash
# define native_slide_hash slide_hash_vmx
# endif
// Power8 - VSX
# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
# undef native_adler32
# define native_adler32 adler32_power8
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_power8
# undef native_chunksize
# define native_chunksize chunksize_power8
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_power8
# undef native_slide_hash
# define native_slide_hash slide_hash_power8
# endif
# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
# undef native_crc32
# define native_crc32 crc32_power8
# endif
// Power9
# if defined(POWER9) && defined(_ARCH_PWR9)
# undef native_compare256
# define native_compare256 compare256_power9
# undef native_longest_match
# define native_longest_match longest_match_power9
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_power9
# endif
#endif
#endif /* POWER_FUNCTIONS_H_ */

View File

@ -9,8 +9,8 @@
#include <riscv_vector.h>
#include <stdint.h>
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "zbuild.h"
#include "adler32_p.h"
static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
/* split Adler-32 into component sums */

View File

@ -6,7 +6,9 @@
#ifdef RISCV_RVV
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#include <riscv_vector.h>

View File

@ -1,10 +1,13 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/auxv.h>
#include <sys/utsname.h>
#include "../../zbuild.h"
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
# include <sys/auxv.h>
#endif
#include "zbuild.h"
#include "riscv_features.h"
#define ISA_V_HWCAP (1 << ('v' - 'a'))
@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea
}
void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
unsigned long hw_cap = getauxval(AT_HWCAP);
#else
unsigned long hw_cap = 0;
#endif
features->has_rvv = hw_cap & ISA_V_HWCAP;
}

View File

@ -6,8 +6,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef RISCV_H_
#define RISCV_H_
#ifndef RISCV_FEATURES_H_
#define RISCV_FEATURES_H_
struct riscv_cpu_features {
int has_rvv;
@ -15,4 +15,4 @@ struct riscv_cpu_features {
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
#endif /* RISCV_H_ */
#endif /* RISCV_FEATURES_H_ */

View File

@ -0,0 +1,49 @@
/* riscv_functions.h -- RISCV implementations for arch-specific functions.
*
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef RISCV_FUNCTIONS_H_
#define RISCV_FUNCTIONS_H_
#ifdef RISCV_RVV
uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_rvv(void);
uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
void slide_hash_rvv(deflate_state *s);
void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// RISCV - RVV
# if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
# undef native_adler32
# define native_adler32 adler32_rvv
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_rvv
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_rvv
# undef native_chunksize
# define native_chunksize chunksize_rvv
# undef native_compare256
# define native_compare256 compare256_rvv
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_rvv
# undef native_longest_match
# define native_longest_match longest_match_rvv
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_rvv
# undef native_slide_hash
# define native_slide_hash slide_hash_rvv
# endif
#endif
#endif /* RISCV_FUNCTIONS_H_ */

View File

@ -8,18 +8,16 @@
#include <riscv_vector.h>
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
size_t vl;
while (entries > 0) {
vl = __riscv_vsetvl_e16m4(entries);
vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
__riscv_vse16_v_u16m4(table, v_tab, vl);
vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
__riscv_vse16_v_u16m4(table, v_diff, vl);
table += vl, entries -= vl;
}
}

48
3rdparty/zlib-ng/arch/s390/Makefile.in vendored Normal file
View File

@ -0,0 +1,48 @@
# Makefile for zlib-ng
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
VGFMAFLAG=
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
s390_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
s390_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
dfltcc_deflate.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
dfltcc_deflate.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
dfltcc_inflate.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
dfltcc_inflate.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
crc32-vx.o:
$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
crc32-vx.lo:
$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

277
3rdparty/zlib-ng/arch/s390/README.md vendored Normal file
View File

@ -0,0 +1,277 @@
# Introduction
This directory contains SystemZ deflate hardware acceleration support.
It can be enabled using the following build commands:
$ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
$ make
or
$ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
$ make
When built like this, zlib-ng would compress using hardware on level 1,
and using software on all other levels. Decompression will always happen
in hardware. In order to enable hardware compression for levels 1-6
(i.e. to make it used by default) one could add
`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
SystemZ deflate hardware acceleration is available on [IBM z15](
https://www.ibm.com/products/z15) and newer machines under the name [
"Integrated Accelerator for zEnterprise Data Compression"](
https://www.ibm.com/support/z-content-solutions/compression/). The
programming interface to it is a machine instruction called DEFLATE
CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
the code and the rest of this document refer to this feature simply as
"DFLTCC".
# Performance
Performance figures are published [here](
https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
). The compression speed-up can be as high as 110x and the decompression
speed-up can be as high as 15x.
# Limitations
Two DFLTCC compression calls with identical inputs are not guaranteed to
produce identical outputs. Therefore care should be taken when using
hardware compression when reproducible results are desired. In
particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
particular stream.
DFLTCC does not support every single zlib-ng feature, in particular:
* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
* `inflateMark()`
* `inflatePrime()`
* `inflateSyncPoint()`
When used, these functions will either switch to software, or, in case
this is not possible, gracefully fail.
# Code structure
All SystemZ-specific code lives in `arch/s390` directory and is
integrated with the rest of zlib-ng using hook macros.
## Hook macros
DFLTCC takes as arguments a parameter block, an input buffer, an output
buffer, and a window. Parameter blocks are stored alongside zlib states;
buffers are forwarded from the caller; and window - which must be
4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
Software and hardware window formats do not match, therefore,
`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
and `inflateGetDictionary()` need special handling, which is triggered using
`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
`INFLATE_RESET_KEEP_HOOK()` macros.
`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
calls gracefully fail.
`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
software compression mid-stream using `deflateParams()`. Switching
normally entails flushing the current block, which might not be possible
in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
in order to detect and gracefully handle such situations.
The algorithm implemented in hardware has different compression ratio
than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
return the correct results for the hardware implementation.
Actual compression and decompression are handled by `DEFLATE_HOOK()` and
`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
window on its own, calling `updatewindow()` is suppressed using
`INFLATE_NEED_UPDATEWINDOW()` macro.
In addition to compression, DFLTCC computes CRC-32 and Adler-32
checksums, therefore, whenever it's used, software checksumming is
suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
macros.
While software always produces reproducible compression results, this
is not the case for DFLTCC. Therefore, zlib-ng users are given the
ability to specify whether or not reproducible compression results
are required. While it is always possible to specify this setting
before the compression begins, it is not always possible to do so in
the middle of a deflate stream - the exact conditions for that are
determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
## SystemZ-specific code
When zlib-ng is built with DFLTCC, the hooks described above are
converted to calls to functions, which are implemented in
`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
categories:
* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
* Translating between software and hardware data formats, e.g.
`dfltcc_deflate_set_dictionary()`.
* Translating between software and hardware state machines, e.g.
`dfltcc_deflate()` and `dfltcc_inflate()`.
The functions from the first two categories are fairly simple, however,
various quirks in both software and hardware state machines make the
functions from the third category quite complicated.
### `dfltcc_deflate()` function
This function is called by `deflate()` and has the following
responsibilities:
* Checking whether DFLTCC can be used with the current stream. If this
is not the case, then it returns `0`, making `deflate()` use some
other function in order to compress in software. Otherwise it returns
`1`.
* Block management and Huffman table generation. DFLTCC ends blocks only
when explicitly instructed to do so by the software. Furthermore,
whether to use fixed or dynamic Huffman tables must also be determined
by the software. Since looking at data in order to gather statistics
would negate performance benefits, the following approach is used: the
first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
dynamic blocks.
* Writing EOBS. Block Closing Control bit in the parameter block
instructs DFLTCC to write EOBS, however, certain conditions need to be
met: input data length must be non-zero or Continuation Flag must be
set. To put this in simpler terms, DFLTCC will silently refuse to
write EOBS if this is the only thing that it is asked to do. Since the
code has to be able to emit EOBS in software anyway, in order to avoid
tricky corner cases Block Closing Control is never used. Whether to
write EOBS is instead controlled by `soft_bcc` variable.
* Triggering block post-processing. Depending on flush mode, `deflate()`
must perform various additional actions when a block or a stream ends.
`dfltcc_deflate()` informs `deflate()` about this using
`block_state *result` parameter.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
and Sub-Byte Boundary. Certain fields cannot be translated and must
persist untouched in the parameter block between calls, for example,
Continuation Flag or Continuation State Buffer.
* Handling flush modes and low-memory situations. These aspects are
quite intertwined and pervasive. The general idea here is that the
code must not do anything in software - whether explicitly by e.g.
calling `send_eobs()`, or implicitly - by returning to `deflate()`
with certain return and `*result` values, when Continuation Flag is
set.
* Ending streams. When a new block is started and flush mode is
`Z_FINISH`, Block Header Final parameter block bit is used to mark
this block as final. However, sometimes an empty final block is
needed, and, unfortunately, just like with EOBS, DFLTCC will silently
refuse to do this. The general idea of DFLTCC implementation is to
rely as much as possible on the existing code. Here in order to do
this, the code pretends that it does not support DFLTCC, which makes
`deflate()` call a software compression function, which writes an
empty final block. Whether this is required is controlled by
`need_empty_block` variable.
* Error handling. This is simply converting
Operation-Ending-Supplemental Code to string. Errors can only happen
due to things like memory corruption, and therefore they don't affect
the `deflate()` return code.
### `dfltcc_inflate()` function
This function is called by `inflate()` from the `TYPEDO` state (that is,
when all the metadata is parsed and the stream is positioned at the type
bits of deflate block header) and it's responsible for the following:
* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
Unfortunately, there is no way to ask DFLTCC to stop decompressing on
block or tree boundary.
* `inflate()` decompression loop management. This is controlled using
the return value, which can be either `DFLTCC_INFLATE_BREAK` or
`DFLTCC_INFLATE_CONTINUE`.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `whave` and History Length or `wnext` and
History Offset.
* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
and is controlled by `last` state field.
* Error handling. Like deflate, error handling comprises
Operation-Ending-Supplemental Code to string conversion. Unlike
deflate, errors may happen due to bad inputs, therefore they are
propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
# Testing
Given complexity of DFLTCC machine instruction, it is not clear whether
QEMU TCG will ever support it. At the time of writing, one has to have
access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
DFLTCC is a non-privileged instruction, neither special VM/LPAR
configuration nor root are required.
zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
testing. There is no official IBM Z GitHub Actions runner, so we build
one inspired by `anup-kodlekere/gaplib`.
Future updates to actions-runner might need an updated patch. The .net
version number patch has been separated into a separate file to avoid a
need for constantly changing the patch.
## Configuring the builder.
### Install prerequisites.
```
sudo dnf install podman
```
### Add actions-runner service.
```
sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
sudo systemctl daemon-reload
```
### Create a config file, needs github personal access token.
```
# Create file /etc/actions-runner
repo=<owner>/<name>
access_token=<ghp_***>
```
Access token should have the repo scope, consult
https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
for details.
### Autostart actions-runner.
```
$ sudo systemctl enable --now actions-runner
```
## Rebuilding the container
In order to update the `gaplib-actions-runner` podman container, e.g. to get the
latest OS security fixes, follow these steps:
```
# Stop actions-runner service
sudo systemctl stop actions-runner
# Delete old container
sudo podman container rm gaplib-actions-runner
# Delete old image
sudo podman image rm localhost/zlib-ng/actions-runner
# Build image
sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg .
# Build container
sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner
# Start actions-runner service
sudo systemctl start actions-runner
```

222
3rdparty/zlib-ng/arch/s390/crc32-vx.c vendored Normal file
View File

@ -0,0 +1,222 @@
/*
* Hardware-accelerated CRC-32 variants for Linux on z Systems
*
* Use the z/Architecture Vector Extension Facility to accelerate the
* computing of bitreflected CRC-32 checksums.
*
* This CRC-32 implementation algorithm is bitreflected and processes
* the least-significant bit first (Little-Endian).
*
* This code was originally written by Hendrik Brueckner
* <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
* relicensed under the zlib license.
*/
#include "zbuild.h"
#include "arch_functions.h"
#include <vecintrin.h>
typedef unsigned char uv16qi __attribute__((vector_size(16)));
typedef unsigned int uv4si __attribute__((vector_size(16)));
typedef unsigned long long uv2di __attribute__((vector_size(16)));
static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
/*
* The CRC-32 constant block contains reduction constants to fold and
* process particular chunks of the input data stream in parallel.
*
* For the CRC-32 variants, the constants are precomputed according to
* these definitions:
*
* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
* R3 = [(x128+32 mod P'(x) << 32)]' << 1
* R4 = [(x128-32 mod P'(x) << 32)]' << 1
* R5 = [(x64 mod P'(x) << 32)]' << 1
* R6 = [(x32 mod P'(x) << 32)]' << 1
*
* The bitreflected Barret reduction constant, u', is defined as
* the bit reversal of floor(x**64 / P(x)).
*
* where P(x) is the polynomial in the normal domain and the P'(x) is the
* polynomial in the reversed (bitreflected) domain.
*
* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
*
* P(x) = 0x04C11DB7
* P'(x) = 0xEDB88320
*/
const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
const uv2di r5 = {0, 0x163CD6124}; /* R5 */
const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
/*
* Load the initial CRC value.
*
* The CRC value is loaded into the rightmost word of the
* vector register and is later XORed with the LSB portion
* of the loaded input data.
*/
uv2di v0 = {0, 0};
v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
/* Load a 64-byte data chunk and XOR with CRC */
uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
v1 ^= v0;
buf += 64;
len -= 64;
while (len >= 64) {
/* Load the next 64-byte data chunk */
uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
/*
* Perform a GF(2) multiplication of the doublewords in V1 with
* the R1 and R2 reduction constants in V0. The intermediate result
* is then folded (accumulated) with the next data chunk in PART1 and
* stored in V1. Repeat this step for the register contents
* in V2, V3, and V4 respectively.
*/
v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
buf += 64;
len -= 64;
}
/*
* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
* and R4 and accumulating the next 128-bit chunk until a single 128-bit
* value remains.
*/
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
while (len >= 16) {
/* Load next data chunk */
v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
/* Fold next data chunk */
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
buf += 16;
len -= 16;
}
/*
* Set up a vector register for byte shifts. The shift value must
* be loaded in bits 1-4 in byte element 7 of a vector register.
* Shift by 8 bytes: 0x40
* Shift by 4 bytes: 0x20
*/
uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
v9 = vec_insert((unsigned char)0x40, v9, 7);
/*
* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
* to move R4 into the rightmost doubleword and set the leftmost
* doubleword to 0x1.
*/
v0 = vec_srb(r4r3, (uv2di)v9);
v0[0] = 1;
/*
* Compute GF(2) product of V1 and V0. The rightmost doubleword
* of V1 is multiplied with R4. The leftmost doubleword of V1 is
* multiplied by 0x1 and is then XORed with rightmost product.
* Implicitly, the intermediate leftmost product becomes padded
*/
v1 = (uv2di)vec_gfmsum_128(v0, v1);
/*
* Now do the final 32-bit fold by multiplying the rightmost word
* in V1 with R5 and XOR the result with the remaining bits in V1.
*
* To achieve this by a single VGFMAG, right shift V1 by a word
* and store the result in V2 which is then accumulated. Use the
* vector unpack instruction to load the rightmost half of the
* doubleword into the rightmost doubleword element of V1; the other
* half is loaded in the leftmost doubleword.
* The vector register with CONST_R5 contains the R5 constant in the
* rightmost doubleword and the leftmost doubleword is zero to ignore
* the leftmost product of V1.
*/
v9 = vec_insert((unsigned char)0x20, v9, 7);
v2 = vec_srb(v1, (uv2di)v9);
v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
/*
* Apply a Barret reduction to compute the final 32-bit CRC value.
*
* The input values to the Barret reduction are the degree-63 polynomial
* in V1 (R(x)), degree-32 generator polynomial, and the reduction
* constant u. The Barret reduction result is the CRC value of R(x) mod
* P(x).
*
* The Barret reduction algorithm is defined as:
*
* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
* 3. C(x) = R(x) XOR T2(x) mod x^32
*
* Note: The leftmost doubleword of vector register containing
* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
* is zero and does not contribute to the final result.
*/
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
v2 = vec_unpackl((uv4si)v1);
v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
/*
* Compute the GF(2) product of the CRC polynomial with T1(x) in
* V2 and XOR the intermediate result, T2(x), with the value in V1.
* The final result is stored in word element 2 of V2.
*/
v2 = vec_unpackl((uv4si)v2);
v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
return ((uv4si)v2)[2];
}
#define VX_MIN_LEN 64
#define VX_ALIGNMENT 16L
#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
size_t prealign, aligned, remaining;
if (len < VX_MIN_LEN + VX_ALIGN_MASK)
return PREFIX(crc32_braid)(crc, buf, len);
if ((uintptr_t)buf & VX_ALIGN_MASK) {
prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
len -= prealign;
crc = PREFIX(crc32_braid)(crc, buf, prealign);
buf += prealign;
}
aligned = len & ~VX_ALIGN_MASK;
remaining = len & VX_ALIGN_MASK;
crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
if (remaining)
crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
return crc;
}

View File

@ -0,0 +1,119 @@
#ifndef DFLTCC_COMMON_H
#define DFLTCC_COMMON_H
#include "zutil.h"
/*
Parameter Block for Query Available Functions.
*/
struct dfltcc_qaf_param {
char fns[16];
char reserved1[8];
char fmts[2];
char reserved2[6];
} ALIGNED_(8);
/*
Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
*/
struct dfltcc_param_v0 {
uint16_t pbvn; /* Parameter-Block-Version Number */
uint8_t mvn; /* Model-Version Number */
uint8_t ribm; /* Reserved for IBM use */
uint32_t reserved32 : 31;
uint32_t cf : 1; /* Continuation Flag */
uint8_t reserved64[8];
uint32_t nt : 1; /* New Task */
uint32_t reserved129 : 1;
uint32_t cvt : 1; /* Check Value Type */
uint32_t reserved131 : 1;
uint32_t htt : 1; /* Huffman-Table Type */
uint32_t bcf : 1; /* Block-Continuation Flag */
uint32_t bcc : 1; /* Block Closing Control */
uint32_t bhf : 1; /* Block Header Final */
uint32_t reserved136 : 1;
uint32_t reserved137 : 1;
uint32_t dhtgc : 1; /* DHT Generation Control */
uint32_t reserved139 : 5;
uint32_t reserved144 : 5;
uint32_t sbb : 3; /* Sub-Byte Boundary */
uint8_t oesc; /* Operation-Ending-Supplemental Code */
uint32_t reserved160 : 12;
uint32_t ifs : 4; /* Incomplete-Function Status */
uint16_t ifl; /* Incomplete-Function Length */
uint8_t reserved192[8];
uint8_t reserved256[8];
uint8_t reserved320[4];
uint16_t hl; /* History Length */
uint32_t reserved368 : 1;
uint16_t ho : 15; /* History Offset */
uint32_t cv; /* Check Value */
uint32_t eobs : 15; /* End-of-block Symbol */
uint32_t reserved431: 1;
uint8_t eobl : 4; /* End-of-block Length */
uint32_t reserved436 : 12;
uint32_t reserved448 : 4;
uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table
Length */
uint8_t reserved464[6];
uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */
uint8_t reserved[24];
uint8_t ribm2[8]; /* Reserved for IBM use */
uint8_t csb[1152]; /* Continuation-State Buffer */
} ALIGNED_(8);
/*
Extension of inflate_state and deflate_state.
*/
struct dfltcc_state {
struct dfltcc_param_v0 param; /* Parameter block. */
struct dfltcc_qaf_param af; /* Available functions. */
char msg[64]; /* Buffer for strm->msg */
};
typedef struct {
struct dfltcc_state common;
uint16_t level_mask; /* Levels on which to use DFLTCC */
uint32_t block_size; /* New block each X bytes */
size_t block_threshold; /* New block after total_in > X */
uint32_t dht_threshold; /* New block only if avail_in >= X */
} arch_deflate_state;
typedef struct {
struct dfltcc_state common;
} arch_inflate_state;
/*
History buffer size.
*/
#define HB_BITS 15
#define HB_SIZE (1 << HB_BITS)
/*
Sizes of deflate block parts.
*/
#define DFLTCC_BLOCK_HEADER_BITS 3
#define DFLTCC_HLITS_COUNT_BITS 5
#define DFLTCC_HDISTS_COUNT_BITS 5
#define DFLTCC_HCLENS_COUNT_BITS 4
#define DFLTCC_MAX_HCLENS 19
#define DFLTCC_HCLEN_BITS 3
#define DFLTCC_MAX_HLITS 286
#define DFLTCC_MAX_HDISTS 30
#define DFLTCC_MAX_HLIT_HDIST_BITS 7
#define DFLTCC_MAX_SYMBOL_BITS 16
#define DFLTCC_MAX_EOBS_BITS 15
#define DFLTCC_MAX_PADDING_BITS 7
#define DEFLATE_BOUND_COMPLEN(source_len) \
((DFLTCC_BLOCK_HEADER_BITS + \
DFLTCC_HLITS_COUNT_BITS + \
DFLTCC_HDISTS_COUNT_BITS + \
DFLTCC_HCLENS_COUNT_BITS + \
DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
(DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
(source_len) * DFLTCC_MAX_SYMBOL_BITS + \
DFLTCC_MAX_EOBS_BITS + \
DFLTCC_MAX_PADDING_BITS) >> 3)
#endif

View File

@ -0,0 +1,383 @@
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
/*
Use the following commands to build zlib-ng with DFLTCC compression support:
$ ./configure --with-dfltcc-deflate
or
$ cmake -DWITH_DFLTCC_DEFLATE=1 .
and then
$ make
*/
#include "zbuild.h"
#include "deflate.h"
#include "trees_emit.h"
#include "dfltcc_deflate.h"
#include "dfltcc_detail.h"
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
arch_deflate_state *dfltcc_state = &state->arch;
dfltcc_reset_state(&dfltcc_state->common);
/* Initialize tuning parameters */
dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
}
static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
arch_deflate_state *dfltcc_state = &state->arch;
/* Unsupported compression settings */
if ((dfltcc_state->level_mask & (1 << level)) == 0)
return 0;
if (window_bits != HB_BITS)
return 0;
if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
return 0;
if (reproducible)
return 0;
/* Unsupported hardware */
if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
!is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
return 0;
return 1;
}
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
}
static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
size_t avail_in = strm->avail_in;
dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
}
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
size_t avail_in = strm->avail_in;
size_t avail_out = strm->avail_out;
dfltcc_cc cc;
cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
param, &strm->next_out, &avail_out,
&strm->next_in, &avail_in, state->window);
strm->total_in += (strm->avail_in - avail_in);
strm->total_out += (strm->avail_out - avail_out);
strm->avail_in = avail_in;
strm->avail_out = avail_out;
return cc;
}
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
deflate_state *state = (deflate_state *)strm->state;
send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
PREFIX(flush_pending)(strm);
if (state->pending != 0) {
/* The remaining data is located in pending_out[0:pending]. If someone
* calls put_byte() - this might happen in deflate() - the byte will be
* placed into pending_buf[pending], which is incorrect. Move the
* remaining data to the beginning of pending_buf so that put_byte() is
* usable again.
*/
memmove(state->pending_buf, state->pending_out, state->pending);
state->pending_out = state->pending_buf;
}
#ifdef ZLIB_DEBUG
state->compressed_len += param->eobl;
#endif
}
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
deflate_state *state = (deflate_state *)strm->state;
arch_deflate_state *dfltcc_state = &state->arch;
struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
uInt masked_avail_in;
dfltcc_cc cc;
int need_empty_block;
int soft_bcc;
int no_flush;
if (!PREFIX(dfltcc_can_deflate)(strm)) {
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
return 0;
}
again:
masked_avail_in = 0;
soft_bcc = 0;
no_flush = flush == Z_NO_FLUSH;
/* No input data. Return, except when Continuation Flag is set, which means
* that DFLTCC has buffered some output in the parameter block and needs to
* be called again in order to flush it.
*/
if (strm->avail_in == 0 && !param->cf) {
/* A block is still open, and the hardware does not support closing
* blocks without adding data. Thus, close it manually.
*/
if (!no_flush && param->bcf) {
send_eobs(strm, param);
param->bcf = 0;
}
/* Let one of deflate_* functions write a trailing empty block. */
if (flush == Z_FINISH)
return 0;
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
/* Trigger block post-processing if necessary. */
*result = no_flush ? need_more : block_done;
return 1;
}
/* There is an open non-BFINAL block, we are not going to close it just
* yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
* more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
* DHT in order to adapt to a possibly changed input data distribution.
*/
if (param->bcf && no_flush &&
strm->total_in > dfltcc_state->block_threshold &&
strm->avail_in >= dfltcc_state->dht_threshold) {
if (param->cf) {
/* We need to flush the DFLTCC buffer before writing the
* End-of-block Symbol. Mask the input data and proceed as usual.
*/
masked_avail_in += strm->avail_in;
strm->avail_in = 0;
no_flush = 0;
} else {
/* DFLTCC buffer is empty, so we can manually write the
* End-of-block Symbol right away.
*/
send_eobs(strm, param);
param->bcf = 0;
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
}
}
/* No space for compressed data. If we proceed, dfltcc_cmpr() will return
* DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
* set BCF=1, which is wrong. Avoid complications and return early.
*/
if (strm->avail_out == 0) {
*result = need_more;
return 1;
}
/* The caller gave us too much data. Pass only one block worth of
* uncompressed data to DFLTCC and mask the rest, so that on the next
* iteration we start a new block.
*/
if (no_flush && strm->avail_in > dfltcc_state->block_size) {
masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
strm->avail_in = dfltcc_state->block_size;
}
/* When we have an open non-BFINAL deflate block and caller indicates that
* the stream is ending, we need to close an open deflate block and open a
* BFINAL one.
*/
need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
/* Translate stream to parameter block */
param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
if (!no_flush)
/* We need to close a block. Always do this in software - when there is
* no input data, the hardware will not honor BCC. */
soft_bcc = 1;
if (flush == Z_FINISH && !param->bcf)
/* We are about to open a BFINAL block, set Block Header Final bit
* until the stream ends.
*/
param->bhf = 1;
/* DFLTCC-CMPR will write to next_out, so make sure that buffers with
* higher precedence are empty.
*/
Assert(state->pending == 0, "There must be no pending bytes");
Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
param->sbb = (unsigned int)state->bi_valid;
if (param->sbb > 0)
*strm->next_out = (unsigned char)state->bi_buf;
/* Honor history and check value */
param->nt = 0;
if (state->wrap == 1)
param->cv = strm->adler;
else if (state->wrap == 2)
param->cv = ZSWAP32(state->crc_fold.value);
/* When opening a block, choose a Huffman-Table Type */
if (!param->bcf) {
if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
param->htt = HTT_FIXED;
else {
param->htt = HTT_DYNAMIC;
dfltcc_gdht(strm);
}
}
/* Deflate */
do {
cc = dfltcc_cmpr(strm);
if (strm->avail_in < 4096 && masked_avail_in > 0)
/* We are about to call DFLTCC with a small input buffer, which is
* inefficient. Since there is masked data, there will be at least
* one more DFLTCC call, so skip the current one and make the next
* one handle more data.
*/
break;
} while (cc == DFLTCC_CC_AGAIN);
/* Translate parameter block to stream */
strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
state->bi_valid = param->sbb;
if (state->bi_valid == 0)
state->bi_buf = 0; /* Avoid accessing next_out */
else
state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
if (state->wrap == 1)
strm->adler = param->cv;
else if (state->wrap == 2)
state->crc_fold.value = ZSWAP32(param->cv);
/* Unmask the input data */
strm->avail_in += masked_avail_in;
masked_avail_in = 0;
/* If we encounter an error, it means there is a bug in DFLTCC call */
Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
/* Update Block-Continuation Flag. It will be used to check whether to call
* GDHT the next time.
*/
if (cc == DFLTCC_CC_OK) {
if (soft_bcc) {
send_eobs(strm, param);
param->bcf = 0;
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
} else
param->bcf = 1;
if (flush == Z_FINISH) {
if (need_empty_block)
/* Make the current deflate() call also close the stream */
return 0;
else {
bi_windup(state);
*result = finish_done;
}
} else {
if (flush == Z_FULL_FLUSH)
param->hl = 0; /* Clear history */
*result = flush == Z_NO_FLUSH ? need_more : block_done;
}
} else {
param->bcf = 1;
*result = need_more;
}
if (strm->avail_in != 0 && strm->avail_out != 0)
goto again; /* deflate() must use all input or all output */
return 1;
}
/*
Switching between hardware and software compression.
DFLTCC does not support all zlib settings, e.g. generation of non-compressed
blocks or alternative window sizes. When such settings are applied on the
fly with deflateParams, we need to convert between hardware and software
window formats.
*/
static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
deflate_state *state = (deflate_state *)strm->state;
int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
if (can_deflate == could_deflate)
/* We continue to work in the same mode - no changes needed */
return Z_OK;
if (!dfltcc_was_deflate_used(strm))
/* DFLTCC was not used yet - no changes needed */
return Z_OK;
/* For now, do not convert between window formats - simply get rid of the old data instead */
*flush = Z_FULL_FLUSH;
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
/* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
* close the block without resetting the compression state. Detect this
* situation and return that deflation is not done.
*/
if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
return 0;
/* Return that deflation is not done if DFLTCC is used and either it
* buffered some data (Continuation Flag is set), or has not written EOBS
* yet (Block-Continuation Flag is set).
*/
return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
}
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
}
/*
Preloading history.
*/
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
append_history(param, state->window, dictionary, dict_length);
state->strstart = 1; /* Add FDICT to zlib header */
state->block_start = state->strstart; /* Make deflate_stored happy */
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
if (dictionary)
get_history(param, state->window, dictionary);
if (dict_length)
*dict_length = param->hl;
return Z_OK;
}

View File

@ -0,0 +1,58 @@
#ifndef DFLTCC_DEFLATE_H
#define DFLTCC_DEFLATE_H
#include "deflate.h"
#include "dfltcc_common.h"
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length);
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_deflate)((strm))) \
return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_deflate)((strm))) \
return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
do { \
int err; \
\
err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
if (err == Z_STREAM_ERROR) \
return err; \
} while (0)
#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
do { \
if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
(complen) = DEFLATE_BOUND_COMPLEN(source_len); \
} while (0)
#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
#endif

View File

@ -0,0 +1,275 @@
#include "zbuild.h"
#include <stdio.h>
#ifdef HAVE_SYS_SDT_H
#include <sys/sdt.h>
#endif
/*
Tuning parameters.
*/
#ifndef DFLTCC_LEVEL_MASK
#define DFLTCC_LEVEL_MASK 0x2
#endif
#ifndef DFLTCC_BLOCK_SIZE
#define DFLTCC_BLOCK_SIZE 1048576
#endif
#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
#endif
#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
#endif
#ifndef DFLTCC_RIBM
#define DFLTCC_RIBM 0
#endif
#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
#define DFLTCC_SIZEOF_QAF 32
static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
static inline int is_bit_set(const char *bits, int n) {
return bits[n / 8] & (1 << (7 - (n % 8)));
}
static inline void clear_bit(char *bits, int n) {
bits[n / 8] &= ~(1 << (7 - (n % 8)));
}
#define DFLTCC_FACILITY 151
static inline int is_dfltcc_enabled(void) {
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
Z_REGISTER uint8_t r0 __asm__("r0");
memset(facilities, 0, sizeof(facilities));
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
* is 64-bit, it's always z/Architecture mode at runtime.
*/
__asm__ volatile(
#ifndef __clang__
".machinemode push\n"
".machinemode zarch\n"
#endif
"stfle %[facilities]\n"
#ifndef __clang__
".machinemode pop\n"
#endif
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
}
#define DFLTCC_FMT0 0
#define CVT_CRC32 0
#define CVT_ADLER32 1
#define HTT_FIXED 0
#define HTT_DYNAMIC 1
#define DFLTCC_SIZEOF_GDHT_V0 384
#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
static inline z_const char *oesc_msg(char *buf, int oesc) {
if (oesc == 0x00)
return NULL; /* Successful completion */
else {
sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
return buf;
}
}
/*
C wrapper for the DEFLATE CONVERSION CALL instruction.
*/
typedef enum {
DFLTCC_CC_OK = 0,
DFLTCC_CC_OP1_TOO_SHORT = 1,
DFLTCC_CC_OP2_TOO_SHORT = 2,
DFLTCC_CC_OP2_CORRUPT = 2,
DFLTCC_CC_AGAIN = 3,
} dfltcc_cc;
#define DFLTCC_QAF 0
#define DFLTCC_GDHT 1
#define DFLTCC_CMPR 2
#define DFLTCC_XPND 4
#define HBT_CIRCULAR (1 << 7)
#define DFLTCC_FN_MASK ((1 << 7) - 1)
/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
*hl_high = MIN(param->hl, HB_SIZE - param->ho);
*hl_low = param->hl - *hl_high;
}
/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
size_t hl_high, hl_low;
get_history_lengths(param, &hl_high, &hl_low);
instrument_read_write(hist + param->ho, hl_high);
instrument_read_write(hist, hl_low);
}
/* Notify MSan about a completed write to the circular history buffer. */
static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
size_t hl_high, hl_low;
get_history_lengths(param, &hl_high, &hl_low);
__msan_unpoison(hist + param->ho, hl_high);
__msan_unpoison(hist, hl_low);
}
static inline dfltcc_cc dfltcc(int fn, void *param,
unsigned char **op1, size_t *len1,
z_const unsigned char **op2, size_t *len2, void *hist) {
unsigned char *t2 = op1 ? *op1 : NULL;
unsigned char *orig_t2 = t2;
size_t t3 = len1 ? *len1 : 0;
z_const unsigned char *t4 = op2 ? *op2 : NULL;
size_t t5 = len2 ? *len2 : 0;
Z_REGISTER int r0 __asm__("r0");
Z_REGISTER void *r1 __asm__("r1");
Z_REGISTER unsigned char *r2 __asm__("r2");
Z_REGISTER size_t r3 __asm__("r3");
Z_REGISTER z_const unsigned char *r4 __asm__("r4");
Z_REGISTER size_t r5 __asm__("r5");
int cc;
/* Insert pre-instrumentation for DFLTCC. */
switch (fn & DFLTCC_FN_MASK) {
case DFLTCC_QAF:
instrument_write(param, DFLTCC_SIZEOF_QAF);
break;
case DFLTCC_GDHT:
instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
instrument_read(t4, t5);
break;
case DFLTCC_CMPR:
case DFLTCC_XPND:
instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
instrument_read(t4, t5);
instrument_write(t2, t3);
instrument_read_write_hist(param, hist);
break;
}
r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
__asm__ volatile(
#ifdef HAVE_SYS_SDT_H
STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
#endif
".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
#ifdef HAVE_SYS_SDT_H
STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
#endif
"ipm %[cc]\n"
: [r2] "+r" (r2)
, [r3] "+r" (r3)
, [r4] "+r" (r4)
, [r5] "+r" (r5)
, [cc] "=r" (cc)
: [r0] "r" (r0)
, [r1] "r" (r1)
, [hist] "r" (hist)
#ifdef HAVE_SYS_SDT_H
, STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
#endif
: "cc", "memory");
t2 = r2; t3 = r3; t4 = r4; t5 = r5;
/* Insert post-instrumentation for DFLTCC. */
switch (fn & DFLTCC_FN_MASK) {
case DFLTCC_QAF:
__msan_unpoison(param, DFLTCC_SIZEOF_QAF);
break;
case DFLTCC_GDHT:
__msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
break;
case DFLTCC_CMPR:
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
__msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
msan_unpoison_hist(param, hist);
break;
case DFLTCC_XPND:
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
__msan_unpoison(orig_t2, t2 - orig_t2);
msan_unpoison_hist(param, hist);
break;
}
if (op1)
*op1 = t2;
if (len1)
*len1 = t3;
if (op2)
*op2 = t4;
if (len2)
*len2 = t5;
return (cc >> 28) & 3;
}
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
/* Initialize available functions */
if (is_dfltcc_enabled()) {
dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
} else
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
/* Initialize parameter block */
memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
dfltcc_state->param.nt = 1;
dfltcc_state->param.ribm = DFLTCC_RIBM;
}
static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
}
static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
const unsigned char *buf, uInt count) {
size_t offset;
size_t n;
/* Do not use more than 32K */
if (count > HB_SIZE) {
buf += count - HB_SIZE;
count = HB_SIZE;
}
offset = (param->ho + param->hl) % HB_SIZE;
if (offset + count <= HB_SIZE)
/* Circular history buffer does not wrap - copy one chunk */
memcpy(history + offset, buf, count);
else {
/* Circular history buffer wraps - copy two chunks */
n = HB_SIZE - offset;
memcpy(history + offset, buf, n);
memcpy(history, buf + n, count - n);
}
n = param->hl + count;
if (n <= HB_SIZE)
/* All history fits into buffer - no need to discard anything */
param->hl = n;
else {
/* History does not fit into buffer - discard extra bytes */
param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
param->hl = HB_SIZE;
}
}
static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
unsigned char *buf) {
size_t hl_high, hl_low;
get_history_lengths(param, &hl_high, &hl_low);
memcpy(buf, history + param->ho, hl_high);
memcpy(buf + hl_high, history, hl_low);
}

View File

@ -0,0 +1,191 @@
/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
/*
Use the following commands to build zlib-ng with DFLTCC decompression support:
$ ./configure --with-dfltcc-inflate
or
$ cmake -DWITH_DFLTCC_INFLATE=1 .
and then
$ make
*/
#include "zbuild.h"
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "dfltcc_inflate.h"
#include "dfltcc_detail.h"
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
dfltcc_reset_state(&state->arch.common);
}
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = &state->arch.common;
/* Unsupported hardware */
return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
}
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
size_t avail_in = strm->avail_in;
size_t avail_out = strm->avail_out;
dfltcc_cc cc;
cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
param, &strm->next_out, &avail_out,
&strm->next_in, &avail_in, state->window);
strm->avail_in = avail_in;
strm->avail_out = avail_out;
return cc;
}
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = &state->arch.common;
struct dfltcc_param_v0 *param = &dfltcc_state->param;
dfltcc_cc cc;
if (flush == Z_BLOCK || flush == Z_TREES) {
/* DFLTCC does not support stopping on block boundaries */
if (PREFIX(dfltcc_inflate_disable)(strm)) {
*ret = Z_STREAM_ERROR;
return DFLTCC_INFLATE_BREAK;
} else
return DFLTCC_INFLATE_SOFTWARE;
}
if (state->last) {
if (state->bits != 0) {
strm->next_in++;
strm->avail_in--;
state->bits = 0;
}
state->mode = CHECK;
return DFLTCC_INFLATE_CONTINUE;
}
if (strm->avail_in == 0 && !param->cf)
return DFLTCC_INFLATE_BREAK;
/* if window not in use yet, initialize */
if (state->wsize == 0)
state->wsize = 1U << state->wbits;
/* Translate stream to parameter block */
param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
param->sbb = state->bits;
if (param->hl)
param->nt = 0; /* Honor history for the first block */
if (state->wrap & 4)
param->cv = state->flags ? ZSWAP32(state->check) : state->check;
/* Inflate */
do {
cc = dfltcc_xpnd(strm);
} while (cc == DFLTCC_CC_AGAIN);
/* Translate parameter block to stream */
strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
state->last = cc == DFLTCC_CC_OK;
state->bits = param->sbb;
if (state->wrap & 4)
strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
/* Report an error if stream is corrupted */
state->mode = BAD;
return DFLTCC_INFLATE_CONTINUE;
}
state->mode = TYPEDO;
/* Break if operands are exhausted, otherwise continue looping */
return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
}
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
return !state->arch.common.param.nt;
}
/*
Rotates a circular buffer.
The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
*/
static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
unsigned char *p = pivot;
unsigned char tmp;
while (p != start) {
tmp = *start;
*start = *p;
*p = tmp;
start++;
p++;
if (p == end)
p = pivot;
else if (start == pivot)
pivot = p;
}
}
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = &state->arch.common;
struct dfltcc_param_v0 *param = &dfltcc_state->param;
if (!PREFIX(dfltcc_can_inflate)(strm))
return 0;
if (PREFIX(dfltcc_was_inflate_used)(strm))
/* DFLTCC has already decompressed some data. Since there is not
* enough information to resume decompression in software, the call
* must fail.
*/
return 1;
/* DFLTCC was not used yet - decompress in software */
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
/* Convert the window from the hardware to the software format */
rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
state->whave = state->wnext = MIN(param->hl, state->wsize);
return 0;
}
/*
Preloading history.
*/
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
/* if window not in use yet, initialize */
if (state->wsize == 0)
state->wsize = 1U << state->wbits;
append_history(param, state->window, dictionary, dict_length);
state->havedict = 1;
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
unsigned char *dictionary, uInt *dict_length) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
if (dictionary && state->window)
get_history(param, state->window, dictionary);
if (dict_length)
*dict_length = param->hl;
return Z_OK;
}

View File

@ -0,0 +1,67 @@
#ifndef DFLTCC_INFLATE_H
#define DFLTCC_INFLATE_H
#include "dfltcc_common.h"
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
typedef enum {
DFLTCC_INFLATE_CONTINUE,
DFLTCC_INFLATE_BREAK,
DFLTCC_INFLATE_SOFTWARE,
} dfltcc_inflate_action;
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length);
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
unsigned char *dictionary, uInt* dict_length);
#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
#define INFLATE_PRIME_HOOK(strm, bits, value) \
do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
#define INFLATE_TYPEDO_HOOK(strm, flush) \
if (PREFIX(dfltcc_can_inflate)((strm))) { \
dfltcc_inflate_action action; \
\
RESTORE(); \
action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
LOAD(); \
if (action == DFLTCC_INFLATE_CONTINUE) \
break; \
else if (action == DFLTCC_INFLATE_BREAK) \
goto inf_leave; \
}
#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
#define INFLATE_MARK_HOOK(strm) \
do { \
if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
} while (0)
#define INFLATE_SYNC_POINT_HOOK(strm) \
do { \
if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
} while (0)
#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_inflate)((strm))) \
return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_inflate)((strm))) \
return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
#endif

View File

@ -0,0 +1,14 @@
#include "zbuild.h"
#include "s390_features.h"
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifndef HWCAP_S390_VXRS
#define HWCAP_S390_VXRS HWCAP_S390_VX
#endif
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
}

View File

@ -0,0 +1,14 @@
/* s390_features.h -- check for s390 features.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef S390_FEATURES_H_
#define S390_FEATURES_H_
struct s390_cpu_features {
int has_vx;
};
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
#endif

View File

@ -0,0 +1,20 @@
/* s390_functions.h -- s390 implementations for arch-specific functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef S390_FUNCTIONS_H_
#define S390_FUNCTIONS_H_
#ifdef S390_CRC32_VX
uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
# if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
# undef native_crc32
# define native_crc32 = crc32_s390_vx
# endif
#endif
#endif

View File

@ -0,0 +1,47 @@
# Self-Hosted IBM Z Github Actions Runner.
FROM almalinux:9
RUN dnf update -y -q && \
dnf install -y -q --enablerepo=crb wget git which sudo jq \
cmake make automake autoconf m4 libtool ninja-build python3-pip \
gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
RUN dnf install -y -q dotnet-sdk-6.0 && \
echo "Using SDK - `dotnet --version`"
COPY runner-s390x.patch /tmp/runner.patch
COPY runner-global.json /tmp/global.json
RUN cd /tmp && \
git clone -q https://github.com/actions/runner && \
cd runner && \
git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -b build && \
git apply /tmp/runner.patch && \
cp -f /tmp/global.json src/global.json
RUN cd /tmp/runner/src && \
./dev.sh layout && \
./dev.sh package && \
rm -rf /root/.dotnet /root/.nuget
RUN useradd -c "Action Runner" -m actions-runner && \
usermod -L actions-runner
RUN tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
chown -R actions-runner:actions-runner /home/actions-runner
#VOLUME /home/actions-runner
RUN rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
dnf clean all
USER actions-runner
# Scripts.
COPY fs/ /
WORKDIR /home/actions-runner
ENTRYPOINT ["/usr/bin/entrypoint"]
CMD ["/usr/bin/actions-runner"]

View File

@ -0,0 +1,18 @@
[Unit]
Description=Podman container: Gaplib Github Actions Runner
Wants=network-online.target
After=network-online.target
StartLimitIntervalSec=1
RequiresMountsFor=/run/user/1001/containers
[Service]
Environment=PODMAN_SYSTEMD_UNIT=%n
Restart=always
TimeoutStopSec=61
ExecStart=/usr/bin/podman start gaplib-actions-runner
ExecStop=/usr/bin/podman stop -t 1 gaplib-actions-runner
ExecStopPost=/usr/bin/podman stop -t 1 gaplib-actions-runner
Type=forking
[Install]
WantedBy=default.target

View File

@ -0,0 +1,5 @@
{
"sdk": {
"version": "6.0.421"
}
}

View File

@ -0,0 +1,243 @@
diff --git a/src/Directory.Build.props b/src/Directory.Build.props
index 9db5fac..f02e235 100644
--- a/src/Directory.Build.props
+++ b/src/Directory.Build.props
@@ -44,6 +44,9 @@
<PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-arm64'">
<DefineConstants>$(DefineConstants);ARM64</DefineConstants>
</PropertyGroup>
+ <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-s390x'">
+ <DefineConstants>$(DefineConstants);S390X</DefineConstants>
+ </PropertyGroup>
<!-- Set TRACE/DEBUG vars -->
<PropertyGroup>
diff --git a/src/Misc/externals.sh b/src/Misc/externals.sh
index 383221e..1555f67 100755
--- a/src/Misc/externals.sh
+++ b/src/Misc/externals.sh
@@ -189,3 +189,8 @@ if [[ "$PACKAGERUNTIME" == "linux-arm" ]]; then
acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-armv7l.tar.gz" node16 fix_nested_dir
acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-armv7l.tar.gz" node20 fix_nested_dir
fi
+
+if [[ "$PACKAGERUNTIME" == "linux-s390x" ]]; then
+ acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-s390x.tar.gz" node16 fix_nested_dir
+ acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-s390x.tar.gz" node20 fix_nested_dir
+fi
diff --git a/src/Misc/layoutroot/config.sh b/src/Misc/layoutroot/config.sh
index 14cc6ba..9b5b8e6 100755
--- a/src/Misc/layoutroot/config.sh
+++ b/src/Misc/layoutroot/config.sh
@@ -20,25 +20,29 @@ then
message="Execute sudo ./bin/installdependencies.sh to install any missing Dotnet Core 6.0 dependencies."
- ldd ./bin/libcoreclr.so | grep 'not found'
- if [ $? -eq 0 ]; then
- echo "Dependencies is missing for Dotnet Core 6.0"
- echo $message
- exit 1
- fi
+ ARCH=`uname -m`
+ if [ "${ARCH}" != "s390x" -a "${ARCH}" != "ppc64le" ]
+ then
+ ldd ./bin/libcoreclr.so | grep 'not found'
+ if [ $? -eq 0 ]; then
+ echo "Dependencies is missing for Dotnet Core 6.0"
+ echo $message
+ exit 1
+ fi
- ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
- if [ $? -eq 0 ]; then
- echo "Dependencies is missing for Dotnet Core 6.0"
- echo $message
- exit 1
- fi
+ ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
+ if [ $? -eq 0 ]; then
+ echo "Dependencies is missing for Dotnet Core 6.0"
+ echo $message
+ exit 1
+ fi
- ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
- if [ $? -eq 0 ]; then
- echo "Dependencies is missing for Dotnet Core 6.0"
- echo $message
- exit 1
+ ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
+ if [ $? -eq 0 ]; then
+ echo "Dependencies is missing for Dotnet Core 6.0"
+ echo $message
+ exit 1
+ fi
fi
if ! [ -x "$(command -v ldconfig)" ]; then
diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs
index 177e3c9..9545981 100644
--- a/src/Runner.Common/Constants.cs
+++ b/src/Runner.Common/Constants.cs
@@ -58,7 +58,8 @@ namespace GitHub.Runner.Common
X86,
X64,
Arm,
- Arm64
+ Arm64,
+ S390x
}
public static class Runner
@@ -81,6 +82,8 @@ namespace GitHub.Runner.Common
public static readonly Architecture PlatformArchitecture = Architecture.Arm;
#elif ARM64
public static readonly Architecture PlatformArchitecture = Architecture.Arm64;
+#elif S390X
+ public static readonly Architecture PlatformArchitecture = Architecture.S390x;
#else
public static readonly Architecture PlatformArchitecture = Architecture.X64;
#endif
diff --git a/src/Runner.Common/Util/VarUtil.cs b/src/Runner.Common/Util/VarUtil.cs
index 97273a1..2a34430 100644
--- a/src/Runner.Common/Util/VarUtil.cs
+++ b/src/Runner.Common/Util/VarUtil.cs
@@ -53,6 +53,8 @@ namespace GitHub.Runner.Common.Util
return "ARM";
case Constants.Architecture.Arm64:
return "ARM64";
+ case Constants.Architecture.S390x:
+ return "S390X";
default:
throw new NotSupportedException(); // Should never reach here.
}
diff --git a/src/Test/L0/ConstantGenerationL0.cs b/src/Test/L0/ConstantGenerationL0.cs
index 2042485..a9d8b46 100644
--- a/src/Test/L0/ConstantGenerationL0.cs
+++ b/src/Test/L0/ConstantGenerationL0.cs
@@ -20,6 +20,7 @@ namespace GitHub.Runner.Common.Tests
"linux-x64",
"linux-arm",
"linux-arm64",
+ "linux-s390x",
"osx-x64",
"osx-arm64"
};
diff --git a/src/Test/L0/Listener/SelfUpdaterL0.cs b/src/Test/L0/Listener/SelfUpdaterL0.cs
index 26ba65e..6791df3 100644
--- a/src/Test/L0/Listener/SelfUpdaterL0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterL0.cs
@@ -1,4 +1,4 @@
-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
using System;
using System.Collections.Generic;
using System.IO;
@@ -16,6 +16,7 @@ using Xunit;
namespace GitHub.Runner.Common.Tests.Listener
{
+#if !S390X // Self-update is not currently supported on S390X
public sealed class SelfUpdaterL0
{
private Mock<IRunnerServer> _runnerServer;
@@ -291,5 +292,6 @@ namespace GitHub.Runner.Common.Tests.Listener
}
}
}
+#endif
}
#endif
diff --git a/src/Test/L0/Listener/SelfUpdaterV2L0.cs b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
index 5115a6b..dd8d198 100644
--- a/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
@@ -1,4 +1,4 @@
-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
using System;
using System.Collections.Generic;
using System.IO;
diff --git a/src/Test/L0/Worker/StepHostL0.cs b/src/Test/L0/Worker/StepHostL0.cs
index f6b5889..26f8e21 100644
--- a/src/Test/L0/Worker/StepHostL0.cs
+++ b/src/Test/L0/Worker/StepHostL0.cs
@@ -31,7 +31,7 @@ namespace GitHub.Runner.Common.Tests.Worker
return hc;
}
-#if OS_LINUX
+#if OS_LINUX && !S390X
[Fact]
[Trait("Level", "L0")]
[Trait("Category", "Worker")]
diff --git a/src/dev.sh b/src/dev.sh
index fa637d1..8c66f37 100755
--- a/src/dev.sh
+++ b/src/dev.sh
@@ -54,6 +54,7 @@ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
case $CPU_NAME in
armv7l) RUNTIME_ID="linux-arm";;
aarch64) RUNTIME_ID="linux-arm64";;
+ s390x) RUNTIME_ID="linux-s390x";;
esac
fi
elif [[ "$CURRENT_PLATFORM" == 'darwin' ]]; then
@@ -80,7 +81,7 @@ if [[ "$CURRENT_PLATFORM" == 'windows' ]]; then
exit 1
fi
elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
- if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') ]]; then
+ if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') && ("$RUNTIME_ID" != 'linux-s390x') ]]; then
echo "Failed: Can't build $RUNTIME_ID package $CURRENT_PLATFORM" >&2
exit 1
fi
@@ -199,7 +200,8 @@ function package ()
popd > /dev/null
}
-if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet") ]]; then
+if [[ "${RUNTIME_ID}" != "linux-s390x" && ((! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet")) ]]; then
+
# Download dotnet SDK to ../_dotnetsdk directory
heading "Ensure Dotnet SDK"
@@ -224,8 +226,10 @@ if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTN
echo "${DOTNETSDK_VERSION}" > "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}"
fi
-echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
-export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+if [[ -d "${DOTNETSDK_INSTALLDIR}" ]]; then
+ echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
+ export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+fi
heading "Dotnet SDK Version"
dotnet --version
diff --git a/src/dir.proj b/src/dir.proj
index 056a312..8370922 100644
--- a/src/dir.proj
+++ b/src/dir.proj
@@ -41,8 +41,18 @@
</ItemGroup>
<Target Name="Build" DependsOnTargets="GenerateConstant">
- <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" />
- <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);RuntimeIdentifier=$(PackageRuntime);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
+ <PropertyGroup>
+ <!-- Normally we want to publish a self-contained app for $(PackageRuntime) -->
+ <PublishRuntimeIdentifier>RuntimeIdentifier=$(PackageRuntime)</PublishRuntimeIdentifier>
+ <!-- However, on s390x there are no apphost or runtime packages on nuget.org, so self-contained publishing is not supported.
+ Perform a non-self-contained publish using the current runtime identifier (normally something like rhel.8-s390x) instead.
+ In addition, when not using an explicit runtime identifier, the SDK will copy runtime assets from dependent packages;
+ as this would confuse the expected layout, disable that behavior as well. -->
+ <PublishRuntimeIdentifier Condition="'$(PackageRuntime)' == 'linux-s390x'">SelfContained=false;CopyLocalRuntimeTargetAssets=false</PublishRuntimeIdentifier>
+ </PropertyGroup>
+
+ <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" Properties="$(PublishRuntimeIdentifier)" />
+ <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);$(PublishRuntimeIdentifier);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
<Exec Command="%22$(DesktopMSBuild)%22 Runner.Service/Windows/RunnerService.csproj /p:Configuration=$(BUILDCONFIG) /p:PackageRuntime=$(PackageRuntime) /p:OutputPath=%22$(MSBuildProjectDirectory)/../_layout/bin%22" ConsoleToMSBuild="true" Condition="'$(PackageRuntime)' == 'win-x64' Or '$(PackageRuntime)' == 'win-x86' Or '$(PackageRuntime)' == 'win-arm64'" />
</Target>

View File

@ -35,7 +35,6 @@ all: \
chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
insert_string_sse42.o insert_string_sse42.lo \
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
@ -77,12 +76,6 @@ compare256_sse2.o:
compare256_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
insert_string_sse42.o:
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
insert_string_sse42.lo:
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
crc32_pclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
@ -90,10 +83,10 @@ crc32_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_vpclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
crc32_vpclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
slide_hash_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c

View File

@ -9,24 +9,15 @@
#ifdef X86_AVX2
#include "../../zbuild.h"
#include "zbuild.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "../../adler32_p.h"
#include "adler32_p.h"
#include "adler32_avx2_p.h"
#include "x86_intrins.h"
#ifdef X86_SSE42
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
#define sub32(a, b, c) adler32_ssse3(a, b, c)
#else
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
#endif
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
@ -44,9 +35,9 @@ rem_peel:
}
} else if (len < 32) {
if (COPY) {
return copy_sub32(adler, dst, src, len);
return adler32_fold_copy_sse42(adler, dst, src, len);
} else {
return sub32(adler, src, len);
return adler32_ssse3(adler, src, len);
}
}

View File

@ -8,10 +8,9 @@
#ifdef X86_AVX512
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "../../cpu_features.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "arch_functions.h"
#include <immintrin.h>
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
@ -33,13 +32,7 @@ rem_peel:
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
}
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
__m512i vbuf, vs1_0, vs3;

View File

@ -9,11 +9,10 @@
#ifdef X86_AVX512VNNI
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../cpu_features.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "arch_functions.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
@ -28,20 +27,10 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size
rem_peel:
if (len < 32)
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
if (len < 64)
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
@ -135,11 +124,7 @@ rem_peel_copy:
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,

View File

@ -6,9 +6,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "adler32_ssse3_p.h"
#include <immintrin.h>

View File

@ -6,8 +6,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "adler32_ssse3_p.h"
#ifdef X86_SSSE3

View File

@ -4,10 +4,7 @@
#include "zbuild.h"
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
* code size by sharing the chunkcopy functions, which will certainly compile
* to identical machine code */
#if defined(X86_SSSE3) && defined(X86_SSE2)
#if defined(X86_SSSE3)
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
@ -19,8 +16,6 @@ typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
#define HAVE_CHUNKCOPY
#define HAVE_CHUNKUNROLL
static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
@ -83,14 +78,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
return ret_vec;
}
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
#define CHUNKSIZE chunksize_ssse3
#define CHUNKMEMSET chunkmemset_ssse3
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#define CHUNKCOPY chunkcopy_ssse3
#define CHUNKUNROLL chunkunroll_ssse3
#include "chunkset_tpl.h"

View File

@ -3,8 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)

View File

@ -3,8 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)

View File

@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i xmm_crc_part = _mm_setzero_si128();
#ifdef COPY
char ALIGNED_(16) partial_buf[16] = { 0 };
#else
#ifndef COPY
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
int32_t first = init_crc != 0;
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
* by definition can be up to 15 bytes + one full vector load. */
assert(len >= 31 || first == 0);
/* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
* for the aligning load that occurs. If there's an initial CRC, to carry it forward through
* the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
* up to 15 bytes + one full vector load. */
assert(len >= 16 || first == 0);
#endif
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
#ifdef COPY
if (len == 0)
return;
memcpy(partial_buf, src, len);
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
#ifdef COPY
memcpy(dst, partial_buf, len);
#endif
goto partial;
@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
if (algn_diff < 4 && init_crc != 0) {
xmm_t0 = xmm_crc_part;
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
if (len >= 32) {
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
} else {
memcpy(partial_buf, src + 16, len - 16);
xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
src += 16;
len -= 16;
#ifdef COPY
dst -= algn_diff;
#endif
goto partial;
}
src += 16;
len -= 16;
}

View File

@ -17,7 +17,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include <immintrin.h>
#include <wmmintrin.h>
@ -26,8 +26,9 @@
# include <immintrin.h>
#endif
#include "../../crc32_fold.h"
#include "../../crc32_braid_p.h"
#include "crc32.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
#include "x86_intrins.h"
#include <assert.h>
@ -350,11 +351,22 @@ Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
return crc->value;
}
static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
uint32_t c = (~crc) & 0xffffffff;
while (len) {
len--;
DO1;
}
return c ^ 0xffffffff;
}
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
* these short lengths might also prove to be effective */
if (len < 64)
return PREFIX(crc32_braid)(crc32, buf, len);
/* For lens smaller than ~12, crc32_small method is faster.
* But there are also minimum requirements for the pclmul functions due to alignment */
if (len < 16)
return crc32_small(crc32, buf, len);
crc32_fold ALIGNED_(16) crc_state;
CRC32_FOLD_RESET(&crc_state);

View File

@ -3,7 +3,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
#ifdef X86_VPCLMULQDQ_CRC
#define X86_VPCLMULQDQ
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy

View File

@ -1,24 +0,0 @@
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef X86_SSE42
#include "../../zbuild.h"
#include <nmmintrin.h>
#include "../../deflate.h"
#define HASH_CALC(s, h, val)\
h = _mm_crc32_u32(h, val)
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH update_hash_sse42
#define INSERT_STRING insert_string_sse42
#define QUICK_INSERT_STRING quick_insert_string_sse42
#include "../../insert_string_tpl.h"
#endif

View File

@ -9,8 +9,8 @@
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
#include <immintrin.h>

View File

@ -8,8 +8,8 @@
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
#include <immintrin.h>
#include <assert.h>

View File

@ -7,7 +7,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "x86_features.h"
#ifdef _MSC_VER
@ -15,6 +15,13 @@
#else
// Newer versions of GCC and clang come with cpuid.h
# include <cpuid.h>
# ifdef X86_HAVE_XSAVE_INTRIN
# if __GNUC__ == 8
# include <xsaveintrin.h>
# else
# include <immintrin.h>
# endif
# endif
#endif
#include <string.h>
@ -29,6 +36,7 @@ static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx,
*ecx = registers[2];
*edx = registers[3];
#else
*eax = *ebx = *ecx = *edx = 0;
__cpuid(info, *eax, *ebx, *ecx, *edx);
#endif
}
@ -43,12 +51,13 @@ static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx,
*ecx = registers[2];
*edx = registers[3];
#else
*eax = *ebx = *ecx = *edx = 0;
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
#endif
}
static inline uint64_t xgetbv(unsigned int xcr) {
#ifdef _MSC_VER
#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
return _xgetbv(xcr);
#else
uint32_t eax, edx;
@ -90,7 +99,16 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
// check AVX512 bits if the OS supports saving ZMM registers
if (features->has_os_save_zmm) {
features->has_avx512 = ebx & 0x00010000;
features->has_avx512f = ebx & 0x00010000;
if (features->has_avx512f) {
// According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
// AVX512(DQ,BW,VL).
features->has_avx512dq = ebx & 0x00020000;
features->has_avx512bw = ebx & 0x40000000;
features->has_avx512vl = ebx & 0x80000000;
}
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
&& features->has_avx512vl;
features->has_avx512vnni = ecx & 0x800;
}
}

View File

@ -1,14 +1,18 @@
/* x86_features.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef X86_FEATURES_H_
#define X86_FEATURES_H_
struct x86_cpu_features {
int has_avx2;
int has_avx512;
int has_avx512f;
int has_avx512dq;
int has_avx512bw;
int has_avx512vl;
int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
int has_avx512vnni;
int has_sse2;
int has_ssse3;
@ -21,4 +25,4 @@ struct x86_cpu_features {
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
#endif /* CPU_H_ */
#endif /* X86_FEATURES_H_ */

View File

@ -0,0 +1,172 @@
/* x86_functions.h -- x86 implementations for arch-specific functions.
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef X86_FUNCTIONS_H_
#define X86_FUNCTIONS_H_
#ifdef X86_SSE2
uint32_t chunksize_sse2(void);
uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
void slide_hash_sse2(deflate_state *s);
# endif
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_SSSE3
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_SSE42
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX2
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_avx2(void);
uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
void slide_hash_avx2(deflate_state *s);
# endif
void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512VNNI
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_PCLMULQDQ_CRC
uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#ifdef X86_VPCLMULQDQ_CRC
uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// X86 - SSE2
# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_sse2
# undef native_chunksize
# define native_chunksize chunksize_sse2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_sse2
# undef native_slide_hash
# define native_slide_hash slide_hash_sse2
# ifdef HAVE_BUILTIN_CTZ
# undef native_compare256
# define native_compare256 compare256_sse2
# undef native_longest_match
# define native_longest_match longest_match_sse2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_sse2
# endif
#endif
// X86 - SSSE3
# if defined(X86_SSSE3) && defined(__SSSE3__)
# undef native_adler32
# define native_adler32 adler32_ssse3
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_ssse3
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_ssse3
# endif
// X86 - SSE4.2
# if defined(X86_SSE42) && defined(__SSE4_2__)
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_sse42
# endif
// X86 - PCLMUL
#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
# undef native_crc32
# define native_crc32 crc32_pclmulqdq
# undef native_crc32_fold
# define native_crc32_fold crc32_fold_pclmulqdq
# undef native_crc32_fold_copy
# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
# undef native_crc32_fold_final
# define native_crc32_fold_final crc32_fold_pclmulqdq_final
# undef native_crc32_fold_reset
# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
#endif
// X86 - AVX
# if defined(X86_AVX2) && defined(__AVX2__)
# undef native_adler32
# define native_adler32 adler32_avx2
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx2
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx2
# undef native_chunksize
# define native_chunksize chunksize_avx2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx2
# undef native_slide_hash
# define native_slide_hash slide_hash_avx2
# ifdef HAVE_BUILTIN_CTZ
# undef native_compare256
# define native_compare256 compare256_avx2
# undef native_longest_match
# define native_longest_match longest_match_avx2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_avx2
# endif
# endif
// X86 - AVX512 (F,DQ,BW,Vl)
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
# undef native_adler32
# define native_adler32 adler32_avx512
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx512
// X86 - AVX512 (VNNI)
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
# undef native_adler32
# define native_adler32 adler32_avx512_vnni
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
# endif
// X86 - VPCLMULQDQ
# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
# undef native_crc32
# define native_crc32 crc32_vpclmulqdq
# undef native_crc32_fold
# define native_crc32_fold crc32_fold_vpclmulqdq
# undef native_crc32_fold_copy
# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
# undef native_crc32_fold_final
# define native_crc32_fold_final crc32_fold_vpclmulqdq_final
# undef native_crc32_fold_reset
# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
# endif
# endif
#endif
#endif /* X86_FUNCTIONS_H_ */

View File

@ -7,7 +7,7 @@
#ifdef __AVX2__
#include <immintrin.h>
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
__m128i r;
@ -29,7 +29,7 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
/* GCC <9 is missing some AVX512 intrinsics.
*/
#ifdef __AVX512F__
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
#include <immintrin.h>
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \

29
3rdparty/zlib-ng/arch_functions.h vendored Normal file
View File

@ -0,0 +1,29 @@
/* arch_functions.h -- Arch-specific function prototypes.
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CPU_FUNCTIONS_H_
#define CPU_FUNCTIONS_H_
#include "zbuild.h"
#include "zutil.h"
#include "crc32.h"
#include "deflate.h"
#include "fallback_builtins.h"
#include "arch/generic/generic_functions.h"
#if defined(X86_FEATURES)
# include "arch/x86/x86_functions.h"
#elif defined(ARM_FEATURES)
# include "arch/arm/arm_functions.h"
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
# include "arch/power/power_functions.h"
#elif defined(S390_FEATURES)
# include "arch/s390/s390_functions.h"
#elif defined(RISCV_FEATURES)
# include "arch/riscv/riscv_functions.h"
#endif
#endif

View File

@ -5,7 +5,7 @@
#include "zbuild.h"
#include <stdlib.h>
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
#endif
@ -25,7 +25,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
without iteration, which will hopefully make the branch prediction more
reliable. */
#ifndef HAVE_CHUNKCOPY
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
@ -54,7 +54,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
least 258 bytes of output space available (258 being the maximum length
output from a single token; see inflate_fast()'s assumptions below). */
#ifndef HAVE_CHUNKUNROLL
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
unsigned char const *from = out - *dist;
chunk_t chunk;
while (*dist < *len && *dist < sizeof(chunk_t)) {
@ -98,7 +98,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
/* Only AVX2 */
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
if (len <= 16) {
return chunkmemset_ssse3(out, dist, len);
}

115
3rdparty/zlib-ng/cmake/detect-arch.c vendored Normal file
View File

@ -0,0 +1,115 @@
// archdetect.c -- Detect compiler architecture and raise preprocessor error
// containing a simple arch identifier.
// Copyright (C) 2019 Hans Kristian Rosbach
// Licensed under the Zlib license, see LICENSE.md for details
// x86_64
#if defined(__x86_64__) || defined(_M_X64)
#error archfound x86_64
// x86
#elif defined(__i386) || defined(_M_IX86)
#error archfound i686
// ARM
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#error archfound aarch64
#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
#error archfound armv8
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
#error archfound armv7
#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
#error archfound armv6
#elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
#error archfound armv5
#elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
#error archfound armv4
#elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
#error archfound armv3
#elif defined(__ARM_ARCH_2__)
#error archfound armv2
#endif
// PowerPC
#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
#if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#error archfound powerpc64le
#else
#error archfound powerpc64
#endif
#else
#error archfound powerpc
#endif
// --------------- Less common architectures alphabetically below ---------------
// ALPHA
#elif defined(__alpha__) || defined(__alpha)
#error archfound alpha
// Blackfin
#elif defined(__BFIN__)
#error archfound blackfin
// Itanium
#elif defined(__ia64) || defined(_M_IA64)
#error archfound ia64
// MIPS
#elif defined(__mips__) || defined(__mips)
#error archfound mips
// Motorola 68000-series
#elif defined(__m68k__)
#error archfound m68k
// SuperH
#elif defined(__sh__)
#error archfound sh
// SPARC
#elif defined(__sparc__) || defined(__sparc)
#if defined(__sparcv9) || defined(__sparc_v9__)
#error archfound sparc9
#elif defined(__sparcv8) || defined(__sparc_v8__)
#error archfound sparc8
#endif
// SystemZ
#elif defined(__370__)
#error archfound s370
#elif defined(__s390__)
#error archfound s390
#elif defined(__s390x) || defined(__zarch__)
#error archfound s390x
// PARISC
#elif defined(__hppa__)
#error archfound parisc
// RS-6000
#elif defined(__THW_RS6000)
#error archfound rs6000
// RISC-V
#elif defined(__riscv)
#if __riscv_xlen == 64
#error archfound riscv64
#elif __riscv_xlen == 32
#error archfound riscv32
#endif
// LOONGARCH
#elif defined(__loongarch_lp64)
#error archfound loongarch64
// Emscripten (WebAssembly)
#elif defined(__EMSCRIPTEN__)
#error archfound wasm32
// return 'unrecognized' if we do not know what architecture this is
#else
#error archfound unrecognized
#endif

Some files were not shown because too many files have changed in this diff Show More