mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 14:13:15 +08:00
Merge pull request #24782 from FantasqueX:4.x-zlib-ng
Add zlib-ng as an alternative zlib implementation
This commit is contained in:
commit
6bf758ecc4
8
3rdparty/readme.txt
vendored
8
3rdparty/readme.txt
vendored
@ -49,6 +49,14 @@ zlib General purpose LZ77 compression library
|
||||
Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler.
|
||||
See zlib home page http://www.zlib.net
|
||||
for details and links to the source code
|
||||
|
||||
zlib-ng zlib data compression library for the next generation systems
|
||||
(C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
See zlib-ng official GitHub repository
|
||||
https://github.com/zlib-ng/zlib-ng.git
|
||||
for details and links to source code
|
||||
|
||||
WITH_ZLIB_NG CMake option must be ON to use zlib-ng as the zlib implementation.
|
||||
------------------------------------------------------------------------------------
|
||||
jasper JasPer is a collection of software
|
||||
(i.e., a library and application programs) for the coding
|
||||
|
796
3rdparty/zlib-ng/CMakeLists.txt
vendored
Normal file
796
3rdparty/zlib-ng/CMakeLists.txt
vendored
Normal file
@ -0,0 +1,796 @@
|
||||
project(${ZLIB_LIBRARY} LANGUAGES C)
|
||||
|
||||
if("c_std_11" IN_LIST CMAKE_C_COMPILE_FEATURES)
|
||||
set(CMAKE_C_STANDARD 11) # The C standard whose features are requested to build this target
|
||||
else()
|
||||
set(CMAKE_C_STANDARD 99)
|
||||
endif()
|
||||
set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement
|
||||
set(CMAKE_C_EXTENSIONS OFF) # Boolean specifying whether compiler specific extensions are requested
|
||||
|
||||
include(CheckTypeSize)
|
||||
include(CheckSymbolExists)
|
||||
include(CheckFunctionExists)
|
||||
include(CheckIncludeFile)
|
||||
include(CheckCSourceCompiles)
|
||||
include(CheckCSourceRuns)
|
||||
include(CheckCCompilerFlag)
|
||||
include(CMakeDependentOption)
|
||||
|
||||
if(X86_64 OR X86)
|
||||
set(BASEARCH_X86_FOUND TRUE)
|
||||
endif()
|
||||
if(AARCH64 OR ARM)
|
||||
set(BASEARCH_ARM_FOUND TRUE)
|
||||
endif()
|
||||
if(PPC64LE OR PPC64)
|
||||
set(BASEARCH_PPC_FOUND TRUE)
|
||||
endif()
|
||||
if(RISCV)
|
||||
set(BASEARCH_RISCV_FOUND TRUE)
|
||||
endif()
|
||||
|
||||
include(cmake/detect-intrinsics.cmake)
|
||||
include(cmake/fallback-macros.cmake)
|
||||
|
||||
set(ZLIB_SYMBOL_PREFIX "")
|
||||
|
||||
if(BASEARCH_X86_FOUND)
|
||||
set(WITH_AVX2 ON)
|
||||
set(WITH_AVX512 ON)
|
||||
set(WITH_AVX512VNNI ON)
|
||||
set(WITH_SSE2 ON)
|
||||
set(WITH_SSSE3 ON)
|
||||
set(WITH_SSE42 ON)
|
||||
set(WITH_PCLMULQDQ ON)
|
||||
set(WITH_VPCLMULQDQ ON)
|
||||
endif()
|
||||
if(BASEARCH_ARM_FOUND)
|
||||
set(WITH_ACLE ON)
|
||||
set(WITH_NEON ON)
|
||||
if(ARM)
|
||||
set(WITH_ARMV6 ON)
|
||||
else()
|
||||
set(WITH_ARMV6 OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(BASEARCH_PPC_FOUND)
|
||||
set(WITH_ALTIVEC ON)
|
||||
set(WITH_POWER8 ON)
|
||||
set(WITH_POWER9 ON)
|
||||
endif()
|
||||
if(BASEARCH_RISCV_FOUND)
|
||||
set(WITH_RVV ON)
|
||||
endif()
|
||||
|
||||
|
||||
add_definitions(-DZLIB_COMPAT)
|
||||
|
||||
add_definitions(-DWITH_GZFILEOP)
|
||||
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "^Intel")
|
||||
set(WARNFLAGS_DISABLE)
|
||||
elseif(MSVC)
|
||||
# Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013
|
||||
# See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
|
||||
if(MSVC_VERSION VERSION_LESS 1800)
|
||||
message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).")
|
||||
endif()
|
||||
# TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination
|
||||
# (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should
|
||||
# avoid mistakes.
|
||||
# /Oi ?
|
||||
set(WARNFLAGS_DISABLE)
|
||||
if(BASEARCH_ARM_FOUND)
|
||||
add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
|
||||
if(NOT "${ARCH}" MATCHES "aarch64")
|
||||
set(NEONFLAG "/arch:VFPv4")
|
||||
endif()
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(WARNFLAGS_DISABLE)
|
||||
# Check whether -fno-lto is available
|
||||
set(CMAKE_REQUIRED_FLAGS "-fno-lto")
|
||||
check_c_source_compiles(
|
||||
"int main() { return 0; }"
|
||||
FNO_LTO_AVAILABLE FAIL_REGEX "not supported")
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
if(FNO_LTO_AVAILABLE)
|
||||
set(ZNOLTOFLAG "-fno-lto")
|
||||
endif()
|
||||
if(BASEARCH_ARM_FOUND)
|
||||
if(ARM AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi")
|
||||
# Auto-detect support for ARM floating point ABI
|
||||
check_include_file(features.h HAVE_FEATURES_H)
|
||||
if(HAVE_FEATURES_H)
|
||||
set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp)
|
||||
check_c_source_compiles(
|
||||
"#include <features.h>
|
||||
int main() { return 0; }"
|
||||
HAVE_FLOATABI_SOFTFP)
|
||||
if(HAVE_FLOATABI_SOFTFP)
|
||||
set(FLOATABI -mfloat-abi=softfp)
|
||||
else()
|
||||
set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard)
|
||||
check_c_source_compiles(
|
||||
"#include <features.h>
|
||||
int main() { return 0; }"
|
||||
HAVE_FLOATABI_HARD)
|
||||
if(HAVE_FLOATABI_HARD)
|
||||
set(FLOATABI -mfloat-abi=hard)
|
||||
endif()
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endif()
|
||||
if(FLOATABI)
|
||||
message(STATUS "${ZLIB_LIBRARY} ARM floating point arch: ${FLOATABI}")
|
||||
add_compile_options(${FLOATABI})
|
||||
else()
|
||||
message(STATUS "${ZLIB_LIBRARY} ARM floating point arch not auto-detected")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if(FNO_LTO_AVAILABLE)
|
||||
set(NOLTOFLAG ${ZNOLTOFLAG})
|
||||
endif()
|
||||
if(MINGW)
|
||||
# Add `-Wno-pedantic-ms-format` only if the toolchain supports it
|
||||
check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT)
|
||||
if(HAVE_NO_PEDANTIC_MS_FORMAT)
|
||||
list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Force disable LTO
|
||||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
|
||||
|
||||
# Apply warning compiler flags
|
||||
add_compile_options(${WARNFLAGS_DISABLE})
|
||||
|
||||
# Replace optimization level 3 added by default with level 2
|
||||
if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3")
|
||||
string(REGEX REPLACE "([\\/\\-]O)3" "\\12"
|
||||
CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Check for standard/system includes
|
||||
#
|
||||
check_include_file(arm_acle.h HAVE_ARM_ACLE_H)
|
||||
if(HAVE_ARM_ACLE_H)
|
||||
add_definitions(-DHAVE_ARM_ACLE_H)
|
||||
endif()
|
||||
check_include_file(sys/auxv.h HAVE_SYS_AUXV_H)
|
||||
if(HAVE_SYS_AUXV_H)
|
||||
add_definitions(-DHAVE_SYS_AUXV_H)
|
||||
endif()
|
||||
check_include_file(sys/sdt.h HAVE_SYS_SDT_H)
|
||||
if(HAVE_SYS_SDT_H)
|
||||
add_definitions(-DHAVE_SYS_SDT_H)
|
||||
endif()
|
||||
check_include_file(unistd.h HAVE_UNISTD_H)
|
||||
|
||||
#
|
||||
# Check to see if we have large file support
|
||||
#
|
||||
set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
|
||||
check_type_size(off64_t OFF64_T)
|
||||
if(HAVE_OFF64_T)
|
||||
add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
|
||||
else()
|
||||
check_type_size(_off64_t _OFF64_T)
|
||||
if(HAVE__OFF64_T)
|
||||
add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
|
||||
else()
|
||||
check_type_size(__off64_t __OFF64_T)
|
||||
endif()
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_DEFINITIONS) # clear variable
|
||||
|
||||
#
|
||||
# Check for fseeko and other optional functions
|
||||
#
|
||||
check_function_exists(fseeko HAVE_FSEEKO)
|
||||
if(NOT HAVE_FSEEKO)
|
||||
add_definitions(-DNO_FSEEKO)
|
||||
endif()
|
||||
|
||||
check_function_exists(strerror HAVE_STRERROR)
|
||||
if(NOT HAVE_STRERROR)
|
||||
add_definitions(-DNO_STRERROR)
|
||||
endif()
|
||||
|
||||
set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112L)
|
||||
check_symbol_exists(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
|
||||
if(HAVE_POSIX_MEMALIGN)
|
||||
add_definitions(-DHAVE_POSIX_MEMALIGN)
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_DEFINITIONS)
|
||||
|
||||
set(CMAKE_REQUIRED_DEFINITIONS -D_ISOC11_SOURCE=1)
|
||||
check_symbol_exists(aligned_alloc stdlib.h HAVE_ALIGNED_ALLOC)
|
||||
if(HAVE_ALIGNED_ALLOC)
|
||||
add_definitions(-DHAVE_ALIGNED_ALLOC)
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_DEFINITIONS)
|
||||
|
||||
#
|
||||
# Check if we can hide zlib internal symbols that are linked between separate source files using hidden
|
||||
#
|
||||
check_c_source_compiles(
|
||||
"#define Z_INTERNAL __attribute__((visibility (\"hidden\")))
|
||||
int Z_INTERNAL foo;
|
||||
int main() {
|
||||
return 0;
|
||||
}"
|
||||
HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility")
|
||||
if(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN)
|
||||
add_definitions(-DHAVE_VISIBILITY_HIDDEN)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Check if we can hide zlib internal symbols that are linked between separate source files using internal
|
||||
#
|
||||
check_c_source_compiles(
|
||||
"#define Z_INTERNAL __attribute__((visibility (\"internal\")))
|
||||
int Z_INTERNAL foo;
|
||||
int main() {
|
||||
return 0;
|
||||
}"
|
||||
HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility")
|
||||
if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL)
|
||||
add_definitions(-DHAVE_VISIBILITY_INTERNAL)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Check for __attribute__((aligned(x))) support in the compiler
|
||||
#
|
||||
check_c_source_compiles(
|
||||
"int main(void) {
|
||||
__attribute__((aligned(8))) int test = 0;
|
||||
(void)test;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned")
|
||||
if(HAVE_ATTRIBUTE_ALIGNED)
|
||||
add_definitions(-DHAVE_ATTRIBUTE_ALIGNED)
|
||||
endif()
|
||||
|
||||
#
|
||||
# check for __builtin_ctz() support in the compiler
|
||||
#
|
||||
check_c_source_compiles(
|
||||
"int main(void) {
|
||||
unsigned int zero = 0;
|
||||
long test = __builtin_ctz(zero);
|
||||
(void)test;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_BUILTIN_CTZ
|
||||
)
|
||||
if(HAVE_BUILTIN_CTZ)
|
||||
add_definitions(-DHAVE_BUILTIN_CTZ)
|
||||
endif()
|
||||
|
||||
#
|
||||
# check for __builtin_ctzll() support in the compiler
|
||||
#
|
||||
check_c_source_compiles(
|
||||
"int main(void) {
|
||||
unsigned int zero = 0;
|
||||
long test = __builtin_ctzll(zero);
|
||||
(void)test;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_BUILTIN_CTZLL
|
||||
)
|
||||
if(HAVE_BUILTIN_CTZLL)
|
||||
add_definitions(-DHAVE_BUILTIN_CTZLL)
|
||||
endif()
|
||||
|
||||
#
|
||||
# check for ptrdiff_t support
|
||||
#
|
||||
check_c_source_compiles(
|
||||
"#include <stddef.h>
|
||||
int main() {
|
||||
ptrdiff_t *a;
|
||||
(void)a;
|
||||
return 0;
|
||||
}"
|
||||
HAVE_PTRDIFF_T
|
||||
)
|
||||
if(NOT HAVE_PTRDIFF_T)
|
||||
set(NEED_PTRDIFF_T 1)
|
||||
|
||||
check_type_size("void *" SIZEOF_DATA_PTR)
|
||||
message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes")
|
||||
|
||||
if(${SIZEOF_DATA_PTR} MATCHES "4")
|
||||
set(PTRDIFF_TYPE "uint32_t")
|
||||
elseif(${SIZEOF_DATA_PTR} MATCHES "8")
|
||||
set(PTRDIFF_TYPE "uint64_t")
|
||||
else()
|
||||
message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
add_definitions(-D_CRT_SECURE_NO_DEPRECATE)
|
||||
add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE)
|
||||
endif()
|
||||
|
||||
set(ZLIB_ARCH_SRCS)
|
||||
set(ZLIB_ARCH_HDRS)
|
||||
set(ARCHDIR "arch/generic")
|
||||
if(BASEARCH_X86_FOUND)
|
||||
set(ARCHDIR "arch/x86")
|
||||
endif()
|
||||
if(BASEARCH_ARM_FOUND)
|
||||
set(ARCHDIR "arch/arm")
|
||||
endif()
|
||||
if(BASEARCH_PPC_FOUND)
|
||||
set(ARCHDIR "arch/power")
|
||||
endif()
|
||||
if(BASEARCH_RISCV_FOUND)
|
||||
set(ARCHDIR "arch/riscv")
|
||||
endif()
|
||||
|
||||
if(NOT CV_DISABLE_OPTIMIZATION)
|
||||
if(BASEARCH_ARM_FOUND)
|
||||
add_definitions(-DARM_FEATURES)
|
||||
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
|
||||
if("${ARCH}" MATCHES "aarch64")
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
int main() {
|
||||
return (getauxval(AT_HWCAP) & HWCAP_CRC32);
|
||||
}"
|
||||
ARM_AUXV_HAS_CRC32
|
||||
)
|
||||
if(ARM_AUXV_HAS_CRC32)
|
||||
add_definitions(-DARM_AUXV_HAS_CRC32)
|
||||
else()
|
||||
message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
|
||||
endif()
|
||||
else()
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
int main() {
|
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
|
||||
}"
|
||||
ARM_AUXV_HAS_CRC32
|
||||
)
|
||||
if(ARM_AUXV_HAS_CRC32)
|
||||
add_definitions(-DARM_AUXV_HAS_CRC32)
|
||||
else()
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#include <asm/hwcap.h>
|
||||
int main() {
|
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
|
||||
}"
|
||||
ARM_HWCAP_HAS_CRC32
|
||||
)
|
||||
if(ARM_HWCAP_HAS_CRC32)
|
||||
add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP)
|
||||
else()
|
||||
message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
|
||||
endif()
|
||||
endif()
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
int main() {
|
||||
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON);
|
||||
}"
|
||||
ARM_AUXV_HAS_NEON
|
||||
)
|
||||
if(ARM_AUXV_HAS_NEON)
|
||||
add_definitions(-DARM_AUXV_HAS_NEON)
|
||||
else()
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
int main() {
|
||||
return (getauxval(AT_HWCAP) & HWCAP_NEON);
|
||||
}"
|
||||
ARM_AUXV_HAS_NEON
|
||||
)
|
||||
if (ARM_AUXV_HAS_NEON)
|
||||
add_definitions(-DARM_AUXV_HAS_NEON)
|
||||
else()
|
||||
message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c)
|
||||
if(WITH_ACLE)
|
||||
check_acle_compiler_flag()
|
||||
if(HAVE_ACLE_FLAG)
|
||||
add_definitions(-DARM_ACLE)
|
||||
set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
|
||||
set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}")
|
||||
list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
|
||||
else()
|
||||
set(WITH_ACLE OFF)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_ACLE OFF)
|
||||
endif()
|
||||
if(WITH_NEON)
|
||||
check_neon_compiler_flag()
|
||||
if(NEON_AVAILABLE)
|
||||
add_definitions(-DARM_NEON)
|
||||
set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
|
||||
${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
|
||||
set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
|
||||
if(MSVC)
|
||||
add_definitions(-D__ARM_NEON__)
|
||||
endif()
|
||||
check_neon_ld4_intrinsics()
|
||||
if(NEON_HAS_LD4)
|
||||
add_definitions(-DARM_NEON_HASLD4)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_NEON OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_ARMV6)
|
||||
check_armv6_compiler_flag()
|
||||
if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN)
|
||||
add_definitions(-DARM_SIMD)
|
||||
set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c)
|
||||
set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}")
|
||||
list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS})
|
||||
if(HAVE_ARMV6_INTRIN)
|
||||
add_definitions(-DARM_SIMD_INTRIN)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_ARMV6 OFF)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_ARMV6 OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(BASEARCH_PPC_FOUND)
|
||||
# Common arch detection code
|
||||
if(WITH_ALTIVEC)
|
||||
check_ppc_intrinsics()
|
||||
endif()
|
||||
if(WITH_POWER8)
|
||||
check_power8_intrinsics()
|
||||
endif()
|
||||
if(WITH_POWER9)
|
||||
check_power9_intrinsics()
|
||||
endif()
|
||||
if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
|
||||
add_definitions(-DPOWER_FEATURES)
|
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
|
||||
endif()
|
||||
# VMX specific options and files
|
||||
if(WITH_ALTIVEC)
|
||||
if(HAVE_VMX)
|
||||
add_definitions(-DPPC_FEATURES)
|
||||
if(HAVE_ALTIVEC)
|
||||
add_definitions(-DPPC_VMX)
|
||||
set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
|
||||
set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
|
||||
else()
|
||||
set(WITH_ALTIVEC OFF)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
# Power8 specific options and files
|
||||
if(WITH_POWER8)
|
||||
if(HAVE_POWER8_INTRIN)
|
||||
add_definitions(-DPOWER8_VSX)
|
||||
set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
|
||||
if("${ARCH}" MATCHES "powerpc64(le)?")
|
||||
add_definitions(-DPOWER8_VSX_CRC32)
|
||||
list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
|
||||
endif()
|
||||
list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
|
||||
set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_POWER8 OFF)
|
||||
endif()
|
||||
endif()
|
||||
# Power9 specific options and files
|
||||
if(WITH_POWER9)
|
||||
if(HAVE_POWER9_INTRIN)
|
||||
add_definitions(-DPOWER9)
|
||||
set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
|
||||
set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_POWER9 OFF)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if(BASEARCH_RISCV_FOUND)
|
||||
if(WITH_RVV)
|
||||
check_rvv_intrinsics()
|
||||
if(HAVE_RVV_INTRIN)
|
||||
add_definitions(-DRISCV_FEATURES)
|
||||
add_definitions(-DRISCV_RVV)
|
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
|
||||
# FIXME: we will not set compile flags for riscv_features.c when
|
||||
# the kernels update hwcap or hwprobe for riscv
|
||||
set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
|
||||
set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_RVV OFF)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if(BASEARCH_X86_FOUND)
|
||||
add_definitions(-DX86_FEATURES)
|
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
|
||||
if(MSVC)
|
||||
list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
|
||||
endif()
|
||||
if(WITH_AVX2)
|
||||
check_avx2_intrinsics()
|
||||
if(HAVE_AVX2_INTRIN)
|
||||
add_definitions(-DX86_AVX2)
|
||||
set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
|
||||
list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
|
||||
list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
|
||||
list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
|
||||
set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_AVX2 OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_AVX512)
|
||||
check_avx512_intrinsics()
|
||||
if(HAVE_AVX512_INTRIN)
|
||||
add_definitions(-DX86_AVX512)
|
||||
list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
|
||||
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
|
||||
if(HAVE_MASK_INTRIN)
|
||||
add_definitions(-DX86_MASK_INTRIN)
|
||||
endif()
|
||||
set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_AVX512 OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_AVX512VNNI)
|
||||
check_avx512vnni_intrinsics()
|
||||
if(HAVE_AVX512VNNI_INTRIN)
|
||||
add_definitions(-DX86_AVX512VNNI)
|
||||
list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
|
||||
set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_AVX512VNNI OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_SSE42)
|
||||
check_sse42_intrinsics()
|
||||
if(HAVE_SSE42_INTRIN)
|
||||
add_definitions(-DX86_SSE42)
|
||||
set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
|
||||
set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_SSE42 OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_SSE2)
|
||||
check_sse2_intrinsics()
|
||||
if(HAVE_SSE2_INTRIN)
|
||||
add_definitions(-DX86_SSE2)
|
||||
set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
|
||||
if(NOT ${ARCH} MATCHES "x86_64")
|
||||
set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
|
||||
add_definitions(-DX86_NOCHECK_SSE2)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_SSE2 OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_SSSE3)
|
||||
check_ssse3_intrinsics()
|
||||
if(HAVE_SSSE3_INTRIN)
|
||||
add_definitions(-DX86_SSSE3)
|
||||
set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
|
||||
set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_SSSE3 OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42)
|
||||
check_pclmulqdq_intrinsics()
|
||||
if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
|
||||
add_definitions(-DX86_PCLMULQDQ_CRC)
|
||||
set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
|
||||
set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
|
||||
|
||||
if(WITH_VPCLMULQDQ AND WITH_AVX512)
|
||||
check_vpclmulqdq_intrinsics()
|
||||
if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
|
||||
add_definitions(-DX86_VPCLMULQDQ_CRC)
|
||||
set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
|
||||
list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
|
||||
set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
|
||||
else()
|
||||
set(WITH_VPCLMULQDQ OFF)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_VPCLMULQDQ OFF)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_PCLMULQDQ OFF)
|
||||
set(WITH_VPCLMULQDQ OFF)
|
||||
endif()
|
||||
else()
|
||||
set(WITH_PCLMULQDQ OFF)
|
||||
set(WITH_VPCLMULQDQ OFF)
|
||||
endif()
|
||||
check_xsave_intrinsics()
|
||||
if(HAVE_XSAVE_INTRIN)
|
||||
set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#============================================================================
|
||||
# zconf.h
|
||||
#============================================================================
|
||||
|
||||
macro(generate_cmakein input output)
|
||||
file(REMOVE ${output})
|
||||
file(STRINGS ${input} _lines)
|
||||
foreach(_line IN LISTS _lines)
|
||||
string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}")
|
||||
string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}")
|
||||
if(NEED_PTRDIFF_T)
|
||||
string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}")
|
||||
endif()
|
||||
file(APPEND ${output} "${_line}\n")
|
||||
endforeach()
|
||||
endmacro(generate_cmakein)
|
||||
|
||||
generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein )
|
||||
|
||||
#============================================================================
|
||||
# zlib
|
||||
#============================================================================
|
||||
|
||||
set(ZLIB_PUBLIC_HDRS
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zconf.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib.h
|
||||
)
|
||||
set(ZLIB_PRIVATE_HDRS
|
||||
adler32_p.h
|
||||
chunkset_tpl.h
|
||||
compare256_rle.h
|
||||
cpu_features.h
|
||||
crc32_braid_p.h
|
||||
crc32_braid_comb_p.h
|
||||
crc32_braid_tbl.h
|
||||
crc32_fold.h
|
||||
deflate.h
|
||||
deflate_p.h
|
||||
functable.h
|
||||
inffast_tpl.h
|
||||
inffixed_tbl.h
|
||||
inflate.h
|
||||
inflate_p.h
|
||||
inftrees.h
|
||||
insert_string_tpl.h
|
||||
match_tpl.h
|
||||
trees.h
|
||||
trees_emit.h
|
||||
trees_tbl.h
|
||||
zbuild.h
|
||||
zendian.h
|
||||
zutil.h
|
||||
)
|
||||
set(ZLIB_SRCS
|
||||
adler32.c
|
||||
adler32_fold.c
|
||||
chunkset.c
|
||||
compare256.c
|
||||
compress.c
|
||||
cpu_features.c
|
||||
crc32_braid.c
|
||||
crc32_braid_comb.c
|
||||
crc32_fold.c
|
||||
deflate.c
|
||||
deflate_fast.c
|
||||
deflate_huff.c
|
||||
deflate_medium.c
|
||||
deflate_quick.c
|
||||
deflate_rle.c
|
||||
deflate_slow.c
|
||||
deflate_stored.c
|
||||
functable.c
|
||||
infback.c
|
||||
inflate.c
|
||||
inftrees.c
|
||||
insert_string.c
|
||||
insert_string_roll.c
|
||||
slide_hash.c
|
||||
trees.c
|
||||
uncompr.c
|
||||
zutil.c
|
||||
)
|
||||
|
||||
set(ZLIB_GZFILE_PRIVATE_HDRS
|
||||
gzguts.h
|
||||
)
|
||||
set(ZLIB_GZFILE_SRCS
|
||||
gzlib.c
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gzread.c
|
||||
gzwrite.c
|
||||
)
|
||||
|
||||
set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
|
||||
list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS})
|
||||
|
||||
add_library(zlib STATIC ${ZLIB_ALL_SRCS})
|
||||
|
||||
target_include_directories(zlib PUBLIC
|
||||
"$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR};${CMAKE_CURRENT_SOURCE_DIR}>"
|
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
|
||||
|
||||
if(HAVE_UNISTD_H)
|
||||
SET(ZCONF_UNISTD_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */")
|
||||
else()
|
||||
SET(ZCONF_UNISTD_LINE "#if 0 /* was set to #if 0 by configure/cmake/etc */")
|
||||
endif()
|
||||
if(NEED_PTRDIFF_T)
|
||||
SET(ZCONF_PTRDIFF_LINE "#if 1 /* was set to #if 1 by configure/cmake/etc */")
|
||||
else()
|
||||
SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T /* may be set to #if 1 by configure/cmake/etc */")
|
||||
endif()
|
||||
|
||||
configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib.h @ONLY)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY)
|
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY)
|
||||
|
||||
ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes
|
||||
-Wundef
|
||||
-Wmissing-declarations
|
||||
)
|
||||
|
||||
set_target_properties(${ZLIB_LIBRARY} PROPERTIES
|
||||
OUTPUT_NAME ${ZLIB_LIBRARY}
|
||||
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
|
||||
COMPILE_PDB_NAME ${ZLIB_LIBRARY}
|
||||
COMPILE_PDB_NAME_DEBUG "${ZLIB_LIBRARY}${OPENCV_DEBUG_POSTFIX}"
|
||||
ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
|
||||
)
|
||||
|
||||
if(ENABLE_SOLUTION_FOLDERS)
|
||||
set_target_properties(${ZLIB_LIBRARY} PROPERTIES FOLDER "3rdparty")
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_SHARED_LIBS)
|
||||
ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
|
||||
endif()
|
||||
|
||||
ocv_install_3rdparty_licenses(${ZLIB_LIBRARY} LICENSE.md)
|
19
3rdparty/zlib-ng/LICENSE.md
vendored
Normal file
19
3rdparty/zlib-ng/LICENSE.md
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
(C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
|
||||
3. This notice may not be removed or altered from any source distribution.
|
229
3rdparty/zlib-ng/README.md
vendored
Normal file
229
3rdparty/zlib-ng/README.md
vendored
Normal file
@ -0,0 +1,229 @@
|
||||
| CI | Stable | Develop |
|
||||
|:---|:-------|:--------|
|
||||
| GitHub Actions | [![Stable CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Astable) <br> [![Stable Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Astable) <br> [![Stable NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Astable) | [![Develop CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Adevelop) <br> [![Develop Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Adevelop) <br> [![Develop NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Adevelop) |
|
||||
| CodeFactor | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/stable)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/develop)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) |
|
||||
| OSS-Fuzz | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) |
|
||||
| Codecov | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/stable/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/develop/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) |
|
||||
|
||||
## zlib-ng
|
||||
*zlib data compression library for the next generation systems*
|
||||
|
||||
Maintained by Hans Kristian Rosbach
|
||||
aka Dead2 (zlib-ng àt circlestorm dót org)
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
* Zlib compatible API with support for dual-linking
|
||||
* Modernized native API based on zlib API for ease of porting
|
||||
* Modern C11 syntax and a clean code layout
|
||||
* Deflate medium and quick algorithms based on Intel’s zlib fork
|
||||
* Support for CPU intrinsics when available
|
||||
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
|
||||
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
|
||||
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
|
||||
* Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
|
||||
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
|
||||
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
|
||||
* Support for hardware-accelerated deflate using IBM Z DFLTCC
|
||||
* Unaligned memory read/writes and large bit buffer improvements
|
||||
* Includes improvements from Cloudflare and Intel forks
|
||||
* Configure, CMake, and NMake build system support
|
||||
* Comprehensive set of CMake unit tests
|
||||
* Code sanitizers, fuzzing, and coverage
|
||||
* GitHub Actions continuous integration on Windows, macOS, and Linux
|
||||
* Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu
|
||||
|
||||
|
||||
History
|
||||
-------
|
||||
|
||||
The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting
|
||||
implemented into the official zlib repository.
|
||||
|
||||
Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue
|
||||
for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a
|
||||
lower threshold for code change.
|
||||
|
||||
zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br>
|
||||
That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds
|
||||
for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment.
|
||||
|
||||
Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds
|
||||
cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute.
|
||||
|
||||
I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other
|
||||
smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br>
|
||||
The result is a better performing and easier to maintain zlib-ng.
|
||||
|
||||
A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both
|
||||
small and big improvements, or valuable testing.
|
||||
|
||||
|
||||
Build
|
||||
-----
|
||||
<sup>Please read LICENSE.md, it is very simple and very liberal.</sup>
|
||||
|
||||
There are two ways to build zlib-ng:
|
||||
|
||||
### Cmake
|
||||
|
||||
To build zlib-ng using the cross-platform makefile generator cmake.
|
||||
|
||||
```
|
||||
cmake .
|
||||
cmake --build . --config Release
|
||||
ctest --verbose -C Release
|
||||
```
|
||||
|
||||
Alternatively, you can use the cmake configuration GUI tool ccmake:
|
||||
|
||||
```
|
||||
ccmake .
|
||||
```
|
||||
|
||||
### Configure
|
||||
|
||||
To build zlib-ng using the bash configure script:
|
||||
|
||||
```
|
||||
./configure
|
||||
make
|
||||
make test
|
||||
```
|
||||
|
||||
Build Options
|
||||
-------------
|
||||
|
||||
| CMake | configure | Description | Default |
|
||||
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
|
||||
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
|
||||
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
|
||||
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
|
||||
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
|
||||
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
|
||||
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
|
||||
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
|
||||
| WITH_GTEST | | Build gtest_zlib | ON |
|
||||
| WITH_FUZZERS | | Build test/fuzz | OFF |
|
||||
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
|
||||
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
|
||||
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
|
||||
|
||||
|
||||
Install
|
||||
-------
|
||||
|
||||
WARNING: We do not recommend manually installing unless you really know what you are doing, because this can
|
||||
potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng
|
||||
can make the whole system unusable, requiring recovery or reinstall.
|
||||
If you still want a manual install, we recommend using the /opt/ path prefix.
|
||||
|
||||
For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through
|
||||
the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program
|
||||
will temporarily attempt to use zlib-ng instead, without risking system-wide instability.
|
||||
|
||||
```
|
||||
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
|
||||
```
|
||||
|
||||
### Cmake
|
||||
|
||||
To install zlib-ng system-wide using cmake:
|
||||
|
||||
```sh or powershell
|
||||
cmake --build . --target install
|
||||
```
|
||||
|
||||
### Configure
|
||||
|
||||
To install zlib-ng system-wide using the configure script:
|
||||
|
||||
```sh
|
||||
make install
|
||||
```
|
||||
|
||||
### CPack
|
||||
|
||||
After building with cmake, an installation package can be created using cpack. By default a tgz package is created,
|
||||
but you can append `-G <format>` to each command to generate alternative packages types (TGZ, ZIP, RPM, DEB). To easily
|
||||
create a rpm or deb package, you would use `-G RPM` or `-G DEB` respectively.
|
||||
|
||||
```sh or powershell
|
||||
cd build
|
||||
cpack --config CPackConfig.cmake
|
||||
cpack --config CPackSourceConfig.cmake
|
||||
```
|
||||
|
||||
### Vcpkg
|
||||
|
||||
Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
|
||||
|
||||
```sh or powershell
|
||||
git clone https://github.com/Microsoft/vcpkg.git
|
||||
cd vcpkg
|
||||
./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell
|
||||
./vcpkg integrate install
|
||||
./vcpkg install zlib-ng
|
||||
```
|
||||
|
||||
The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors.
|
||||
If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
|
||||
|
||||
Contributing
|
||||
------------
|
||||
|
||||
Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github.
|
||||
Help with testing and reviewing pull requests etc is also very much appreciated.
|
||||
|
||||
Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing)
|
||||
|
||||
Acknowledgments
|
||||
----------------
|
||||
|
||||
Thanks go out to all the people and companies who have taken the time to contribute
|
||||
code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you.
|
||||
|
||||
The deflate format used by zlib was defined by Phil Katz.<br>
|
||||
The deflate and zlib specifications were written by L. Peter Deutsch.
|
||||
|
||||
zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression).
|
||||
|
||||
|
||||
Advanced Build Options
|
||||
----------------------
|
||||
|
||||
| CMake | configure | Description | Default |
|
||||
|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
|
||||
| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
|
||||
| WITH_AVX2 | | Build with AVX2 intrinsics | ON |
|
||||
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
|
||||
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
|
||||
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
|
||||
| WITH_SSSE3 | | Build with SSSE3 intrinsics | ON |
|
||||
| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
|
||||
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
|
||||
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
|
||||
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
|
||||
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
|
||||
| WITH_ARMV6 | --without-armv6 | Build with ARMv6 intrinsics | ON |
|
||||
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON |
|
||||
| WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON |
|
||||
| WITH_RVV | | Build with RVV intrinsics | ON |
|
||||
| WITH_CRC32_VX | --without-crc32-vx | Build with vectorized CRC32 on IBM Z | ON |
|
||||
| WITH_DFLTCC_DEFLATE | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z | OFF |
|
||||
| WITH_DFLTCC_INFLATE | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z | OFF |
|
||||
| WITH_UNALIGNED | --without-unaligned | Allow optimizations that use unaligned reads if safe on current arch| ON |
|
||||
| WITH_INFLATE_STRICT | | Build with strict inflate distance checking | OFF |
|
||||
| WITH_INFLATE_ALLOW_INVALID_DIST | | Build with zero fill for inflate invalid distances | OFF |
|
||||
| INSTALL_UTILS | | Copy minigzip and minideflate during install | OFF |
|
||||
| ZLIBNG_ENABLE_TESTS | | Test zlib-ng specific API | ON |
|
||||
|
||||
|
||||
Related Projects
|
||||
----------------
|
||||
|
||||
* Fork of the popular minizip https://github.com/zlib-ng/minizip-ng
|
||||
* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench
|
||||
* Python tool to benchmark pigz https://github.com/zlib-ng/pigzbench
|
||||
* 3rd party patches for zlib-ng compatibility https://github.com/zlib-ng/patches
|
115
3rdparty/zlib-ng/adler32.c
vendored
Normal file
115
3rdparty/zlib-ng/adler32.c
vendored
Normal file
@ -0,0 +1,115 @@
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "functable.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
unsigned n;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
/* do length NMAX blocks -- requires just one modulo operation */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
#ifdef UNROLL_MORE
|
||||
n = NMAX / 16; /* NMAX is divisible by 16 */
|
||||
#else
|
||||
n = NMAX / 8; /* NMAX is divisible by 8 */
|
||||
#endif
|
||||
do {
|
||||
#ifdef UNROLL_MORE
|
||||
DO16(adler, sum2, buf); /* 16 sums unrolled */
|
||||
buf += 16;
|
||||
#else
|
||||
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
|
||||
buf += 8;
|
||||
#endif
|
||||
} while (--n);
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
}
|
||||
|
||||
/* do remaining bytes (less than NMAX, still just one modulo) */
|
||||
return adler32_len_64(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
|
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
return functable.adler32(adler, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
|
||||
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
|
||||
return functable.adler32(adler, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
|
||||
uint32_t sum1;
|
||||
uint32_t sum2;
|
||||
unsigned rem;
|
||||
|
||||
/* for negative len, return invalid adler32 as a clue for debugging */
|
||||
if (len2 < 0)
|
||||
return 0xffffffff;
|
||||
|
||||
/* the derivation of this formula is left as an exercise for the reader */
|
||||
len2 %= BASE; /* assumes len2 >= 0 */
|
||||
rem = (unsigned)len2;
|
||||
sum1 = adler1 & 0xffff;
|
||||
sum2 = rem * sum1;
|
||||
sum2 %= BASE;
|
||||
sum1 += (adler2 & 0xffff) + BASE - 1;
|
||||
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
|
||||
if (sum1 >= BASE) sum1 -= BASE;
|
||||
if (sum1 >= BASE) sum1 -= BASE;
|
||||
if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
|
||||
if (sum2 >= BASE) sum2 -= BASE;
|
||||
return sum1 | (sum2 << 16);
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) {
|
||||
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
|
||||
}
|
||||
|
||||
unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) {
|
||||
return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
|
||||
return adler32_combine_(adler1, adler2, len2);
|
||||
}
|
||||
#endif
|
16
3rdparty/zlib-ng/adler32_fold.c
vendored
Normal file
16
3rdparty/zlib-ng/adler32_fold.c
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
/* adler32_fold.c -- adler32 folding interface
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "functable.h"
|
||||
#include "adler32_fold.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
adler = functable.adler32(adler, src, len);
|
||||
memcpy(dst, src, len);
|
||||
return adler;
|
||||
}
|
11
3rdparty/zlib-ng/adler32_fold.h
vendored
Normal file
11
3rdparty/zlib-ng/adler32_fold.h
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
/* adler32_fold.h -- adler32 folding interface
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_FOLD_H_
|
||||
#define ADLER32_FOLD_H_
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
|
||||
#endif
|
70
3rdparty/zlib-ng/adler32_p.h
vendored
Normal file
70
3rdparty/zlib-ng/adler32_p.h
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
/* adler32_p.h -- Private inline functions and macros shared with
|
||||
* different computation of the Adler-32 checksum
|
||||
* of a data stream.
|
||||
* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_P_H
|
||||
#define ADLER32_P_H
|
||||
|
||||
#define BASE 65521U /* largest prime smaller than 65536 */
|
||||
#define NMAX 5552
|
||||
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
|
||||
|
||||
#define DO1(sum1, sum2, buf, i) {(sum1) += buf[(i)]; (sum2) += (sum1);}
|
||||
#define DO2(sum1, sum2, buf, i) {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);}
|
||||
#define DO4(sum1, sum2, buf, i) {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);}
|
||||
#define DO8(sum1, sum2, buf, i) {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
|
||||
#define DO16(sum1, sum2, buf) {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
|
||||
|
||||
static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
|
||||
adler += buf[0];
|
||||
adler %= BASE;
|
||||
sum2 += adler;
|
||||
sum2 %= BASE;
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
|
||||
while (len) {
|
||||
--len;
|
||||
adler += *buf++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE; /* only added so many BASE's */
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
|
||||
while (len--) {
|
||||
*dst = *buf++;
|
||||
adler += *dst++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE; /* only added so many BASE's */
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
|
||||
#ifdef UNROLL_MORE
|
||||
while (len >= 16) {
|
||||
len -= 16;
|
||||
DO16(adler, sum2, buf);
|
||||
buf += 16;
|
||||
#else
|
||||
while (len >= 8) {
|
||||
len -= 8;
|
||||
DO8(adler, sum2, buf, 0);
|
||||
buf += 8;
|
||||
#endif
|
||||
}
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#endif /* ADLER32_P_H */
|
2
3rdparty/zlib-ng/arch/.gitignore
vendored
Normal file
2
3rdparty/zlib-ng/arch/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
# ignore Makefiles; they're all automatically generated
|
||||
Makefile
|
85
3rdparty/zlib-ng/arch/arm/Makefile.in
vendored
Normal file
85
3rdparty/zlib-ng/arch/arm/Makefile.in
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
ACLEFLAG=
|
||||
NEONFLAG=
|
||||
ARMV6FLAG=
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: \
|
||||
adler32_neon.o adler32_neon.lo \
|
||||
arm_features.o arm_features.lo \
|
||||
chunkset_neon.o chunkset_neon.lo \
|
||||
compare256_neon.o compare256_neon.lo \
|
||||
crc32_acle.o crc32_acle.lo \
|
||||
slide_hash_neon.o slide_hash_neon.lo \
|
||||
slide_hash_armv6.o slide_hash_armv6.lo \
|
||||
insert_string_acle.o insert_string_acle.lo
|
||||
|
||||
adler32_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
adler32_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
arm_features.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
|
||||
|
||||
arm_features.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
|
||||
|
||||
chunkset_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
chunkset_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
compare256_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
|
||||
|
||||
compare256_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
|
||||
|
||||
crc32_acle.o:
|
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
crc32_acle.lo:
|
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
slide_hash_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
|
||||
|
||||
slide_hash_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
|
||||
|
||||
slide_hash_armv6.o:
|
||||
$(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
|
||||
|
||||
slide_hash_armv6.lo:
|
||||
$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
|
||||
|
||||
insert_string_acle.o:
|
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
insert_string_acle.lo:
|
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
35
3rdparty/zlib-ng/arch/arm/acle_intrins.h
vendored
Normal file
35
3rdparty/zlib-ng/arch/arm/acle_intrins.h
vendored
Normal file
@ -0,0 +1,35 @@
|
||||
#ifndef ARM_ACLE_INTRINS_H
|
||||
#define ARM_ACLE_INTRINS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#elif defined(HAVE_ARM_ACLE_H)
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
#if defined(__aarch64__)
|
||||
# define Z_TARGET_CRC Z_TARGET("+crc")
|
||||
#else
|
||||
# define Z_TARGET_CRC
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ARM_SIMD
|
||||
#ifdef _MSC_VER
|
||||
typedef uint32_t uint16x2_t;
|
||||
|
||||
#define __uqsub16 _arm_uqsub16
|
||||
#elif !defined(ARM_SIMD_INTRIN)
|
||||
typedef uint32_t uint16x2_t;
|
||||
|
||||
static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
|
||||
uint16x2_t __c;
|
||||
__asm__ __volatile__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
|
||||
return __c;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // include guard ARM_ACLE_INTRINS_H
|
215
3rdparty/zlib-ng/arch/arm/adler32_neon.c
vendored
Normal file
215
3rdparty/zlib-ng/arch/arm/adler32_neon.c
vendored
Normal file
@ -0,0 +1,215 @@
|
||||
/* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* Copyright (C) 2017 ARM Holdings Inc.
|
||||
* Authors:
|
||||
* Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
|
||||
static const uint16_t ALIGNED_(16) taps[64] = {
|
||||
64, 63, 62, 61, 60, 59, 58, 57,
|
||||
56, 55, 54, 53, 52, 51, 50, 49,
|
||||
48, 47, 46, 45, 44, 43, 42, 41,
|
||||
40, 39, 38, 37, 36, 35, 34, 33,
|
||||
32, 31, 30, 29, 28, 27, 26, 25,
|
||||
24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4, 3, 2, 1 };
|
||||
|
||||
uint32x4_t adacc = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc_0 = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc_1 = vdupq_n_u32(0);
|
||||
uint32x4_t s2acc_2 = vdupq_n_u32(0);
|
||||
|
||||
adacc = vsetq_lane_u32(s[0], adacc, 0);
|
||||
s2acc = vsetq_lane_u32(s[1], s2acc, 0);
|
||||
|
||||
uint32x4_t s3acc = vdupq_n_u32(0);
|
||||
uint32x4_t adacc_prev = adacc;
|
||||
|
||||
uint16x8_t s2_0, s2_1, s2_2, s2_3;
|
||||
s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
|
||||
|
||||
uint16x8_t s2_4, s2_5, s2_6, s2_7;
|
||||
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
|
||||
|
||||
size_t num_iter = len >> 2;
|
||||
int rem = len & 3;
|
||||
|
||||
for (size_t i = 0; i < num_iter; ++i) {
|
||||
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
|
||||
|
||||
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
|
||||
* bit instruction, we'll have to make due summing to 16 bits first */
|
||||
uint16x8x2_t hsum, hsum_fold;
|
||||
hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
|
||||
hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
|
||||
|
||||
hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
|
||||
hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
|
||||
|
||||
adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
|
||||
s3acc = vaddq_u32(s3acc, adacc_prev);
|
||||
adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
|
||||
|
||||
/* If we do straight widening additions to the 16 bit values, we don't incur
|
||||
* the usual penalties of a pairwise add. We can defer the multiplications
|
||||
* until the very end. These will not overflow because we are incurring at
|
||||
* most 408 loop iterations (NMAX / 64), and a given lane is only going to be
|
||||
* summed into once. This means for the maximum input size, the largest value
|
||||
* we will see is 255 * 102 = 26010, safely under uint16 max */
|
||||
s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
|
||||
s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
|
||||
s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
|
||||
s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
|
||||
s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
|
||||
s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
|
||||
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
|
||||
s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
|
||||
|
||||
adacc_prev = adacc;
|
||||
buf += 64;
|
||||
}
|
||||
|
||||
s3acc = vshlq_n_u32(s3acc, 6);
|
||||
|
||||
if (rem) {
|
||||
uint32x4_t s3acc_0 = vdupq_n_u32(0);
|
||||
while (rem--) {
|
||||
uint8x16_t d0 = vld1q_u8(buf);
|
||||
uint16x8_t adler;
|
||||
adler = vpaddlq_u8(d0);
|
||||
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
|
||||
s2_7 = vaddw_high_u8(s2_7, d0);
|
||||
adacc = vpadalq_u16(adacc, adler);
|
||||
s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
|
||||
adacc_prev = adacc;
|
||||
buf += 16;
|
||||
}
|
||||
|
||||
s3acc_0 = vshlq_n_u32(s3acc_0, 4);
|
||||
s3acc = vaddq_u32(s3acc_0, s3acc);
|
||||
}
|
||||
|
||||
uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
|
||||
uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
|
||||
|
||||
s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
|
||||
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
|
||||
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
|
||||
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
|
||||
|
||||
s2acc = vaddq_u32(s2acc_0, s2acc);
|
||||
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
|
||||
s2acc = vaddq_u32(s2acc, s2acc_2);
|
||||
|
||||
uint32x2_t adacc2, s2acc2, as;
|
||||
s2acc = vaddq_u32(s2acc, s3acc);
|
||||
adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
|
||||
s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
|
||||
as = vpadd_u32(adacc2, s2acc2);
|
||||
s[0] = vget_lane_u32(as, 0);
|
||||
s[1] = vget_lane_u32(as, 1);
|
||||
}
|
||||
|
||||
static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
pair[0] += buf[i];
|
||||
pair[1] += pair[0];
|
||||
}
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
/* split Adler-32 into component sums */
|
||||
uint32_t sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (len == 1)
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (buf == NULL)
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (len < 16)
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
uint32_t pair[2];
|
||||
int n = NMAX;
|
||||
unsigned int done = 0;
|
||||
|
||||
/* Split Adler-32 into component sums, it can be supplied by
|
||||
* the caller sites (e.g. in a PNG file).
|
||||
*/
|
||||
pair[0] = adler;
|
||||
pair[1] = sum2;
|
||||
|
||||
/* If memory is not SIMD aligned, do scalar sums to an aligned
|
||||
* offset, provided that doing so doesn't completely eliminate
|
||||
* SIMD operation. Aligned loads are still faster on ARM, even
|
||||
* though there's no explicit aligned load instruction */
|
||||
unsigned int align_offset = ((uintptr_t)buf & 15);
|
||||
unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
|
||||
|
||||
if (align_offset && len >= (16 + align_adj)) {
|
||||
NEON_handle_tail(pair, buf, align_adj);
|
||||
n -= align_adj;
|
||||
done += align_adj;
|
||||
|
||||
} else {
|
||||
/* If here, we failed the len criteria test, it wouldn't be
|
||||
* worthwhile to do scalar aligning sums */
|
||||
align_adj = 0;
|
||||
}
|
||||
|
||||
while (done < len) {
|
||||
int remaining = (int)(len - done);
|
||||
n = MIN(remaining, (done == align_adj) ? n : NMAX);
|
||||
|
||||
if (n < 16)
|
||||
break;
|
||||
|
||||
NEON_accum32(pair, buf + done, n >> 4);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
|
||||
int actual_nsums = (n >> 4) << 4;
|
||||
done += actual_nsums;
|
||||
}
|
||||
|
||||
/* Handle the tail elements. */
|
||||
if (done < len) {
|
||||
NEON_handle_tail(pair, (buf + done), len - done);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
}
|
||||
|
||||
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
|
||||
return (pair[1] << 16) | pair[0];
|
||||
}
|
||||
|
||||
#endif
|
100
3rdparty/zlib-ng/arch/arm/arm_features.c
vendored
Normal file
100
3rdparty/zlib-ng/arch/arm/arm_features.c
vendored
Normal file
@ -0,0 +1,100 @@
|
||||
#include "../../zbuild.h"
|
||||
#include "arm_features.h"
|
||||
|
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
|
||||
# include <sys/auxv.h>
|
||||
# ifdef ARM_ASM_HWCAP
|
||||
# include <asm/hwcap.h>
|
||||
# endif
|
||||
#elif defined(__FreeBSD__) && defined(__aarch64__)
|
||||
# include <machine/armreg.h>
|
||||
# ifndef ID_AA64ISAR0_CRC32_VAL
|
||||
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
|
||||
# endif
|
||||
#elif defined(__APPLE__)
|
||||
# if !defined(_DARWIN_C_SOURCE)
|
||||
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
|
||||
# endif
|
||||
# include <sys/sysctl.h>
|
||||
#elif defined(_WIN32)
|
||||
# include <windows.h>
|
||||
#endif
|
||||
|
||||
static int arm_has_crc32() {
|
||||
#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32)
|
||||
# ifdef HWCAP_CRC32
|
||||
return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0;
|
||||
# else
|
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
|
||||
# endif
|
||||
#elif defined(__FreeBSD__) && defined(__aarch64__)
|
||||
return getenv("QEMU_EMULATING") == NULL
|
||||
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
|
||||
#elif defined(__APPLE__)
|
||||
int hascrc32;
|
||||
size_t size = sizeof(hascrc32);
|
||||
return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
|
||||
&& hascrc32 == 1;
|
||||
#elif defined(_WIN32)
|
||||
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
|
||||
#elif defined(ARM_NOCHECK_ACLE)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* AArch64 has neon. */
|
||||
#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
|
||||
static inline int arm_has_neon() {
|
||||
#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
|
||||
# ifdef HWCAP_ARM_NEON
|
||||
return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0;
|
||||
# else
|
||||
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
|
||||
# endif
|
||||
#elif defined(__APPLE__)
|
||||
int hasneon;
|
||||
size_t size = sizeof(hasneon);
|
||||
return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
|
||||
&& hasneon == 1;
|
||||
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
|
||||
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
|
||||
return 1; /* Always supported */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(ARM_NOCHECK_NEON)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* AArch64 does not have ARMv6 SIMD. */
|
||||
#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
|
||||
static inline int arm_has_simd() {
|
||||
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
|
||||
const char *platform = (const char *)getauxval(AT_PLATFORM);
|
||||
return strncmp(platform, "v6l", 3) == 0
|
||||
|| strncmp(platform, "v7l", 3) == 0
|
||||
|| strncmp(platform, "v8l", 3) == 0;
|
||||
#elif defined(ARM_NOCHECK_SIMD)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
features->has_simd = 0; /* never available */
|
||||
features->has_neon = 1; /* always available */
|
||||
#else
|
||||
features->has_simd = arm_has_simd();
|
||||
features->has_neon = arm_has_neon();
|
||||
#endif
|
||||
features->has_crc32 = arm_has_crc32();
|
||||
}
|
16
3rdparty/zlib-ng/arch/arm/arm_features.h
vendored
Normal file
16
3rdparty/zlib-ng/arch/arm/arm_features.h
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
/* arm_features.h -- check for ARM features.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ARM_H_
|
||||
#define ARM_H_
|
||||
|
||||
struct arm_cpu_features {
|
||||
int has_simd;
|
||||
int has_neon;
|
||||
int has_crc32;
|
||||
};
|
||||
|
||||
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
|
||||
|
||||
#endif /* ARM_H_ */
|
99
3rdparty/zlib-ng/arch/arm/chunkset_neon.c
vendored
Normal file
99
3rdparty/zlib-ng/arch/arm/chunkset_neon.c
vendored
Normal file
@ -0,0 +1,99 @@
|
||||
/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
typedef uint8x16_t chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = {
|
||||
{0, 1}, /* 3 */
|
||||
{0, 0}, /* don't care */
|
||||
{1 * 32, 1}, /* 5 */
|
||||
{2 * 32, 4}, /* 6 */
|
||||
{3 * 32, 2}, /* 7 */
|
||||
{0 * 32, 0}, /* don't care */
|
||||
{4 * 32, 7}, /* 9 */
|
||||
{5 * 32, 6}, /* 10 */
|
||||
{6 * 32, 5}, /* 11 */
|
||||
{7 * 32, 4}, /* 12 */
|
||||
{8 * 32, 3}, /* 13 */
|
||||
{9 * 32, 2}, /* 14 */
|
||||
{10 * 32, 1},/* 15 */
|
||||
};
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
uint16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
uint32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
uint64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_neon
|
||||
#define CHUNKCOPY chunkcopy_neon
|
||||
#define CHUNKUNROLL chunkunroll_neon
|
||||
#define CHUNKMEMSET chunkmemset_neon
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = vld1q_u8(s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
vst1q_u8(out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
/* See note in chunkset_ssse3.c for why this is ok */
|
||||
__msan_unpoison(buf + dist, 16 - dist);
|
||||
|
||||
/* This version of table is only available on aarch64 */
|
||||
#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
|
||||
uint8x16_t ret_vec = vld1q_u8(buf);
|
||||
|
||||
uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
|
||||
return vqtbl1q_u8(ret_vec, perm_vec);
|
||||
#else
|
||||
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
|
||||
perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
|
||||
perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
|
||||
a = vld1_u8(buf);
|
||||
b = vld1_u8(buf + 8);
|
||||
ret0 = vtbl1_u8(a, perm_vec0);
|
||||
uint8x8x2_t ab = {{a, b}};
|
||||
ret1 = vtbl2_u8(ab, perm_vec1);
|
||||
return vcombine_u8(ret0, ret1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_neon
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
59
3rdparty/zlib-ng/arch/arm/compare256_neon.c
vendored
Normal file
59
3rdparty/zlib-ng/arch/arm/compare256_neon.c
vendored
Normal file
@ -0,0 +1,59 @@
|
||||
/* compare256_neon.c - NEON version of compare256
|
||||
* Copyright (C) 2022 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
|
||||
#include "neon_intrins.h"
|
||||
|
||||
static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint8x16_t a, b, cmp;
|
||||
uint64_t lane;
|
||||
|
||||
a = vld1q_u8(src0);
|
||||
b = vld1q_u8(src1);
|
||||
|
||||
cmp = veorq_u8(a, b);
|
||||
|
||||
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
|
||||
if (lane) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
|
||||
return len + match_byte;
|
||||
}
|
||||
len += 8;
|
||||
lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
|
||||
if (lane) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
|
||||
return len + match_byte;
|
||||
}
|
||||
len += 8;
|
||||
|
||||
src0 += 16, src1 += 16;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_neon_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_neon
|
||||
#define COMPARE256 compare256_neon_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_neon
|
||||
#define COMPARE256 compare256_neon_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
78
3rdparty/zlib-ng/arch/arm/crc32_acle.c
vendored
Normal file
78
3rdparty/zlib-ng/arch/arm/crc32_acle.c
vendored
Normal file
@ -0,0 +1,78 @@
|
||||
/* crc32_acle.c -- compute the CRC-32 of a data stream
|
||||
* Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
|
||||
* Copyright (C) 2016 Yang Zhang
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
#include "acle_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
|
||||
Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
Z_REGISTER uint32_t c;
|
||||
Z_REGISTER const uint16_t *buf2;
|
||||
Z_REGISTER const uint32_t *buf4;
|
||||
Z_REGISTER const uint64_t *buf8;
|
||||
|
||||
c = ~crc;
|
||||
|
||||
if (UNLIKELY(len == 1)) {
|
||||
c = __crc32b(c, *buf);
|
||||
c = ~c;
|
||||
return c;
|
||||
}
|
||||
|
||||
if ((ptrdiff_t)buf & (sizeof(uint64_t) - 1)) {
|
||||
if (len && ((ptrdiff_t)buf & 1)) {
|
||||
c = __crc32b(c, *buf++);
|
||||
len--;
|
||||
}
|
||||
|
||||
if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
|
||||
buf2 = (const uint16_t *) buf;
|
||||
c = __crc32h(c, *buf2++);
|
||||
len -= sizeof(uint16_t);
|
||||
buf4 = (const uint32_t *) buf2;
|
||||
} else {
|
||||
buf4 = (const uint32_t *) buf;
|
||||
}
|
||||
|
||||
if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
|
||||
c = __crc32w(c, *buf4++);
|
||||
len -= sizeof(uint32_t);
|
||||
}
|
||||
|
||||
buf8 = (const uint64_t *) buf4;
|
||||
} else {
|
||||
buf8 = (const uint64_t *) buf;
|
||||
}
|
||||
|
||||
while (len >= sizeof(uint64_t)) {
|
||||
c = __crc32d(c, *buf8++);
|
||||
len -= sizeof(uint64_t);
|
||||
}
|
||||
|
||||
if (len >= sizeof(uint32_t)) {
|
||||
buf4 = (const uint32_t *) buf8;
|
||||
c = __crc32w(c, *buf4++);
|
||||
len -= sizeof(uint32_t);
|
||||
buf2 = (const uint16_t *) buf4;
|
||||
} else {
|
||||
buf2 = (const uint16_t *) buf8;
|
||||
}
|
||||
|
||||
if (len >= sizeof(uint16_t)) {
|
||||
c = __crc32h(c, *buf2++);
|
||||
len -= sizeof(uint16_t);
|
||||
}
|
||||
|
||||
buf = (const unsigned char *) buf2;
|
||||
if (len) {
|
||||
c = __crc32b(c, *buf);
|
||||
}
|
||||
|
||||
c = ~c;
|
||||
return c;
|
||||
}
|
||||
#endif
|
24
3rdparty/zlib-ng/arch/arm/insert_string_acle.c
vendored
Normal file
24
3rdparty/zlib-ng/arch/arm/insert_string_acle.c
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef ARM_ACLE
|
||||
#include "acle_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#define HASH_CALC(s, h, val) \
|
||||
h = __crc32w(0, val)
|
||||
|
||||
#define HASH_CALC_VAR h
|
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0
|
||||
|
||||
#define UPDATE_HASH Z_TARGET_CRC update_hash_acle
|
||||
#define INSERT_STRING Z_TARGET_CRC insert_string_acle
|
||||
#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
|
||||
|
||||
#include "../../insert_string_tpl.h"
|
||||
#endif
|
58
3rdparty/zlib-ng/arch/arm/neon_intrins.h
vendored
Normal file
58
3rdparty/zlib-ng/arch/arm/neon_intrins.h
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
#ifndef ARM_NEON_INTRINS_H
|
||||
#define ARM_NEON_INTRINS_H
|
||||
|
||||
#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
|
||||
/* arm64_neon.h is MSVC specific */
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
|
||||
/* Compatibility shim for the _high family of functions */
|
||||
#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
|
||||
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
|
||||
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
|
||||
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
|
||||
#endif
|
||||
|
||||
#ifdef ARM_NEON
|
||||
|
||||
#define vqsubq_u16_x4_x1(out, a, b) do { \
|
||||
out.val[0] = vqsubq_u16(a.val[0], b); \
|
||||
out.val[1] = vqsubq_u16(a.val[1], b); \
|
||||
out.val[2] = vqsubq_u16(a.val[2], b); \
|
||||
out.val[3] = vqsubq_u16(a.val[3], b); \
|
||||
} while (0)
|
||||
|
||||
|
||||
# ifndef ARM_NEON_HASLD4
|
||||
|
||||
static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
|
||||
uint16x8x4_t ret = (uint16x8x4_t) {{
|
||||
vld1q_u16(a),
|
||||
vld1q_u16(a+8),
|
||||
vld1q_u16(a+16),
|
||||
vld1q_u16(a+24)}};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
|
||||
uint8x16x4_t ret = (uint8x16x4_t) {{
|
||||
vld1q_u8(a),
|
||||
vld1q_u8(a+16),
|
||||
vld1q_u8(a+32),
|
||||
vld1q_u8(a+48)}};
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
|
||||
vst1q_u16(p, a.val[0]);
|
||||
vst1q_u16(p + 8, a.val[1]);
|
||||
vst1q_u16(p + 16, a.val[2]);
|
||||
vst1q_u16(p + 24, a.val[3]);
|
||||
}
|
||||
# endif // HASLD4 check
|
||||
#endif
|
||||
|
||||
#endif // include guard ARM_NEON_INTRINS_H
|
47
3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
vendored
Normal file
47
3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
|
||||
* Copyright (C) 2023 Cameron Cawley
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#if defined(ARM_SIMD)
|
||||
#include "acle_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
/* SIMD version of hash_chain rebase */
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
Z_REGISTER uint16x2_t v;
|
||||
uint16x2_t p0, p1, p2, p3;
|
||||
Z_REGISTER size_t n;
|
||||
|
||||
size_t size = entries*sizeof(table[0]);
|
||||
Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
|
||||
|
||||
Assert(sizeof(Pos) == 2, "Wrong Pos size");
|
||||
v = wsize | (wsize << 16);
|
||||
|
||||
n = size / (sizeof(uint16x2_t) * 4);
|
||||
do {
|
||||
p0 = *((const uint16x2_t *)(table));
|
||||
p1 = *((const uint16x2_t *)(table+2));
|
||||
p2 = *((const uint16x2_t *)(table+4));
|
||||
p3 = *((const uint16x2_t *)(table+6));
|
||||
p0 = __uqsub16(p0, v);
|
||||
p1 = __uqsub16(p1, v);
|
||||
p2 = __uqsub16(p2, v);
|
||||
p3 = __uqsub16(p3, v);
|
||||
*((uint16x2_t *)(table)) = p0;
|
||||
*((uint16x2_t *)(table+2)) = p1;
|
||||
*((uint16x2_t *)(table+4)) = p2;
|
||||
*((uint16x2_t *)(table+6)) = p3;
|
||||
table += 8;
|
||||
} while (--n);
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
||||
#endif
|
46
3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
vendored
Normal file
46
3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
vendored
Normal file
@ -0,0 +1,46 @@
|
||||
/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
|
||||
* Copyright (C) 2017-2020 Mika T. Lindqvist
|
||||
*
|
||||
* Authors:
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Jun He <jun.he@arm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef ARM_NEON
|
||||
#include "neon_intrins.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
/* SIMD version of hash_chain rebase */
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
Z_REGISTER uint16x8_t v;
|
||||
uint16x8x4_t p0, p1;
|
||||
Z_REGISTER size_t n;
|
||||
|
||||
size_t size = entries*sizeof(table[0]);
|
||||
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
|
||||
|
||||
Assert(sizeof(Pos) == 2, "Wrong Pos size");
|
||||
v = vdupq_n_u16(wsize);
|
||||
|
||||
n = size / (sizeof(uint16x8_t) * 8);
|
||||
do {
|
||||
p0 = vld1q_u16_x4(table);
|
||||
p1 = vld1q_u16_x4(table+32);
|
||||
vqsubq_u16_x4_x1(p0, p0, v);
|
||||
vqsubq_u16_x4_x1(p1, p1, v);
|
||||
vst1q_u16_x4(table, p0);
|
||||
vst1q_u16_x4(table+32, p1);
|
||||
table += 64;
|
||||
} while (--n);
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
||||
#endif
|
24
3rdparty/zlib-ng/arch/generic/Makefile.in
vendored
Normal file
24
3rdparty/zlib-ng/arch/generic/Makefile.in
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all:
|
||||
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~ \
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
53
3rdparty/zlib-ng/arch/generic/chunk_permute_table.h
vendored
Normal file
53
3rdparty/zlib-ng/arch/generic/chunk_permute_table.h
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CHUNK_PERMUTE_TABLE_H_
|
||||
#define CHUNK_PERMUTE_TABLE_H_
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
|
||||
static const ALIGNED_(32) uint8_t permute_table[26*32] = {
|
||||
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
|
||||
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
|
||||
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
|
||||
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
|
||||
|
||||
/* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
|
||||
* beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
|
||||
* blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
|
||||
* we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
|
||||
* this is what we're dealt.
|
||||
*/
|
||||
|
||||
16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
|
||||
16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
|
||||
16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
|
||||
16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
|
||||
16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
|
||||
16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
|
||||
16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
|
||||
};
|
||||
|
||||
typedef struct lut_rem_pair_s {
|
||||
uint16_t idx;
|
||||
uint16_t remval;
|
||||
} lut_rem_pair;
|
||||
|
||||
#endif
|
93
3rdparty/zlib-ng/arch/power/Makefile.in
vendored
Normal file
93
3rdparty/zlib-ng/arch/power/Makefile.in
vendored
Normal file
@ -0,0 +1,93 @@
|
||||
# Makefile for POWER-specific files
|
||||
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
P8FLAGS=-mcpu=power8
|
||||
P9FLAGS=-mcpu=power9
|
||||
PPCFLAGS=-maltivec
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: power_features.o \
|
||||
power_features.lo \
|
||||
adler32_power8.o \
|
||||
adler32_power8.lo \
|
||||
adler32_vmx.o \
|
||||
adler32_vmx.lo \
|
||||
chunkset_power8.o \
|
||||
chunkset_power8.lo \
|
||||
compare256_power9.o \
|
||||
compare256_power9.lo \
|
||||
crc32_power8.o \
|
||||
crc32_power8.lo \
|
||||
slide_hash_power8.o \
|
||||
slide_hash_power8.lo \
|
||||
slide_hash_vmx.o \
|
||||
slide_hash_vmx.lo
|
||||
|
||||
power_features.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
power_features.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
|
||||
|
||||
adler32_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_vmx.o:
|
||||
$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
adler32_vmx.lo:
|
||||
$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
|
||||
|
||||
chunkset_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
chunkset_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
|
||||
|
||||
compare256_power9.o:
|
||||
$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
compare256_power9.lo:
|
||||
$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
|
||||
|
||||
crc32_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
crc32_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
|
||||
|
||||
slide_hash_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_vmx.o:
|
||||
$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
slide_hash_vmx.lo:
|
||||
$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
153
3rdparty/zlib-ng/arch/power/adler32_power8.c
vendored
Normal file
153
3rdparty/zlib-ng/arch/power/adler32_power8.c
vendored
Normal file
@ -0,0 +1,153 @@
|
||||
/* Adler32 for POWER8 using VSX instructions.
|
||||
* Copyright (C) 2020 IBM Corporation
|
||||
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
|
||||
* instructions.
|
||||
*
|
||||
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
|
||||
* iteration n) is the initial value of adler - at start _0 is 1 unless
|
||||
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
|
||||
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
|
||||
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
|
||||
* after iteration N.
|
||||
*
|
||||
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
|
||||
* N-1*c[1] + ... + c[N]
|
||||
*
|
||||
* In a more general way:
|
||||
*
|
||||
* s1_N = s1_0 + sum(i=1 to N)c[i]
|
||||
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
|
||||
*
|
||||
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
|
||||
* can process N-bit at time we can do this at once.
|
||||
*
|
||||
* Since VSX can support 16-bit vector instructions, we can process
|
||||
* 16-bit at time using N = 16 we have:
|
||||
*
|
||||
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
|
||||
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
|
||||
*
|
||||
* After the first iteration we calculate the adler32 checksum for 16 bytes.
|
||||
*
|
||||
* For more background about adler32 please check the RFC:
|
||||
* https://www.ietf.org/rfc/rfc1950.txt
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* Vector across sum unsigned int (saturate). */
|
||||
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
|
||||
__b = vec_sld(__a, __a, 8);
|
||||
__b = vec_add(__b, __a);
|
||||
__a = vec_sld(__b, __b, 4);
|
||||
__a = vec_add(__a, __b);
|
||||
|
||||
return __a;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t s1 = adler & 0xffff;
|
||||
uint32_t s2 = (adler >> 16) & 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(s1, buf, s2);
|
||||
|
||||
/* If buffer is empty or len=0 we need to return adler initial value. */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1;
|
||||
|
||||
/* This is faster than VSX code for len < 64. */
|
||||
if (len < 64)
|
||||
return adler32_len_64(s1, buf, len, s2);
|
||||
|
||||
/* Use POWER VSX instructions for len >= 64. */
|
||||
const vector unsigned int v_zeros = { 0 };
|
||||
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
|
||||
6, 5, 4, 3, 2, 1};
|
||||
const vector unsigned char vsh = vec_splat_u8(4);
|
||||
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
|
||||
vector unsigned int vs1 = { 0 };
|
||||
vector unsigned int vs2 = { 0 };
|
||||
vector unsigned int vs1_save = { 0 };
|
||||
vector unsigned int vsum1, vsum2;
|
||||
vector unsigned char vbuf;
|
||||
int n;
|
||||
|
||||
vs1[0] = s1;
|
||||
vs2[0] = s2;
|
||||
|
||||
/* Do length bigger than NMAX in blocks of NMAX size. */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
n = NMAX / 16;
|
||||
do {
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
} while (--n);
|
||||
/* Once each block of NMAX size. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
|
||||
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
|
||||
vs1[0] = vs1[0] % BASE;
|
||||
/* vs2[0] = s2_i + 16*s1_save +
|
||||
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
|
||||
vs2[0] = vs2[0] % BASE;
|
||||
|
||||
vs1 = vec_and(vs1, vmask);
|
||||
vs2 = vec_and(vs2, vmask);
|
||||
vs1_save = v_zeros;
|
||||
}
|
||||
|
||||
/* len is less than NMAX one modulo is needed. */
|
||||
if (len >= 16) {
|
||||
while (len >= 16) {
|
||||
len -= 16;
|
||||
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
}
|
||||
/* Since the size will be always less than NMAX we do this once. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
}
|
||||
/* Copy result back to s1, s2 (mod 65521). */
|
||||
s1 = vs1[0] % BASE;
|
||||
s2 = vs2[0] % BASE;
|
||||
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(s1, buf, len, s2);
|
||||
}
|
||||
|
||||
#endif /* POWER8_VSX */
|
186
3rdparty/zlib-ng/arch/power/adler32_vmx.c
vendored
Normal file
186
3rdparty/zlib-ng/arch/power/adler32_vmx.c
vendored
Normal file
@ -0,0 +1,186 @@
|
||||
/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef PPC_VMX
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "zendian.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
#define vmx_zero() (vec_splat_u32(0))
|
||||
|
||||
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
pair[0] += buf[i];
|
||||
pair[1] += pair[0];
|
||||
}
|
||||
}
|
||||
|
||||
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
|
||||
/* Different taps for the separable components of sums */
|
||||
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
|
||||
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
|
||||
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
|
||||
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
|
||||
/* As silly and inefficient as it seems, creating 1 permutation vector to permute
|
||||
* a 2 element vector from a single load + a subsequent shift is just barely faster
|
||||
* than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
|
||||
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
|
||||
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
|
||||
vector unsigned int adacc, s2acc;
|
||||
vector unsigned int pair_vec = vec_ld(0, s);
|
||||
adacc = vec_perm(pair_vec, pair_vec, s0_perm);
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
s2acc = vec_sro(pair_vec, shift_vec);
|
||||
#else
|
||||
s2acc = vec_slo(pair_vec, shift_vec);
|
||||
#endif
|
||||
|
||||
vector unsigned int zero = vmx_zero();
|
||||
vector unsigned int s3acc = zero;
|
||||
vector unsigned int s3acc_0 = zero;
|
||||
vector unsigned int adacc_prev = adacc;
|
||||
vector unsigned int adacc_prev_0 = zero;
|
||||
|
||||
vector unsigned int s2acc_0 = zero;
|
||||
vector unsigned int s2acc_1 = zero;
|
||||
vector unsigned int s2acc_2 = zero;
|
||||
|
||||
/* Maintain a running sum of a second half, this might help use break yet another
|
||||
* data dependency bubble in the sum */
|
||||
vector unsigned int adacc_0 = zero;
|
||||
|
||||
int num_iter = len / 4;
|
||||
int rem = len & 3;
|
||||
|
||||
for (int i = 0; i < num_iter; ++i) {
|
||||
vector unsigned char d0 = vec_ld(0, buf);
|
||||
vector unsigned char d1 = vec_ld(16, buf);
|
||||
vector unsigned char d2 = vec_ld(32, buf);
|
||||
vector unsigned char d3 = vec_ld(48, buf);
|
||||
|
||||
/* The core operation of the loop, basically
|
||||
* what is being unrolled below */
|
||||
adacc = vec_sum4s(d0, adacc);
|
||||
s3acc = vec_add(s3acc, adacc_prev);
|
||||
s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
|
||||
s2acc = vec_msum(t0, d0, s2acc);
|
||||
|
||||
/* interleave dependent sums in here */
|
||||
adacc_0 = vec_sum4s(d1, adacc_0);
|
||||
s2acc_0 = vec_msum(t1, d1, s2acc_0);
|
||||
adacc = vec_sum4s(d2, adacc);
|
||||
s2acc_1 = vec_msum(t2, d2, s2acc_1);
|
||||
s2acc_2 = vec_msum(t3, d3, s2acc_2);
|
||||
adacc_0 = vec_sum4s(d3, adacc_0);
|
||||
|
||||
adacc_prev = adacc;
|
||||
adacc_prev_0 = adacc_0;
|
||||
buf += 64;
|
||||
}
|
||||
|
||||
adacc = vec_add(adacc, adacc_0);
|
||||
s3acc = vec_add(s3acc, s3acc_0);
|
||||
s3acc = vec_sl(s3acc, vec_splat_u32(6));
|
||||
|
||||
if (rem) {
|
||||
adacc_prev = vec_add(adacc_prev_0, adacc_prev);
|
||||
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
|
||||
while (rem--) {
|
||||
vector unsigned char d0 = vec_ld(0, buf);
|
||||
adacc = vec_sum4s(d0, adacc);
|
||||
s3acc = vec_add(s3acc, adacc_prev);
|
||||
s2acc = vec_msum(t3, d0, s2acc);
|
||||
adacc_prev = vec_sl(adacc, vec_splat_u32(4));
|
||||
buf += 16;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Sum up independent second sums */
|
||||
s2acc = vec_add(s2acc, s2acc_0);
|
||||
s2acc_2 = vec_add(s2acc_1, s2acc_2);
|
||||
s2acc = vec_add(s2acc, s2acc_2);
|
||||
|
||||
s2acc = vec_add(s2acc, s3acc);
|
||||
|
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
|
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
|
||||
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
|
||||
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
|
||||
|
||||
vec_ste(adacc, 0, s);
|
||||
vec_ste(s2acc, 0, s+1);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
uint32_t pair[16] ALIGNED_(16);
|
||||
memset(&pair[2], 0, 14);
|
||||
int n = NMAX;
|
||||
unsigned int done = 0, i;
|
||||
|
||||
/* Split Adler-32 into component sums, it can be supplied by
|
||||
* the caller sites (e.g. in a PNG file).
|
||||
*/
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
pair[0] = adler;
|
||||
pair[1] = sum2;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
// Align buffer
|
||||
unsigned int al = 0;
|
||||
if ((uintptr_t)buf & 0xf) {
|
||||
al = 16-((uintptr_t)buf & 0xf);
|
||||
if (al > len) {
|
||||
al=len;
|
||||
}
|
||||
vmx_handle_head_or_tail(pair, buf, al);
|
||||
|
||||
done += al;
|
||||
/* Rather than rebasing, we can reduce the max sums for the
|
||||
* first round only */
|
||||
n -= al;
|
||||
}
|
||||
for (i = al; i < len; i += n) {
|
||||
int remaining = (int)(len-i);
|
||||
n = MIN(remaining, (i == al) ? n : NMAX);
|
||||
|
||||
if (n < 16)
|
||||
break;
|
||||
|
||||
vmx_accum32(pair, buf + i, n / 16);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
|
||||
done += (n / 16) * 16;
|
||||
}
|
||||
|
||||
/* Handle the tail elements. */
|
||||
if (done < len) {
|
||||
vmx_handle_head_or_tail(pair, (buf + done), len - done);
|
||||
pair[0] %= BASE;
|
||||
pair[1] %= BASE;
|
||||
}
|
||||
|
||||
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
|
||||
return (pair[1] << 16) | pair[0];
|
||||
}
|
||||
#endif
|
55
3rdparty/zlib-ng/arch/power/chunkset_power8.c
vendored
Normal file
55
3rdparty/zlib-ng/arch/power/chunkset_power8.c
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
#include <altivec.h>
|
||||
#include "../../zbuild.h"
|
||||
|
||||
typedef vector unsigned char chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
uint16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = (vector unsigned char)vec_splats(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
uint32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = (vector unsigned char)vec_splats(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
uint64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = (vector unsigned char)vec_splats((unsigned long long)tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = vec_xl(0, s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
vec_xst(*chunk, 0, out);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_power8
|
||||
#define CHUNKCOPY chunkcopy_power8
|
||||
#define CHUNKUNROLL chunkunroll_power8
|
||||
#define CHUNKMEMSET chunkmemset_power8
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_power8
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
64
3rdparty/zlib-ng/arch/power/compare256_power9.c
vendored
Normal file
64
3rdparty/zlib-ng/arch/power/compare256_power9.c
vendored
Normal file
@ -0,0 +1,64 @@
|
||||
/* compare256_power9.c - Power9 version of compare256
|
||||
* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER9
|
||||
#include <altivec.h>
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zendian.h"
|
||||
|
||||
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
|
||||
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
|
||||
#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
|
||||
#else
|
||||
# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
|
||||
#endif
|
||||
#else
|
||||
# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
|
||||
#endif
|
||||
|
||||
static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0, cmplen;
|
||||
|
||||
do {
|
||||
vector unsigned char vsrc0, vsrc1, vc;
|
||||
|
||||
vsrc0 = *((vector unsigned char *)src0);
|
||||
vsrc1 = *((vector unsigned char *)src1);
|
||||
|
||||
/* Compare 16 bytes at a time. Each byte of vc will be either
|
||||
* all ones or all zeroes, depending on the result of the comparison. */
|
||||
vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
|
||||
|
||||
/* Since the index of matching bytes will contain only zeroes
|
||||
* on vc (since we used cmpne), counting the number of consecutive
|
||||
* bytes where LSB == 0 is the same as counting the length of the match. */
|
||||
zng_vec_vctzlsbb(vc, cmplen);
|
||||
if (cmplen != 16)
|
||||
return len + cmplen;
|
||||
|
||||
src0 += 16, src1 += 16, len += 16;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_power9_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_power9
|
||||
#define COMPARE256 compare256_power9_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_power9
|
||||
#define COMPARE256 compare256_power9_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
1123
3rdparty/zlib-ng/arch/power/crc32_constants.h
vendored
Normal file
1123
3rdparty/zlib-ng/arch/power/crc32_constants.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
589
3rdparty/zlib-ng/arch/power/crc32_power8.c
vendored
Normal file
589
3rdparty/zlib-ng/arch/power/crc32_power8.c
vendored
Normal file
@ -0,0 +1,589 @@
|
||||
/* crc32 for POWER8 using VSX instructions
|
||||
* Copyright (C) 2021 IBM Corporation
|
||||
*
|
||||
* Author: Rogerio Alves <rogealve@br.ibm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of
|
||||
* 16 bytes.
|
||||
*
|
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
||||
* chunks in order to mask the latency of the vpmsum instructions. If we
|
||||
* have more than 32 kB of data to checksum we repeat this step multiple
|
||||
* times, passing in the previous 1024 bits.
|
||||
*
|
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds
|
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just
|
||||
* calculate constants that land the data in this 32 bits.
|
||||
*
|
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
||||
* for n = CRC using POWER8 instructions. We use x = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
*
|
||||
* This code uses gcc vector builtins instead using assembly directly.
|
||||
*/
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zendian.h"
|
||||
#include "zbuild.h"
|
||||
|
||||
#include "crc32_constants.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
|
||||
#if defined (__clang__)
|
||||
#include "fallback_builtins.h"
|
||||
#endif
|
||||
|
||||
#define MAX_SIZE 32768
|
||||
#define VMX_ALIGN 16
|
||||
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
|
||||
|
||||
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
|
||||
while (len--)
|
||||
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
|
||||
return crc;
|
||||
}
|
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
|
||||
|
||||
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
|
||||
unsigned int prealign;
|
||||
unsigned int tail;
|
||||
|
||||
unsigned long len = (unsigned long) _len;
|
||||
|
||||
if (p == (const unsigned char *) 0x0)
|
||||
return 0;
|
||||
|
||||
crc ^= 0xffffffff;
|
||||
|
||||
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
|
||||
crc = crc32_align(crc, p, len);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((unsigned long)p & VMX_ALIGN_MASK) {
|
||||
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
|
||||
crc = crc32_align(crc, p, prealign);
|
||||
len -= prealign;
|
||||
p += prealign;
|
||||
}
|
||||
|
||||
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
|
||||
|
||||
tail = len & VMX_ALIGN_MASK;
|
||||
if (tail) {
|
||||
p += len & ~VMX_ALIGN_MASK;
|
||||
crc = crc32_align(crc, p, tail);
|
||||
}
|
||||
|
||||
out:
|
||||
crc ^= 0xffffffff;
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
/* When we have a load-store in a single-dispatch group and address overlap
|
||||
* such that forward is not allowed (load-hit-store) the group must be flushed.
|
||||
* A group ending NOP prevents the flush.
|
||||
*/
|
||||
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
|
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
#define BYTESWAP_DATA
|
||||
#endif
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
/* Byte reverse permute constant LE. */
|
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
|
||||
#else
|
||||
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
|
||||
#endif
|
||||
#else
|
||||
#define VEC_PERM(vr, va, vb, vc)
|
||||
#endif
|
||||
|
||||
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
|
||||
|
||||
const __vector unsigned long long vzero = {0,0};
|
||||
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
|
||||
|
||||
const __vector unsigned long long vmask_32bit =
|
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
|
||||
|
||||
const __vector unsigned long long vmask_64bit =
|
||||
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
|
||||
|
||||
__vector unsigned long long vcrc;
|
||||
|
||||
__vector unsigned long long vconst1, vconst2;
|
||||
|
||||
/* vdata0-vdata7 will contain our data (p). */
|
||||
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
|
||||
|
||||
/* v0-v7 will contain our checksums */
|
||||
__vector unsigned long long v0 = {0,0};
|
||||
__vector unsigned long long v1 = {0,0};
|
||||
__vector unsigned long long v2 = {0,0};
|
||||
__vector unsigned long long v3 = {0,0};
|
||||
__vector unsigned long long v4 = {0,0};
|
||||
__vector unsigned long long v5 = {0,0};
|
||||
__vector unsigned long long v6 = {0,0};
|
||||
__vector unsigned long long v7 = {0,0};
|
||||
|
||||
|
||||
/* Vector auxiliary variables. */
|
||||
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
|
||||
|
||||
unsigned int offset; /* Constant table offset. */
|
||||
|
||||
unsigned long i; /* Counter. */
|
||||
unsigned long chunks;
|
||||
|
||||
unsigned long block_size;
|
||||
int next_block = 0;
|
||||
|
||||
/* Align by 128 bits. The last 128 bit block will be processed at end. */
|
||||
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
|
||||
|
||||
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
|
||||
|
||||
/* Short version. */
|
||||
if (len < 256) {
|
||||
/* Calculate where in the constant table we need to start. */
|
||||
offset = 256 - len;
|
||||
|
||||
vconst1 = vec_ld(offset, vcrc_short_const);
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
|
||||
|
||||
/* xor initial value */
|
||||
vdata0 = vec_xor(vdata0, vcrc);
|
||||
|
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
|
||||
v0 = vec_xor(v0, vdata0);
|
||||
|
||||
for (i = 16; i < len; i += 16) {
|
||||
vconst1 = vec_ld(offset + i, vcrc_short_const);
|
||||
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
|
||||
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
|
||||
v0 = vec_xor(v0, vdata0);
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Load initial values. */
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
/* xor in initial value */
|
||||
vdata0 = vec_xor(vdata0, vcrc);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
do {
|
||||
/* Checksum in blocks of MAX_SIZE. */
|
||||
block_size = length;
|
||||
if (block_size > MAX_SIZE) {
|
||||
block_size = MAX_SIZE;
|
||||
}
|
||||
|
||||
length = length - block_size;
|
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each
|
||||
* constant is 16 bytes, and it is used against 128 bytes of input
|
||||
* data - 128 / 16 = 8
|
||||
*/
|
||||
offset = (MAX_SIZE/8) - (block_size/8);
|
||||
/* We reduce our final 128 bytes in a separate step */
|
||||
chunks = (block_size/128)-1;
|
||||
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
|
||||
if (chunks > 1) {
|
||||
offset += 16;
|
||||
vconst2 = vec_ld(offset, vcrc_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
/*
|
||||
* main loop. Each iteration calculates the CRC for a 128-byte
|
||||
* block.
|
||||
*/
|
||||
for (i = 0; i < chunks-2; i++) {
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
offset += 16;
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v0 = vec_xor(v0, va0);
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v1 = vec_xor(v1, va1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v2 = vec_xor(v2, va2);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
|
||||
vdata2, (__vector unsigned long long)vconst2);
|
||||
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v3 = vec_xor(v3, va3);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst2);
|
||||
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
|
||||
|
||||
vconst2 = vec_ld(offset, vcrc_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v4 = vec_xor(v4, va4);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v5 = vec_xor(v5, va5);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v6 = vec_xor(v6, va6);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v7 = vec_xor(v7, va7);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
}
|
||||
|
||||
/* First cool down */
|
||||
vconst1 = vec_ld(offset, vcrc_const);
|
||||
offset += 16;
|
||||
|
||||
v0 = vec_xor(v0, va0);
|
||||
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v1 = vec_xor(v1, va1);
|
||||
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v2 = vec_xor(v2, va2);
|
||||
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v3 = vec_xor(v3, va3);
|
||||
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v4 = vec_xor(v4, va4);
|
||||
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v5 = vec_xor(v5, va5);
|
||||
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v6 = vec_xor(v6, va6);
|
||||
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
|
||||
(__vector unsigned long long)vconst1);
|
||||
GROUP_ENDING_NOP;
|
||||
|
||||
v7 = vec_xor(v7, va7);
|
||||
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
|
||||
(__vector unsigned long long)vconst1);
|
||||
}/* else */
|
||||
|
||||
/* Second cool down. */
|
||||
v0 = vec_xor(v0, va0);
|
||||
v1 = vec_xor(v1, va1);
|
||||
v2 = vec_xor(v2, va2);
|
||||
v3 = vec_xor(v3, va3);
|
||||
v4 = vec_xor(v4, va4);
|
||||
v5 = vec_xor(v5, va5);
|
||||
v6 = vec_xor(v6, va6);
|
||||
v7 = vec_xor(v7, va7);
|
||||
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits
|
||||
* of the register. Since we are bit reflected we have to shift it
|
||||
* left 32 bits so it occupies the least significant bits in the
|
||||
* bit reflected domain.
|
||||
*/
|
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
|
||||
/* xor with the last 1024 bits. */
|
||||
va0 = vec_ld(0, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va0, va0, va0, vperm_const);
|
||||
|
||||
va1 = vec_ld(16, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va1, va1, va1, vperm_const);
|
||||
|
||||
va2 = vec_ld(32, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va2, va2, va2, vperm_const);
|
||||
|
||||
va3 = vec_ld(48, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va3, va3, va3, vperm_const);
|
||||
|
||||
va4 = vec_ld(64, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va4, va4, va4, vperm_const);
|
||||
|
||||
va5 = vec_ld(80, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va5, va5, va5, vperm_const);
|
||||
|
||||
va6 = vec_ld(96, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va6, va6, va6, vperm_const);
|
||||
|
||||
va7 = vec_ld(112, (__vector unsigned long long*) p);
|
||||
VEC_PERM(va7, va7, va7, vperm_const);
|
||||
|
||||
p = (char *)p + 128;
|
||||
|
||||
vdata0 = vec_xor(v0, va0);
|
||||
vdata1 = vec_xor(v1, va1);
|
||||
vdata2 = vec_xor(v2, va2);
|
||||
vdata3 = vec_xor(v3, va3);
|
||||
vdata4 = vec_xor(v4, va4);
|
||||
vdata5 = vec_xor(v5, va5);
|
||||
vdata6 = vec_xor(v6, va6);
|
||||
vdata7 = vec_xor(v7, va7);
|
||||
|
||||
/* Check if we have more blocks to process */
|
||||
next_block = 0;
|
||||
if (length != 0) {
|
||||
next_block = 1;
|
||||
|
||||
/* zero v0-v7 */
|
||||
v0 = vec_xor(v0, v0);
|
||||
v1 = vec_xor(v1, v1);
|
||||
v2 = vec_xor(v2, v2);
|
||||
v3 = vec_xor(v3, v3);
|
||||
v4 = vec_xor(v4, v4);
|
||||
v5 = vec_xor(v5, v5);
|
||||
v6 = vec_xor(v6, v6);
|
||||
v7 = vec_xor(v7, v7);
|
||||
}
|
||||
length = length + 128;
|
||||
|
||||
} while (next_block);
|
||||
|
||||
/* Calculate how many bytes we have left. */
|
||||
length = (len & 127);
|
||||
|
||||
/* Calculate where in (short) constant table we need to start. */
|
||||
offset = 128 - length;
|
||||
|
||||
v0 = vec_ld(offset, vcrc_short_const);
|
||||
v1 = vec_ld(offset + 16, vcrc_short_const);
|
||||
v2 = vec_ld(offset + 32, vcrc_short_const);
|
||||
v3 = vec_ld(offset + 48, vcrc_short_const);
|
||||
v4 = vec_ld(offset + 64, vcrc_short_const);
|
||||
v5 = vec_ld(offset + 80, vcrc_short_const);
|
||||
v6 = vec_ld(offset + 96, vcrc_short_const);
|
||||
v7 = vec_ld(offset + 112, vcrc_short_const);
|
||||
|
||||
offset += 128;
|
||||
|
||||
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)v0);
|
||||
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata1, (__vector unsigned int)v1);
|
||||
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata2, (__vector unsigned int)v2);
|
||||
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata3, (__vector unsigned int)v3);
|
||||
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata4, (__vector unsigned int)v4);
|
||||
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata5, (__vector unsigned int)v5);
|
||||
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata6, (__vector unsigned int)v6);
|
||||
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata7, (__vector unsigned int)v7);
|
||||
|
||||
/* Now reduce the tail (0-112 bytes). */
|
||||
for (i = 0; i < length; i+=16) {
|
||||
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
|
||||
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
|
||||
va0 = vec_ld(offset + i,vcrc_short_const);
|
||||
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
|
||||
(__vector unsigned int)vdata0, (__vector unsigned int)va0);
|
||||
v0 = vec_xor(v0, va0);
|
||||
}
|
||||
|
||||
/* xor all parallel chunks together. */
|
||||
v0 = vec_xor(v0, v1);
|
||||
v2 = vec_xor(v2, v3);
|
||||
v4 = vec_xor(v4, v5);
|
||||
v6 = vec_xor(v6, v7);
|
||||
|
||||
v0 = vec_xor(v0, v2);
|
||||
v4 = vec_xor(v4, v6);
|
||||
|
||||
v0 = vec_xor(v0, v4);
|
||||
}
|
||||
|
||||
/* Barrett Reduction */
|
||||
vconst1 = vec_ld(0, v_Barrett_const);
|
||||
vconst2 = vec_ld(16, v_Barrett_const);
|
||||
|
||||
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)v0, 8);
|
||||
v0 = vec_xor(v1,v0);
|
||||
|
||||
/* shift left one bit */
|
||||
__vector unsigned char vsht_splat = vec_splat_u8 (1);
|
||||
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
|
||||
|
||||
v0 = vec_and(v0, vmask_64bit);
|
||||
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit
|
||||
* reflecting our data (which is expensive to do), we bit reflect our
|
||||
* constants and our algorithm, which means the intermediate data in
|
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect
|
||||
* the algorithm because we don't carry in mod 2 arithmetic.
|
||||
*/
|
||||
|
||||
/* bottom 32 bits of a */
|
||||
v1 = vec_and(v0, vmask_32bit);
|
||||
|
||||
/* ma */
|
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
|
||||
(__vector unsigned long long)vconst1);
|
||||
|
||||
/* bottom 32bits of ma */
|
||||
v1 = vec_and(v1, vmask_32bit);
|
||||
/* qn */
|
||||
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
|
||||
(__vector unsigned long long)vconst2);
|
||||
/* a - qn, subtraction is xor in GF(2) */
|
||||
v0 = vec_xor (v0, v1);
|
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in
|
||||
* the high 32 bits. We just need to shift it left 4 bytes
|
||||
* V0 [ 0 1 X 3 ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
|
||||
/* shift result into top 64 bits of */
|
||||
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
|
||||
(__vector unsigned char)vzero, 4);
|
||||
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
return v0[0];
|
||||
#else
|
||||
return v0[1];
|
||||
#endif
|
||||
}
|
31
3rdparty/zlib-ng/arch/power/fallback_builtins.h
vendored
Normal file
31
3rdparty/zlib-ng/arch/power/fallback_builtins.h
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
/* Helper functions to work around issues with clang builtins
|
||||
* Copyright (C) 2021 IBM Corporation
|
||||
*
|
||||
* Authors:
|
||||
* Daniel Black <daniel@linux.vnet.ibm.com>
|
||||
* Rogerio Alves <rogealve@br.ibm.com>
|
||||
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_BUILTINS_H
|
||||
#define POWER_BUILTINS_H
|
||||
|
||||
/*
|
||||
* These stubs fix clang incompatibilities with GCC builtins.
|
||||
*/
|
||||
|
||||
#ifndef __builtin_crypto_vpmsumw
|
||||
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
|
||||
#endif
|
||||
#ifndef __builtin_crypto_vpmsumd
|
||||
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
|
||||
#endif
|
||||
|
||||
static inline __vector unsigned long long __attribute__((overloadable))
|
||||
vec_ld(int __a, const __vector unsigned long long* __b) {
|
||||
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
|
||||
}
|
||||
|
||||
#endif
|
46
3rdparty/zlib-ng/arch/power/power_features.c
vendored
Normal file
46
3rdparty/zlib-ng/arch/power/power_features.c
vendored
Normal file
@ -0,0 +1,46 @@
|
||||
/* power_features.c - POWER feature check
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef HAVE_SYS_AUXV_H
|
||||
# include <sys/auxv.h>
|
||||
#endif
|
||||
#ifdef __FreeBSD__
|
||||
# include <machine/cpu.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
#include "power_features.h"
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
|
||||
#ifdef PPC_FEATURES
|
||||
unsigned long hwcap;
|
||||
#ifdef __FreeBSD__
|
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
|
||||
#else
|
||||
hwcap = getauxval(AT_HWCAP);
|
||||
#endif
|
||||
|
||||
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
|
||||
features->has_altivec = 1;
|
||||
#endif
|
||||
|
||||
#ifdef POWER_FEATURES
|
||||
unsigned long hwcap2;
|
||||
#ifdef __FreeBSD__
|
||||
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
|
||||
#else
|
||||
hwcap2 = getauxval(AT_HWCAP2);
|
||||
#endif
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||
features->has_arch_2_07 = 1;
|
||||
#endif
|
||||
#ifdef POWER9
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
|
||||
features->has_arch_3_00 = 1;
|
||||
#endif
|
||||
#endif
|
||||
}
|
18
3rdparty/zlib-ng/arch/power/power_features.h
vendored
Normal file
18
3rdparty/zlib-ng/arch/power/power_features.h
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
/* power_features.h -- check for POWER CPU features
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_H_
|
||||
#define POWER_H_
|
||||
|
||||
struct power_cpu_features {
|
||||
int has_altivec;
|
||||
int has_arch_2_07;
|
||||
int has_arch_3_00;
|
||||
};
|
||||
|
||||
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
|
||||
|
||||
#endif /* POWER_H_ */
|
12
3rdparty/zlib-ng/arch/power/slide_hash_power8.c
vendored
Normal file
12
3rdparty/zlib-ng/arch/power/slide_hash_power8.c
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
/* Optimized slide_hash for POWER processors
|
||||
* Copyright (C) 2019-2020 IBM Corporation
|
||||
* Author: Matheus Castanho <msc@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX
|
||||
|
||||
#define SLIDE_PPC slide_hash_power8
|
||||
#include "slide_ppc_tpl.h"
|
||||
|
||||
#endif /* POWER8_VSX */
|
10
3rdparty/zlib-ng/arch/power/slide_hash_vmx.c
vendored
Normal file
10
3rdparty/zlib-ng/arch/power/slide_hash_vmx.c
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
/* Optimized slide_hash for PowerPC processors with VMX instructions
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef PPC_VMX
|
||||
|
||||
#define SLIDE_PPC slide_hash_vmx
|
||||
#include "slide_ppc_tpl.h"
|
||||
|
||||
#endif /* PPC_VMX */
|
31
3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h
vendored
Normal file
31
3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
/* Optimized slide_hash for PowerPC processors
|
||||
* Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
const vector unsigned short vmx_wsize = vec_splats(wsize);
|
||||
Pos *p = table;
|
||||
|
||||
do {
|
||||
vector unsigned short value, result;
|
||||
|
||||
value = vec_ld(0, p);
|
||||
result = vec_subs(value, vmx_wsize);
|
||||
vec_st(result, 0, p);
|
||||
|
||||
p += 8;
|
||||
entries -= 8;
|
||||
} while (entries > 0);
|
||||
}
|
||||
|
||||
void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
|
||||
uint16_t wsize = s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
45
3rdparty/zlib-ng/arch/riscv/README.md
vendored
Normal file
45
3rdparty/zlib-ng/arch/riscv/README.md
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
# Building RISC-V Target with Cmake #
|
||||
|
||||
> **Warning**
|
||||
> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer.
|
||||
>
|
||||
> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu.
|
||||
> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it.
|
||||
## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
|
||||
|
||||
If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
|
||||
|
||||
```bash
|
||||
./prepare_riscv_toolchain_qemu.sh
|
||||
```
|
||||
|
||||
After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
|
||||
|
||||
`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
|
||||
`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
|
||||
|
||||
You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
|
||||
|
||||
## Cross-Compile for RISC-V Target ##
|
||||
|
||||
```bash
|
||||
cmake -G Ninja -B ./build-riscv \
|
||||
-D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
|
||||
-D CMAKE_INSTALL_PREFIX=./build-riscv/install \
|
||||
-D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
|
||||
-D QEMU_PATH={QEMU_PATH} \
|
||||
.
|
||||
|
||||
cmake --build ./build-riscv
|
||||
```
|
||||
|
||||
Disable the option if there is no RVV support:
|
||||
```
|
||||
-D WITH_RVV=OFF
|
||||
```
|
||||
|
||||
## Run Unittests on User Mode QEMU ##
|
||||
|
||||
```bash
|
||||
cd ./build-riscv && ctest --verbose
|
||||
```
|
132
3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
vendored
Normal file
132
3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
/* adler32_rvv.c - RVV version of adler32
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
|
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef RISCV_RVV
|
||||
|
||||
#include <riscv_vector.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
|
||||
/* split Adler-32 into component sums */
|
||||
uint32_t sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (len == 1) {
|
||||
if (COPY) memcpy(dst, src, 1);
|
||||
return adler32_len_1(adler, src, sum2);
|
||||
}
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (src == NULL)
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (len < 16) {
|
||||
if (COPY) memcpy(dst, src, len);
|
||||
return adler32_len_16(adler, src, len, sum2);
|
||||
}
|
||||
|
||||
size_t left = len;
|
||||
size_t vl = __riscv_vsetvlmax_e8m1();
|
||||
vl = vl > 256 ? 256 : vl;
|
||||
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
|
||||
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
|
||||
vuint16m2_t v_buf16_accu;
|
||||
|
||||
/*
|
||||
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
|
||||
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
|
||||
* accumulators to boost performance.
|
||||
*
|
||||
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
|
||||
* vl > 256 (255 * 256 <= UINT16_MAX).
|
||||
*
|
||||
* We accumulate 8-bit data into a 16-bit accumulator and then
|
||||
* move the data into the 32-bit accumulator at the last iteration.
|
||||
*/
|
||||
size_t block_size = (256 / vl) * vl;
|
||||
size_t nmax_limit = (NMAX / block_size);
|
||||
size_t cnt = 0;
|
||||
while (left >= block_size) {
|
||||
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
|
||||
size_t subprob = block_size;
|
||||
while (subprob > 0) {
|
||||
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
|
||||
if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
|
||||
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
|
||||
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
|
||||
src += vl;
|
||||
if (COPY) dst += vl;
|
||||
subprob -= vl;
|
||||
}
|
||||
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
|
||||
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
|
||||
left -= block_size;
|
||||
/* do modulo once each block of NMAX size */
|
||||
if (++cnt >= nmax_limit) {
|
||||
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
|
||||
cnt = 0;
|
||||
}
|
||||
}
|
||||
/* the left len <= 256 now, we can use 16-bit accum safely */
|
||||
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
|
||||
size_t res = left;
|
||||
while (left >= vl) {
|
||||
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
|
||||
if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
|
||||
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
|
||||
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
|
||||
src += vl;
|
||||
if (COPY) dst += vl;
|
||||
left -= vl;
|
||||
}
|
||||
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
|
||||
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
|
||||
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
|
||||
|
||||
vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
|
||||
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
|
||||
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
|
||||
|
||||
v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
|
||||
|
||||
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
|
||||
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
|
||||
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
|
||||
|
||||
sum2 += (sum2_sum + adler * (len - left));
|
||||
|
||||
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
|
||||
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
|
||||
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
|
||||
|
||||
adler += adler_sum;
|
||||
|
||||
while (left--) {
|
||||
if (COPY) *dst++ = *src;
|
||||
adler += *src++;
|
||||
sum2 += adler;
|
||||
}
|
||||
|
||||
sum2 %= BASE;
|
||||
adler %= BASE;
|
||||
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
return adler32_rvv_impl(adler, dst, src, len, 1);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
return adler32_rvv_impl(adler, NULL, buf, len, 0);
|
||||
}
|
||||
|
||||
#endif // RISCV_RVV
|
121
3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c
vendored
Normal file
121
3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
/* chunkset_rvv.c - RVV version of chunkset
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
|
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include <riscv_vector.h>
|
||||
#include "zbuild.h"
|
||||
|
||||
/*
|
||||
* RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
|
||||
* so we prefer using large size chunk and copy memory as much as possible.
|
||||
*/
|
||||
#define CHUNK_SIZE 32
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
#define CHUNK_MEMSET_RVV_IMPL(elen) \
|
||||
do { \
|
||||
size_t vl, len = CHUNK_SIZE / sizeof(uint##elen##_t); \
|
||||
uint##elen##_t val = *(uint##elen##_t*)from; \
|
||||
uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \
|
||||
do { \
|
||||
vl = __riscv_vsetvl_e##elen##m4(len); \
|
||||
vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
|
||||
__riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \
|
||||
len -= vl; chunk_p += vl; \
|
||||
} while (len > 0); \
|
||||
} while (0)
|
||||
|
||||
/* We don't have a 32-byte datatype for RISC-V arch. */
|
||||
typedef struct chunk_s {
|
||||
uint64_t data[4];
|
||||
} chunk_t;
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
CHUNK_MEMSET_RVV_IMPL(16);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
CHUNK_MEMSET_RVV_IMPL(32);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
CHUNK_MEMSET_RVV_IMPL(64);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
memcpy(chunk->data, (uint8_t *)s, CHUNK_SIZE);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
memcpy(out, chunk->data, CHUNK_SIZE);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_rvv
|
||||
#define CHUNKCOPY chunkcopy_rvv
|
||||
#define CHUNKUNROLL chunkunroll_rvv
|
||||
#define CHUNKMEMSET chunkmemset_rvv
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
|
||||
|
||||
#define HAVE_CHUNKCOPY
|
||||
|
||||
/*
|
||||
* Assuming that the length is non-zero, and that `from` lags `out` by at least
|
||||
* sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
|
||||
*
|
||||
* We load/store a single chunk once in the `CHUNKCOPY`.
|
||||
* However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
|
||||
* such that, we prefer copy large memory size once to make good use of the the RVV advance.
|
||||
*
|
||||
* To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
|
||||
* but we still copy as much memory as possible for some conditions.
|
||||
*
|
||||
* case 1: out - from >= len (no overlap)
|
||||
* We can use memcpy to copy `len` size once
|
||||
* because the memory layout would be the same.
|
||||
*
|
||||
* case 2: overlap
|
||||
* We copy N chunks using memcpy at once, aiming to achieve our goal:
|
||||
* to copy as much memory as possible.
|
||||
*
|
||||
* After using a single memcpy to copy N chunks, we have to use series of
|
||||
* loadchunk and storechunk to ensure the result is correct.
|
||||
*/
|
||||
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
Assert(len > 0, "chunkcopy should never have a length 0");
|
||||
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
|
||||
memcpy(out, from, sizeof(chunk_t));
|
||||
out += align;
|
||||
from += align;
|
||||
len -= align;
|
||||
ptrdiff_t dist = out - from;
|
||||
if (dist >= len) {
|
||||
memcpy(out, from, len);
|
||||
out += len;
|
||||
from += len;
|
||||
return out;
|
||||
}
|
||||
if (dist >= sizeof(chunk_t)) {
|
||||
dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
|
||||
memcpy(out, from, dist);
|
||||
out += dist;
|
||||
from += dist;
|
||||
len -= dist;
|
||||
}
|
||||
while (len > 0) {
|
||||
memcpy(out, from, sizeof(chunk_t));
|
||||
out += sizeof(chunk_t);
|
||||
from += sizeof(chunk_t);
|
||||
len -= sizeof(chunk_t);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_rvv
|
||||
|
||||
#include "inffast_tpl.h"
|
47
3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
vendored
Normal file
47
3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
/* compare256_rvv.c - RVV version of compare256
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
|
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef RISCV_RVV
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
size_t vl;
|
||||
long found_diff;
|
||||
do {
|
||||
vl = __riscv_vsetvl_e8m4(256 - len);
|
||||
vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
|
||||
vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
|
||||
vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
|
||||
found_diff = __riscv_vfirst_m_b2(v_mask, vl);
|
||||
if (found_diff >= 0)
|
||||
return len + (uint32_t)found_diff;
|
||||
src0 += vl, src1 += vl, len += vl;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_rvv_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_rvv
|
||||
#define COMPARE256 compare256_rvv_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_rvv
|
||||
#define COMPARE256 compare256_rvv_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif // RISCV_RVV
|
45
3rdparty/zlib-ng/arch/riscv/riscv_features.c
vendored
Normal file
45
3rdparty/zlib-ng/arch/riscv/riscv_features.c
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/auxv.h>
|
||||
#include <sys/utsname.h>
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "riscv_features.h"
|
||||
|
||||
#define ISA_V_HWCAP (1 << ('v' - 'a'))
|
||||
|
||||
int Z_INTERNAL is_kernel_version_greater_or_equal_to_6_5() {
|
||||
struct utsname buffer;
|
||||
uname(&buffer);
|
||||
|
||||
int major, minor;
|
||||
if (sscanf(buffer.release, "%d.%d", &major, &minor) != 2) {
|
||||
// Something bad with uname()
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (major > 6 || major == 6 && minor >= 5)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *features) {
|
||||
#if defined(__riscv_v) && defined(__linux__)
|
||||
features->has_rvv = 1;
|
||||
#else
|
||||
features->has_rvv = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
|
||||
unsigned long hw_cap = getauxval(AT_HWCAP);
|
||||
features->has_rvv = hw_cap & ISA_V_HWCAP;
|
||||
}
|
||||
|
||||
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
|
||||
if (is_kernel_version_greater_or_equal_to_6_5())
|
||||
riscv_check_features_runtime(features);
|
||||
else
|
||||
riscv_check_features_compile_time(features);
|
||||
}
|
18
3rdparty/zlib-ng/arch/riscv/riscv_features.h
vendored
Normal file
18
3rdparty/zlib-ng/arch/riscv/riscv_features.h
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
/* riscv_features.h -- check for riscv features.
|
||||
*
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
|
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef RISCV_H_
|
||||
#define RISCV_H_
|
||||
|
||||
struct riscv_cpu_features {
|
||||
int has_rvv;
|
||||
};
|
||||
|
||||
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
|
||||
|
||||
#endif /* RISCV_H_ */
|
34
3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
vendored
Normal file
34
3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
/* slide_hash_rvv.c - RVV version of slide_hash
|
||||
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
|
||||
* Contributed by Alex Chiang <alex.chiang@sifive.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef RISCV_RVV
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
|
||||
size_t vl;
|
||||
while (entries > 0) {
|
||||
vl = __riscv_vsetvl_e16m4(entries);
|
||||
vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
|
||||
vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
|
||||
vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
|
||||
v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
|
||||
__riscv_vse16_v_u16m4(table, v_tab, vl);
|
||||
table += vl, entries -= vl;
|
||||
}
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_rvv(deflate_state *s) {
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
||||
|
||||
#endif // RISCV_RVV
|
147
3rdparty/zlib-ng/arch/x86/Makefile.in
vendored
Normal file
147
3rdparty/zlib-ng/arch/x86/Makefile.in
vendored
Normal file
@ -0,0 +1,147 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
|
||||
AVX512VNNIFLAG=-mavx512vnni
|
||||
AVX2FLAG=-mavx2
|
||||
SSE2FLAG=-msse2
|
||||
SSSE3FLAG=-mssse3
|
||||
SSE42FLAG=-msse4.2
|
||||
PCLMULFLAG=-mpclmul
|
||||
VPCLMULFLAG=-mvpclmulqdq
|
||||
XSAVEFLAG=-mxsave
|
||||
NOLTOFLAG=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: \
|
||||
x86_features.o x86_features.lo \
|
||||
adler32_avx2.o adler32_avx2.lo \
|
||||
adler32_avx512.o adler32_avx512.lo \
|
||||
adler32_avx512_vnni.o adler32_avx512_vnni.lo \
|
||||
adler32_sse42.o adler32_sse42.lo \
|
||||
adler32_ssse3.o adler32_ssse3.lo \
|
||||
chunkset_avx2.o chunkset_avx2.lo \
|
||||
chunkset_sse2.o chunkset_sse2.lo \
|
||||
chunkset_ssse3.o chunkset_ssse3.lo \
|
||||
compare256_avx2.o compare256_avx2.lo \
|
||||
compare256_sse2.o compare256_sse2.lo \
|
||||
insert_string_sse42.o insert_string_sse42.lo \
|
||||
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
|
||||
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
|
||||
slide_hash_avx2.o slide_hash_avx2.lo \
|
||||
slide_hash_sse2.o slide_hash_sse2.lo
|
||||
|
||||
x86_features.o:
|
||||
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
x86_features.lo:
|
||||
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
|
||||
|
||||
chunkset_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
|
||||
|
||||
chunkset_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
|
||||
|
||||
chunkset_ssse3.o:
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
chunkset_ssse3.lo:
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
|
||||
|
||||
compare256_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
|
||||
|
||||
compare256_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
compare256_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
|
||||
|
||||
insert_string_sse42.o:
|
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
insert_string_sse42.lo:
|
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
|
||||
|
||||
crc32_pclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_pclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
crc32_vpclmulqdq.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
|
||||
|
||||
slide_hash_avx2.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_avx2.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
|
||||
|
||||
slide_hash_sse2.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
slide_hash_sse2.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
|
||||
|
||||
adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
|
||||
|
||||
adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
|
||||
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
|
||||
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
|
||||
|
||||
adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
|
||||
$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
|
||||
$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
|
||||
|
||||
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
|
||||
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
|
||||
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean: clean
|
||||
rm -f Makefile
|
154
3rdparty/zlib-ng/arch/x86/adler32_avx2.c
vendored
Normal file
154
3rdparty/zlib-ng/arch/x86/adler32_avx2.c
vendored
Normal file
@ -0,0 +1,154 @@
|
||||
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* Authors:
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX2
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#include "../../adler32_fold.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
#include "x86_intrins.h"
|
||||
|
||||
#ifdef X86_SSE42
|
||||
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
|
||||
|
||||
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
|
||||
#define sub32(a, b, c) adler32_ssse3(a, b, c)
|
||||
#else
|
||||
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
|
||||
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
|
||||
#endif
|
||||
|
||||
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 16) {
|
||||
if (COPY) {
|
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1);
|
||||
} else {
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
}
|
||||
} else if (len < 32) {
|
||||
if (COPY) {
|
||||
return copy_sub32(adler, dst, src, len);
|
||||
} else {
|
||||
return sub32(adler, src, len);
|
||||
}
|
||||
}
|
||||
|
||||
__m256i vs1, vs2;
|
||||
|
||||
const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m256i dot3v = _mm256_set1_epi16(1);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
while (len >= 32) {
|
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
|
||||
__m256i vs1_0 = vs1;
|
||||
__m256i vs3 = _mm256_setzero_si256();
|
||||
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
|
||||
*/
|
||||
__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
|
||||
|
||||
if (COPY) {
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf);
|
||||
dst += 32;
|
||||
}
|
||||
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
|
||||
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
|
||||
vs2 = _mm256_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Defer the multiplication with 32 to outside of the loop */
|
||||
vs3 = _mm256_slli_epi32(vs3, 5);
|
||||
vs2 = _mm256_add_epi32(vs2, vs3);
|
||||
|
||||
/* The compiler is generating the following sequence for this integer modulus
|
||||
* when done the scalar way, in GPRs:
|
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
|
||||
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
|
||||
|
||||
mov $0x80078071,%edi // move magic constant into 32 bit register %edi
|
||||
...
|
||||
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
|
||||
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
|
||||
imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
|
||||
shr $0x2f,%rsi // shift right by 47
|
||||
imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
|
||||
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
|
||||
...
|
||||
// repeats for each element with vpextract instructions
|
||||
|
||||
This is tricky with AVX2 for a number of reasons:
|
||||
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
|
||||
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
|
||||
back down to 32 bit precision later (there is in AVX512)
|
||||
3.) Full width integer multiplications aren't cheap
|
||||
|
||||
We can, however, do a relatively cheap sequence for horizontal sums.
|
||||
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
|
||||
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
|
||||
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
|
||||
performed on the maximum possible inputs before overflow
|
||||
*/
|
||||
|
||||
|
||||
/* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
|
||||
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
|
||||
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
|
||||
* what the compiler is doing to avoid integer divisions. */
|
||||
adler0 = partial_hsum256(vs1) % BASE;
|
||||
adler1 = hsum256(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, dst, src, len, 1);
|
||||
}
|
||||
|
||||
#endif
|
32
3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h
vendored
Normal file
32
3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
/* adler32_avx2_p.h -- adler32 avx2 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_AVX2_P_H_
|
||||
#define ADLER32_AVX2_P_H_
|
||||
|
||||
#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
|
||||
|
||||
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
|
||||
static inline uint32_t hsum256(__m256i x) {
|
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
|
||||
_mm256_castsi256_si128(x));
|
||||
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
|
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
|
||||
static inline uint32_t partial_hsum256(__m256i x) {
|
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros */
|
||||
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
|
||||
__m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
|
||||
__m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
|
||||
__m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
|
||||
__m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
115
3rdparty/zlib-ng/arch/x86/adler32_avx512.c
vendored
Normal file
115
3rdparty/zlib-ng/arch/x86/adler32_avx512.c
vendored
Normal file
@ -0,0 +1,115 @@
|
||||
/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX512
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../adler32_fold.h"
|
||||
#include "../../cpu_features.h"
|
||||
#include <immintrin.h>
|
||||
#include "x86_intrins.h"
|
||||
#include "adler32_avx512_p.h"
|
||||
|
||||
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 64) {
|
||||
/* This handles the remaining copies, just call normal adler checksum after this */
|
||||
if (COPY) {
|
||||
__mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
|
||||
__m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
|
||||
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
}
|
||||
|
||||
#ifdef X86_AVX2
|
||||
return adler32_avx2(adler, src, len);
|
||||
#elif defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
}
|
||||
|
||||
__m512i vbuf, vs1_0, vs3;
|
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 64);
|
||||
const __m512i dot3v = _mm512_set1_epi16(1);
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
size_t k;
|
||||
|
||||
while (len >= 64) {
|
||||
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
|
||||
__m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
|
||||
vs1_0 = vs1;
|
||||
vs3 = _mm512_setzero_si512();
|
||||
|
||||
k = MIN(len, NMAX);
|
||||
k -= k % 64;
|
||||
len -= k;
|
||||
|
||||
while (k >= 64) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm512_loadu_si512(src);
|
||||
|
||||
if (COPY) {
|
||||
_mm512_storeu_si512(dst, vbuf);
|
||||
dst += 64;
|
||||
}
|
||||
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
|
||||
__m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
|
||||
vs1 = _mm512_add_epi32(vs1_sad, vs1);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
__m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm512_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6);
|
||||
vs2 = _mm512_add_epi32(vs2, vs3);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, dst, src, len, 1);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
return adler32_fold_copy_impl(adler, NULL, src, len, 0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
46
3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h
vendored
Normal file
46
3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h
vendored
Normal file
@ -0,0 +1,46 @@
|
||||
#ifndef AVX512_FUNCS_H
|
||||
#define AVX512_FUNCS_H
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
/* Written because *_add_epi32(a) sets off ubsan */
|
||||
static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
|
||||
__m256i a = _mm512_extracti64x4_epi64(x, 1);
|
||||
__m256i b = _mm512_extracti64x4_epi64(x, 0);
|
||||
|
||||
__m256i a_plus_b = _mm256_add_epi32(a, b);
|
||||
__m128i c = _mm256_extracti128_si256(a_plus_b, 1);
|
||||
__m128i d = _mm256_extracti128_si256(a_plus_b, 0);
|
||||
__m128i c_plus_d = _mm_add_epi32(c, d);
|
||||
|
||||
__m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
|
||||
__m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
|
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
|
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3);
|
||||
|
||||
return _mm_cvtsi128_si32(sum4);
|
||||
}
|
||||
|
||||
static inline uint32_t partial_hsum(__m512i x) {
|
||||
/* We need a permutation vector to extract every other integer. The
|
||||
* rest are going to be zeros. Marking this const so the compiler stands
|
||||
* a better chance of keeping this resident in a register through entire
|
||||
* loop execution. We certainly have enough zmm registers (32) */
|
||||
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
|
||||
1, 1, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
__m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
|
||||
|
||||
/* From here, it's a simple 256 bit wide reduction sum */
|
||||
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
|
||||
|
||||
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
|
||||
* pretty slow, much slower than the longer instruction sequence below */
|
||||
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
|
||||
_mm256_castsi256_si128(non_zero_avx));
|
||||
__m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
|
||||
__m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
|
||||
return (uint32_t)_mm_cvtsi128_si32(sum3);
|
||||
}
|
||||
|
||||
#endif
|
225
3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
Normal file
225
3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
vendored
Normal file
@ -0,0 +1,225 @@
|
||||
/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
|
||||
* Based on Brian Bockelman's AVX2 version
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_AVX512VNNI
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../cpu_features.h"
|
||||
#include <immintrin.h>
|
||||
#include "../../adler32_fold.h"
|
||||
#include "x86_intrins.h"
|
||||
#include "adler32_avx512_p.h"
|
||||
#include "adler32_avx2_p.h"
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 32)
|
||||
#if defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
|
||||
if (len < 64)
|
||||
#ifdef X86_AVX2
|
||||
return adler32_avx2(adler, src, len);
|
||||
#elif defined(X86_SSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
|
||||
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 64);
|
||||
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
__m512i vs1, vs2;
|
||||
|
||||
while (len >= 64) {
|
||||
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 64;
|
||||
len -= k;
|
||||
__m512i vs1_0 = vs1;
|
||||
__m512i vs3 = _mm512_setzero_si512();
|
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
|
||||
__m512i vs2_1 = _mm512_setzero_si512();
|
||||
__m512i vbuf0, vbuf1;
|
||||
|
||||
/* Remainder peeling */
|
||||
if (k % 128) {
|
||||
vbuf1 = _mm512_loadu_si512((__m512i*)src);
|
||||
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */
|
||||
while (k >= 128) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf0 = _mm512_loadu_si512((__m512i*)src);
|
||||
vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
|
||||
src += 128;
|
||||
k -= 128;
|
||||
|
||||
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm512_add_epi32(vs3, vs1_0);
|
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */
|
||||
vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
|
||||
|
||||
vs3 = _mm512_add_epi32(vs3, vs1);
|
||||
vs1_sad = _mm512_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm512_add_epi32(vs1, vs1_sad);
|
||||
vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm512_slli_epi32(vs3, 6);
|
||||
vs2 = _mm512_add_epi32(vs2, vs3);
|
||||
vs2 = _mm512_add_epi32(vs2, vs2_1);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
if (src == NULL) return 1L;
|
||||
if (len == 0) return adler;
|
||||
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel_copy:
|
||||
if (len < 32) {
|
||||
/* This handles the remaining copies, just call normal adler checksum after this */
|
||||
__mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
|
||||
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
|
||||
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
|
||||
|
||||
#if defined(X86_SSSE3)
|
||||
return adler32_ssse3(adler, src, len);
|
||||
#else
|
||||
return adler32_len_16(adler0, src, len, adler1);
|
||||
#endif
|
||||
}
|
||||
|
||||
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
|
||||
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
__m256i vs1, vs2;
|
||||
|
||||
while (len >= 32) {
|
||||
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
|
||||
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
|
||||
size_t k = MIN(len, NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
__m256i vs1_0 = vs1;
|
||||
__m256i vs3 = _mm256_setzero_si256();
|
||||
/* We might get a tad bit more ILP here if we sum to a second register in the loop */
|
||||
__m256i vs2_1 = _mm256_setzero_si256();
|
||||
__m256i vbuf0, vbuf1;
|
||||
|
||||
/* Remainder peeling */
|
||||
if (k % 64) {
|
||||
vbuf1 = _mm256_loadu_si256((__m256i*)src);
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf1);
|
||||
dst += 32;
|
||||
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
/* Manually unrolled this loop by 2 for an decent amount of ILP */
|
||||
while (k >= 64) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
|
||||
*/
|
||||
vbuf0 = _mm256_loadu_si256((__m256i*)src);
|
||||
vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
|
||||
_mm256_storeu_si256((__m256i*)dst, vbuf0);
|
||||
_mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
|
||||
dst += 64;
|
||||
src += 64;
|
||||
k -= 64;
|
||||
|
||||
__m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs3 = _mm256_add_epi32(vs3, vs1_0);
|
||||
/* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
|
||||
* instructions to eliminate them */
|
||||
vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
|
||||
|
||||
vs3 = _mm256_add_epi32(vs3, vs1);
|
||||
vs1_sad = _mm256_sad_epu8(vbuf1, zero);
|
||||
vs1 = _mm256_add_epi32(vs1, vs1_sad);
|
||||
vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm256_slli_epi32(vs3, 5);
|
||||
vs2 = _mm256_add_epi32(vs2, vs3);
|
||||
vs2 = _mm256_add_epi32(vs2, vs2_1);
|
||||
|
||||
adler0 = partial_hsum256(vs1) % BASE;
|
||||
adler1 = hsum256(vs2) % BASE;
|
||||
}
|
||||
|
||||
adler = adler0 | (adler1 << 16);
|
||||
|
||||
/* Process tail (len < 64). */
|
||||
if (len) {
|
||||
goto rem_peel_copy;
|
||||
}
|
||||
|
||||
return adler;
|
||||
}
|
||||
|
||||
#endif
|
121
3rdparty/zlib-ng/arch/x86/adler32_sse42.c
vendored
Normal file
121
3rdparty/zlib-ng/arch/x86/adler32_sse42.c
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "../../adler32_fold.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef X86_SSE42
|
||||
|
||||
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
uint32_t adler0, adler1;
|
||||
adler1 = (adler >> 16) & 0xffff;
|
||||
adler0 = adler & 0xffff;
|
||||
|
||||
rem_peel:
|
||||
if (len < 16) {
|
||||
return adler32_copy_len_16(adler0, src, dst, len, adler1);
|
||||
}
|
||||
|
||||
__m128i vbuf, vbuf_0;
|
||||
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
|
||||
v_sad_sum2, vsum2, vsum2_0;
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i dot3v = _mm_set1_epi16(1);
|
||||
size_t k;
|
||||
|
||||
while (len >= 16) {
|
||||
|
||||
k = MIN(len, NMAX);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
vs1 = _mm_cvtsi32_si128(adler0);
|
||||
vs2 = _mm_cvtsi32_si128(adler1);
|
||||
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs2_0 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_loadu_si128((__m128i*)src);
|
||||
vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
|
||||
src += 32;
|
||||
k -= 32;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
|
||||
_mm_storeu_si128((__m128i*)dst, vbuf);
|
||||
_mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
|
||||
dst += 32;
|
||||
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
|
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2);
|
||||
vs3 = _mm_slli_epi32(vs3, 5);
|
||||
vs2 = _mm_add_epi32(vs3, vs2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_loadu_si128((__m128i*)src);
|
||||
src += 16;
|
||||
k -= 16;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst, vbuf);
|
||||
dst += 16;
|
||||
}
|
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4);
|
||||
vs2 = _mm_add_epi32(vs2, vs3);
|
||||
|
||||
adler0 = partial_hsum(vs1) % BASE;
|
||||
adler1 = hsum(vs2) % BASE;
|
||||
}
|
||||
|
||||
/* If this is true, there's fewer than 16 elements remaining */
|
||||
if (len) {
|
||||
goto rem_peel;
|
||||
}
|
||||
|
||||
return adler0 | (adler1 << 16);
|
||||
}
|
||||
|
||||
#endif
|
156
3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
vendored
Normal file
156
3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
vendored
Normal file
@ -0,0 +1,156 @@
|
||||
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../adler32_p.h"
|
||||
#include "adler32_ssse3_p.h"
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i dot3v = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
__m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
|
||||
vbuf_0, v_sad_sum2, vsum2, vsum2_0;
|
||||
|
||||
/* If our buffer is unaligned (likely), make the determination whether
|
||||
* or not there's enough of a buffer to consume to make the scalar, aligning
|
||||
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
|
||||
* load. This is a pretty simple test, just test if 16 - the remainder + len is
|
||||
* < 16 */
|
||||
size_t max_iters = NMAX;
|
||||
size_t rem = (uintptr_t)buf & 15;
|
||||
size_t align_offset = 16 - rem;
|
||||
size_t k = 0;
|
||||
if (rem) {
|
||||
if (len < 16 + align_offset) {
|
||||
/* Let's eat the cost of this one unaligned load so that
|
||||
* we don't completely skip over the vectorization. Doing
|
||||
* 16 bytes at a time unaligned is better than 16 + <= 15
|
||||
* sums */
|
||||
vbuf = _mm_loadu_si128((__m128i*)buf);
|
||||
len -= 16;
|
||||
buf += 16;
|
||||
vs1 = _mm_cvtsi32_si128(adler);
|
||||
vs2 = _mm_cvtsi32_si128(sum2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
goto unaligned_jmp;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < align_offset; ++i) {
|
||||
adler += *(buf++);
|
||||
sum2 += adler;
|
||||
}
|
||||
|
||||
/* lop off the max number of sums based on the scalar sums done
|
||||
* above */
|
||||
len -= align_offset;
|
||||
max_iters -= align_offset;
|
||||
}
|
||||
|
||||
|
||||
while (len >= 16) {
|
||||
vs1 = _mm_cvtsi32_si128(adler);
|
||||
vs2 = _mm_cvtsi32_si128(sum2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
vs2_0 = _mm_setzero_si128();
|
||||
vs1_0 = vs1;
|
||||
|
||||
k = (len < max_iters ? len : max_iters);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_load_si128((__m128i*)buf);
|
||||
vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
|
||||
buf += 32;
|
||||
k -= 32;
|
||||
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
|
||||
vs1 = _mm_add_epi32(v_sad_sum2, vs1);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
|
||||
vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs2 = _mm_add_epi32(vs2_0, vs2);
|
||||
vs3 = _mm_slli_epi32(vs3, 5);
|
||||
vs2 = _mm_add_epi32(vs3, vs2);
|
||||
vs3 = _mm_setzero_si128();
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
vbuf = _mm_load_si128((__m128i*)buf);
|
||||
buf += 16;
|
||||
k -= 16;
|
||||
|
||||
unaligned_jmp:
|
||||
v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
|
||||
vs1 = _mm_add_epi32(v_sad_sum1, vs1);
|
||||
vs3 = _mm_add_epi32(vs1_0, vs3);
|
||||
v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
|
||||
vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
vs3 = _mm_slli_epi32(vs3, 4);
|
||||
vs2 = _mm_add_epi32(vs2, vs3);
|
||||
|
||||
/* We don't actually need to do a full horizontal sum, since psadbw is actually doing
|
||||
* a partial reduction sum implicitly and only summing to integers in vector positions
|
||||
* 0 and 2. This saves us some contention on the shuffle port(s) */
|
||||
adler = partial_hsum(vs1) % BASE;
|
||||
sum2 = hsum(vs2) % BASE;
|
||||
max_iters = NMAX;
|
||||
}
|
||||
|
||||
/* Process tail (len < 16). */
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
}
|
||||
|
||||
#endif
|
29
3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h
vendored
Normal file
29
3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
|
||||
* Copyright (C) 2022 Adam Stylinski
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef ADLER32_SSSE3_P_H_
|
||||
#define ADLER32_SSSE3_P_H_
|
||||
|
||||
#ifdef X86_SSSE3
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static inline uint32_t partial_hsum(__m128i x) {
|
||||
__m128i second_int = _mm_srli_si128(x, 8);
|
||||
__m128i sum = _mm_add_epi32(x, second_int);
|
||||
return _mm_cvtsi128_si32(sum);
|
||||
}
|
||||
|
||||
static inline uint32_t hsum(__m128i x) {
|
||||
__m128i sum1 = _mm_unpackhi_epi64(x, x);
|
||||
__m128i sum2 = _mm_add_epi32(x, sum1);
|
||||
__m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
|
||||
__m128i sum4 = _mm_add_epi32(sum2, sum3);
|
||||
return _mm_cvtsi128_si32(sum4);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
133
3rdparty/zlib-ng/arch/x86/chunkset_avx2.c
vendored
Normal file
133
3rdparty/zlib-ng/arch/x86/chunkset_avx2.c
vendored
Normal file
@ -0,0 +1,133 @@
|
||||
/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
|
||||
#ifdef X86_AVX2
|
||||
#include <immintrin.h>
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
typedef __m256i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 32
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
|
||||
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
|
||||
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
|
||||
static const lut_rem_pair perm_idx_lut[29] = {
|
||||
{ 0, 2}, /* 3 */
|
||||
{ 0, 0}, /* don't care */
|
||||
{ 1 * 32, 2}, /* 5 */
|
||||
{ 2 * 32, 2}, /* 6 */
|
||||
{ 3 * 32, 4}, /* 7 */
|
||||
{ 0 * 32, 0}, /* don't care */
|
||||
{ 4 * 32, 5}, /* 9 */
|
||||
{ 5 * 32, 22}, /* 10 */
|
||||
{ 6 * 32, 21}, /* 11 */
|
||||
{ 7 * 32, 20}, /* 12 */
|
||||
{ 8 * 32, 6}, /* 13 */
|
||||
{ 9 * 32, 4}, /* 14 */
|
||||
{10 * 32, 2}, /* 15 */
|
||||
{ 0 * 32, 0}, /* don't care */
|
||||
{11 * 32, 15}, /* 17 */
|
||||
{11 * 32 + 16, 14}, /* 18 */
|
||||
{11 * 32 + 16 * 2, 13}, /* 19 */
|
||||
{11 * 32 + 16 * 3, 12}, /* 20 */
|
||||
{11 * 32 + 16 * 4, 11}, /* 21 */
|
||||
{11 * 32 + 16 * 5, 10}, /* 22 */
|
||||
{11 * 32 + 16 * 6, 9}, /* 23 */
|
||||
{11 * 32 + 16 * 7, 8}, /* 24 */
|
||||
{11 * 32 + 16 * 8, 7}, /* 25 */
|
||||
{11 * 32 + 16 * 9, 6}, /* 26 */
|
||||
{11 * 32 + 16 * 10, 5}, /* 27 */
|
||||
{11 * 32 + 16 * 11, 4}, /* 28 */
|
||||
{11 * 32 + 16 * 12, 3}, /* 29 */
|
||||
{11 * 32 + 16 * 13, 2}, /* 30 */
|
||||
{11 * 32 + 16 * 14, 1} /* 31 */
|
||||
};
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm256_set1_epi16(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
int32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm256_set1_epi32(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
int64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm256_set1_epi64x(tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm256_loadu_si256((__m256i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm256_storeu_si256((__m256i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m256i ret_vec;
|
||||
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
|
||||
* compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
|
||||
* GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
/* See note in chunkset_ssse3.c for why this is ok */
|
||||
__msan_unpoison(buf + dist, 32 - dist);
|
||||
|
||||
if (dist < 16) {
|
||||
/* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
|
||||
* broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
|
||||
* shuffles and combining the halves later */
|
||||
const __m256i permute_xform =
|
||||
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
|
||||
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
|
||||
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
|
||||
} else if (dist == 16) {
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
|
||||
} else {
|
||||
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
|
||||
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
|
||||
/* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
|
||||
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
|
||||
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
|
||||
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
|
||||
* shuffle those values */
|
||||
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
|
||||
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
|
||||
}
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_avx2
|
||||
#define CHUNKCOPY chunkcopy_avx2
|
||||
#define CHUNKUNROLL chunkunroll_avx2
|
||||
#define CHUNKMEMSET chunkmemset_avx2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_avx2
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
56
3rdparty/zlib-ng/arch/x86/chunkset_sse2.c
vendored
Normal file
56
3rdparty/zlib-ng/arch/x86/chunkset_sse2.c
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
#ifdef X86_SSE2
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi16(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
int32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi32(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
int64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi64x(tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_sse2
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
#define CHUNKMEMSET chunkmemset_sse2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_sse2
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
101
3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
Normal file
101
3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
vendored
Normal file
@ -0,0 +1,101 @@
|
||||
/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
|
||||
* code size by sharing the chunkcopy functions, which will certainly compile
|
||||
* to identical machine code */
|
||||
#if defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
#include <immintrin.h>
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNK_MAG
|
||||
#define HAVE_CHUNKCOPY
|
||||
#define HAVE_CHUNKUNROLL
|
||||
|
||||
static const lut_rem_pair perm_idx_lut[13] = {
|
||||
{0, 1}, /* 3 */
|
||||
{0, 0}, /* don't care */
|
||||
{1 * 32, 1}, /* 5 */
|
||||
{2 * 32, 4}, /* 6 */
|
||||
{3 * 32, 2}, /* 7 */
|
||||
{0 * 32, 0}, /* don't care */
|
||||
{4 * 32, 7}, /* 9 */
|
||||
{5 * 32, 6}, /* 10 */
|
||||
{6 * 32, 5}, /* 11 */
|
||||
{7 * 32, 4}, /* 12 */
|
||||
{8 * 32, 3}, /* 13 */
|
||||
{9 * 32, 2}, /* 14 */
|
||||
{10 * 32, 1},/* 15 */
|
||||
};
|
||||
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi16(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
int32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi32(tmp);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
int64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi64x(tmp);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m128i perm_vec, ret_vec;
|
||||
/* Important to note:
|
||||
* This is _not_ to subvert the memory sanitizer but to instead unpoison some
|
||||
* bytes we willingly and purposefully load uninitialized that we swizzle over
|
||||
* in a vector register, anyway. If what we assume is wrong about what is used,
|
||||
* the memory sanitizer will still usefully flag it */
|
||||
__msan_unpoison(buf + dist, 16 - dist);
|
||||
ret_vec = _mm_loadu_si128((__m128i*)buf);
|
||||
*chunk_rem = lut_rem.remval;
|
||||
|
||||
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
|
||||
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
|
||||
|
||||
#define CHUNKSIZE chunksize_ssse3
|
||||
#define CHUNKMEMSET chunkmemset_ssse3
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_ssse3
|
||||
|
||||
#include "inffast_tpl.h"
|
||||
|
||||
#endif
|
63
3rdparty/zlib-ng/arch/x86/compare256_avx2.c
vendored
Normal file
63
3rdparty/zlib-ng/arch/x86/compare256_avx2.c
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
/* compare256_avx2.c -- AVX2 version of compare256
|
||||
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
__m256i ymm_src0, ymm_src1, ymm_cmp;
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
|
||||
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
|
||||
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_avx2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_avx2
|
||||
#define COMPARE256 compare256_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_avx2
|
||||
#define COMPARE256 compare256_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
96
3rdparty/zlib-ng/arch/x86/compare256_sse2.c
vendored
Normal file
96
3rdparty/zlib-ng/arch/x86/compare256_sse2.c
vendored
Normal file
@ -0,0 +1,96 @@
|
||||
/* compare256_sse2.c -- SSE2 version of compare256
|
||||
* Copyright Adam Stylinski <kungfujesus06@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
int align_offset = ((uintptr_t)src0) & 15;
|
||||
const uint8_t *end0 = src0 + 256;
|
||||
const uint8_t *end1 = src1 + 256;
|
||||
__m128i xmm_src0, xmm_src1, xmm_cmp;
|
||||
|
||||
/* Do the first load unaligned, than all subsequent ones we have at least
|
||||
* one aligned load. Sadly aligning both loads is probably unrealistic */
|
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
int align_adv = 16 - align_offset;
|
||||
len += align_adv;
|
||||
src0 += align_adv;
|
||||
src1 += align_adv;
|
||||
|
||||
/* Do a flooring division (should just be a shift right) */
|
||||
int num_iter = (256 - len) / 16;
|
||||
|
||||
for (int i = 0; i < num_iter; ++i) {
|
||||
xmm_src0 = _mm_load_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
/* Compiler _may_ turn this branch into a ptest + movemask,
|
||||
* since a lot of those uops are shared and fused */
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
len += 16, src0 += 16, src1 += 16;
|
||||
}
|
||||
|
||||
if (align_offset) {
|
||||
src0 = end0 - 16;
|
||||
src1 = end1 - 16;
|
||||
len = 256 - 16;
|
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i*)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i*)src1);
|
||||
xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
|
||||
|
||||
mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
|
||||
|
||||
if (mask != 0xFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
}
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_sse2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_sse2
|
||||
#define COMPARE256 compare256_sse2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_sse2
|
||||
#define COMPARE256 compare256_sse2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
186
3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
vendored
Normal file
186
3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
vendored
Normal file
@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef COPY
|
||||
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
|
||||
#endif
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
||||
__m128i xmm_crc_part = _mm_setzero_si128();
|
||||
#ifdef COPY
|
||||
char ALIGNED_(16) partial_buf[16] = { 0 };
|
||||
#else
|
||||
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
|
||||
int32_t first = init_crc != 0;
|
||||
|
||||
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
|
||||
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
|
||||
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
|
||||
* by definition can be up to 15 bytes + one full vector load. */
|
||||
assert(len >= 31 || first == 0);
|
||||
#endif
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
if (len < 16) {
|
||||
#ifdef COPY
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
memcpy(partial_buf, src, len);
|
||||
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
|
||||
memcpy(dst, partial_buf, len);
|
||||
#endif
|
||||
goto partial;
|
||||
}
|
||||
|
||||
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
|
||||
if (algn_diff) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
dst += algn_diff;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_crc_part);
|
||||
|
||||
if (algn_diff < 4 && init_crc != 0) {
|
||||
xmm_t0 = xmm_crc_part;
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
src += 16;
|
||||
len -= 16;
|
||||
}
|
||||
#endif
|
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
|
||||
src += algn_diff;
|
||||
len -= algn_diff;
|
||||
}
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
if (len >= 256) {
|
||||
#ifdef COPY
|
||||
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
|
||||
dst += n;
|
||||
#else
|
||||
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
|
||||
xmm_initial, first);
|
||||
first = 0;
|
||||
#endif
|
||||
len -= n;
|
||||
src += n;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (len >= 64) {
|
||||
len -= 64;
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
||||
src += 64;
|
||||
|
||||
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
|
||||
dst += 64;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
|
||||
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
||||
}
|
||||
|
||||
/*
|
||||
* len = num bytes left - 64
|
||||
*/
|
||||
if (len >= 48) {
|
||||
len -= 48;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
src += 48;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
dst += 48;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
||||
} else if (len >= 32) {
|
||||
len -= 32;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
src += 32;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
dst += 32;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
||||
} else if (len >= 16) {
|
||||
len -= 16;
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
src += 16;
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
dst += 16;
|
||||
#else
|
||||
XOR_INITIAL128(xmm_t0);
|
||||
#endif
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
}
|
||||
|
||||
partial:
|
||||
if (len) {
|
||||
memcpy(&xmm_crc_part, src, len);
|
||||
#ifdef COPY
|
||||
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
|
||||
memcpy(dst, partial_buf, len);
|
||||
#endif
|
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
}
|
||||
|
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
}
|
107
3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
vendored
Normal file
107
3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
vendored
Normal file
@ -0,0 +1,107 @@
|
||||
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com)
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef COPY
|
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
#else
|
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
|
||||
__m128i init_crc, int32_t first) {
|
||||
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
|
||||
#endif
|
||||
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
|
||||
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
|
||||
__m512i z0, z1, z2, z3;
|
||||
size_t len_tmp = len;
|
||||
const __m512i zmm_fold4 = _mm512_set4_epi32(
|
||||
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
const __m512i zmm_fold16 = _mm512_set4_epi32(
|
||||
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
|
||||
|
||||
// zmm register init
|
||||
zmm_crc0 = _mm512_setzero_si512();
|
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
|
||||
#ifndef COPY
|
||||
XOR_INITIAL512(zmm_t0);
|
||||
#endif
|
||||
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
|
||||
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
|
||||
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
|
||||
|
||||
/* already have intermediate CRC in xmm registers
|
||||
* fold4 with 4 xmm_crc to get zmm_crc0
|
||||
*/
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
|
||||
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
|
||||
|
||||
#ifdef COPY
|
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
|
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
|
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
|
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
|
||||
dst += 256;
|
||||
#endif
|
||||
len -= 256;
|
||||
src += 256;
|
||||
|
||||
// fold-16 loops
|
||||
while (len >= 256) {
|
||||
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
|
||||
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
|
||||
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
|
||||
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
|
||||
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
|
||||
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
|
||||
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
|
||||
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
|
||||
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
|
||||
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
|
||||
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
|
||||
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
|
||||
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
|
||||
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
|
||||
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
|
||||
|
||||
#ifdef COPY
|
||||
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
|
||||
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
|
||||
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
|
||||
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
|
||||
dst += 256;
|
||||
#endif
|
||||
len -= 256;
|
||||
src += 256;
|
||||
}
|
||||
// zmm_crc[0,1,2,3] -> zmm_crc0
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
|
||||
|
||||
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
|
||||
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
|
||||
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
|
||||
|
||||
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
|
||||
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
|
||||
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
|
||||
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
|
||||
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
|
||||
|
||||
return (len_tmp - len); // return n bytes processed
|
||||
}
|
30
3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c
vendored
Normal file
30
3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
|
||||
#define CRC32_FOLD_COPY crc32_fold_pclmulqdq_copy
|
||||
#define CRC32_FOLD crc32_fold_pclmulqdq
|
||||
#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
|
||||
#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
|
||||
#define CRC32 crc32_pclmulqdq
|
||||
|
||||
#include "crc32_pclmulqdq_tpl.h"
|
||||
|
||||
#endif
|
363
3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
Normal file
363
3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
vendored
Normal file
@ -0,0 +1,363 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* doc/crc-pclmulqdq.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include <smmintrin.h> // _mm_extract_epi32
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include "../../crc32_fold.h"
|
||||
#include "../../crc32_braid_p.h"
|
||||
#include "x86_intrins.h"
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
|
||||
int32_t first);
|
||||
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc3, ps_res;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
|
||||
*xmm_crc0 = *xmm_crc1;
|
||||
*xmm_crc1 = *xmm_crc2;
|
||||
*xmm_crc2 = x_tmp3;
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3, x_tmp2;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
|
||||
*xmm_crc3 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
|
||||
*xmm_crc2 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
|
||||
*xmm_crc0 = x_tmp2;
|
||||
*xmm_crc1 = x_tmp3;
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res20);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
||||
}
|
||||
|
||||
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc2;
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
|
||||
*xmm_crc2 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
|
||||
*xmm_crc1 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
|
||||
*xmm_crc0 = x_tmp3;
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res21);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
||||
}
|
||||
|
||||
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
|
||||
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
|
||||
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
|
||||
|
||||
x_tmp0 = *xmm_crc0;
|
||||
x_tmp1 = *xmm_crc1;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res2);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||
}
|
||||
|
||||
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
|
||||
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
||||
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
||||
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
||||
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
|
||||
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
|
||||
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
|
||||
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
|
||||
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
|
||||
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
|
||||
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
|
||||
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
|
||||
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
|
||||
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
|
||||
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
|
||||
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
||||
};
|
||||
|
||||
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
|
||||
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
|
||||
|
||||
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
|
||||
__m128i xmm_a0_0, xmm_a0_1;
|
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
||||
|
||||
xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
|
||||
xmm_shr = xmm_shl;
|
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
|
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
|
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
|
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
|
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
|
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
|
||||
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
||||
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
||||
|
||||
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
|
||||
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
|
||||
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
|
||||
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
|
||||
|
||||
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
|
||||
ps_res = _mm_xor_ps(ps_res, psa0_1);
|
||||
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
|
||||
*fold0 = _mm_load_si128(fold + 0);
|
||||
*fold1 = _mm_load_si128(fold + 1);
|
||||
*fold2 = _mm_load_si128(fold + 2);
|
||||
*fold3 = _mm_load_si128(fold + 3);
|
||||
}
|
||||
|
||||
static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
|
||||
const __m128i *fold2, const __m128i *fold3) {
|
||||
_mm_storeu_si128(fold + 0, *fold0);
|
||||
_mm_storeu_si128(fold + 1, *fold1);
|
||||
_mm_storeu_si128(fold + 2, *fold2);
|
||||
_mm_storeu_si128(fold + 3, *fold3);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
|
||||
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
|
||||
__m128i xmm_zero = _mm_setzero_si128();
|
||||
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define ONCE(op) if (first) { first = 0; op; }
|
||||
#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
|
||||
#endif
|
||||
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include "crc32_fold_vpclmulqdq_tpl.h"
|
||||
#endif
|
||||
#include "crc32_fold_pclmulqdq_tpl.h"
|
||||
#define COPY
|
||||
#ifdef X86_VPCLMULQDQ
|
||||
# include "crc32_fold_vpclmulqdq_tpl.h"
|
||||
#endif
|
||||
#include "crc32_fold_pclmulqdq_tpl.h"
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_k[] = {
|
||||
0xccaa009e, 0x00000000, /* rk1 */
|
||||
0x751997d0, 0x00000001, /* rk2 */
|
||||
0xccaa009e, 0x00000000, /* rk5 */
|
||||
0x63cd6124, 0x00000001, /* rk6 */
|
||||
0xf7011640, 0x00000001, /* rk7 */
|
||||
0xdb710640, 0x00000001 /* rk8 */
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask[4] = {
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask2[4] = {
|
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
||||
|
||||
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
/*
|
||||
* k1
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
|
||||
|
||||
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
||||
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
|
||||
|
||||
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
||||
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
|
||||
/*
|
||||
* k5
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
||||
|
||||
/*
|
||||
* k7
|
||||
*/
|
||||
xmm_crc1 = xmm_crc3;
|
||||
xmm_crc2 = xmm_crc3;
|
||||
crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
|
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
||||
|
||||
xmm_crc2 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
||||
|
||||
crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
|
||||
|
||||
return crc->value;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
|
||||
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
|
||||
* these short lengths might also prove to be effective */
|
||||
if (len < 64)
|
||||
return PREFIX(crc32_braid)(crc32, buf, len);
|
||||
|
||||
crc32_fold ALIGNED_(16) crc_state;
|
||||
CRC32_FOLD_RESET(&crc_state);
|
||||
CRC32_FOLD(&crc_state, buf, len, crc32);
|
||||
return CRC32_FOLD_FINAL(&crc_state);
|
||||
}
|
17
3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
Normal file
17
3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
|
||||
* Copyright Wangyang Guo (wangyang.guo@intel.com)
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
|
||||
|
||||
#define X86_VPCLMULQDQ
|
||||
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy
|
||||
#define CRC32_FOLD crc32_fold_vpclmulqdq
|
||||
#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
|
||||
#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
|
||||
#define CRC32 crc32_vpclmulqdq
|
||||
|
||||
#include "crc32_pclmulqdq_tpl.h"
|
||||
|
||||
#endif
|
24
3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
vendored
Normal file
24
3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef X86_SSE42
|
||||
#include "../../zbuild.h"
|
||||
#include <nmmintrin.h>
|
||||
#include "../../deflate.h"
|
||||
|
||||
#define HASH_CALC(s, h, val)\
|
||||
h = _mm_crc32_u32(h, val)
|
||||
|
||||
#define HASH_CALC_VAR h
|
||||
#define HASH_CALC_VAR_INIT uint32_t h = 0
|
||||
|
||||
#define UPDATE_HASH update_hash_sse42
|
||||
#define INSERT_STRING insert_string_sse42
|
||||
#define QUICK_INSERT_STRING quick_insert_string_sse42
|
||||
|
||||
#include "../../insert_string_tpl.h"
|
||||
#endif
|
39
3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
Normal file
39
3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
|
||||
table += entries;
|
||||
table -= 16;
|
||||
|
||||
do {
|
||||
__m256i value, result;
|
||||
|
||||
value = _mm256_loadu_si256((__m256i *)table);
|
||||
result = _mm256_subs_epu16(value, wsize);
|
||||
_mm256_storeu_si256((__m256i *)table, result);
|
||||
|
||||
table -= 16;
|
||||
entries -= 16;
|
||||
} while (entries > 0);
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
|
||||
slide_hash_chain(s->prev, wsize, ymm_wsize);
|
||||
}
|
62
3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
Normal file
62
3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
vendored
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* SSE optimized hash slide
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <assert.h>
|
||||
|
||||
static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
|
||||
uint32_t entries1, const __m128i wsize) {
|
||||
uint32_t entries;
|
||||
Pos *table;
|
||||
__m128i value0, value1, result0, result1;
|
||||
|
||||
int on_chain = 0;
|
||||
|
||||
next_chain:
|
||||
table = (on_chain) ? table1 : table0;
|
||||
entries = (on_chain) ? entries1 : entries0;
|
||||
|
||||
table += entries;
|
||||
table -= 16;
|
||||
|
||||
/* ZALLOC allocates this pointer unless the user chose a custom allocator.
|
||||
* Our alloc function is aligned to 64 byte boundaries */
|
||||
do {
|
||||
value0 = _mm_load_si128((__m128i *)table);
|
||||
value1 = _mm_load_si128((__m128i *)(table + 8));
|
||||
result0 = _mm_subs_epu16(value0, wsize);
|
||||
result1 = _mm_subs_epu16(value1, wsize);
|
||||
_mm_store_si128((__m128i *)table, result0);
|
||||
_mm_store_si128((__m128i *)(table + 8), result1);
|
||||
|
||||
table -= 16;
|
||||
entries -= 16;
|
||||
} while (entries > 0);
|
||||
|
||||
++on_chain;
|
||||
if (on_chain > 1) {
|
||||
return;
|
||||
} else {
|
||||
goto next_chain;
|
||||
}
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
|
||||
|
||||
assert(((uintptr_t)s->head & 15) == 0);
|
||||
assert(((uintptr_t)s->prev & 15) == 0);
|
||||
|
||||
slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
|
||||
}
|
97
3rdparty/zlib-ng/arch/x86/x86_features.c
vendored
Normal file
97
3rdparty/zlib-ng/arch/x86/x86_features.c
vendored
Normal file
@ -0,0 +1,97 @@
|
||||
/* x86_features.c - x86 feature check
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Author:
|
||||
* Jim Kukunas
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "x86_features.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#else
|
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
# include <cpuid.h>
|
||||
#endif
|
||||
|
||||
#include <string.h>
|
||||
|
||||
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuid((int *)registers, info);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
__cpuid(info, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuidex((int *)registers, info, subinfo);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t xgetbv(unsigned int xcr) {
|
||||
#ifdef _MSC_VER
|
||||
return _xgetbv(xcr);
|
||||
#else
|
||||
uint32_t eax, edx;
|
||||
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
|
||||
return (uint64_t)(edx) << 32 | eax;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
features->has_sse2 = edx & 0x4000000;
|
||||
features->has_ssse3 = ecx & 0x200;
|
||||
features->has_sse42 = ecx & 0x100000;
|
||||
features->has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (ecx & 0x08000000) {
|
||||
uint64_t xfeature = xgetbv(0);
|
||||
|
||||
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
|
||||
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
|
||||
}
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
features->has_vpclmulqdq = ecx & 0x400;
|
||||
|
||||
// check AVX2 bit if the OS supports saving YMM registers
|
||||
if (features->has_os_save_ymm) {
|
||||
features->has_avx2 = ebx & 0x20;
|
||||
}
|
||||
|
||||
// check AVX512 bits if the OS supports saving ZMM registers
|
||||
if (features->has_os_save_zmm) {
|
||||
features->has_avx512 = ebx & 0x00010000;
|
||||
features->has_avx512vnni = ecx & 0x800;
|
||||
}
|
||||
}
|
||||
}
|
24
3rdparty/zlib-ng/arch/x86/x86_features.h
vendored
Normal file
24
3rdparty/zlib-ng/arch/x86/x86_features.h
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
/* x86_features.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef X86_FEATURES_H_
|
||||
#define X86_FEATURES_H_
|
||||
|
||||
struct x86_cpu_features {
|
||||
int has_avx2;
|
||||
int has_avx512;
|
||||
int has_avx512vnni;
|
||||
int has_sse2;
|
||||
int has_ssse3;
|
||||
int has_sse42;
|
||||
int has_pclmulqdq;
|
||||
int has_vpclmulqdq;
|
||||
int has_os_save_ymm;
|
||||
int has_os_save_zmm;
|
||||
};
|
||||
|
||||
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
|
||||
|
||||
#endif /* CPU_H_ */
|
87
3rdparty/zlib-ng/arch/x86/x86_intrins.h
vendored
Normal file
87
3rdparty/zlib-ng/arch/x86/x86_intrins.h
vendored
Normal file
@ -0,0 +1,87 @@
|
||||
#ifndef X86_INTRINS_H
|
||||
#define X86_INTRINS_H
|
||||
|
||||
/* Unfortunately GCC didn't support these things until version 10.
|
||||
* Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
|
||||
*/
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
|
||||
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
|
||||
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
|
||||
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
|
||||
__m128i r;
|
||||
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
|
||||
return _mm256_castsi128_si256(r);
|
||||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
|
||||
__m128i r;
|
||||
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
|
||||
return _mm512_castsi128_si512(r);
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
#endif // gcc/AppleClang version test
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
/* GCC <9 is missing some AVX512 intrinsics.
|
||||
*/
|
||||
#ifdef __AVX512F__
|
||||
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
|
||||
#include <immintrin.h>
|
||||
|
||||
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
|
||||
((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
|
||||
|
||||
static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
|
||||
char __q59, char __q58, char __q57, char __q56,
|
||||
char __q55, char __q54, char __q53, char __q52,
|
||||
char __q51, char __q50, char __q49, char __q48,
|
||||
char __q47, char __q46, char __q45, char __q44,
|
||||
char __q43, char __q42, char __q41, char __q40,
|
||||
char __q39, char __q38, char __q37, char __q36,
|
||||
char __q35, char __q34, char __q33, char __q32,
|
||||
char __q31, char __q30, char __q29, char __q28,
|
||||
char __q27, char __q26, char __q25, char __q24,
|
||||
char __q23, char __q22, char __q21, char __q20,
|
||||
char __q19, char __q18, char __q17, char __q16,
|
||||
char __q15, char __q14, char __q13, char __q12,
|
||||
char __q11, char __q10, char __q09, char __q08,
|
||||
char __q07, char __q06, char __q05, char __q04,
|
||||
char __q03, char __q02, char __q01, char __q00) {
|
||||
return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
|
||||
PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
|
||||
PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
|
||||
PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
|
||||
PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
|
||||
PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
|
||||
PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
|
||||
PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
|
||||
}
|
||||
|
||||
#undef PACK
|
||||
|
||||
#endif // gcc version test
|
||||
#endif // __AVX512F__
|
||||
|
||||
/* Missing zero-extension AVX and AVX512 intrinsics.
|
||||
* Fixed in Microsoft Visual Studio 2017 version 15.7
|
||||
* https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
|
||||
*/
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1914
|
||||
#ifdef __AVX2__
|
||||
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
|
||||
return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
#ifdef __AVX512F__
|
||||
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
|
||||
return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
#endif // defined(_MSC_VER) && _MSC_VER < 1914
|
||||
|
||||
#endif // include guard X86_INTRINS_H
|
42
3rdparty/zlib-ng/chunkset.c
vendored
Normal file
42
3rdparty/zlib-ng/chunkset.c
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
/* chunkset.c -- inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
|
||||
typedef uint64_t chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 8
|
||||
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
uint8_t *dest = (uint8_t *)chunk;
|
||||
memcpy(dest, from, sizeof(uint32_t));
|
||||
memcpy(dest+4, from, sizeof(uint32_t));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
memcpy(chunk, from, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
memcpy(out, chunk, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_c
|
||||
#define CHUNKCOPY chunkcopy_c
|
||||
#define CHUNKUNROLL chunkunroll_c
|
||||
#define CHUNKMEMSET chunkmemset_c
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_c
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#define INFLATE_FAST inflate_fast_c
|
||||
|
||||
#include "inffast_tpl.h"
|
200
3rdparty/zlib-ng/chunkset_tpl.h
vendored
Normal file
200
3rdparty/zlib-ng/chunkset_tpl.h
vendored
Normal file
@ -0,0 +1,200 @@
|
||||
/* chunkset_tpl.h -- inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
|
||||
#endif
|
||||
|
||||
/* Returns the chunk size */
|
||||
Z_INTERNAL uint32_t CHUNKSIZE(void) {
|
||||
return sizeof(chunk_t);
|
||||
}
|
||||
|
||||
/* Behave like memcpy, but assume that it's OK to overwrite at least
|
||||
chunk_t bytes of output even if the length is shorter than this,
|
||||
that the length is non-zero, and that `from` lags `out` by at least
|
||||
sizeof chunk_t bytes (or that they don't overlap at all or simply that
|
||||
the distance is less than the length of the copy).
|
||||
|
||||
Aside from better memory bus utilisation, this means that short copies
|
||||
(chunk_t bytes or fewer) will fall straight through the loop
|
||||
without iteration, which will hopefully make the branch prediction more
|
||||
reliable. */
|
||||
#ifndef HAVE_CHUNKCOPY
|
||||
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
Assert(len > 0, "chunkcopy should never have a length 0");
|
||||
chunk_t chunk;
|
||||
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
|
||||
loadchunk(from, &chunk);
|
||||
storechunk(out, &chunk);
|
||||
out += align;
|
||||
from += align;
|
||||
len -= align;
|
||||
while (len > 0) {
|
||||
loadchunk(from, &chunk);
|
||||
storechunk(out, &chunk);
|
||||
out += sizeof(chunk_t);
|
||||
from += sizeof(chunk_t);
|
||||
len -= sizeof(chunk_t);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Perform short copies until distance can be rewritten as being at least
|
||||
sizeof chunk_t.
|
||||
|
||||
This assumes that it's OK to overwrite at least the first
|
||||
2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
|
||||
This assumption holds because inflate_fast() starts every iteration with at
|
||||
least 258 bytes of output space available (258 being the maximum length
|
||||
output from a single token; see inflate_fast()'s assumptions below). */
|
||||
#ifndef HAVE_CHUNKUNROLL
|
||||
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
|
||||
unsigned char const *from = out - *dist;
|
||||
chunk_t chunk;
|
||||
while (*dist < *len && *dist < sizeof(chunk_t)) {
|
||||
loadchunk(from, &chunk);
|
||||
storechunk(out, &chunk);
|
||||
out += *dist;
|
||||
*len -= *dist;
|
||||
*dist += *dist;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_CHUNK_MAG
|
||||
/* Loads a magazine to feed into memory of the pattern */
|
||||
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
/* This code takes string of length dist from "from" and repeats
|
||||
* it for as many times as can fit in a chunk_t (vector register) */
|
||||
uint32_t cpy_dist;
|
||||
uint32_t bytes_remaining = sizeof(chunk_t);
|
||||
chunk_t chunk_load;
|
||||
uint8_t *cur_chunk = (uint8_t *)&chunk_load;
|
||||
while (bytes_remaining) {
|
||||
cpy_dist = MIN(dist, bytes_remaining);
|
||||
memcpy(cur_chunk, buf, cpy_dist);
|
||||
bytes_remaining -= cpy_dist;
|
||||
cur_chunk += cpy_dist;
|
||||
/* This allows us to bypass an expensive integer division since we're effectively
|
||||
* counting in this loop, anyway */
|
||||
*chunk_rem = cpy_dist;
|
||||
}
|
||||
|
||||
return chunk_load;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
|
||||
Return OUT + LEN. */
|
||||
Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
|
||||
/* Debug performance related issues when len < sizeof(uint64_t):
|
||||
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
|
||||
Assert(dist > 0, "chunkmemset cannot have a distance 0");
|
||||
/* Only AVX2 */
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
|
||||
if (len <= 16) {
|
||||
return chunkmemset_ssse3(out, dist, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint8_t *from = out - dist;
|
||||
|
||||
if (dist == 1) {
|
||||
memset(out, *from, len);
|
||||
return out + len;
|
||||
} else if (dist > sizeof(chunk_t)) {
|
||||
return CHUNKCOPY(out, out - dist, len);
|
||||
}
|
||||
|
||||
chunk_t chunk_load;
|
||||
uint32_t chunk_mod = 0;
|
||||
|
||||
/* TODO: possibly build up a permutation table for this if not an even modulus */
|
||||
#ifdef HAVE_CHUNKMEMSET_2
|
||||
if (dist == 2) {
|
||||
chunkmemset_2(from, &chunk_load);
|
||||
} else
|
||||
#endif
|
||||
#ifdef HAVE_CHUNKMEMSET_4
|
||||
if (dist == 4) {
|
||||
chunkmemset_4(from, &chunk_load);
|
||||
} else
|
||||
#endif
|
||||
#ifdef HAVE_CHUNKMEMSET_8
|
||||
if (dist == 8) {
|
||||
chunkmemset_8(from, &chunk_load);
|
||||
} else if (dist == sizeof(chunk_t)) {
|
||||
loadchunk(from, &chunk_load);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
|
||||
}
|
||||
|
||||
/* If we're lucky enough and dist happens to be an even modulus of our vector length,
|
||||
* we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
|
||||
if (chunk_mod == 0) {
|
||||
while (len >= (2 * sizeof(chunk_t))) {
|
||||
storechunk(out, &chunk_load);
|
||||
storechunk(out + sizeof(chunk_t), &chunk_load);
|
||||
out += 2 * sizeof(chunk_t);
|
||||
len -= 2 * sizeof(chunk_t);
|
||||
}
|
||||
}
|
||||
|
||||
/* If we don't have a "dist" length that divides evenly into a vector
|
||||
* register, we can write the whole vector register but we need only
|
||||
* advance by the amount of the whole string that fits in our chunk_t.
|
||||
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/
|
||||
uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
|
||||
while (len >= sizeof(chunk_t)) {
|
||||
storechunk(out, &chunk_load);
|
||||
len -= adv_amount;
|
||||
out += adv_amount;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
memcpy(out, &chunk_load, len);
|
||||
out += len;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
|
||||
#if !defined(UNALIGNED64_OK)
|
||||
# if !defined(UNALIGNED_OK)
|
||||
static const uint32_t align_mask = 7;
|
||||
# else
|
||||
static const uint32_t align_mask = 3;
|
||||
# endif
|
||||
#endif
|
||||
|
||||
len = MIN(len, left);
|
||||
uint8_t *from = out - dist;
|
||||
#if !defined(UNALIGNED64_OK)
|
||||
while (((uintptr_t)out & align_mask) && (len > 0)) {
|
||||
*out++ = *from++;
|
||||
--len;
|
||||
--left;
|
||||
}
|
||||
#endif
|
||||
if (left < (unsigned)(3 * sizeof(chunk_t))) {
|
||||
while (len > 0) {
|
||||
*out++ = *from++;
|
||||
--len;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
if (len)
|
||||
return CHUNKMEMSET(out, dist, len);
|
||||
|
||||
return out;
|
||||
}
|
543
3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
vendored
Normal file
543
3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
vendored
Normal file
@ -0,0 +1,543 @@
|
||||
# detect-intrinsics.cmake -- Detect compiler intrinsics support
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
macro(check_acle_compiler_flag)
|
||||
if(MSVC)
|
||||
# Both ARM and ARM64-targeting msvc support intrinsics, but
|
||||
# ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
|
||||
if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
|
||||
set(HAVE_ACLE_FLAG TRUE)
|
||||
endif()
|
||||
else()
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports ACLE flag
|
||||
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"int main() { return 0; }"
|
||||
HAVE_ACLE_FLAG FAIL_REGEX "not supported")
|
||||
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
|
||||
set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
|
||||
# Check whether compiler supports ACLE flag
|
||||
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
|
||||
check_c_source_compiles(
|
||||
"int main() { return 0; }"
|
||||
HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
|
||||
set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
|
||||
unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(check_armv6_compiler_flag)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
|
||||
if(HAVE_MARCH_ARMV6)
|
||||
set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports ARMv6 inline asm
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARMV6FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"unsigned int f(unsigned int a, unsigned int b) {
|
||||
unsigned int c;
|
||||
__asm__ __volatile__ ( \"uqsub16 %0, %1, %2\" : \"=r\" (c) : \"r\" (a), \"r\" (b) );
|
||||
return (int)c;
|
||||
}
|
||||
int main(void) { return f(1,2); }"
|
||||
HAVE_ARMV6_INLINE_ASM
|
||||
)
|
||||
# Check whether compiler supports ARMv6 intrinsics
|
||||
check_c_source_compiles(
|
||||
"#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <arm_acle.h>
|
||||
#endif
|
||||
unsigned int f(unsigned int a, unsigned int b) {
|
||||
#if defined(_MSC_VER)
|
||||
return _arm_uqsub16(a, b);
|
||||
#else
|
||||
return __uqsub16(a, b);
|
||||
#endif
|
||||
}
|
||||
int main(void) { return 0; }"
|
||||
HAVE_ARMV6_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_avx512_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
|
||||
else()
|
||||
set(AVX512FLAG "/arch:AVX512")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
|
||||
# instruction scheduling unless you specify a reasonable -mtune= target
|
||||
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
|
||||
if(NOT MSVC)
|
||||
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
|
||||
if(HAVE_CASCADE_LAKE)
|
||||
set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake")
|
||||
else()
|
||||
set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512")
|
||||
endif()
|
||||
unset(HAVE_CASCADE_LAKE)
|
||||
endif()
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
set(AVX512FLAG "/arch:AVX512")
|
||||
endif()
|
||||
# Check whether compiler supports AVX512 intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
__m512i f(__m512i y) {
|
||||
__m512i x = _mm512_set1_epi8(2);
|
||||
return _mm512_sub_epi8(x, y);
|
||||
}
|
||||
int main(void) { return 0; }"
|
||||
HAVE_AVX512_INTRIN
|
||||
)
|
||||
|
||||
# Evidently both GCC and clang were late to implementing these
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
__mmask16 f(__mmask16 x) { return _knot_mask16(x); }
|
||||
int main(void) { return 0; }"
|
||||
HAVE_MASK_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_avx512vnni_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
|
||||
else()
|
||||
set(AVX512VNNIFLAG "/arch:AVX512")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
|
||||
if(NOT MSVC)
|
||||
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
|
||||
if(HAVE_CASCADE_LAKE)
|
||||
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake")
|
||||
else()
|
||||
set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512")
|
||||
endif()
|
||||
unset(HAVE_CASCADE_LAKE)
|
||||
endif()
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
set(AVX512VNNIFLAG "/arch:AVX512")
|
||||
endif()
|
||||
|
||||
# Check whether compiler supports AVX512vnni intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
__m512i f(__m512i x, __m512i y) {
|
||||
__m512i z = _mm512_setzero_epi32();
|
||||
return _mm512_dpbusd_epi32(z, x, y);
|
||||
}
|
||||
int main(void) { return 0; }"
|
||||
HAVE_AVX512VNNI_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_avx2_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(AVX2FLAG "-mavx2")
|
||||
else()
|
||||
set(AVX2FLAG "/arch:AVX2")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(AVX2FLAG "-mavx2")
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
set(AVX2FLAG "/arch:AVX2")
|
||||
endif()
|
||||
# Check whether compiler supports AVX2 intrinics
|
||||
set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
__m256i f(__m256i x) {
|
||||
const __m256i y = _mm256_set1_epi16(1);
|
||||
return _mm256_subs_epu16(x, y);
|
||||
}
|
||||
int main(void) { return 0; }"
|
||||
HAVE_AVX2_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_neon_compiler_flag)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
if("${ARCH}" MATCHES "aarch64")
|
||||
set(NEONFLAG "-march=armv8-a+simd")
|
||||
else()
|
||||
set(NEONFLAG "-mfpu=neon")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports NEON flag
|
||||
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#if defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
int main() { return 0; }"
|
||||
NEON_AVAILABLE FAIL_REGEX "not supported")
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_neon_ld4_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
if("${ARCH}" MATCHES "aarch64")
|
||||
set(NEONFLAG "-march=armv8-a+simd")
|
||||
else()
|
||||
set(NEONFLAG "-mfpu=neon")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports loading 4 neon vecs into a register range
|
||||
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
int32x4x4_t f(int var[16]) { return vld1q_s32_x4(var); }
|
||||
int main(void) { return 0; }"
|
||||
NEON_HAS_LD4)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_pclmulqdq_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(PCLMULFLAG "-mpclmul")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports PCLMULQDQ intrinsics
|
||||
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
|
||||
# The pclmul code currently crashes on Mac in 32bit mode. Avoid for now.
|
||||
set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
__m128i f(__m128i a, __m128i b) { return _mm_clmulepi64_si128(a, b, 0x10); }
|
||||
int main(void) { return 0; }"
|
||||
HAVE_PCLMULQDQ_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
else()
|
||||
set(HAVE_PCLMULQDQ_INTRIN OFF)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(check_vpclmulqdq_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports VPCLMULQDQ intrinsics
|
||||
if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
|
||||
set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
__m512i f(__m512i a) {
|
||||
__m512i b = _mm512_setzero_si512();
|
||||
return _mm512_clmulepi64_epi128(a, b, 0x10);
|
||||
}
|
||||
int main(void) { return 0; }"
|
||||
HAVE_VPCLMULQDQ_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
else()
|
||||
set(HAVE_VPCLMULQDQ_INTRIN OFF)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(check_ppc_intrinsics)
|
||||
# Check if compiler supports AltiVec
|
||||
set(CMAKE_REQUIRED_FLAGS "-maltivec ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <altivec.h>
|
||||
int main(void)
|
||||
{
|
||||
vector int a = vec_splats(0);
|
||||
vector int b = vec_splats(0);
|
||||
a = vec_add(a, b);
|
||||
return 0;
|
||||
}"
|
||||
HAVE_ALTIVEC
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
if(HAVE_ALTIVEC)
|
||||
set(PPCFLAGS "-maltivec")
|
||||
endif()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <altivec.h>
|
||||
int main(void)
|
||||
{
|
||||
vector int a = vec_splats(0);
|
||||
vector int b = vec_splats(0);
|
||||
a = vec_add(a, b);
|
||||
return 0;
|
||||
}"
|
||||
HAVE_NOVSX
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
if(HAVE_NOVSX)
|
||||
set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
|
||||
endif()
|
||||
|
||||
# Check if we have what we need for AltiVec optimizations
|
||||
set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#ifdef __FreeBSD__
|
||||
#include <machine/cpu.h>
|
||||
#endif
|
||||
int main() {
|
||||
#ifdef __FreeBSD__
|
||||
unsigned long hwcap;
|
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
|
||||
return (hwcap & PPC_FEATURE_HAS_ALTIVEC);
|
||||
#else
|
||||
return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
|
||||
#endif
|
||||
}"
|
||||
HAVE_VMX
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_power8_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(POWER8FLAG "-mcpu=power8")
|
||||
endif()
|
||||
endif()
|
||||
# Check if we have what we need for POWER8 optimizations
|
||||
set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#ifdef __FreeBSD__
|
||||
#include <machine/cpu.h>
|
||||
#endif
|
||||
int main() {
|
||||
#ifdef __FreeBSD__
|
||||
unsigned long hwcap;
|
||||
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
|
||||
return (hwcap & PPC_FEATURE2_ARCH_2_07);
|
||||
#else
|
||||
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
|
||||
#endif
|
||||
}"
|
||||
HAVE_POWER8_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_rvv_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(RISCVFLAG "-march=rv64gcv")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports RVV
|
||||
set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <riscv_vector.h>
|
||||
int main() {
|
||||
return 0;
|
||||
}"
|
||||
HAVE_RVV_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_s390_intrinsics)
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#ifndef HWCAP_S390_VXRS
|
||||
#define HWCAP_S390_VXRS HWCAP_S390_VX
|
||||
#endif
|
||||
int main() {
|
||||
return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS);
|
||||
}"
|
||||
HAVE_S390_INTRIN
|
||||
)
|
||||
endmacro()
|
||||
|
||||
macro(check_power9_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(POWER9FLAG "-mcpu=power9")
|
||||
endif()
|
||||
endif()
|
||||
# Check if we have what we need for POWER9 optimizations
|
||||
set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <sys/auxv.h>
|
||||
#ifdef __FreeBSD__
|
||||
#include <machine/cpu.h>
|
||||
#endif
|
||||
int main() {
|
||||
#ifdef __FreeBSD__
|
||||
unsigned long hwcap;
|
||||
elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
|
||||
return (hwcap & PPC_FEATURE2_ARCH_3_00);
|
||||
#else
|
||||
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00);
|
||||
#endif
|
||||
}"
|
||||
HAVE_POWER9_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_sse2_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(SSE2FLAG "-msse2")
|
||||
else()
|
||||
set(SSE2FLAG "/arch:SSE2")
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
if(NOT "${ARCH}" MATCHES "x86_64")
|
||||
set(SSE2FLAG "/arch:SSE2")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(SSE2FLAG "-msse2")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports SSE2 intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
__m128i f(__m128i x, __m128i y) { return _mm_sad_epu8(x, y); }
|
||||
int main(void) { return 0; }"
|
||||
HAVE_SSE2_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_ssse3_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(SSSE3FLAG "-mssse3")
|
||||
else()
|
||||
set(SSSE3FLAG "/arch:SSSE3")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(SSSE3FLAG "-mssse3")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports SSSE3 intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <immintrin.h>
|
||||
__m128i f(__m128i u) {
|
||||
__m128i v = _mm_set1_epi32(1);
|
||||
return _mm_hadd_epi32(u, v);
|
||||
}
|
||||
int main(void) { return 0; }"
|
||||
HAVE_SSSE3_INTRIN
|
||||
)
|
||||
endmacro()
|
||||
|
||||
macro(check_sse42_intrinsics)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
if(CMAKE_HOST_UNIX OR APPLE)
|
||||
set(SSE42FLAG "-msse4.2")
|
||||
else()
|
||||
set(SSE42FLAG "/arch:SSE4.2")
|
||||
endif()
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
if(NOT NATIVEFLAG)
|
||||
set(SSE42FLAG "-msse4.2")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports SSE4.2 intrinsics
|
||||
set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <nmmintrin.h>
|
||||
unsigned int f(unsigned int a, unsigned int b) { return _mm_crc32_u32(a, b); }
|
||||
int main(void) { return 0; }"
|
||||
HAVE_SSE42_INTRIN
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_vgfma_intrinsics)
|
||||
if(NOT NATIVEFLAG)
|
||||
set(VGFMAFLAG "-march=z13")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
|
||||
endif()
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
|
||||
endif()
|
||||
endif()
|
||||
# Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
|
||||
set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#include <vecintrin.h>
|
||||
int main(void) {
|
||||
unsigned long long a __attribute__((vector_size(16))) = { 0 };
|
||||
unsigned long long b __attribute__((vector_size(16))) = { 0 };
|
||||
unsigned char c __attribute__((vector_size(16))) = { 0 };
|
||||
c = vec_gfmsum_accum_128(a, b, c);
|
||||
return c[0];
|
||||
}"
|
||||
HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
||||
|
||||
macro(check_xsave_intrinsics)
|
||||
if(NOT NATIVEFLAG AND NOT MSVC)
|
||||
set(XSAVEFLAG "-mxsave")
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
|
||||
check_c_source_compiles(
|
||||
"#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#else
|
||||
# include <x86gprintrin.h>
|
||||
#endif
|
||||
unsigned int f(unsigned int a) { return (int) _xgetbv(a); }
|
||||
int main(void) { return 0; }"
|
||||
HAVE_XSAVE_INTRIN FAIL_REGEX "not supported")
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
endmacro()
|
19
3rdparty/zlib-ng/cmake/fallback-macros.cmake
vendored
Normal file
19
3rdparty/zlib-ng/cmake/fallback-macros.cmake
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
# fallback-macros.cmake -- CMake fallback macros
|
||||
# Copyright (C) 2022 Nathan Moinvaziri
|
||||
# Licensed under the Zlib license, see LICENSE.md for details
|
||||
|
||||
# CMake less than version 3.5.2
|
||||
if(NOT COMMAND add_compile_options)
|
||||
macro(add_compile_options options)
|
||||
string(APPEND CMAKE_C_FLAGS ${options})
|
||||
string(APPEND CMAKE_CXX_FLAGS ${options})
|
||||
endmacro()
|
||||
endif()
|
||||
|
||||
# CMake less than version 3.14
|
||||
if(NOT COMMAND add_link_options)
|
||||
macro(add_link_options options)
|
||||
string(APPEND CMAKE_EXE_LINKER_FLAGS ${options})
|
||||
string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options})
|
||||
endmacro()
|
||||
endif()
|
180
3rdparty/zlib-ng/compare256.c
vendored
Normal file
180
3rdparty/zlib-ng/compare256.c
vendored
Normal file
@ -0,0 +1,180 @@
|
||||
/* compare256.c -- 256 byte memory comparison with match length return
|
||||
* Copyright (C) 2020 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
/* ALIGNED, byte comparison */
|
||||
static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_c_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_c
|
||||
#define COMPARE256 compare256_c_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_c
|
||||
#define COMPARE256 compare256_c_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
/* 16-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_unaligned_16_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_unaligned_16_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_16
|
||||
#define COMPARE256 compare256_unaligned_16_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_unaligned_16
|
||||
#define COMPARE256 compare256_unaligned_16_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
/* 32-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint32_t sv, mv, diff;
|
||||
|
||||
memcpy(&sv, src0, sizeof(sv));
|
||||
memcpy(&mv, src1, sizeof(mv));
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
uint32_t match_byte = __builtin_ctz(diff) / 8;
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 4, src1 += 4, len += 4;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_unaligned_32_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_32
|
||||
#define COMPARE256 compare256_unaligned_32_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_unaligned_32
|
||||
#define COMPARE256 compare256_unaligned_32_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
/* UNALIGNED64_OK, 64-bit integer comparison */
|
||||
static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint64_t sv, mv, diff;
|
||||
|
||||
memcpy(&sv, src0, sizeof(sv));
|
||||
memcpy(&mv, src1, sizeof(mv));
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
uint64_t match_byte = __builtin_ctzll(diff) / 8;
|
||||
return len + (uint32_t)match_byte;
|
||||
}
|
||||
|
||||
src0 += 8, src1 += 8, len += 8;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_unaligned_64_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_64
|
||||
#define COMPARE256 compare256_unaligned_64_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_unaligned_64
|
||||
#define COMPARE256 compare256_unaligned_64_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
134
3rdparty/zlib-ng/compare256_rle.h
vendored
Normal file
134
3rdparty/zlib-ng/compare256_rle.h
vendored
Normal file
@ -0,0 +1,134 @@
|
||||
/* compare256_rle.h -- 256 byte run-length encoding comparison
|
||||
* Copyright (C) 2022 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
|
||||
|
||||
/* ALIGNED, byte comparison */
|
||||
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src1 += 1, len += 1;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#ifdef UNALIGNED_OK
|
||||
/* 16-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
uint16_t src0_cmp, src1_cmp;
|
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
|
||||
|
||||
do {
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
memcpy(&src1_cmp, src1, sizeof(src1_cmp));
|
||||
if (src0_cmp != src1_cmp)
|
||||
return len + (*src0 == *src1);
|
||||
src1 += 2, len += 2;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
/* 32-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t sv, len = 0;
|
||||
uint16_t src0_cmp;
|
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
|
||||
sv = ((uint32_t)src0_cmp << 16) | src0_cmp;
|
||||
|
||||
do {
|
||||
uint32_t mv, diff;
|
||||
|
||||
memcpy(&mv, src1, sizeof(mv));
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
uint32_t match_byte = __builtin_ctz(diff) / 8;
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src1 += 4, len += 4;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
/* 64-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t src0_cmp32, len = 0;
|
||||
uint16_t src0_cmp;
|
||||
uint64_t sv;
|
||||
|
||||
memcpy(&src0_cmp, src0, sizeof(src0_cmp));
|
||||
src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
|
||||
sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;
|
||||
|
||||
do {
|
||||
uint64_t mv, diff;
|
||||
|
||||
memcpy(&mv, src1, sizeof(mv));
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
uint64_t match_byte = __builtin_ctzll(diff) / 8;
|
||||
return len + (uint32_t)match_byte;
|
||||
}
|
||||
|
||||
src1 += 8, len += 8;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
98
3rdparty/zlib-ng/compress.c
vendored
Normal file
98
3rdparty/zlib-ng/compress.c
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
/* compress.c -- compress a memory buffer
|
||||
* Copyright (C) 1995-2005, 2014, 2016 Jean-loup Gailly, Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Architecture-specific hooks.
|
||||
*/
|
||||
#ifdef S390_DFLTCC_DEFLATE
|
||||
# include "arch/s390/dfltcc_common.h"
|
||||
#else
|
||||
/* Returns the upper bound on compressed data length based on uncompressed data length, assuming default settings.
|
||||
* Zero means that arch-specific deflation code behaves identically to the regular zlib-ng algorithms. */
|
||||
# define DEFLATE_BOUND_COMPLEN(source_len) 0
|
||||
#endif
|
||||
|
||||
/* ===========================================================================
|
||||
Compresses the source buffer into the destination buffer. The level
|
||||
parameter has the same meaning as in deflateInit. sourceLen is the byte
|
||||
length of the source buffer. Upon entry, destLen is the total size of the
|
||||
destination buffer, which must be at least 0.1% larger than sourceLen plus
|
||||
12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
|
||||
|
||||
compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
|
||||
memory, Z_BUF_ERROR if there was not enough room in the output buffer,
|
||||
Z_STREAM_ERROR if the level parameter is invalid.
|
||||
*/
|
||||
int Z_EXPORT PREFIX(compress2)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source,
|
||||
z_uintmax_t sourceLen, int level) {
|
||||
PREFIX3(stream) stream;
|
||||
int err;
|
||||
const unsigned int max = (unsigned int)-1;
|
||||
z_size_t left;
|
||||
|
||||
left = *destLen;
|
||||
*destLen = 0;
|
||||
|
||||
stream.zalloc = NULL;
|
||||
stream.zfree = NULL;
|
||||
stream.opaque = NULL;
|
||||
|
||||
err = PREFIX(deflateInit)(&stream, level);
|
||||
if (err != Z_OK)
|
||||
return err;
|
||||
|
||||
stream.next_out = dest;
|
||||
stream.avail_out = 0;
|
||||
stream.next_in = (z_const unsigned char *)source;
|
||||
stream.avail_in = 0;
|
||||
|
||||
do {
|
||||
if (stream.avail_out == 0) {
|
||||
stream.avail_out = left > (unsigned long)max ? max : (unsigned int)left;
|
||||
left -= stream.avail_out;
|
||||
}
|
||||
if (stream.avail_in == 0) {
|
||||
stream.avail_in = sourceLen > (unsigned long)max ? max : (unsigned int)sourceLen;
|
||||
sourceLen -= stream.avail_in;
|
||||
}
|
||||
err = PREFIX(deflate)(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH);
|
||||
} while (err == Z_OK);
|
||||
|
||||
*destLen = stream.total_out;
|
||||
PREFIX(deflateEnd)(&stream);
|
||||
return err == Z_STREAM_END ? Z_OK : err;
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
*/
|
||||
int Z_EXPORT PREFIX(compress)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, z_uintmax_t sourceLen) {
|
||||
return PREFIX(compress2)(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
If the default memLevel or windowBits for deflateInit() is changed, then
|
||||
this function needs to be updated.
|
||||
*/
|
||||
z_uintmax_t Z_EXPORT PREFIX(compressBound)(z_uintmax_t sourceLen) {
|
||||
z_uintmax_t complen = DEFLATE_BOUND_COMPLEN(sourceLen);
|
||||
|
||||
if (complen > 0)
|
||||
/* Architecture-specific code provided an upper bound. */
|
||||
return complen + ZLIB_WRAPLEN;
|
||||
|
||||
#ifndef NO_QUICK_STRATEGY
|
||||
return sourceLen /* The source size itself */
|
||||
+ (sourceLen == 0 ? 1 : 0) /* Always at least one byte for any input */
|
||||
+ (sourceLen < 9 ? 1 : 0) /* One extra byte for lengths less than 9 */
|
||||
+ DEFLATE_QUICK_OVERHEAD(sourceLen) /* Source encoding overhead, padded to next full byte */
|
||||
+ DEFLATE_BLOCK_OVERHEAD /* Deflate block overhead bytes */
|
||||
+ ZLIB_WRAPLEN; /* zlib wrapper */
|
||||
#else
|
||||
return sourceLen + (sourceLen >> 4) + 7 + ZLIB_WRAPLEN;
|
||||
#endif
|
||||
}
|
23
3rdparty/zlib-ng/cpu_features.c
vendored
Normal file
23
3rdparty/zlib-ng/cpu_features.c
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
/* cpu_features.c -- CPU architecture feature check
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "cpu_features.h"
|
||||
#include <string.h>
|
||||
|
||||
Z_INTERNAL void cpu_check_features(struct cpu_features *features) {
|
||||
memset(features, 0, sizeof(struct cpu_features));
|
||||
#if defined(X86_FEATURES)
|
||||
x86_check_features(&features->x86);
|
||||
#elif defined(ARM_FEATURES)
|
||||
arm_check_features(&features->arm);
|
||||
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
|
||||
power_check_features(&features->power);
|
||||
#elif defined(S390_FEATURES)
|
||||
s390_check_features(&features->s390);
|
||||
#elif defined(RISCV_FEATURES)
|
||||
riscv_check_features(&features->riscv);
|
||||
#endif
|
||||
}
|
303
3rdparty/zlib-ng/cpu_features.h
vendored
Normal file
303
3rdparty/zlib-ng/cpu_features.h
vendored
Normal file
@ -0,0 +1,303 @@
|
||||
/* cpu_features.h -- CPU architecture feature check
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CPU_FEATURES_H_
|
||||
#define CPU_FEATURES_H_
|
||||
|
||||
#include "adler32_fold.h"
|
||||
#include "crc32_fold.h"
|
||||
|
||||
#if defined(X86_FEATURES)
|
||||
# include "arch/x86/x86_features.h"
|
||||
# include "fallback_builtins.h"
|
||||
#elif defined(ARM_FEATURES)
|
||||
# include "arch/arm/arm_features.h"
|
||||
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
|
||||
# include "arch/power/power_features.h"
|
||||
#elif defined(S390_FEATURES)
|
||||
# include "arch/s390/s390_features.h"
|
||||
#elif defined(RISCV_FEATURES)
|
||||
# include "arch/riscv/riscv_features.h"
|
||||
#endif
|
||||
|
||||
struct cpu_features {
|
||||
#if defined(X86_FEATURES)
|
||||
struct x86_cpu_features x86;
|
||||
#elif defined(ARM_FEATURES)
|
||||
struct arm_cpu_features arm;
|
||||
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
|
||||
struct power_cpu_features power;
|
||||
#elif defined(S390_FEATURES)
|
||||
struct s390_cpu_features s390;
|
||||
#elif defined(RISCV_FEATURES)
|
||||
struct riscv_cpu_features riscv;
|
||||
#else
|
||||
char empty;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern void cpu_check_features(struct cpu_features *features);
|
||||
|
||||
/* adler32 */
|
||||
typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
|
||||
extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#ifdef ARM_NEON
|
||||
extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef PPC_VMX
|
||||
extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef RISCV_RVV
|
||||
extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef X86_SSSE3
|
||||
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef X86_AVX2
|
||||
extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef X86_AVX512
|
||||
extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef X86_AVX512VNNI
|
||||
extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#ifdef POWER8_VSX
|
||||
extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
|
||||
/* adler32 folding */
|
||||
#ifdef RISCV_RVV
|
||||
extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
#ifdef X86_SSE42
|
||||
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
#ifdef X86_AVX2
|
||||
extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
#ifdef X86_AVX512
|
||||
extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
#ifdef X86_AVX512VNNI
|
||||
extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
#endif
|
||||
|
||||
/* CRC32 folding */
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
|
||||
extern void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
|
||||
extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
|
||||
extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
|
||||
extern void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
extern void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
|
||||
extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
|
||||
/* memory chunking */
|
||||
extern uint32_t chunksize_c(void);
|
||||
extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
#ifdef X86_SSE2
|
||||
extern uint32_t chunksize_sse2(void);
|
||||
extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
#endif
|
||||
#ifdef X86_SSSE3
|
||||
extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
#endif
|
||||
#ifdef X86_AVX2
|
||||
extern uint32_t chunksize_avx2(void);
|
||||
extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
#endif
|
||||
#ifdef ARM_NEON
|
||||
extern uint32_t chunksize_neon(void);
|
||||
extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
#endif
|
||||
#ifdef POWER8_VSX
|
||||
extern uint32_t chunksize_power8(void);
|
||||
extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
#endif
|
||||
#ifdef RISCV_RVV
|
||||
extern uint32_t chunksize_rvv(void);
|
||||
extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
#endif
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
typedef struct z_stream_s z_stream;
|
||||
#else
|
||||
typedef struct zng_stream_s zng_stream;
|
||||
#endif
|
||||
|
||||
/* inflate fast loop */
|
||||
extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
|
||||
#ifdef X86_SSE2
|
||||
extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef X86_SSSE3
|
||||
extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef X86_AVX2
|
||||
extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef ARM_NEON
|
||||
extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef POWER8_VSX
|
||||
extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
#ifdef RISCV_RVV
|
||||
extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
|
||||
#endif
|
||||
|
||||
/* CRC32 */
|
||||
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
|
||||
|
||||
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
#ifdef ARM_ACLE
|
||||
extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
#elif defined(POWER8_VSX)
|
||||
extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
#elif defined(S390_CRC32_VX)
|
||||
extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
#endif
|
||||
|
||||
/* compare256 */
|
||||
typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
|
||||
|
||||
extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
|
||||
#endif
|
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
|
||||
#endif
|
||||
#endif
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
|
||||
#endif
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
|
||||
#endif
|
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
|
||||
extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
|
||||
#endif
|
||||
#ifdef POWER9
|
||||
extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
|
||||
#endif
|
||||
#ifdef RISCV_RVV
|
||||
extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
|
||||
#endif
|
||||
|
||||
#ifdef DEFLATE_H_
|
||||
/* insert_string */
|
||||
extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
|
||||
#ifdef X86_SSE42
|
||||
extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
|
||||
#elif defined(ARM_ACLE)
|
||||
extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
|
||||
#endif
|
||||
|
||||
/* longest_match */
|
||||
extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#endif
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
|
||||
extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#ifdef POWER9
|
||||
extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#ifdef RISCV_RVV
|
||||
extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
|
||||
/* longest_match_slow */
|
||||
extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
|
||||
extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
|
||||
#ifdef UNALIGNED64_OK
|
||||
extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#endif
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
|
||||
extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#ifdef POWER9
|
||||
extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
#ifdef RISCV_RVV
|
||||
extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
|
||||
#endif
|
||||
|
||||
/* quick_insert_string */
|
||||
extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
|
||||
#ifdef X86_SSE42
|
||||
extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
|
||||
#elif defined(ARM_ACLE)
|
||||
extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
|
||||
#endif
|
||||
|
||||
/* slide_hash */
|
||||
typedef void (*slide_hash_func)(deflate_state *s);
|
||||
|
||||
#ifdef X86_SSE2
|
||||
extern void slide_hash_sse2(deflate_state *s);
|
||||
#endif
|
||||
#if defined(ARM_SIMD)
|
||||
extern void slide_hash_armv6(deflate_state *s);
|
||||
#endif
|
||||
#if defined(ARM_NEON)
|
||||
extern void slide_hash_neon(deflate_state *s);
|
||||
#endif
|
||||
#if defined(PPC_VMX)
|
||||
extern void slide_hash_vmx(deflate_state *s);
|
||||
#endif
|
||||
#if defined(POWER8_VSX)
|
||||
extern void slide_hash_power8(deflate_state *s);
|
||||
#endif
|
||||
#if defined(RISCV_RVV)
|
||||
extern void slide_hash_rvv(deflate_state *s);
|
||||
#endif
|
||||
#ifdef X86_AVX2
|
||||
extern void slide_hash_avx2(deflate_state *s);
|
||||
#endif
|
||||
|
||||
/* update_hash */
|
||||
extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
|
||||
#ifdef X86_SSE42
|
||||
extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val);
|
||||
#elif defined(ARM_ACLE)
|
||||
extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
267
3rdparty/zlib-ng/crc32_braid.c
vendored
Normal file
267
3rdparty/zlib-ng/crc32_braid.c
vendored
Normal file
@ -0,0 +1,267 @@
|
||||
/* crc32_braid.c -- compute the CRC-32 of a data stream
|
||||
* Copyright (C) 1995-2022 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* This interleaved implementation of a CRC makes use of pipelined multiple
|
||||
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
|
||||
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "functable.h"
|
||||
#include "crc32_braid_p.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
|
||||
const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
|
||||
return (const uint32_t *)crc_table;
|
||||
}
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
|
||||
if (buf == NULL) return 0;
|
||||
|
||||
return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
|
||||
if (buf == NULL) return 0;
|
||||
|
||||
return functable.crc32(crc, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
|
||||
return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
|
||||
return PREFIX(crc32_z)(crc, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
|
||||
/*
|
||||
A CRC of a message is computed on N braids of words in the message, where
|
||||
each word consists of W bytes (4 or 8). If N is 3, for example, then three
|
||||
running sparse CRCs are calculated respectively on each braid, at these
|
||||
indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
|
||||
This is done starting at a word boundary, and continues until as many blocks
|
||||
of N * W bytes as are available have been processed. The results are combined
|
||||
into a single CRC at the end. For this code, N must be in the range 1..6 and
|
||||
W must be 4 or 8. The upper limit on N can be increased if desired by adding
|
||||
more #if blocks, extending the patterns apparent in the code. In addition,
|
||||
crc32 tables would need to be regenerated, if the maximum N value is increased.
|
||||
|
||||
N and W are chosen empirically by benchmarking the execution time on a given
|
||||
processor. The choices for N and W below were based on testing on Intel Kaby
|
||||
Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
|
||||
Octeon II processors. The Intel, AMD, and ARM processors were all fastest
|
||||
with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
|
||||
They were all tested with either gcc or clang, all using the -O3 optimization
|
||||
level. Your mileage may vary.
|
||||
*/
|
||||
|
||||
/* ========================================================================= */
|
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
# define ZSWAPWORD(word) (word)
|
||||
# define BRAID_TABLE crc_braid_table
|
||||
#elif BYTE_ORDER == BIG_ENDIAN
|
||||
# if W == 8
|
||||
# define ZSWAPWORD(word) ZSWAP64(word)
|
||||
# elif W == 4
|
||||
# define ZSWAPWORD(word) ZSWAP32(word)
|
||||
# endif
|
||||
# define BRAID_TABLE crc_braid_big_table
|
||||
#else
|
||||
# error "No endian defined"
|
||||
#endif
|
||||
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
|
||||
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef W
|
||||
/*
|
||||
Return the CRC of the W bytes in the word_t data, taking the
|
||||
least-significant byte of the word as the first byte of data, without any pre
|
||||
or post conditioning. This is used to combine the CRCs of each braid.
|
||||
*/
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
static uint32_t crc_word(z_word_t data) {
|
||||
int k;
|
||||
for (k = 0; k < W; k++)
|
||||
data = (data >> 8) ^ crc_table[data & 0xff];
|
||||
return (uint32_t)data;
|
||||
}
|
||||
#elif BYTE_ORDER == BIG_ENDIAN
|
||||
static z_word_t crc_word(z_word_t data) {
|
||||
int k;
|
||||
for (k = 0; k < W; k++)
|
||||
data = (data << 8) ^
|
||||
crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
|
||||
return data;
|
||||
}
|
||||
#endif /* BYTE_ORDER */
|
||||
|
||||
#endif /* W */
|
||||
|
||||
/* ========================================================================= */
|
||||
Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
|
||||
Z_REGISTER uint32_t c;
|
||||
|
||||
/* Pre-condition the CRC */
|
||||
c = (~crc) & 0xffffffff;
|
||||
|
||||
#ifdef W
|
||||
/* If provided enough bytes, do a braided CRC calculation. */
|
||||
if (len >= N * W + W - 1) {
|
||||
size_t blks;
|
||||
z_word_t const *words;
|
||||
int k;
|
||||
|
||||
/* Compute the CRC up to a z_word_t boundary. */
|
||||
while (len && ((uintptr_t)buf & (W - 1)) != 0) {
|
||||
len--;
|
||||
DO1;
|
||||
}
|
||||
|
||||
/* Compute the CRC on as many N z_word_t blocks as are available. */
|
||||
blks = len / (N * W);
|
||||
len -= blks * N * W;
|
||||
words = (z_word_t const *)buf;
|
||||
|
||||
z_word_t crc0, word0, comb;
|
||||
#if N > 1
|
||||
z_word_t crc1, word1;
|
||||
#if N > 2
|
||||
z_word_t crc2, word2;
|
||||
#if N > 3
|
||||
z_word_t crc3, word3;
|
||||
#if N > 4
|
||||
z_word_t crc4, word4;
|
||||
#if N > 5
|
||||
z_word_t crc5, word5;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
/* Initialize the CRC for each braid. */
|
||||
crc0 = ZSWAPWORD(c);
|
||||
#if N > 1
|
||||
crc1 = 0;
|
||||
#if N > 2
|
||||
crc2 = 0;
|
||||
#if N > 3
|
||||
crc3 = 0;
|
||||
#if N > 4
|
||||
crc4 = 0;
|
||||
#if N > 5
|
||||
crc5 = 0;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
/* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
|
||||
while (--blks) {
|
||||
/* Load the word for each braid into registers. */
|
||||
word0 = crc0 ^ words[0];
|
||||
#if N > 1
|
||||
word1 = crc1 ^ words[1];
|
||||
#if N > 2
|
||||
word2 = crc2 ^ words[2];
|
||||
#if N > 3
|
||||
word3 = crc3 ^ words[3];
|
||||
#if N > 4
|
||||
word4 = crc4 ^ words[4];
|
||||
#if N > 5
|
||||
word5 = crc5 ^ words[5];
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
words += N;
|
||||
|
||||
/* Compute and update the CRC for each word. The loop should get unrolled. */
|
||||
crc0 = BRAID_TABLE[0][word0 & 0xff];
|
||||
#if N > 1
|
||||
crc1 = BRAID_TABLE[0][word1 & 0xff];
|
||||
#if N > 2
|
||||
crc2 = BRAID_TABLE[0][word2 & 0xff];
|
||||
#if N > 3
|
||||
crc3 = BRAID_TABLE[0][word3 & 0xff];
|
||||
#if N > 4
|
||||
crc4 = BRAID_TABLE[0][word4 & 0xff];
|
||||
#if N > 5
|
||||
crc5 = BRAID_TABLE[0][word5 & 0xff];
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
for (k = 1; k < W; k++) {
|
||||
crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
|
||||
#if N > 1
|
||||
crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
|
||||
#if N > 2
|
||||
crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
|
||||
#if N > 3
|
||||
crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
|
||||
#if N > 4
|
||||
crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
|
||||
#if N > 5
|
||||
crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* Process the last block, combining the CRCs of the N braids at the same time. */
|
||||
comb = crc_word(crc0 ^ words[0]);
|
||||
#if N > 1
|
||||
comb = crc_word(crc1 ^ words[1] ^ comb);
|
||||
#if N > 2
|
||||
comb = crc_word(crc2 ^ words[2] ^ comb);
|
||||
#if N > 3
|
||||
comb = crc_word(crc3 ^ words[3] ^ comb);
|
||||
#if N > 4
|
||||
comb = crc_word(crc4 ^ words[4] ^ comb);
|
||||
#if N > 5
|
||||
comb = crc_word(crc5 ^ words[5] ^ comb);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
words += N;
|
||||
c = ZSWAPWORD(comb);
|
||||
|
||||
/* Update the pointer to the remaining bytes to process. */
|
||||
buf = (const unsigned char *)words;
|
||||
}
|
||||
|
||||
#endif /* W */
|
||||
|
||||
/* Complete the computation of the CRC on any remaining bytes. */
|
||||
while (len >= 8) {
|
||||
len -= 8;
|
||||
DO8;
|
||||
}
|
||||
while (len) {
|
||||
len--;
|
||||
DO1;
|
||||
}
|
||||
|
||||
/* Return the CRC, post-conditioned. */
|
||||
return c ^ 0xffffffff;
|
||||
}
|
57
3rdparty/zlib-ng/crc32_braid_comb.c
vendored
Normal file
57
3rdparty/zlib-ng/crc32_braid_comb.c
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
/* crc32_braid_comb.c -- compute the CRC-32 of a data stream
|
||||
* Copyright (C) 1995-2022 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* This interleaved implementation of a CRC makes use of pipelined multiple
|
||||
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
|
||||
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "crc32_braid_p.h"
|
||||
#include "crc32_braid_tbl.h"
|
||||
#include "crc32_braid_comb_p.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
static uint32_t crc32_combine_(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
|
||||
return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
|
||||
}
|
||||
static uint32_t crc32_combine_gen_(z_off64_t len2) {
|
||||
return x2nmodp(len2, 3);
|
||||
}
|
||||
static uint32_t crc32_combine_op_(uint32_t crc1, uint32_t crc2, const uint32_t op) {
|
||||
return multmodp(op, crc1) ^ crc2;
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
unsigned long Z_EXPORT PREFIX(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off_t len2) {
|
||||
return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
|
||||
}
|
||||
unsigned long Z_EXPORT PREFIX4(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off64_t len2) {
|
||||
return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
|
||||
}
|
||||
unsigned long Z_EXPORT PREFIX(crc32_combine_gen)(z_off_t len2) {
|
||||
return crc32_combine_gen_(len2);
|
||||
}
|
||||
unsigned long Z_EXPORT PREFIX4(crc32_combine_gen)(z_off64_t len2) {
|
||||
return crc32_combine_gen_(len2);
|
||||
}
|
||||
unsigned long Z_EXPORT PREFIX(crc32_combine_op)(unsigned long crc1, unsigned long crc2, const unsigned long op) {
|
||||
return (unsigned long)crc32_combine_op_((uint32_t)crc1, (uint32_t)crc2, (uint32_t)op);
|
||||
}
|
||||
#else
|
||||
uint32_t Z_EXPORT PREFIX4(crc32_combine)(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
|
||||
return crc32_combine_(crc1, crc2, len2);
|
||||
}
|
||||
uint32_t Z_EXPORT PREFIX(crc32_combine_gen)(z_off64_t len2) {
|
||||
return crc32_combine_gen_(len2);
|
||||
}
|
||||
uint32_t Z_EXPORT PREFIX(crc32_combine_op)(uint32_t crc1, uint32_t crc2, const uint32_t op) {
|
||||
return crc32_combine_op_(crc1, crc2, op);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
42
3rdparty/zlib-ng/crc32_braid_comb_p.h
vendored
Normal file
42
3rdparty/zlib-ng/crc32_braid_comb_p.h
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
#ifndef CRC32_BRAID_COMB_P_H_
|
||||
#define CRC32_BRAID_COMB_P_H_
|
||||
|
||||
/*
|
||||
Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
|
||||
reflected. For speed, this requires that a not be zero.
|
||||
*/
|
||||
static uint32_t multmodp(uint32_t a, uint32_t b) {
|
||||
uint32_t m, p;
|
||||
|
||||
m = (uint32_t)1 << 31;
|
||||
p = 0;
|
||||
for (;;) {
|
||||
if (a & m) {
|
||||
p ^= b;
|
||||
if ((a & (m - 1)) == 0)
|
||||
break;
|
||||
}
|
||||
m >>= 1;
|
||||
b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/*
|
||||
Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
|
||||
initialized.
|
||||
*/
|
||||
static uint32_t x2nmodp(z_off64_t n, unsigned k) {
|
||||
uint32_t p;
|
||||
|
||||
p = (uint32_t)1 << 31; /* x^0 == 1 */
|
||||
while (n) {
|
||||
if (n & 1)
|
||||
p = multmodp(x2n_table[k & 31], p);
|
||||
n >>= 1;
|
||||
k++;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
#endif /* CRC32_BRAID_COMB_P_H_ */
|
50
3rdparty/zlib-ng/crc32_braid_p.h
vendored
Normal file
50
3rdparty/zlib-ng/crc32_braid_p.h
vendored
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef CRC32_BRAID_P_H_
|
||||
#define CRC32_BRAID_P_H_
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zendian.h"
|
||||
|
||||
/* Define N */
|
||||
#ifdef Z_TESTN
|
||||
# define N Z_TESTN
|
||||
#else
|
||||
# define N 5
|
||||
#endif
|
||||
#if N < 1 || N > 6
|
||||
# error N must be in 1..6
|
||||
#endif
|
||||
|
||||
/*
|
||||
Define W and the associated z_word_t type. If W is not defined, then a
|
||||
braided calculation is not used, and the associated tables and code are not
|
||||
compiled.
|
||||
*/
|
||||
#ifdef Z_TESTW
|
||||
# if Z_TESTW-1 != -1
|
||||
# define W Z_TESTW
|
||||
# endif
|
||||
#else
|
||||
# ifndef W
|
||||
# if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__)
|
||||
# define W 8
|
||||
# else
|
||||
# define W 4
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
#ifdef W
|
||||
# if W == 8
|
||||
typedef uint64_t z_word_t;
|
||||
# else
|
||||
# undef W
|
||||
# define W 4
|
||||
typedef uint32_t z_word_t;
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* CRC polynomial. */
|
||||
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
|
||||
|
||||
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
|
||||
#endif /* CRC32_BRAID_P_H_ */
|
9446
3rdparty/zlib-ng/crc32_braid_tbl.h
vendored
Normal file
9446
3rdparty/zlib-ng/crc32_braid_tbl.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
33
3rdparty/zlib-ng/crc32_fold.c
vendored
Normal file
33
3rdparty/zlib-ng/crc32_fold.c
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
/* crc32_fold.c -- crc32 folding interface
|
||||
* Copyright (C) 2021 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "functable.h"
|
||||
|
||||
#include "crc32_fold.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
|
||||
crc->value = CRC32_INITIAL_VALUE;
|
||||
return crc->value;
|
||||
}
|
||||
|
||||
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
|
||||
crc->value = functable.crc32(crc->value, src, len);
|
||||
memcpy(dst, src, len);
|
||||
}
|
||||
|
||||
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
|
||||
/* Note: while this is basically the same thing as the vanilla CRC function, we still need
|
||||
* a functable entry for it so that we can generically dispatch to this function with the
|
||||
* same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
|
||||
* init_crc is an unused argument in this context */
|
||||
Z_UNUSED(init_crc);
|
||||
crc->value = functable.crc32(crc->value, src, len);
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
|
||||
return crc->value;
|
||||
}
|
21
3rdparty/zlib-ng/crc32_fold.h
vendored
Normal file
21
3rdparty/zlib-ng/crc32_fold.h
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
/* crc32_fold.h -- crc32 folding interface
|
||||
* Copyright (C) 2021 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifndef CRC32_FOLD_H_
|
||||
#define CRC32_FOLD_H_
|
||||
|
||||
#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
|
||||
/* sizeof(__m128i) * (4 folds) */
|
||||
|
||||
typedef struct crc32_fold_s {
|
||||
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
|
||||
uint32_t value;
|
||||
} crc32_fold;
|
||||
|
||||
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
|
||||
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
|
||||
|
||||
#endif
|
1410
3rdparty/zlib-ng/deflate.c
vendored
Normal file
1410
3rdparty/zlib-ng/deflate.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
408
3rdparty/zlib-ng/deflate.h
vendored
Normal file
408
3rdparty/zlib-ng/deflate.h
vendored
Normal file
@ -0,0 +1,408 @@
|
||||
#ifndef DEFLATE_H_
|
||||
#define DEFLATE_H_
|
||||
/* deflate.h -- internal compression state
|
||||
* Copyright (C) 1995-2016 Jean-loup Gailly
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
/* WARNING: this file should *not* be used by applications. It is
|
||||
part of the implementation of the compression library and is
|
||||
subject to change. Applications should only use zlib.h.
|
||||
*/
|
||||
|
||||
#include "zutil.h"
|
||||
#include "zendian.h"
|
||||
#include "adler32_fold.h"
|
||||
#include "crc32_fold.h"
|
||||
|
||||
/* define NO_GZIP when compiling if you want to disable gzip header and
|
||||
trailer creation by deflate(). NO_GZIP would be used to avoid linking in
|
||||
the crc code when it is not needed. For shared libraries, gzip encoding
|
||||
should be left enabled. */
|
||||
#ifndef NO_GZIP
|
||||
# define GZIP
|
||||
#endif
|
||||
|
||||
/* ===========================================================================
|
||||
* Internal compression state.
|
||||
*/
|
||||
|
||||
#define LENGTH_CODES 29
|
||||
/* number of length codes, not counting the special END_BLOCK code */
|
||||
|
||||
#define LITERALS 256
|
||||
/* number of literal bytes 0..255 */
|
||||
|
||||
#define L_CODES (LITERALS+1+LENGTH_CODES)
|
||||
/* number of Literal or Length codes, including the END_BLOCK code */
|
||||
|
||||
#define D_CODES 30
|
||||
/* number of distance codes */
|
||||
|
||||
#define BL_CODES 19
|
||||
/* number of codes used to transfer the bit lengths */
|
||||
|
||||
#define HEAP_SIZE (2*L_CODES+1)
|
||||
/* maximum heap size */
|
||||
|
||||
#define BIT_BUF_SIZE 64
|
||||
/* size of bit buffer in bi_buf */
|
||||
|
||||
#define END_BLOCK 256
|
||||
/* end of block literal code */
|
||||
|
||||
#define INIT_STATE 1 /* zlib header -> BUSY_STATE */
|
||||
#ifdef GZIP
|
||||
# define GZIP_STATE 4 /* gzip header -> BUSY_STATE | EXTRA_STATE */
|
||||
# define EXTRA_STATE 5 /* gzip extra block -> NAME_STATE */
|
||||
# define NAME_STATE 6 /* gzip file name -> COMMENT_STATE */
|
||||
# define COMMENT_STATE 7 /* gzip comment -> HCRC_STATE */
|
||||
# define HCRC_STATE 8 /* gzip header CRC -> BUSY_STATE */
|
||||
#endif
|
||||
#define BUSY_STATE 2 /* deflate -> FINISH_STATE */
|
||||
#define FINISH_STATE 3 /* stream complete */
|
||||
#ifdef GZIP
|
||||
# define MAX_STATE HCRC_STATE
|
||||
#else
|
||||
# define MAX_STATE FINISH_STATE
|
||||
#endif
|
||||
/* Stream status */
|
||||
|
||||
#define HASH_BITS 16u /* log2(HASH_SIZE) */
|
||||
#ifndef HASH_SIZE
|
||||
# define HASH_SIZE 65536u /* number of elements in hash table */
|
||||
#endif
|
||||
#define HASH_MASK (HASH_SIZE - 1u) /* HASH_SIZE-1 */
|
||||
|
||||
|
||||
/* Data structure describing a single value and its code string. */
|
||||
typedef struct ct_data_s {
|
||||
union {
|
||||
uint16_t freq; /* frequency count */
|
||||
uint16_t code; /* bit string */
|
||||
} fc;
|
||||
union {
|
||||
uint16_t dad; /* father node in Huffman tree */
|
||||
uint16_t len; /* length of bit string */
|
||||
} dl;
|
||||
} ct_data;
|
||||
|
||||
#define Freq fc.freq
|
||||
#define Code fc.code
|
||||
#define Dad dl.dad
|
||||
#define Len dl.len
|
||||
|
||||
typedef struct static_tree_desc_s static_tree_desc;
|
||||
|
||||
typedef struct tree_desc_s {
|
||||
ct_data *dyn_tree; /* the dynamic tree */
|
||||
int max_code; /* largest code with non zero frequency */
|
||||
const static_tree_desc *stat_desc; /* the corresponding static tree */
|
||||
} tree_desc;
|
||||
|
||||
typedef uint16_t Pos;
|
||||
|
||||
/* A Pos is an index in the character window. We use short instead of int to
|
||||
* save space in the various tables.
|
||||
*/
|
||||
/* Type definitions for hash callbacks */
|
||||
typedef struct internal_state deflate_state;
|
||||
|
||||
typedef uint32_t (* update_hash_cb) (deflate_state *const s, uint32_t h, uint32_t val);
|
||||
typedef void (* insert_string_cb) (deflate_state *const s, uint32_t str, uint32_t count);
|
||||
typedef Pos (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
|
||||
|
||||
struct internal_state {
|
||||
PREFIX3(stream) *strm; /* pointer back to this zlib stream */
|
||||
unsigned char *pending_buf; /* output still pending */
|
||||
unsigned char *pending_out; /* next pending byte to output to the stream */
|
||||
uint32_t pending_buf_size; /* size of pending_buf */
|
||||
uint32_t pending; /* nb of bytes in the pending buffer */
|
||||
int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
|
||||
uint32_t gzindex; /* where in extra, name, or comment */
|
||||
PREFIX(gz_headerp) gzhead; /* gzip header information to write */
|
||||
int status; /* as the name implies */
|
||||
int last_flush; /* value of flush param for previous deflate call */
|
||||
int reproducible; /* Whether reproducible compression results are required. */
|
||||
|
||||
int block_open;
|
||||
/* Whether or not a block is currently open for the QUICK deflation scheme.
|
||||
* This is set to 1 if there is an active block, or 0 if the block was just closed.
|
||||
*/
|
||||
|
||||
/* used by deflate.c: */
|
||||
|
||||
unsigned int w_size; /* LZ77 window size (32K by default) */
|
||||
unsigned int w_bits; /* log2(w_size) (8..16) */
|
||||
unsigned int w_mask; /* w_size - 1 */
|
||||
unsigned int lookahead; /* number of valid bytes ahead in window */
|
||||
|
||||
unsigned int high_water;
|
||||
/* High water mark offset in window for initialized bytes -- bytes above
|
||||
* this are set to zero in order to avoid memory check warnings when
|
||||
* longest match routines access bytes past the input. This is then
|
||||
* updated to the new high water mark.
|
||||
*/
|
||||
|
||||
unsigned int window_size;
|
||||
/* Actual size of window: 2*wSize, except when the user input buffer
|
||||
* is directly used as sliding window.
|
||||
*/
|
||||
|
||||
unsigned char *window;
|
||||
/* Sliding window. Input bytes are read into the second half of the window,
|
||||
* and move to the first half later to keep a dictionary of at least wSize
|
||||
* bytes. With this organization, matches are limited to a distance of
|
||||
* wSize-STD_MAX_MATCH bytes, but this ensures that IO is always
|
||||
* performed with a length multiple of the block size. Also, it limits
|
||||
* the window size to 64K, which is quite useful on MSDOS.
|
||||
* To do: use the user input buffer as sliding window.
|
||||
*/
|
||||
|
||||
Pos *prev;
|
||||
/* Link to older string with same hash index. To limit the size of this
|
||||
* array to 64K, this link is maintained only for the last 32K strings.
|
||||
* An index in this array is thus a window index modulo 32K.
|
||||
*/
|
||||
|
||||
Pos *head; /* Heads of the hash chains or 0. */
|
||||
|
||||
uint32_t ins_h; /* hash index of string to be inserted */
|
||||
|
||||
int block_start;
|
||||
/* Window position at the beginning of the current output block. Gets
|
||||
* negative when the window is moved backwards.
|
||||
*/
|
||||
|
||||
unsigned int match_length; /* length of best match */
|
||||
Pos prev_match; /* previous match */
|
||||
int match_available; /* set if previous match exists */
|
||||
unsigned int strstart; /* start of string to insert */
|
||||
unsigned int match_start; /* start of matching string */
|
||||
|
||||
unsigned int prev_length;
|
||||
/* Length of the best match at previous step. Matches not greater than this
|
||||
* are discarded. This is used in the lazy match evaluation.
|
||||
*/
|
||||
|
||||
unsigned int max_chain_length;
|
||||
/* To speed up deflation, hash chains are never searched beyond this length.
|
||||
* A higher limit improves compression ratio but degrades the speed.
|
||||
*/
|
||||
|
||||
unsigned int max_lazy_match;
|
||||
/* Attempt to find a better match only when the current match is strictly smaller
|
||||
* than this value. This mechanism is used only for compression levels >= 4.
|
||||
*/
|
||||
# define max_insert_length max_lazy_match
|
||||
/* Insert new strings in the hash table only if the match length is not
|
||||
* greater than this length. This saves time but degrades compression.
|
||||
* max_insert_length is used only for compression levels <= 3.
|
||||
*/
|
||||
|
||||
update_hash_cb update_hash;
|
||||
insert_string_cb insert_string;
|
||||
quick_insert_string_cb quick_insert_string;
|
||||
/* Hash function callbacks that can be configured depending on the deflate
|
||||
* algorithm being used */
|
||||
|
||||
int level; /* compression level (1..9) */
|
||||
int strategy; /* favor or force Huffman coding*/
|
||||
|
||||
unsigned int good_match;
|
||||
/* Use a faster search when the previous match is longer than this */
|
||||
|
||||
int nice_match; /* Stop searching when current match exceeds this */
|
||||
|
||||
struct crc32_fold_s ALIGNED_(16) crc_fold;
|
||||
|
||||
/* used by trees.c: */
|
||||
/* Didn't use ct_data typedef below to suppress compiler warning */
|
||||
struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */
|
||||
struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
|
||||
struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */
|
||||
|
||||
struct tree_desc_s l_desc; /* desc. for literal tree */
|
||||
struct tree_desc_s d_desc; /* desc. for distance tree */
|
||||
struct tree_desc_s bl_desc; /* desc. for bit length tree */
|
||||
|
||||
uint16_t bl_count[MAX_BITS+1];
|
||||
/* number of codes at each bit length for an optimal tree */
|
||||
|
||||
int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */
|
||||
int heap_len; /* number of elements in the heap */
|
||||
int heap_max; /* element of largest frequency */
|
||||
/* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
|
||||
* The same heap array is used to build all trees.
|
||||
*/
|
||||
|
||||
unsigned char depth[2*L_CODES+1];
|
||||
/* Depth of each subtree used as tie breaker for trees of equal frequency
|
||||
*/
|
||||
|
||||
unsigned int lit_bufsize;
|
||||
/* Size of match buffer for literals/lengths. There are 4 reasons for
|
||||
* limiting lit_bufsize to 64K:
|
||||
* - frequencies can be kept in 16 bit counters
|
||||
* - if compression is not successful for the first block, all input
|
||||
* data is still in the window so we can still emit a stored block even
|
||||
* when input comes from standard input. (This can also be done for
|
||||
* all blocks if lit_bufsize is not greater than 32K.)
|
||||
* - if compression is not successful for a file smaller than 64K, we can
|
||||
* even emit a stored file instead of a stored block (saving 5 bytes).
|
||||
* This is applicable only for zip (not gzip or zlib).
|
||||
* - creating new Huffman trees less frequently may not provide fast
|
||||
* adaptation to changes in the input data statistics. (Take for
|
||||
* example a binary file with poorly compressible code followed by
|
||||
* a highly compressible string table.) Smaller buffer sizes give
|
||||
* fast adaptation but have of course the overhead of transmitting
|
||||
* trees more frequently.
|
||||
* - I can't count above 4
|
||||
*/
|
||||
|
||||
unsigned char *sym_buf; /* buffer for distances and literals/lengths */
|
||||
unsigned int sym_next; /* running index in sym_buf */
|
||||
unsigned int sym_end; /* symbol table full when sym_next reaches this */
|
||||
|
||||
unsigned long opt_len; /* bit length of current block with optimal trees */
|
||||
unsigned long static_len; /* bit length of current block with static trees */
|
||||
unsigned int matches; /* number of string matches in current block */
|
||||
unsigned int insert; /* bytes at end of window left to insert */
|
||||
|
||||
/* compressed_len and bits_sent are only used if ZLIB_DEBUG is defined */
|
||||
unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */
|
||||
unsigned long bits_sent; /* bit length of compressed data sent mod 2^32 */
|
||||
|
||||
/* Reserved for future use and alignment purposes */
|
||||
char *reserved_p;
|
||||
|
||||
uint64_t bi_buf;
|
||||
/* Output buffer. bits are inserted starting at the bottom (least significant bits). */
|
||||
|
||||
int32_t bi_valid;
|
||||
/* Number of valid bits in bi_buf. All bits above the last valid bit are always zero. */
|
||||
|
||||
/* Reserved for future use and alignment purposes */
|
||||
int32_t reserved[11];
|
||||
} ALIGNED_(8);
|
||||
|
||||
typedef enum {
|
||||
need_more, /* block not completed, need more input or more output */
|
||||
block_done, /* block flush performed */
|
||||
finish_started, /* finish started, need only more output at next deflate */
|
||||
finish_done /* finish done, accept no more input or output */
|
||||
} block_state;
|
||||
|
||||
/* Output a byte on the stream.
|
||||
* IN assertion: there is enough room in pending_buf.
|
||||
*/
|
||||
#define put_byte(s, c) { \
|
||||
s->pending_buf[s->pending++] = (unsigned char)(c); \
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* Output a short LSB first on the stream.
|
||||
* IN assertion: there is enough room in pending_buf.
|
||||
*/
|
||||
static inline void put_short(deflate_state *s, uint16_t w) {
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
w = ZSWAP16(w);
|
||||
#endif
|
||||
memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
|
||||
s->pending += 2;
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* Output a short MSB first on the stream.
|
||||
* IN assertion: there is enough room in pending_buf.
|
||||
*/
|
||||
static inline void put_short_msb(deflate_state *s, uint16_t w) {
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
w = ZSWAP16(w);
|
||||
#endif
|
||||
memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
|
||||
s->pending += 2;
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* Output a 32-bit unsigned int LSB first on the stream.
|
||||
* IN assertion: there is enough room in pending_buf.
|
||||
*/
|
||||
static inline void put_uint32(deflate_state *s, uint32_t dw) {
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
dw = ZSWAP32(dw);
|
||||
#endif
|
||||
memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
|
||||
s->pending += 4;
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* Output a 32-bit unsigned int MSB first on the stream.
|
||||
* IN assertion: there is enough room in pending_buf.
|
||||
*/
|
||||
static inline void put_uint32_msb(deflate_state *s, uint32_t dw) {
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
dw = ZSWAP32(dw);
|
||||
#endif
|
||||
memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
|
||||
s->pending += 4;
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* Output a 64-bit unsigned int LSB first on the stream.
|
||||
* IN assertion: there is enough room in pending_buf.
|
||||
*/
|
||||
static inline void put_uint64(deflate_state *s, uint64_t lld) {
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
lld = ZSWAP64(lld);
|
||||
#endif
|
||||
memcpy(&s->pending_buf[s->pending], &lld, sizeof(lld));
|
||||
s->pending += 8;
|
||||
}
|
||||
|
||||
#define MIN_LOOKAHEAD (STD_MAX_MATCH + STD_MIN_MATCH + 1)
|
||||
/* Minimum amount of lookahead, except at the end of the input file.
|
||||
* See deflate.c for comments about the STD_MIN_MATCH+1.
|
||||
*/
|
||||
|
||||
#define MAX_DIST(s) ((s)->w_size - MIN_LOOKAHEAD)
|
||||
/* In order to simplify the code, particularly on 16 bit machines, match
|
||||
* distances are limited to MAX_DIST instead of WSIZE.
|
||||
*/
|
||||
|
||||
#define WIN_INIT STD_MAX_MATCH
|
||||
/* Number of bytes after end of data in window to initialize in order to avoid
|
||||
memory checker errors from longest match routines */
|
||||
|
||||
|
||||
void Z_INTERNAL PREFIX(fill_window)(deflate_state *s);
|
||||
void Z_INTERNAL slide_hash_c(deflate_state *s);
|
||||
|
||||
/* in trees.c */
|
||||
void Z_INTERNAL zng_tr_init(deflate_state *s);
|
||||
void Z_INTERNAL zng_tr_flush_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
|
||||
void Z_INTERNAL zng_tr_flush_bits(deflate_state *s);
|
||||
void Z_INTERNAL zng_tr_align(deflate_state *s);
|
||||
void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
|
||||
uint16_t Z_INTERNAL PREFIX(bi_reverse)(unsigned code, int len);
|
||||
void Z_INTERNAL PREFIX(flush_pending)(PREFIX3(streamp) strm);
|
||||
#define d_code(dist) ((dist) < 256 ? zng_dist_code[dist] : zng_dist_code[256+((dist)>>7)])
|
||||
/* Mapping from a distance to a distance code. dist is the distance - 1 and
|
||||
* must not have side effects. zng_dist_code[256] and zng_dist_code[257] are never
|
||||
* used.
|
||||
*/
|
||||
|
||||
/* Bit buffer and compress bits calculation debugging */
|
||||
#ifdef ZLIB_DEBUG
|
||||
# define cmpr_bits_add(s, len) s->compressed_len += (len)
|
||||
# define cmpr_bits_align(s) s->compressed_len = (s->compressed_len + 7) & ~7L
|
||||
# define sent_bits_add(s, bits) s->bits_sent += (bits)
|
||||
# define sent_bits_align(s) s->bits_sent = (s->bits_sent + 7) & ~7L
|
||||
#else
|
||||
# define cmpr_bits_add(s, len) Z_UNUSED(len)
|
||||
# define cmpr_bits_align(s)
|
||||
# define sent_bits_add(s, bits) Z_UNUSED(bits)
|
||||
# define sent_bits_align(s)
|
||||
#endif
|
||||
|
||||
#endif /* DEFLATE_H_ */
|
102
3rdparty/zlib-ng/deflate_fast.c
vendored
Normal file
102
3rdparty/zlib-ng/deflate_fast.c
vendored
Normal file
@ -0,0 +1,102 @@
|
||||
/* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Compress as much as possible from the input stream, return the current
|
||||
* block state.
|
||||
* This function does not perform lazy evaluation of matches and inserts
|
||||
* new strings in the dictionary only for unmatched strings or for short
|
||||
* matches. It is used only for the fast compression options.
|
||||
*/
|
||||
Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
|
||||
Pos hash_head; /* head of the hash chain */
|
||||
int bflush = 0; /* set if current block must be flushed */
|
||||
int64_t dist;
|
||||
uint32_t match_len = 0;
|
||||
|
||||
for (;;) {
|
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes
|
||||
* for the next match, plus WANT_MIN_MATCH bytes to insert the
|
||||
* string following the next match.
|
||||
*/
|
||||
if (s->lookahead < MIN_LOOKAHEAD) {
|
||||
PREFIX(fill_window)(s);
|
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
|
||||
return need_more;
|
||||
}
|
||||
if (UNLIKELY(s->lookahead == 0))
|
||||
break; /* flush the current block */
|
||||
}
|
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain:
|
||||
*/
|
||||
if (s->lookahead >= WANT_MIN_MATCH) {
|
||||
hash_head = functable.quick_insert_string(s, s->strstart);
|
||||
dist = (int64_t)s->strstart - hash_head;
|
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
* At this point we have always match length < WANT_MIN_MATCH
|
||||
*/
|
||||
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
|
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match
|
||||
* of the string with itself at the start of the input file).
|
||||
*/
|
||||
match_len = functable.longest_match(s, hash_head);
|
||||
/* longest_match() sets match_start */
|
||||
}
|
||||
}
|
||||
|
||||
if (match_len >= WANT_MIN_MATCH) {
|
||||
check_match(s, s->strstart, s->match_start, match_len);
|
||||
|
||||
bflush = zng_tr_tally_dist(s, s->strstart - s->match_start, match_len - STD_MIN_MATCH);
|
||||
|
||||
s->lookahead -= match_len;
|
||||
|
||||
/* Insert new strings in the hash table only if the match length
|
||||
* is not too large. This saves time but degrades compression.
|
||||
*/
|
||||
if (match_len <= s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
|
||||
match_len--; /* string at strstart already in table */
|
||||
s->strstart++;
|
||||
|
||||
functable.insert_string(s, s->strstart, match_len);
|
||||
s->strstart += match_len;
|
||||
} else {
|
||||
s->strstart += match_len;
|
||||
functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
|
||||
|
||||
/* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
|
||||
* matter since it will be recomputed at next deflate call.
|
||||
*/
|
||||
}
|
||||
match_len = 0;
|
||||
} else {
|
||||
/* No match, output a literal byte */
|
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
|
||||
s->lookahead--;
|
||||
s->strstart++;
|
||||
}
|
||||
if (UNLIKELY(bflush))
|
||||
FLUSH_BLOCK(s, 0);
|
||||
}
|
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
|
||||
if (UNLIKELY(flush == Z_FINISH)) {
|
||||
FLUSH_BLOCK(s, 1);
|
||||
return finish_done;
|
||||
}
|
||||
if (UNLIKELY(s->sym_next))
|
||||
FLUSH_BLOCK(s, 0);
|
||||
return block_done;
|
||||
}
|
45
3rdparty/zlib-ng/deflate_huff.c
vendored
Normal file
45
3rdparty/zlib-ng/deflate_huff.c
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
/* deflate_huff.c -- compress data using huffman encoding only strategy
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* For Z_HUFFMAN_ONLY, do not look for matches. Do not maintain a hash table.
|
||||
* (It will be regenerated if this run of deflate switches away from Huffman.)
|
||||
*/
|
||||
Z_INTERNAL block_state deflate_huff(deflate_state *s, int flush) {
|
||||
int bflush = 0; /* set if current block must be flushed */
|
||||
|
||||
for (;;) {
|
||||
/* Make sure that we have a literal to write. */
|
||||
if (s->lookahead == 0) {
|
||||
PREFIX(fill_window)(s);
|
||||
if (s->lookahead == 0) {
|
||||
if (flush == Z_NO_FLUSH)
|
||||
return need_more;
|
||||
break; /* flush the current block */
|
||||
}
|
||||
}
|
||||
|
||||
/* Output a literal byte */
|
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
|
||||
s->lookahead--;
|
||||
s->strstart++;
|
||||
if (bflush)
|
||||
FLUSH_BLOCK(s, 0);
|
||||
}
|
||||
s->insert = 0;
|
||||
if (flush == Z_FINISH) {
|
||||
FLUSH_BLOCK(s, 1);
|
||||
return finish_done;
|
||||
}
|
||||
if (s->sym_next)
|
||||
FLUSH_BLOCK(s, 0);
|
||||
return block_done;
|
||||
}
|
293
3rdparty/zlib-ng/deflate_medium.c
vendored
Normal file
293
3rdparty/zlib-ng/deflate_medium.c
vendored
Normal file
@ -0,0 +1,293 @@
|
||||
/* deflate_medium.c -- The deflate_medium deflate strategy
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifndef NO_MEDIUM_STRATEGY
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
struct match {
|
||||
uint16_t match_start;
|
||||
uint16_t match_length;
|
||||
uint16_t strstart;
|
||||
uint16_t orgstart;
|
||||
};
|
||||
|
||||
static int emit_match(deflate_state *s, struct match match) {
|
||||
int bflush = 0;
|
||||
|
||||
/* matches that are not long enough we need to emit as literals */
|
||||
if (match.match_length < WANT_MIN_MATCH) {
|
||||
while (match.match_length) {
|
||||
bflush += zng_tr_tally_lit(s, s->window[match.strstart]);
|
||||
s->lookahead--;
|
||||
match.strstart++;
|
||||
match.match_length--;
|
||||
}
|
||||
return bflush;
|
||||
}
|
||||
|
||||
check_match(s, match.strstart, match.match_start, match.match_length);
|
||||
|
||||
bflush += zng_tr_tally_dist(s, match.strstart - match.match_start, match.match_length - STD_MIN_MATCH);
|
||||
|
||||
s->lookahead -= match.match_length;
|
||||
return bflush;
|
||||
}
|
||||
|
||||
static void insert_match(deflate_state *s, struct match match) {
|
||||
if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH)))
|
||||
return;
|
||||
|
||||
/* matches that are not long enough we need to emit as literals */
|
||||
if (LIKELY(match.match_length < WANT_MIN_MATCH)) {
|
||||
match.strstart++;
|
||||
match.match_length--;
|
||||
if (UNLIKELY(match.match_length > 0)) {
|
||||
if (match.strstart >= match.orgstart) {
|
||||
if (match.strstart + match.match_length - 1 >= match.orgstart) {
|
||||
functable.insert_string(s, match.strstart, match.match_length);
|
||||
} else {
|
||||
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
|
||||
}
|
||||
match.strstart += match.match_length;
|
||||
match.match_length = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Insert new strings in the hash table only if the match length
|
||||
* is not too large. This saves time but degrades compression.
|
||||
*/
|
||||
if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
|
||||
match.match_length--; /* string at strstart already in table */
|
||||
match.strstart++;
|
||||
|
||||
if (LIKELY(match.strstart >= match.orgstart)) {
|
||||
if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
|
||||
functable.insert_string(s, match.strstart, match.match_length);
|
||||
} else {
|
||||
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
|
||||
}
|
||||
} else if (match.orgstart < match.strstart + match.match_length) {
|
||||
functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
|
||||
}
|
||||
match.strstart += match.match_length;
|
||||
match.match_length = 0;
|
||||
} else {
|
||||
match.strstart += match.match_length;
|
||||
match.match_length = 0;
|
||||
|
||||
if (match.strstart >= (STD_MIN_MATCH - 2))
|
||||
functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
|
||||
|
||||
/* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
|
||||
* matter since it will be recomputed at next deflate call.
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) {
|
||||
Pos limit;
|
||||
unsigned char *match, *orig;
|
||||
int changed = 0;
|
||||
struct match c, n;
|
||||
/* step zero: sanity checks */
|
||||
|
||||
if (current->match_length <= 1)
|
||||
return;
|
||||
|
||||
if (UNLIKELY(current->match_length > 1 + next->match_start))
|
||||
return;
|
||||
|
||||
if (UNLIKELY(current->match_length > 1 + next->strstart))
|
||||
return;
|
||||
|
||||
match = s->window - current->match_length + 1 + next->match_start;
|
||||
orig = s->window - current->match_length + 1 + next->strstart;
|
||||
|
||||
/* quick exit check.. if this fails then don't bother with anything else */
|
||||
if (LIKELY(*match != *orig))
|
||||
return;
|
||||
|
||||
c = *current;
|
||||
n = *next;
|
||||
|
||||
/* step one: try to move the "next" match to the left as much as possible */
|
||||
limit = next->strstart > MAX_DIST(s) ? next->strstart - (Pos)MAX_DIST(s) : 0;
|
||||
|
||||
match = s->window + n.match_start - 1;
|
||||
orig = s->window + n.strstart - 1;
|
||||
|
||||
while (*match == *orig) {
|
||||
if (UNLIKELY(c.match_length < 1))
|
||||
break;
|
||||
if (UNLIKELY(n.strstart <= limit))
|
||||
break;
|
||||
if (UNLIKELY(n.match_length >= 256))
|
||||
break;
|
||||
if (UNLIKELY(n.match_start <= 1))
|
||||
break;
|
||||
|
||||
n.strstart--;
|
||||
n.match_start--;
|
||||
n.match_length++;
|
||||
c.match_length--;
|
||||
match--;
|
||||
orig--;
|
||||
changed++;
|
||||
}
|
||||
|
||||
if (!changed)
|
||||
return;
|
||||
|
||||
if (c.match_length <= 1 && n.match_length != 2) {
|
||||
n.orgstart++;
|
||||
*current = c;
|
||||
*next = n;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
|
||||
/* Align the first struct to start on a new cacheline, this allows us to fit both structs in one cacheline */
|
||||
ALIGNED_(16) struct match current_match;
|
||||
struct match next_match;
|
||||
|
||||
/* For levels below 5, don't check the next position for a better match */
|
||||
int early_exit = s->level < 5;
|
||||
|
||||
memset(¤t_match, 0, sizeof(struct match));
|
||||
memset(&next_match, 0, sizeof(struct match));
|
||||
|
||||
for (;;) {
|
||||
Pos hash_head = 0; /* head of the hash chain */
|
||||
int bflush = 0; /* set if current block must be flushed */
|
||||
int64_t dist;
|
||||
|
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes
|
||||
* for the next match, plus WANT_MIN_MATCH bytes to insert the
|
||||
* string following the next current_match.
|
||||
*/
|
||||
if (s->lookahead < MIN_LOOKAHEAD) {
|
||||
PREFIX(fill_window)(s);
|
||||
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
|
||||
return need_more;
|
||||
}
|
||||
if (UNLIKELY(s->lookahead == 0))
|
||||
break; /* flush the current block */
|
||||
next_match.match_length = 0;
|
||||
}
|
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain:
|
||||
*/
|
||||
|
||||
/* If we already have a future match from a previous round, just use that */
|
||||
if (!early_exit && next_match.match_length > 0) {
|
||||
current_match = next_match;
|
||||
next_match.match_length = 0;
|
||||
} else {
|
||||
hash_head = 0;
|
||||
if (s->lookahead >= WANT_MIN_MATCH) {
|
||||
hash_head = functable.quick_insert_string(s, s->strstart);
|
||||
}
|
||||
|
||||
current_match.strstart = (uint16_t)s->strstart;
|
||||
current_match.orgstart = current_match.strstart;
|
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
* At this point we have always match_length < WANT_MIN_MATCH
|
||||
*/
|
||||
|
||||
dist = (int64_t)s->strstart - hash_head;
|
||||
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
|
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match
|
||||
* of the string with itself at the start of the input file).
|
||||
*/
|
||||
current_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
|
||||
current_match.match_start = (uint16_t)s->match_start;
|
||||
if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH))
|
||||
current_match.match_length = 1;
|
||||
if (UNLIKELY(current_match.match_start >= current_match.strstart)) {
|
||||
/* this can happen due to some restarts */
|
||||
current_match.match_length = 1;
|
||||
}
|
||||
} else {
|
||||
/* Set up the match to be a 1 byte literal */
|
||||
current_match.match_start = 0;
|
||||
current_match.match_length = 1;
|
||||
}
|
||||
}
|
||||
|
||||
insert_match(s, current_match);
|
||||
|
||||
/* now, look ahead one */
|
||||
if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
|
||||
s->strstart = current_match.strstart + current_match.match_length;
|
||||
hash_head = functable.quick_insert_string(s, s->strstart);
|
||||
|
||||
next_match.strstart = (uint16_t)s->strstart;
|
||||
next_match.orgstart = next_match.strstart;
|
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
* At this point we have always match_length < WANT_MIN_MATCH
|
||||
*/
|
||||
|
||||
dist = (int64_t)s->strstart - hash_head;
|
||||
if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
|
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match
|
||||
* of the string with itself at the start of the input file).
|
||||
*/
|
||||
next_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
|
||||
next_match.match_start = (uint16_t)s->match_start;
|
||||
if (UNLIKELY(next_match.match_start >= next_match.strstart)) {
|
||||
/* this can happen due to some restarts */
|
||||
next_match.match_length = 1;
|
||||
}
|
||||
if (next_match.match_length < WANT_MIN_MATCH)
|
||||
next_match.match_length = 1;
|
||||
else
|
||||
fizzle_matches(s, ¤t_match, &next_match);
|
||||
} else {
|
||||
/* Set up the match to be a 1 byte literal */
|
||||
next_match.match_start = 0;
|
||||
next_match.match_length = 1;
|
||||
}
|
||||
|
||||
s->strstart = current_match.strstart;
|
||||
} else {
|
||||
next_match.match_length = 0;
|
||||
}
|
||||
|
||||
/* now emit the current match */
|
||||
bflush = emit_match(s, current_match);
|
||||
|
||||
/* move the "cursor" forward */
|
||||
s->strstart += current_match.match_length;
|
||||
|
||||
if (UNLIKELY(bflush))
|
||||
FLUSH_BLOCK(s, 0);
|
||||
}
|
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
|
||||
if (flush == Z_FINISH) {
|
||||
FLUSH_BLOCK(s, 1);
|
||||
return finish_done;
|
||||
}
|
||||
if (UNLIKELY(s->sym_next))
|
||||
FLUSH_BLOCK(s, 0);
|
||||
|
||||
return block_done;
|
||||
}
|
||||
#endif
|
116
3rdparty/zlib-ng/deflate_p.h
vendored
Normal file
116
3rdparty/zlib-ng/deflate_p.h
vendored
Normal file
@ -0,0 +1,116 @@
|
||||
/* deflate_p.h -- Private inline functions and macros shared with more than
|
||||
* one deflate method
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef DEFLATE_P_H
|
||||
#define DEFLATE_P_H
|
||||
|
||||
/* Forward declare common non-inlined functions declared in deflate.c */
|
||||
|
||||
#ifdef ZLIB_DEBUG
|
||||
/* ===========================================================================
|
||||
* Check that the match at match_start is indeed a match.
|
||||
*/
|
||||
static inline void check_match(deflate_state *s, Pos start, Pos match, int length) {
|
||||
/* check that the match length is valid*/
|
||||
if (length < STD_MIN_MATCH || length > STD_MAX_MATCH) {
|
||||
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
|
||||
z_error("invalid match length");
|
||||
}
|
||||
/* check that the match isn't at the same position as the start string */
|
||||
if (match == start) {
|
||||
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
|
||||
z_error("invalid match position");
|
||||
}
|
||||
/* check that the match is indeed a match */
|
||||
if (memcmp(s->window + match, s->window + start, length) != 0) {
|
||||
int32_t i = 0;
|
||||
fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
|
||||
do {
|
||||
fprintf(stderr, " %03d: match [%02x] start [%02x]\n", i++,
|
||||
s->window[match++], s->window[start++]);
|
||||
} while (--length != 0);
|
||||
z_error("invalid match");
|
||||
}
|
||||
if (z_verbose > 1) {
|
||||
fprintf(stderr, "\\[%u,%d]", start-match, length);
|
||||
do {
|
||||
putc(s->window[start++], stderr);
|
||||
} while (--length != 0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#define check_match(s, start, match, length)
|
||||
#endif
|
||||
|
||||
Z_INTERNAL void PREFIX(flush_pending)(PREFIX3(stream) *strm);
|
||||
Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
|
||||
|
||||
/* ===========================================================================
|
||||
* Save the match info and tally the frequency counts. Return true if
|
||||
* the current block must be flushed.
|
||||
*/
|
||||
|
||||
extern const unsigned char Z_INTERNAL zng_length_code[];
|
||||
extern const unsigned char Z_INTERNAL zng_dist_code[];
|
||||
|
||||
static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
|
||||
/* c is the unmatched char */
|
||||
s->sym_buf[s->sym_next++] = 0;
|
||||
s->sym_buf[s->sym_next++] = 0;
|
||||
s->sym_buf[s->sym_next++] = c;
|
||||
s->dyn_ltree[c].Freq++;
|
||||
Tracevv((stderr, "%c", c));
|
||||
Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal");
|
||||
return (s->sym_next == s->sym_end);
|
||||
}
|
||||
|
||||
static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) {
|
||||
/* dist: distance of matched string */
|
||||
/* len: match length-STD_MIN_MATCH */
|
||||
s->sym_buf[s->sym_next++] = (uint8_t)(dist);
|
||||
s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8);
|
||||
s->sym_buf[s->sym_next++] = (uint8_t)len;
|
||||
s->matches++;
|
||||
dist--;
|
||||
Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES,
|
||||
"zng_tr_tally: bad match");
|
||||
|
||||
s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++;
|
||||
s->dyn_dtree[d_code(dist)].Freq++;
|
||||
return (s->sym_next == s->sym_end);
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* Flush the current block, with given end-of-file flag.
|
||||
* IN assertion: strstart is set to the end of the current match.
|
||||
*/
|
||||
#define FLUSH_BLOCK_ONLY(s, last) { \
|
||||
zng_tr_flush_block(s, (s->block_start >= 0 ? \
|
||||
(char *)&s->window[(unsigned)s->block_start] : \
|
||||
NULL), \
|
||||
(uint32_t)((int)s->strstart - s->block_start), \
|
||||
(last)); \
|
||||
s->block_start = (int)s->strstart; \
|
||||
PREFIX(flush_pending)(s->strm); \
|
||||
}
|
||||
|
||||
/* Same but force premature exit if necessary. */
|
||||
#define FLUSH_BLOCK(s, last) { \
|
||||
FLUSH_BLOCK_ONLY(s, last); \
|
||||
if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
|
||||
}
|
||||
|
||||
/* Maximum stored block length in deflate format (not including header). */
|
||||
#define MAX_STORED 65535
|
||||
|
||||
/* Compression function. Returns the block state after the call. */
|
||||
typedef block_state (*compress_func) (deflate_state *s, int flush);
|
||||
/* Match function. Returns the longest match. */
|
||||
typedef uint32_t (*match_func) (deflate_state *const s, Pos cur_match);
|
||||
|
||||
#endif
|
129
3rdparty/zlib-ng/deflate_quick.c
vendored
Normal file
129
3rdparty/zlib-ng/deflate_quick.c
vendored
Normal file
@ -0,0 +1,129 @@
|
||||
/*
|
||||
* The deflate_quick deflate strategy, designed to be used when cycles are
|
||||
* at a premium.
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* Portions are Copyright (C) 2016 12Sided Technology, LLC.
|
||||
* Author:
|
||||
* Phil Vachon <pvachon@12sidedtech.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
#include "trees_emit.h"
|
||||
|
||||
extern const ct_data static_ltree[L_CODES+2];
|
||||
extern const ct_data static_dtree[D_CODES];
|
||||
|
||||
#define QUICK_START_BLOCK(s, last) { \
|
||||
zng_tr_emit_tree(s, STATIC_TREES, last); \
|
||||
s->block_open = 1 + (int)last; \
|
||||
s->block_start = (int)s->strstart; \
|
||||
}
|
||||
|
||||
#define QUICK_END_BLOCK(s, last) { \
|
||||
if (s->block_open) { \
|
||||
zng_tr_emit_end_block(s, static_ltree, last); \
|
||||
s->block_open = 0; \
|
||||
s->block_start = (int)s->strstart; \
|
||||
PREFIX(flush_pending)(s->strm); \
|
||||
if (s->strm->avail_out == 0) \
|
||||
return (last) ? finish_started : need_more; \
|
||||
} \
|
||||
}
|
||||
|
||||
Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
|
||||
Pos hash_head;
|
||||
int64_t dist;
|
||||
unsigned match_len, last;
|
||||
|
||||
|
||||
last = (flush == Z_FINISH) ? 1 : 0;
|
||||
if (UNLIKELY(last && s->block_open != 2)) {
|
||||
/* Emit end of previous block */
|
||||
QUICK_END_BLOCK(s, 0);
|
||||
/* Emit start of last block */
|
||||
QUICK_START_BLOCK(s, last);
|
||||
} else if (UNLIKELY(s->block_open == 0 && s->lookahead > 0)) {
|
||||
/* Start new block only when we have lookahead data, so that if no
|
||||
input data is given an empty block will not be written */
|
||||
QUICK_START_BLOCK(s, last);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
if (UNLIKELY(s->pending + ((BIT_BUF_SIZE + 7) >> 3) >= s->pending_buf_size)) {
|
||||
PREFIX(flush_pending)(s->strm);
|
||||
if (s->strm->avail_out == 0) {
|
||||
return (last && s->strm->avail_in == 0 && s->bi_valid == 0 && s->block_open == 0) ? finish_started : need_more;
|
||||
}
|
||||
}
|
||||
|
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD)) {
|
||||
PREFIX(fill_window)(s);
|
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
|
||||
return need_more;
|
||||
}
|
||||
if (UNLIKELY(s->lookahead == 0))
|
||||
break;
|
||||
|
||||
if (UNLIKELY(s->block_open == 0)) {
|
||||
/* Start new block when we have lookahead data, so that if no
|
||||
input data is given an empty block will not be written */
|
||||
QUICK_START_BLOCK(s, last);
|
||||
}
|
||||
}
|
||||
|
||||
if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
|
||||
hash_head = functable.quick_insert_string(s, s->strstart);
|
||||
dist = (int64_t)s->strstart - hash_head;
|
||||
|
||||
if (dist <= MAX_DIST(s) && dist > 0) {
|
||||
const uint8_t *str_start = s->window + s->strstart;
|
||||
const uint8_t *match_start = s->window + hash_head;
|
||||
|
||||
if (zng_memcmp_2(str_start, match_start) == 0) {
|
||||
match_len = functable.compare256(str_start+2, match_start+2) + 2;
|
||||
|
||||
if (match_len >= WANT_MIN_MATCH) {
|
||||
if (UNLIKELY(match_len > s->lookahead))
|
||||
match_len = s->lookahead;
|
||||
if (UNLIKELY(match_len > STD_MAX_MATCH))
|
||||
match_len = STD_MAX_MATCH;
|
||||
|
||||
check_match(s, s->strstart, hash_head, match_len);
|
||||
|
||||
zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist);
|
||||
s->lookahead -= match_len;
|
||||
s->strstart += match_len;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zng_tr_emit_lit(s, static_ltree, s->window[s->strstart]);
|
||||
s->strstart++;
|
||||
s->lookahead--;
|
||||
}
|
||||
|
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
|
||||
if (UNLIKELY(last)) {
|
||||
QUICK_END_BLOCK(s, 1);
|
||||
return finish_done;
|
||||
}
|
||||
|
||||
QUICK_END_BLOCK(s, 0);
|
||||
return block_done;
|
||||
}
|
85
3rdparty/zlib-ng/deflate_rle.c
vendored
Normal file
85
3rdparty/zlib-ng/deflate_rle.c
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
/* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "compare256_rle.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
#ifdef UNALIGNED_OK
|
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
# define compare256_rle compare256_rle_unaligned_64
|
||||
# elif defined(HAVE_BUILTIN_CTZ)
|
||||
# define compare256_rle compare256_rle_unaligned_32
|
||||
# else
|
||||
# define compare256_rle compare256_rle_unaligned_16
|
||||
# endif
|
||||
#else
|
||||
# define compare256_rle compare256_rle_c
|
||||
#endif
|
||||
|
||||
/* ===========================================================================
|
||||
* For Z_RLE, simply look for runs of bytes, generate matches only of distance
|
||||
* one. Do not maintain a hash table. (It will be regenerated if this run of
|
||||
* deflate switches away from Z_RLE.)
|
||||
*/
|
||||
Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
|
||||
int bflush = 0; /* set if current block must be flushed */
|
||||
unsigned char *scan; /* scan goes up to strend for length of run */
|
||||
uint32_t match_len = 0;
|
||||
|
||||
for (;;) {
|
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes
|
||||
* for the longest run, plus one for the unrolled loop.
|
||||
*/
|
||||
if (s->lookahead <= STD_MAX_MATCH) {
|
||||
PREFIX(fill_window)(s);
|
||||
if (s->lookahead <= STD_MAX_MATCH && flush == Z_NO_FLUSH)
|
||||
return need_more;
|
||||
if (s->lookahead == 0)
|
||||
break; /* flush the current block */
|
||||
}
|
||||
|
||||
/* See how many times the previous byte repeats */
|
||||
if (s->lookahead >= STD_MIN_MATCH && s->strstart > 0) {
|
||||
scan = s->window + s->strstart - 1;
|
||||
if (scan[0] == scan[1] && scan[1] == scan[2]) {
|
||||
match_len = compare256_rle(scan, scan+3)+2;
|
||||
match_len = MIN(match_len, s->lookahead);
|
||||
match_len = MIN(match_len, STD_MAX_MATCH);
|
||||
}
|
||||
Assert(scan+match_len <= s->window + s->window_size - 1, "wild scan");
|
||||
}
|
||||
|
||||
/* Emit match if have run of STD_MIN_MATCH or longer, else emit literal */
|
||||
if (match_len >= STD_MIN_MATCH) {
|
||||
check_match(s, s->strstart, s->strstart - 1, match_len);
|
||||
|
||||
bflush = zng_tr_tally_dist(s, 1, match_len - STD_MIN_MATCH);
|
||||
|
||||
s->lookahead -= match_len;
|
||||
s->strstart += match_len;
|
||||
match_len = 0;
|
||||
} else {
|
||||
/* No match, output a literal byte */
|
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
|
||||
s->lookahead--;
|
||||
s->strstart++;
|
||||
}
|
||||
if (bflush)
|
||||
FLUSH_BLOCK(s, 0);
|
||||
}
|
||||
s->insert = 0;
|
||||
if (flush == Z_FINISH) {
|
||||
FLUSH_BLOCK(s, 1);
|
||||
return finish_done;
|
||||
}
|
||||
if (s->sym_next)
|
||||
FLUSH_BLOCK(s, 0);
|
||||
return block_done;
|
||||
}
|
143
3rdparty/zlib-ng/deflate_slow.c
vendored
Normal file
143
3rdparty/zlib-ng/deflate_slow.c
vendored
Normal file
@ -0,0 +1,143 @@
|
||||
/* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Same as deflate_medium, but achieves better compression. We use a lazy
|
||||
* evaluation for matches: a match is finally adopted only if there is
|
||||
* no better match at the next window position.
|
||||
*/
|
||||
Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
|
||||
Pos hash_head; /* head of hash chain */
|
||||
int bflush; /* set if current block must be flushed */
|
||||
int64_t dist;
|
||||
uint32_t match_len;
|
||||
match_func *longest_match;
|
||||
|
||||
if (s->max_chain_length <= 1024)
|
||||
longest_match = &functable.longest_match;
|
||||
else
|
||||
longest_match = &functable.longest_match_slow;
|
||||
|
||||
/* Process the input block. */
|
||||
for (;;) {
|
||||
/* Make sure that we always have enough lookahead, except
|
||||
* at the end of the input file. We need STD_MAX_MATCH bytes
|
||||
* for the next match, plus WANT_MIN_MATCH bytes to insert the
|
||||
* string following the next match.
|
||||
*/
|
||||
if (s->lookahead < MIN_LOOKAHEAD) {
|
||||
PREFIX(fill_window)(s);
|
||||
if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
|
||||
return need_more;
|
||||
}
|
||||
if (UNLIKELY(s->lookahead == 0))
|
||||
break; /* flush the current block */
|
||||
}
|
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain:
|
||||
*/
|
||||
hash_head = 0;
|
||||
if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
|
||||
hash_head = s->quick_insert_string(s, s->strstart);
|
||||
}
|
||||
|
||||
/* Find the longest match, discarding those <= prev_length.
|
||||
*/
|
||||
s->prev_match = (Pos)s->match_start;
|
||||
match_len = STD_MIN_MATCH - 1;
|
||||
dist = (int64_t)s->strstart - hash_head;
|
||||
|
||||
if (dist <= MAX_DIST(s) && dist > 0 && s->prev_length < s->max_lazy_match && hash_head != 0) {
|
||||
/* To simplify the code, we prevent matches with the string
|
||||
* of window index 0 (in particular we have to avoid a match
|
||||
* of the string with itself at the start of the input file).
|
||||
*/
|
||||
match_len = (*longest_match)(s, hash_head);
|
||||
/* longest_match() sets match_start */
|
||||
|
||||
if (match_len <= 5 && (s->strategy == Z_FILTERED)) {
|
||||
/* If prev_match is also WANT_MIN_MATCH, match_start is garbage
|
||||
* but we will ignore the current match anyway.
|
||||
*/
|
||||
match_len = STD_MIN_MATCH - 1;
|
||||
}
|
||||
}
|
||||
/* If there was a match at the previous step and the current
|
||||
* match is not better, output the previous match:
|
||||
*/
|
||||
if (s->prev_length >= STD_MIN_MATCH && match_len <= s->prev_length) {
|
||||
unsigned int max_insert = s->strstart + s->lookahead - STD_MIN_MATCH;
|
||||
/* Do not insert strings in hash table beyond this. */
|
||||
|
||||
check_match(s, s->strstart-1, s->prev_match, s->prev_length);
|
||||
|
||||
bflush = zng_tr_tally_dist(s, s->strstart -1 - s->prev_match, s->prev_length - STD_MIN_MATCH);
|
||||
|
||||
/* Insert in hash table all strings up to the end of the match.
|
||||
* strstart-1 and strstart are already inserted. If there is not
|
||||
* enough lookahead, the last two strings are not inserted in
|
||||
* the hash table.
|
||||
*/
|
||||
s->prev_length -= 1;
|
||||
s->lookahead -= s->prev_length;
|
||||
|
||||
unsigned int mov_fwd = s->prev_length - 1;
|
||||
if (max_insert > s->strstart) {
|
||||
unsigned int insert_cnt = mov_fwd;
|
||||
if (UNLIKELY(insert_cnt > max_insert - s->strstart))
|
||||
insert_cnt = max_insert - s->strstart;
|
||||
s->insert_string(s, s->strstart + 1, insert_cnt);
|
||||
}
|
||||
s->prev_length = 0;
|
||||
s->match_available = 0;
|
||||
s->strstart += mov_fwd + 1;
|
||||
|
||||
if (UNLIKELY(bflush))
|
||||
FLUSH_BLOCK(s, 0);
|
||||
|
||||
} else if (s->match_available) {
|
||||
/* If there was no match at the previous position, output a
|
||||
* single literal. If there was a match but the current match
|
||||
* is longer, truncate the previous match to a single literal.
|
||||
*/
|
||||
bflush = zng_tr_tally_lit(s, s->window[s->strstart-1]);
|
||||
if (UNLIKELY(bflush))
|
||||
FLUSH_BLOCK_ONLY(s, 0);
|
||||
s->prev_length = match_len;
|
||||
s->strstart++;
|
||||
s->lookahead--;
|
||||
if (UNLIKELY(s->strm->avail_out == 0))
|
||||
return need_more;
|
||||
} else {
|
||||
/* There is no previous match to compare with, wait for
|
||||
* the next step to decide.
|
||||
*/
|
||||
s->prev_length = match_len;
|
||||
s->match_available = 1;
|
||||
s->strstart++;
|
||||
s->lookahead--;
|
||||
}
|
||||
}
|
||||
Assert(flush != Z_NO_FLUSH, "no flush?");
|
||||
if (UNLIKELY(s->match_available)) {
|
||||
(void) zng_tr_tally_lit(s, s->window[s->strstart-1]);
|
||||
s->match_available = 0;
|
||||
}
|
||||
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
|
||||
if (UNLIKELY(flush == Z_FINISH)) {
|
||||
FLUSH_BLOCK(s, 1);
|
||||
return finish_done;
|
||||
}
|
||||
if (UNLIKELY(s->sym_next))
|
||||
FLUSH_BLOCK(s, 0);
|
||||
return block_done;
|
||||
}
|
186
3rdparty/zlib-ng/deflate_stored.c
vendored
Normal file
186
3rdparty/zlib-ng/deflate_stored.c
vendored
Normal file
@ -0,0 +1,186 @@
|
||||
/* deflate_stored.c -- store data without compression using deflation algorithm
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Copy without compression as much as possible from the input stream, return
|
||||
* the current block state.
|
||||
*
|
||||
* In case deflateParams() is used to later switch to a non-zero compression
|
||||
* level, s->matches (otherwise unused when storing) keeps track of the number
|
||||
* of hash table slides to perform. If s->matches is 1, then one hash table
|
||||
* slide will be done when switching. If s->matches is 2, the maximum value
|
||||
* allowed here, then the hash table will be cleared, since two or more slides
|
||||
* is the same as a clear.
|
||||
*
|
||||
* deflate_stored() is written to minimize the number of times an input byte is
|
||||
* copied. It is most efficient with large input and output buffers, which
|
||||
* maximizes the opportunites to have a single copy from next_in to next_out.
|
||||
*/
|
||||
Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) {
|
||||
/* Smallest worthy block size when not flushing or finishing. By default
|
||||
* this is 32K. This can be as small as 507 bytes for memLevel == 1. For
|
||||
* large input and output buffers, the stored block size will be larger.
|
||||
*/
|
||||
unsigned min_block = MIN(s->pending_buf_size - 5, s->w_size);
|
||||
|
||||
/* Copy as many min_block or larger stored blocks directly to next_out as
|
||||
* possible. If flushing, copy the remaining available input to next_out as
|
||||
* stored blocks, if there is enough space.
|
||||
*/
|
||||
unsigned len, left, have, last = 0;
|
||||
unsigned used = s->strm->avail_in;
|
||||
do {
|
||||
/* Set len to the maximum size block that we can copy directly with the
|
||||
* available input data and output space. Set left to how much of that
|
||||
* would be copied from what's left in the window.
|
||||
*/
|
||||
len = MAX_STORED; /* maximum deflate stored block length */
|
||||
have = (s->bi_valid + 42) >> 3; /* number of header bytes */
|
||||
if (s->strm->avail_out < have) /* need room for header */
|
||||
break;
|
||||
/* maximum stored block length that will fit in avail_out: */
|
||||
have = s->strm->avail_out - have;
|
||||
left = (int)s->strstart - s->block_start; /* bytes left in window */
|
||||
if (len > (unsigned long)left + s->strm->avail_in)
|
||||
len = left + s->strm->avail_in; /* limit len to the input */
|
||||
len = MIN(len, have); /* limit len to the output */
|
||||
|
||||
/* If the stored block would be less than min_block in length, or if
|
||||
* unable to copy all of the available input when flushing, then try
|
||||
* copying to the window and the pending buffer instead. Also don't
|
||||
* write an empty block when flushing -- deflate() does that.
|
||||
*/
|
||||
if (len < min_block && ((len == 0 && flush != Z_FINISH) || flush == Z_NO_FLUSH || len != left + s->strm->avail_in))
|
||||
break;
|
||||
|
||||
/* Make a dummy stored block in pending to get the header bytes,
|
||||
* including any pending bits. This also updates the debugging counts.
|
||||
*/
|
||||
last = flush == Z_FINISH && len == left + s->strm->avail_in ? 1 : 0;
|
||||
zng_tr_stored_block(s, (char *)0, 0L, last);
|
||||
|
||||
/* Replace the lengths in the dummy stored block with len. */
|
||||
s->pending -= 4;
|
||||
put_short(s, (uint16_t)len);
|
||||
put_short(s, (uint16_t)~len);
|
||||
|
||||
/* Write the stored block header bytes. */
|
||||
PREFIX(flush_pending)(s->strm);
|
||||
|
||||
/* Update debugging counts for the data about to be copied. */
|
||||
cmpr_bits_add(s, len << 3);
|
||||
sent_bits_add(s, len << 3);
|
||||
|
||||
/* Copy uncompressed bytes from the window to next_out. */
|
||||
if (left) {
|
||||
left = MIN(left, len);
|
||||
memcpy(s->strm->next_out, s->window + s->block_start, left);
|
||||
s->strm->next_out += left;
|
||||
s->strm->avail_out -= left;
|
||||
s->strm->total_out += left;
|
||||
s->block_start += (int)left;
|
||||
len -= left;
|
||||
}
|
||||
|
||||
/* Copy uncompressed bytes directly from next_in to next_out, updating
|
||||
* the check value.
|
||||
*/
|
||||
if (len) {
|
||||
PREFIX(read_buf)(s->strm, s->strm->next_out, len);
|
||||
s->strm->next_out += len;
|
||||
s->strm->avail_out -= len;
|
||||
s->strm->total_out += len;
|
||||
}
|
||||
} while (last == 0);
|
||||
|
||||
/* Update the sliding window with the last s->w_size bytes of the copied
|
||||
* data, or append all of the copied data to the existing window if less
|
||||
* than s->w_size bytes were copied. Also update the number of bytes to
|
||||
* insert in the hash tables, in the event that deflateParams() switches to
|
||||
* a non-zero compression level.
|
||||
*/
|
||||
used -= s->strm->avail_in; /* number of input bytes directly copied */
|
||||
if (used) {
|
||||
/* If any input was used, then no unused input remains in the window,
|
||||
* therefore s->block_start == s->strstart.
|
||||
*/
|
||||
if (used >= s->w_size) { /* supplant the previous history */
|
||||
s->matches = 2; /* clear hash */
|
||||
memcpy(s->window, s->strm->next_in - s->w_size, s->w_size);
|
||||
s->strstart = s->w_size;
|
||||
s->insert = s->strstart;
|
||||
} else {
|
||||
if (s->window_size - s->strstart <= used) {
|
||||
/* Slide the window down. */
|
||||
s->strstart -= s->w_size;
|
||||
memcpy(s->window, s->window + s->w_size, s->strstart);
|
||||
if (s->matches < 2)
|
||||
s->matches++; /* add a pending slide_hash() */
|
||||
s->insert = MIN(s->insert, s->strstart);
|
||||
}
|
||||
memcpy(s->window + s->strstart, s->strm->next_in - used, used);
|
||||
s->strstart += used;
|
||||
s->insert += MIN(used, s->w_size - s->insert);
|
||||
}
|
||||
s->block_start = (int)s->strstart;
|
||||
}
|
||||
s->high_water = MAX(s->high_water, s->strstart);
|
||||
|
||||
/* If the last block was written to next_out, then done. */
|
||||
if (last)
|
||||
return finish_done;
|
||||
|
||||
/* If flushing and all input has been consumed, then done. */
|
||||
if (flush != Z_NO_FLUSH && flush != Z_FINISH && s->strm->avail_in == 0 && (int)s->strstart == s->block_start)
|
||||
return block_done;
|
||||
|
||||
/* Fill the window with any remaining input. */
|
||||
have = s->window_size - s->strstart;
|
||||
if (s->strm->avail_in > have && s->block_start >= (int)s->w_size) {
|
||||
/* Slide the window down. */
|
||||
s->block_start -= (int)s->w_size;
|
||||
s->strstart -= s->w_size;
|
||||
memcpy(s->window, s->window + s->w_size, s->strstart);
|
||||
if (s->matches < 2)
|
||||
s->matches++; /* add a pending slide_hash() */
|
||||
have += s->w_size; /* more space now */
|
||||
s->insert = MIN(s->insert, s->strstart);
|
||||
}
|
||||
|
||||
have = MIN(have, s->strm->avail_in);
|
||||
if (have) {
|
||||
PREFIX(read_buf)(s->strm, s->window + s->strstart, have);
|
||||
s->strstart += have;
|
||||
s->insert += MIN(have, s->w_size - s->insert);
|
||||
}
|
||||
s->high_water = MAX(s->high_water, s->strstart);
|
||||
|
||||
/* There was not enough avail_out to write a complete worthy or flushed
|
||||
* stored block to next_out. Write a stored block to pending instead, if we
|
||||
* have enough input for a worthy block, or if flushing and there is enough
|
||||
* room for the remaining input as a stored block in the pending buffer.
|
||||
*/
|
||||
have = (s->bi_valid + 42) >> 3; /* number of header bytes */
|
||||
/* maximum stored block length that will fit in pending: */
|
||||
have = MIN(s->pending_buf_size - have, MAX_STORED);
|
||||
min_block = MIN(have, s->w_size);
|
||||
left = (int)s->strstart - s->block_start;
|
||||
if (left >= min_block || ((left || flush == Z_FINISH) && flush != Z_NO_FLUSH && s->strm->avail_in == 0 && left <= have)) {
|
||||
len = MIN(left, have);
|
||||
last = flush == Z_FINISH && s->strm->avail_in == 0 && len == left ? 1 : 0;
|
||||
zng_tr_stored_block(s, (char *)s->window + s->block_start, len, last);
|
||||
s->block_start += (int)len;
|
||||
PREFIX(flush_pending)(s->strm);
|
||||
}
|
||||
|
||||
/* We've done all we can with the available input and output. */
|
||||
return last ? finish_started : need_more;
|
||||
}
|
50
3rdparty/zlib-ng/fallback_builtins.h
vendored
Normal file
50
3rdparty/zlib-ng/fallback_builtins.h
vendored
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef FALLBACK_BUILTINS_H
|
||||
#define FALLBACK_BUILTINS_H
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
|
||||
#include <intrin.h>
|
||||
#ifdef X86_FEATURES
|
||||
# include "arch/x86/x86_features.h"
|
||||
#endif
|
||||
|
||||
/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
|
||||
* Because of that assumption trailing_zero is not initialized and the return value is not checked.
|
||||
* Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed.
|
||||
* If tzcnt instruction is not supported, the cpu will itself execute bsf instead.
|
||||
* Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu.
|
||||
*/
|
||||
static __forceinline int __builtin_ctz(unsigned int value) {
|
||||
Assert(value != 0, "Invalid input value: 0");
|
||||
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
|
||||
return (int)_tzcnt_u32(value);
|
||||
# else
|
||||
unsigned long trailing_zero;
|
||||
_BitScanForward(&trailing_zero, value);
|
||||
return (int)trailing_zero;
|
||||
# endif
|
||||
}
|
||||
#define HAVE_BUILTIN_CTZ
|
||||
|
||||
#ifdef _M_AMD64
|
||||
/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
|
||||
* Because of that assumption trailing_zero is not initialized and the return value is not checked.
|
||||
*/
|
||||
static __forceinline int __builtin_ctzll(unsigned long long value) {
|
||||
Assert(value != 0, "Invalid input value: 0");
|
||||
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
|
||||
return (int)_tzcnt_u64(value);
|
||||
# else
|
||||
unsigned long trailing_zero;
|
||||
_BitScanForward64(&trailing_zero, value);
|
||||
return (int)trailing_zero;
|
||||
# endif
|
||||
}
|
||||
#define HAVE_BUILTIN_CTZLL
|
||||
#endif // Microsoft AMD64
|
||||
|
||||
#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
|
||||
#endif // _MSC_VER & !clang
|
||||
|
||||
#endif // include guard FALLBACK_BUILTINS_H
|
403
3rdparty/zlib-ng/functable.c
vendored
Normal file
403
3rdparty/zlib-ng/functable.c
vendored
Normal file
@ -0,0 +1,403 @@
|
||||
/* functable.c -- Choose relevant optimized functions at runtime
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zendian.h"
|
||||
#include "crc32_braid_p.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
#include "cpu_features.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# include <intrin.h>
|
||||
#endif
|
||||
|
||||
/* Platform has pointer size atomic store */
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
|
||||
__atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
|
||||
# define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
|
||||
#elif defined(_MSC_VER)
|
||||
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
|
||||
_InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
|
||||
# if defined(_M_ARM) || defined(_M_ARM64)
|
||||
# define FUNCTABLE_BARRIER() do { \
|
||||
_ReadWriteBarrier(); \
|
||||
__dmb(0xB); /* _ARM_BARRIER_ISH */ \
|
||||
_ReadWriteBarrier(); \
|
||||
} while (0)
|
||||
# else
|
||||
# define FUNCTABLE_BARRIER() _ReadWriteBarrier()
|
||||
# endif
|
||||
#else
|
||||
# warning Unable to detect atomic intrinsic support.
|
||||
# define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
|
||||
*((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
|
||||
# define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
|
||||
#endif
|
||||
|
||||
static void force_init_empty(void) {
|
||||
// empty
|
||||
}
|
||||
|
||||
static void init_functable(void) {
|
||||
struct functable_s ft;
|
||||
struct cpu_features cf;
|
||||
|
||||
cpu_check_features(&cf);
|
||||
|
||||
// Generic code
|
||||
ft.force_init = &force_init_empty;
|
||||
ft.adler32 = &adler32_c;
|
||||
ft.adler32_fold_copy = &adler32_fold_copy_c;
|
||||
ft.chunkmemset_safe = &chunkmemset_safe_c;
|
||||
ft.chunksize = &chunksize_c;
|
||||
ft.crc32 = &PREFIX(crc32_braid);
|
||||
ft.crc32_fold = &crc32_fold_c;
|
||||
ft.crc32_fold_copy = &crc32_fold_copy_c;
|
||||
ft.crc32_fold_final = &crc32_fold_final_c;
|
||||
ft.crc32_fold_reset = &crc32_fold_reset_c;
|
||||
ft.inflate_fast = &inflate_fast_c;
|
||||
ft.insert_string = &insert_string_c;
|
||||
ft.quick_insert_string = &quick_insert_string_c;
|
||||
ft.slide_hash = &slide_hash_c;
|
||||
ft.update_hash = &update_hash_c;
|
||||
|
||||
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
||||
ft.longest_match = &longest_match_unaligned_64;
|
||||
ft.longest_match_slow = &longest_match_slow_unaligned_64;
|
||||
ft.compare256 = &compare256_unaligned_64;
|
||||
# elif defined(HAVE_BUILTIN_CTZ)
|
||||
ft.longest_match = &longest_match_unaligned_32;
|
||||
ft.longest_match_slow = &longest_match_slow_unaligned_32;
|
||||
ft.compare256 = &compare256_unaligned_32;
|
||||
# else
|
||||
ft.longest_match = &longest_match_unaligned_16;
|
||||
ft.longest_match_slow = &longest_match_slow_unaligned_16;
|
||||
ft.compare256 = &compare256_unaligned_16;
|
||||
# endif
|
||||
#else
|
||||
ft.longest_match = &longest_match_c;
|
||||
ft.longest_match_slow = &longest_match_slow_c;
|
||||
ft.compare256 = &compare256_c;
|
||||
#endif
|
||||
|
||||
|
||||
// Select arch-optimized functions
|
||||
|
||||
// X86 - SSE2
|
||||
#ifdef X86_SSE2
|
||||
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
||||
if (cf.x86.has_sse2)
|
||||
# endif
|
||||
{
|
||||
ft.chunkmemset_safe = &chunkmemset_safe_sse2;
|
||||
ft.chunksize = &chunksize_sse2;
|
||||
ft.inflate_fast = &inflate_fast_sse2;
|
||||
ft.slide_hash = &slide_hash_sse2;
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
ft.compare256 = &compare256_sse2;
|
||||
ft.longest_match = &longest_match_sse2;
|
||||
ft.longest_match_slow = &longest_match_slow_sse2;
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
// X86 - SSSE3
|
||||
#ifdef X86_SSSE3
|
||||
if (cf.x86.has_ssse3) {
|
||||
ft.adler32 = &adler32_ssse3;
|
||||
# ifdef X86_SSE2
|
||||
ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
|
||||
ft.inflate_fast = &inflate_fast_ssse3;
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
// X86 - SSE4.2
|
||||
#ifdef X86_SSE42
|
||||
if (cf.x86.has_sse42) {
|
||||
ft.adler32_fold_copy = &adler32_fold_copy_sse42;
|
||||
ft.insert_string = &insert_string_sse42;
|
||||
ft.quick_insert_string = &quick_insert_string_sse42;
|
||||
ft.update_hash = &update_hash_sse42;
|
||||
}
|
||||
#endif
|
||||
// X86 - PCLMUL
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
if (cf.x86.has_pclmulqdq) {
|
||||
ft.crc32 = &crc32_pclmulqdq;
|
||||
ft.crc32_fold = &crc32_fold_pclmulqdq;
|
||||
ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy;
|
||||
ft.crc32_fold_final = &crc32_fold_pclmulqdq_final;
|
||||
ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset;
|
||||
}
|
||||
#endif
|
||||
// X86 - AVX
|
||||
#ifdef X86_AVX2
|
||||
if (cf.x86.has_avx2) {
|
||||
ft.adler32 = &adler32_avx2;
|
||||
ft.adler32_fold_copy = &adler32_fold_copy_avx2;
|
||||
ft.chunkmemset_safe = &chunkmemset_safe_avx2;
|
||||
ft.chunksize = &chunksize_avx2;
|
||||
ft.inflate_fast = &inflate_fast_avx2;
|
||||
ft.slide_hash = &slide_hash_avx2;
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
ft.compare256 = &compare256_avx2;
|
||||
ft.longest_match = &longest_match_avx2;
|
||||
ft.longest_match_slow = &longest_match_slow_avx2;
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
#ifdef X86_AVX512
|
||||
if (cf.x86.has_avx512) {
|
||||
ft.adler32 = &adler32_avx512;
|
||||
ft.adler32_fold_copy = &adler32_fold_copy_avx512;
|
||||
}
|
||||
#endif
|
||||
#ifdef X86_AVX512VNNI
|
||||
if (cf.x86.has_avx512vnni) {
|
||||
ft.adler32 = &adler32_avx512_vnni;
|
||||
ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
|
||||
}
|
||||
#endif
|
||||
// X86 - VPCLMULQDQ
|
||||
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
|
||||
if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) {
|
||||
ft.crc32 = &crc32_vpclmulqdq;
|
||||
ft.crc32_fold = &crc32_fold_vpclmulqdq;
|
||||
ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
|
||||
ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
|
||||
ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// ARM - SIMD
|
||||
#ifdef ARM_SIMD
|
||||
# ifndef ARM_NOCHECK_SIMD
|
||||
if (cf.arm.has_simd)
|
||||
# endif
|
||||
{
|
||||
ft.slide_hash = &slide_hash_armv6;
|
||||
}
|
||||
#endif
|
||||
// ARM - NEON
|
||||
#ifdef ARM_NEON
|
||||
# ifndef ARM_NOCHECK_NEON
|
||||
if (cf.arm.has_neon)
|
||||
# endif
|
||||
{
|
||||
ft.adler32 = &adler32_neon;
|
||||
ft.chunkmemset_safe = &chunkmemset_safe_neon;
|
||||
ft.chunksize = &chunksize_neon;
|
||||
ft.inflate_fast = &inflate_fast_neon;
|
||||
ft.slide_hash = &slide_hash_neon;
|
||||
# ifdef HAVE_BUILTIN_CTZLL
|
||||
ft.compare256 = &compare256_neon;
|
||||
ft.longest_match = &longest_match_neon;
|
||||
ft.longest_match_slow = &longest_match_slow_neon;
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
// ARM - ACLE
|
||||
#ifdef ARM_ACLE
|
||||
if (cf.arm.has_crc32) {
|
||||
ft.crc32 = &crc32_acle;
|
||||
ft.insert_string = &insert_string_acle;
|
||||
ft.quick_insert_string = &quick_insert_string_acle;
|
||||
ft.update_hash = &update_hash_acle;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// Power - VMX
|
||||
#ifdef PPC_VMX
|
||||
if (cf.power.has_altivec) {
|
||||
ft.adler32 = &adler32_vmx;
|
||||
ft.slide_hash = &slide_hash_vmx;
|
||||
}
|
||||
#endif
|
||||
// Power8 - VSX
|
||||
#ifdef POWER8_VSX
|
||||
if (cf.power.has_arch_2_07) {
|
||||
ft.adler32 = &adler32_power8;
|
||||
ft.chunkmemset_safe = &chunkmemset_safe_power8;
|
||||
ft.chunksize = &chunksize_power8;
|
||||
ft.inflate_fast = &inflate_fast_power8;
|
||||
ft.slide_hash = &slide_hash_power8;
|
||||
}
|
||||
#endif
|
||||
#ifdef POWER8_VSX_CRC32
|
||||
if (cf.power.has_arch_2_07)
|
||||
ft.crc32 = &crc32_power8;
|
||||
#endif
|
||||
// Power9
|
||||
#ifdef POWER9
|
||||
if (cf.power.has_arch_3_00) {
|
||||
ft.compare256 = &compare256_power9;
|
||||
ft.longest_match = &longest_match_power9;
|
||||
ft.longest_match_slow = &longest_match_slow_power9;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// RISCV - RVV
|
||||
#ifdef RISCV_RVV
|
||||
if (cf.riscv.has_rvv) {
|
||||
ft.adler32 = &adler32_rvv;
|
||||
ft.adler32_fold_copy = &adler32_fold_copy_rvv;
|
||||
ft.chunkmemset_safe = &chunkmemset_safe_rvv;
|
||||
ft.chunksize = &chunksize_rvv;
|
||||
ft.compare256 = &compare256_rvv;
|
||||
ft.inflate_fast = &inflate_fast_rvv;
|
||||
ft.longest_match = &longest_match_rvv;
|
||||
ft.longest_match_slow = &longest_match_slow_rvv;
|
||||
ft.slide_hash = &slide_hash_rvv;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// S390
|
||||
#ifdef S390_CRC32_VX
|
||||
if (cf.s390.has_vx)
|
||||
ft.crc32 = crc32_s390_vx;
|
||||
#endif
|
||||
|
||||
// Assign function pointers individually for atomic operation
|
||||
FUNCTABLE_ASSIGN(ft, force_init);
|
||||
FUNCTABLE_ASSIGN(ft, adler32);
|
||||
FUNCTABLE_ASSIGN(ft, adler32_fold_copy);
|
||||
FUNCTABLE_ASSIGN(ft, chunkmemset_safe);
|
||||
FUNCTABLE_ASSIGN(ft, chunksize);
|
||||
FUNCTABLE_ASSIGN(ft, compare256);
|
||||
FUNCTABLE_ASSIGN(ft, crc32);
|
||||
FUNCTABLE_ASSIGN(ft, crc32_fold);
|
||||
FUNCTABLE_ASSIGN(ft, crc32_fold_copy);
|
||||
FUNCTABLE_ASSIGN(ft, crc32_fold_final);
|
||||
FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
|
||||
FUNCTABLE_ASSIGN(ft, inflate_fast);
|
||||
FUNCTABLE_ASSIGN(ft, insert_string);
|
||||
FUNCTABLE_ASSIGN(ft, longest_match);
|
||||
FUNCTABLE_ASSIGN(ft, longest_match_slow);
|
||||
FUNCTABLE_ASSIGN(ft, quick_insert_string);
|
||||
FUNCTABLE_ASSIGN(ft, slide_hash);
|
||||
FUNCTABLE_ASSIGN(ft, update_hash);
|
||||
|
||||
// Memory barrier for weak memory order CPUs
|
||||
FUNCTABLE_BARRIER();
|
||||
}
|
||||
|
||||
/* stub functions */
|
||||
static void force_init_stub(void) {
|
||||
init_functable();
|
||||
}
|
||||
|
||||
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
|
||||
init_functable();
|
||||
return functable.adler32(adler, buf, len);
|
||||
}
|
||||
|
||||
static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
|
||||
init_functable();
|
||||
return functable.adler32_fold_copy(adler, dst, src, len);
|
||||
}
|
||||
|
||||
static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) {
|
||||
init_functable();
|
||||
return functable.chunkmemset_safe(out, dist, len, left);
|
||||
}
|
||||
|
||||
static uint32_t chunksize_stub(void) {
|
||||
init_functable();
|
||||
return functable.chunksize();
|
||||
}
|
||||
|
||||
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
|
||||
init_functable();
|
||||
return functable.compare256(src0, src1);
|
||||
}
|
||||
|
||||
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
|
||||
init_functable();
|
||||
return functable.crc32(crc, buf, len);
|
||||
}
|
||||
|
||||
static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) {
|
||||
init_functable();
|
||||
functable.crc32_fold(crc, src, len, init_crc);
|
||||
}
|
||||
|
||||
static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) {
|
||||
init_functable();
|
||||
functable.crc32_fold_copy(crc, dst, src, len);
|
||||
}
|
||||
|
||||
static uint32_t crc32_fold_final_stub(crc32_fold* crc) {
|
||||
init_functable();
|
||||
return functable.crc32_fold_final(crc);
|
||||
}
|
||||
|
||||
static uint32_t crc32_fold_reset_stub(crc32_fold* crc) {
|
||||
init_functable();
|
||||
return functable.crc32_fold_reset(crc);
|
||||
}
|
||||
|
||||
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
|
||||
init_functable();
|
||||
functable.inflate_fast(strm, start);
|
||||
}
|
||||
|
||||
static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) {
|
||||
init_functable();
|
||||
functable.insert_string(s, str, count);
|
||||
}
|
||||
|
||||
static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
|
||||
init_functable();
|
||||
return functable.longest_match(s, cur_match);
|
||||
}
|
||||
|
||||
static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
|
||||
init_functable();
|
||||
return functable.longest_match_slow(s, cur_match);
|
||||
}
|
||||
|
||||
static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) {
|
||||
init_functable();
|
||||
return functable.quick_insert_string(s, str);
|
||||
}
|
||||
|
||||
static void slide_hash_stub(deflate_state* s) {
|
||||
init_functable();
|
||||
functable.slide_hash(s);
|
||||
}
|
||||
|
||||
static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) {
|
||||
init_functable();
|
||||
return functable.update_hash(s, h, val);
|
||||
}
|
||||
|
||||
/* functable init */
|
||||
Z_INTERNAL struct functable_s functable = {
|
||||
force_init_stub,
|
||||
adler32_stub,
|
||||
adler32_fold_copy_stub,
|
||||
chunkmemset_safe_stub,
|
||||
chunksize_stub,
|
||||
compare256_stub,
|
||||
crc32_stub,
|
||||
crc32_fold_stub,
|
||||
crc32_fold_copy_stub,
|
||||
crc32_fold_final_stub,
|
||||
crc32_fold_reset_stub,
|
||||
inflate_fast_stub,
|
||||
insert_string_stub,
|
||||
longest_match_stub,
|
||||
longest_match_slow_stub,
|
||||
quick_insert_string_stub,
|
||||
slide_hash_stub,
|
||||
update_hash_stub
|
||||
};
|
42
3rdparty/zlib-ng/functable.h
vendored
Normal file
42
3rdparty/zlib-ng/functable.h
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
/* functable.h -- Struct containing function pointers to optimized functions
|
||||
* Copyright (C) 2017 Hans Kristian Rosbach
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef FUNCTABLE_H_
|
||||
#define FUNCTABLE_H_
|
||||
|
||||
#include "deflate.h"
|
||||
#include "crc32_fold.h"
|
||||
#include "adler32_fold.h"
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
typedef struct z_stream_s z_stream;
|
||||
#else
|
||||
typedef struct zng_stream_s zng_stream;
|
||||
#endif
|
||||
|
||||
struct functable_s {
|
||||
void (* force_init) (void);
|
||||
uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len);
|
||||
uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
||||
uint32_t (* chunksize) (void);
|
||||
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1);
|
||||
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len);
|
||||
void (* crc32_fold) (struct crc32_fold_s *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
||||
void (* crc32_fold_copy) (struct crc32_fold_s *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
||||
uint32_t (* crc32_fold_final) (struct crc32_fold_s *crc);
|
||||
uint32_t (* crc32_fold_reset) (struct crc32_fold_s *crc);
|
||||
void (* inflate_fast) (PREFIX3(stream) *strm, uint32_t start);
|
||||
void (* insert_string) (deflate_state *const s, uint32_t str, uint32_t count);
|
||||
uint32_t (* longest_match) (deflate_state *const s, Pos cur_match);
|
||||
uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match);
|
||||
Pos (* quick_insert_string)(deflate_state *const s, uint32_t str);
|
||||
void (* slide_hash) (deflate_state *s);
|
||||
uint32_t (* update_hash) (deflate_state *const s, uint32_t h, uint32_t val);
|
||||
};
|
||||
|
||||
Z_INTERNAL extern struct functable_s functable;
|
||||
|
||||
#endif
|
144
3rdparty/zlib-ng/gzguts.h
vendored
Normal file
144
3rdparty/zlib-ng/gzguts.h
vendored
Normal file
@ -0,0 +1,144 @@
|
||||
#ifndef GZGUTS_H_
|
||||
#define GZGUTS_H_
|
||||
/* gzguts.h -- zlib internal header definitions for gz* operations
|
||||
* Copyright (C) 2004-2019 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef _LARGEFILE64_SOURCE
|
||||
# ifndef _LARGEFILE_SOURCE
|
||||
# define _LARGEFILE_SOURCE 1
|
||||
# endif
|
||||
# undef _FILE_OFFSET_BITS
|
||||
# undef _TIME_BITS
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_VISIBILITY_INTERNAL)
|
||||
# define Z_INTERNAL __attribute__((visibility ("internal")))
|
||||
#elif defined(HAVE_VISIBILITY_HIDDEN)
|
||||
# define Z_INTERNAL __attribute__((visibility ("hidden")))
|
||||
#else
|
||||
# define Z_INTERNAL
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#if defined(ZLIB_COMPAT)
|
||||
# include "zlib.h"
|
||||
#else
|
||||
# include "zlib-ng.h"
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
# include <stddef.h>
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
# include <io.h>
|
||||
# define WIDECHAR
|
||||
#endif
|
||||
|
||||
#ifdef WINAPI_FAMILY
|
||||
# define open _open
|
||||
# define read _read
|
||||
# define write _write
|
||||
# define close _close
|
||||
#endif
|
||||
|
||||
/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
|
||||
#if !defined(STDC99) && !defined(__CYGWIN__) && !defined(__MINGW__) && defined(_WIN32)
|
||||
# if !defined(vsnprintf)
|
||||
# if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
|
||||
# define vsnprintf _vsnprintf
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* unlike snprintf (which is required in C99), _snprintf does not guarantee
|
||||
null termination of the result -- however this is only used in gzlib.c
|
||||
where the result is assured to fit in the space provided */
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1900
|
||||
# define snprintf _snprintf
|
||||
#endif
|
||||
|
||||
/* get errno and strerror definition */
|
||||
#ifndef NO_STRERROR
|
||||
# include <errno.h>
|
||||
# define zstrerror() strerror(errno)
|
||||
#else
|
||||
# define zstrerror() "stdio error (consult errno)"
|
||||
#endif
|
||||
|
||||
/* default memLevel */
|
||||
#if MAX_MEM_LEVEL >= 8
|
||||
# define DEF_MEM_LEVEL 8
|
||||
#else
|
||||
# define DEF_MEM_LEVEL MAX_MEM_LEVEL
|
||||
#endif
|
||||
|
||||
/* default i/o buffer size -- double this for output when reading (this and
|
||||
twice this must be able to fit in an unsigned type) */
|
||||
#ifndef GZBUFSIZE
|
||||
# define GZBUFSIZE 131072
|
||||
#endif
|
||||
|
||||
/* gzip modes, also provide a little integrity check on the passed structure */
|
||||
#define GZ_NONE 0
|
||||
#define GZ_READ 7247
|
||||
#define GZ_WRITE 31153
|
||||
#define GZ_APPEND 1 /* mode set to GZ_WRITE after the file is opened */
|
||||
|
||||
/* values for gz_state how */
|
||||
#define LOOK 0 /* look for a gzip header */
|
||||
#define COPY 1 /* copy input directly */
|
||||
#define GZIP 2 /* decompress a gzip stream */
|
||||
|
||||
/* internal gzip file state data structure */
|
||||
typedef struct {
|
||||
/* exposed contents for gzgetc() macro */
|
||||
struct gzFile_s x; /* "x" for exposed */
|
||||
/* x.have: number of bytes available at x.next */
|
||||
/* x.next: next output data to deliver or write */
|
||||
/* x.pos: current position in uncompressed data */
|
||||
/* used for both reading and writing */
|
||||
int mode; /* see gzip modes above */
|
||||
int fd; /* file descriptor */
|
||||
char *path; /* path or fd for error messages */
|
||||
unsigned size; /* buffer size, zero if not allocated yet */
|
||||
unsigned want; /* requested buffer size, default is GZBUFSIZE */
|
||||
unsigned char *in; /* input buffer (double-sized when writing) */
|
||||
unsigned char *out; /* output buffer (double-sized when reading) */
|
||||
int direct; /* 0 if processing gzip, 1 if transparent */
|
||||
/* just for reading */
|
||||
int how; /* 0: get header, 1: copy, 2: decompress */
|
||||
z_off64_t start; /* where the gzip data started, for rewinding */
|
||||
int eof; /* true if end of input file reached */
|
||||
int past; /* true if read requested past end */
|
||||
/* just for writing */
|
||||
int level; /* compression level */
|
||||
int strategy; /* compression strategy */
|
||||
int reset; /* true if a reset is pending after a Z_FINISH */
|
||||
/* seek request */
|
||||
z_off64_t skip; /* amount to skip (already rewound if backwards) */
|
||||
int seek; /* true if seek request pending */
|
||||
/* error information */
|
||||
int err; /* error code */
|
||||
char *msg; /* error message */
|
||||
/* zlib inflate or deflate stream */
|
||||
PREFIX3(stream) strm; /* stream structure in-place (not a pointer) */
|
||||
} gz_state;
|
||||
typedef gz_state *gz_statep;
|
||||
|
||||
/* shared functions */
|
||||
void Z_INTERNAL gz_error(gz_state *, int, const char *);
|
||||
|
||||
/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
|
||||
value -- needed when comparing unsigned to z_off64_t, which is signed
|
||||
(possible z_off64_t types off_t, off64_t, and long are all signed) */
|
||||
#define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
|
||||
|
||||
#endif /* GZGUTS_H_ */
|
525
3rdparty/zlib-ng/gzlib.c
vendored
Normal file
525
3rdparty/zlib-ng/gzlib.c
vendored
Normal file
@ -0,0 +1,525 @@
|
||||
/* gzlib.c -- zlib functions common to reading and writing gzip files
|
||||
* Copyright (C) 2004-2019 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "gzguts.h"
|
||||
|
||||
#if defined(_WIN32)
|
||||
# define LSEEK _lseeki64
|
||||
#else
|
||||
#if defined(_LARGEFILE64_SOURCE) && _LFS64_LARGEFILE-0
|
||||
# define LSEEK lseek64
|
||||
#else
|
||||
# define LSEEK lseek
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Local functions */
|
||||
static void gz_reset(gz_state *);
|
||||
static gzFile gz_open(const void *, int, const char *);
|
||||
|
||||
/* Reset gzip file state */
|
||||
static void gz_reset(gz_state *state) {
|
||||
state->x.have = 0; /* no output data available */
|
||||
if (state->mode == GZ_READ) { /* for reading ... */
|
||||
state->eof = 0; /* not at end of file */
|
||||
state->past = 0; /* have not read past end yet */
|
||||
state->how = LOOK; /* look for gzip header */
|
||||
}
|
||||
else /* for writing ... */
|
||||
state->reset = 0; /* no deflateReset pending */
|
||||
state->seek = 0; /* no seek request pending */
|
||||
gz_error(state, Z_OK, NULL); /* clear error */
|
||||
state->x.pos = 0; /* no uncompressed data yet */
|
||||
state->strm.avail_in = 0; /* no input data yet */
|
||||
}
|
||||
|
||||
/* Open a gzip file either by name or file descriptor. */
|
||||
static gzFile gz_open(const void *path, int fd, const char *mode) {
|
||||
gz_state *state;
|
||||
size_t len;
|
||||
int oflag;
|
||||
#ifdef O_CLOEXEC
|
||||
int cloexec = 0;
|
||||
#endif
|
||||
#ifdef O_EXCL
|
||||
int exclusive = 0;
|
||||
#endif
|
||||
|
||||
/* check input */
|
||||
if (path == NULL)
|
||||
return NULL;
|
||||
|
||||
/* allocate gzFile structure to return */
|
||||
state = (gz_state *)zng_alloc(sizeof(gz_state));
|
||||
if (state == NULL)
|
||||
return NULL;
|
||||
state->size = 0; /* no buffers allocated yet */
|
||||
state->want = GZBUFSIZE; /* requested buffer size */
|
||||
state->msg = NULL; /* no error message yet */
|
||||
|
||||
/* interpret mode */
|
||||
state->mode = GZ_NONE;
|
||||
state->level = Z_DEFAULT_COMPRESSION;
|
||||
state->strategy = Z_DEFAULT_STRATEGY;
|
||||
state->direct = 0;
|
||||
while (*mode) {
|
||||
if (*mode >= '0' && *mode <= '9') {
|
||||
state->level = *mode - '0';
|
||||
} else {
|
||||
switch (*mode) {
|
||||
case 'r':
|
||||
state->mode = GZ_READ;
|
||||
break;
|
||||
#ifndef NO_GZCOMPRESS
|
||||
case 'w':
|
||||
state->mode = GZ_WRITE;
|
||||
break;
|
||||
case 'a':
|
||||
state->mode = GZ_APPEND;
|
||||
break;
|
||||
#endif
|
||||
case '+': /* can't read and write at the same time */
|
||||
zng_free(state);
|
||||
return NULL;
|
||||
case 'b': /* ignore -- will request binary anyway */
|
||||
break;
|
||||
#ifdef O_CLOEXEC
|
||||
case 'e':
|
||||
cloexec = 1;
|
||||
break;
|
||||
#endif
|
||||
#ifdef O_EXCL
|
||||
case 'x':
|
||||
exclusive = 1;
|
||||
break;
|
||||
#endif
|
||||
case 'f':
|
||||
state->strategy = Z_FILTERED;
|
||||
break;
|
||||
case 'h':
|
||||
state->strategy = Z_HUFFMAN_ONLY;
|
||||
break;
|
||||
case 'R':
|
||||
state->strategy = Z_RLE;
|
||||
break;
|
||||
case 'F':
|
||||
state->strategy = Z_FIXED;
|
||||
break;
|
||||
case 'T':
|
||||
state->direct = 1;
|
||||
break;
|
||||
default: /* could consider as an error, but just ignore */
|
||||
{}
|
||||
}
|
||||
}
|
||||
mode++;
|
||||
}
|
||||
|
||||
/* must provide an "r", "w", or "a" */
|
||||
if (state->mode == GZ_NONE) {
|
||||
zng_free(state);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* can't force transparent read */
|
||||
if (state->mode == GZ_READ) {
|
||||
if (state->direct) {
|
||||
zng_free(state);
|
||||
return NULL;
|
||||
}
|
||||
state->direct = 1; /* for empty file */
|
||||
}
|
||||
|
||||
/* save the path name for error messages */
|
||||
#ifdef WIDECHAR
|
||||
if (fd == -2) {
|
||||
len = wcstombs(NULL, (const wchar_t *)path, 0);
|
||||
if (len == (size_t)-1)
|
||||
len = 0;
|
||||
} else
|
||||
#endif
|
||||
len = strlen((const char *)path);
|
||||
state->path = (char *)malloc(len + 1);
|
||||
if (state->path == NULL) {
|
||||
zng_free(state);
|
||||
return NULL;
|
||||
}
|
||||
#ifdef WIDECHAR
|
||||
if (fd == -2)
|
||||
if (len) {
|
||||
wcstombs(state->path, (const wchar_t *)path, len + 1);
|
||||
} else {
|
||||
*(state->path) = 0;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
(void)snprintf(state->path, len + 1, "%s", (const char *)path);
|
||||
|
||||
/* compute the flags for open() */
|
||||
oflag =
|
||||
#ifdef O_LARGEFILE
|
||||
O_LARGEFILE |
|
||||
#endif
|
||||
#ifdef O_BINARY
|
||||
O_BINARY |
|
||||
#endif
|
||||
#ifdef O_CLOEXEC
|
||||
(cloexec ? O_CLOEXEC : 0) |
|
||||
#endif
|
||||
(state->mode == GZ_READ ?
|
||||
O_RDONLY :
|
||||
(O_WRONLY | O_CREAT |
|
||||
#ifdef O_EXCL
|
||||
(exclusive ? O_EXCL : 0) |
|
||||
#endif
|
||||
(state->mode == GZ_WRITE ?
|
||||
O_TRUNC :
|
||||
O_APPEND)));
|
||||
|
||||
/* open the file with the appropriate flags (or just use fd) */
|
||||
state->fd = fd > -1 ? fd : (
|
||||
#if defined(_WIN32)
|
||||
fd == -2 ? _wopen((const wchar_t *)path, oflag, 0666) :
|
||||
#elif __CYGWIN__
|
||||
fd == -2 ? open(state->path, oflag, 0666) :
|
||||
#endif
|
||||
open((const char *)path, oflag, 0666));
|
||||
if (state->fd == -1) {
|
||||
free(state->path);
|
||||
zng_free(state);
|
||||
return NULL;
|
||||
}
|
||||
if (state->mode == GZ_APPEND) {
|
||||
LSEEK(state->fd, 0, SEEK_END); /* so gzoffset() is correct */
|
||||
state->mode = GZ_WRITE; /* simplify later checks */
|
||||
}
|
||||
|
||||
/* save the current position for rewinding (only if reading) */
|
||||
if (state->mode == GZ_READ) {
|
||||
state->start = LSEEK(state->fd, 0, SEEK_CUR);
|
||||
if (state->start == -1) state->start = 0;
|
||||
}
|
||||
|
||||
/* initialize stream */
|
||||
gz_reset(state);
|
||||
|
||||
/* return stream */
|
||||
return (gzFile)state;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
gzFile Z_EXPORT PREFIX(gzopen)(const char *path, const char *mode) {
|
||||
return gz_open(path, -1, mode);
|
||||
}
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
gzFile Z_EXPORT PREFIX4(gzopen)(const char *path, const char *mode) {
|
||||
return gz_open(path, -1, mode);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
gzFile Z_EXPORT PREFIX(gzdopen)(int fd, const char *mode) {
|
||||
char *path; /* identifier for error messages */
|
||||
gzFile gz;
|
||||
|
||||
if (fd == -1 || (path = (char *)malloc(7 + 3 * sizeof(int))) == NULL)
|
||||
return NULL;
|
||||
(void)snprintf(path, 7 + 3 * sizeof(int), "<fd:%d>", fd); /* for debugging */
|
||||
gz = gz_open(path, fd, mode);
|
||||
free(path);
|
||||
return gz;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
#ifdef WIDECHAR
|
||||
gzFile Z_EXPORT PREFIX(gzopen_w)(const wchar_t *path, const char *mode) {
|
||||
return gz_open(path, -2, mode);
|
||||
}
|
||||
#endif
|
||||
|
||||
int Z_EXPORT PREFIX(gzclose)(gzFile file) {
|
||||
#ifndef NO_GZCOMPRESS
|
||||
gz_state *state;
|
||||
|
||||
if (file == NULL)
|
||||
return Z_STREAM_ERROR;
|
||||
state = (gz_state *)file;
|
||||
|
||||
return state->mode == GZ_READ ? PREFIX(gzclose_r)(file) : PREFIX(gzclose_w)(file);
|
||||
#else
|
||||
return PREFIX(gzclose_r)(file);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
int Z_EXPORT PREFIX(gzbuffer)(gzFile file, unsigned size) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure and check integrity */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
|
||||
return -1;
|
||||
|
||||
/* make sure we haven't already allocated memory */
|
||||
if (state->size != 0)
|
||||
return -1;
|
||||
|
||||
/* check and set requested size */
|
||||
if ((size << 1) < size)
|
||||
return -1; /* need to be able to double it */
|
||||
if (size < 8)
|
||||
size = 8; /* needed to behave well with flushing */
|
||||
state->want = size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
int Z_EXPORT PREFIX(gzrewind)(gzFile file) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* check that we're reading and that there's no error */
|
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
|
||||
return -1;
|
||||
|
||||
/* back up and start over */
|
||||
if (LSEEK(state->fd, state->start, SEEK_SET) == -1)
|
||||
return -1;
|
||||
gz_reset(state);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
z_off64_t Z_EXPORT PREFIX4(gzseek)(gzFile file, z_off64_t offset, int whence) {
|
||||
unsigned n;
|
||||
z_off64_t ret;
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure and check integrity */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
|
||||
return -1;
|
||||
|
||||
/* check that there's no error */
|
||||
if (state->err != Z_OK && state->err != Z_BUF_ERROR)
|
||||
return -1;
|
||||
|
||||
/* can only seek from start or relative to current position */
|
||||
if (whence != SEEK_SET && whence != SEEK_CUR)
|
||||
return -1;
|
||||
|
||||
/* normalize offset to a SEEK_CUR specification */
|
||||
if (whence == SEEK_SET)
|
||||
offset -= state->x.pos;
|
||||
else if (state->seek)
|
||||
offset += state->skip;
|
||||
state->seek = 0;
|
||||
|
||||
/* if within raw area while reading, just go there */
|
||||
if (state->mode == GZ_READ && state->how == COPY && state->x.pos + offset >= 0) {
|
||||
ret = LSEEK(state->fd, offset - (z_off64_t)state->x.have, SEEK_CUR);
|
||||
if (ret == -1)
|
||||
return -1;
|
||||
state->x.have = 0;
|
||||
state->eof = 0;
|
||||
state->past = 0;
|
||||
state->seek = 0;
|
||||
gz_error(state, Z_OK, NULL);
|
||||
state->strm.avail_in = 0;
|
||||
state->x.pos += offset;
|
||||
return state->x.pos;
|
||||
}
|
||||
|
||||
/* calculate skip amount, rewinding if needed for back seek when reading */
|
||||
if (offset < 0) {
|
||||
if (state->mode != GZ_READ) /* writing -- can't go backwards */
|
||||
return -1;
|
||||
offset += state->x.pos;
|
||||
if (offset < 0) /* before start of file! */
|
||||
return -1;
|
||||
if (PREFIX(gzrewind)(file) == -1) /* rewind, then skip to offset */
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* if reading, skip what's in output buffer (one less gzgetc() check) */
|
||||
if (state->mode == GZ_READ) {
|
||||
n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > offset ? (unsigned)offset : state->x.have;
|
||||
state->x.have -= n;
|
||||
state->x.next += n;
|
||||
state->x.pos += n;
|
||||
offset -= n;
|
||||
}
|
||||
|
||||
/* request skip (if not zero) */
|
||||
if (offset) {
|
||||
state->seek = 1;
|
||||
state->skip = offset;
|
||||
}
|
||||
return state->x.pos + offset;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
#ifdef ZLIB_COMPAT
|
||||
z_off_t Z_EXPORT PREFIX(gzseek)(gzFile file, z_off_t offset, int whence) {
|
||||
z_off64_t ret;
|
||||
|
||||
ret = PREFIX4(gzseek)(file, (z_off64_t)offset, whence);
|
||||
return ret == (z_off_t)ret ? (z_off_t)ret : -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
z_off64_t Z_EXPORT PREFIX4(gztell)(gzFile file) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure and check integrity */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
|
||||
return -1;
|
||||
|
||||
/* return position */
|
||||
return state->x.pos + (state->seek ? state->skip : 0);
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
#ifdef ZLIB_COMPAT
|
||||
z_off_t Z_EXPORT PREFIX(gztell)(gzFile file) {
|
||||
|
||||
z_off64_t ret;
|
||||
|
||||
ret = PREFIX4(gztell)(file);
|
||||
return ret == (z_off_t)ret ? (z_off_t)ret : -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
z_off64_t Z_EXPORT PREFIX4(gzoffset)(gzFile file) {
|
||||
z_off64_t offset;
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure and check integrity */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
|
||||
return -1;
|
||||
|
||||
/* compute and return effective offset in file */
|
||||
offset = LSEEK(state->fd, 0, SEEK_CUR);
|
||||
if (offset == -1)
|
||||
return -1;
|
||||
if (state->mode == GZ_READ) /* reading */
|
||||
offset -= state->strm.avail_in; /* don't count buffered input */
|
||||
return offset;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
#ifdef ZLIB_COMPAT
|
||||
z_off_t Z_EXPORT PREFIX(gzoffset)(gzFile file) {
|
||||
z_off64_t ret;
|
||||
|
||||
ret = PREFIX4(gzoffset)(file);
|
||||
return ret == (z_off_t)ret ? (z_off_t)ret : -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
int Z_EXPORT PREFIX(gzeof)(gzFile file) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure and check integrity */
|
||||
if (file == NULL)
|
||||
return 0;
|
||||
state = (gz_state *)file;
|
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
|
||||
return 0;
|
||||
|
||||
/* return end-of-file state */
|
||||
return state->mode == GZ_READ ? state->past : 0;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
const char * Z_EXPORT PREFIX(gzerror)(gzFile file, int *errnum) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure and check integrity */
|
||||
if (file == NULL)
|
||||
return NULL;
|
||||
state = (gz_state *)file;
|
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
|
||||
return NULL;
|
||||
|
||||
/* return error information */
|
||||
if (errnum != NULL)
|
||||
*errnum = state->err;
|
||||
return state->err == Z_MEM_ERROR ? "out of memory" : (state->msg == NULL ? "" : state->msg);
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
void Z_EXPORT PREFIX(gzclearerr)(gzFile file) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure and check integrity */
|
||||
if (file == NULL)
|
||||
return;
|
||||
state = (gz_state *)file;
|
||||
if (state->mode != GZ_READ && state->mode != GZ_WRITE)
|
||||
return;
|
||||
|
||||
/* clear error and end-of-file */
|
||||
if (state->mode == GZ_READ) {
|
||||
state->eof = 0;
|
||||
state->past = 0;
|
||||
}
|
||||
gz_error(state, Z_OK, NULL);
|
||||
}
|
||||
|
||||
/* Create an error message in allocated memory and set state->err and
|
||||
state->msg accordingly. Free any previous error message already there. Do
|
||||
not try to free or allocate space if the error is Z_MEM_ERROR (out of
|
||||
memory). Simply save the error message as a static string. If there is an
|
||||
allocation failure constructing the error message, then convert the error to
|
||||
out of memory. */
|
||||
void Z_INTERNAL gz_error(gz_state *state, int err, const char *msg) {
|
||||
/* free previously allocated message and clear */
|
||||
if (state->msg != NULL) {
|
||||
if (state->err != Z_MEM_ERROR)
|
||||
free(state->msg);
|
||||
state->msg = NULL;
|
||||
}
|
||||
|
||||
/* if fatal, set state->x.have to 0 so that the gzgetc() macro fails */
|
||||
if (err != Z_OK && err != Z_BUF_ERROR)
|
||||
state->x.have = 0;
|
||||
|
||||
/* set error code, and if no message, then done */
|
||||
state->err = err;
|
||||
if (msg == NULL)
|
||||
return;
|
||||
|
||||
/* for an out of memory error, return literal string when requested */
|
||||
if (err == Z_MEM_ERROR)
|
||||
return;
|
||||
|
||||
/* construct error message with path */
|
||||
if ((state->msg = (char *)malloc(strlen(state->path) + strlen(msg) + 3)) == NULL) {
|
||||
state->err = Z_MEM_ERROR;
|
||||
return;
|
||||
}
|
||||
(void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3, "%s%s%s", state->path, ": ", msg);
|
||||
}
|
606
3rdparty/zlib-ng/gzread.c.in
vendored
Normal file
606
3rdparty/zlib-ng/gzread.c.in
vendored
Normal file
@ -0,0 +1,606 @@
|
||||
/* gzread.c -- zlib functions for reading gzip files
|
||||
* Copyright (C) 2004-2017 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil_p.h"
|
||||
#include "gzguts.h"
|
||||
|
||||
/* Local functions */
|
||||
static int gz_load(gz_state *, unsigned char *, unsigned, unsigned *);
|
||||
static int gz_avail(gz_state *);
|
||||
static int gz_look(gz_state *);
|
||||
static int gz_decomp(gz_state *);
|
||||
static int gz_fetch(gz_state *);
|
||||
static int gz_skip(gz_state *, z_off64_t);
|
||||
static size_t gz_read(gz_state *, void *, size_t);
|
||||
|
||||
/* Use read() to load a buffer -- return -1 on error, otherwise 0. Read from
|
||||
state->fd, and update state->eof, state->err, and state->msg as appropriate.
|
||||
This function needs to loop on read(), since read() is not guaranteed to
|
||||
read the number of bytes requested, depending on the type of descriptor. */
|
||||
static int gz_load(gz_state *state, unsigned char *buf, unsigned len, unsigned *have) {
|
||||
ssize_t ret;
|
||||
|
||||
*have = 0;
|
||||
do {
|
||||
ret = read(state->fd, buf + *have, len - *have);
|
||||
if (ret <= 0)
|
||||
break;
|
||||
*have += (unsigned)ret;
|
||||
} while (*have < len);
|
||||
if (ret < 0) {
|
||||
gz_error(state, Z_ERRNO, zstrerror());
|
||||
return -1;
|
||||
}
|
||||
if (ret == 0)
|
||||
state->eof = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Load up input buffer and set eof flag if last data loaded -- return -1 on
|
||||
error, 0 otherwise. Note that the eof flag is set when the end of the input
|
||||
file is reached, even though there may be unused data in the buffer. Once
|
||||
that data has been used, no more attempts will be made to read the file.
|
||||
If strm->avail_in != 0, then the current data is moved to the beginning of
|
||||
the input buffer, and then the remainder of the buffer is loaded with the
|
||||
available data from the input file. */
|
||||
static int gz_avail(gz_state *state) {
|
||||
unsigned got;
|
||||
PREFIX3(stream) *strm = &(state->strm);
|
||||
|
||||
if (state->err != Z_OK && state->err != Z_BUF_ERROR)
|
||||
return -1;
|
||||
if (state->eof == 0) {
|
||||
if (strm->avail_in) { /* copy what's there to the start */
|
||||
unsigned char *p = state->in;
|
||||
unsigned const char *q = strm->next_in;
|
||||
unsigned n = strm->avail_in;
|
||||
do {
|
||||
*p++ = *q++;
|
||||
} while (--n);
|
||||
}
|
||||
if (gz_load(state, state->in + strm->avail_in, state->size - strm->avail_in, &got) == -1)
|
||||
return -1;
|
||||
strm->avail_in += got;
|
||||
strm->next_in = state->in;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Look for gzip header, set up for inflate or copy. state->x.have must be 0.
|
||||
If this is the first time in, allocate required memory. state->how will be
|
||||
left unchanged if there is no more input data available, will be set to COPY
|
||||
if there is no gzip header and direct copying will be performed, or it will
|
||||
be set to GZIP for decompression. If direct copying, then leftover input
|
||||
data from the input buffer will be copied to the output buffer. In that
|
||||
case, all further file reads will be directly to either the output buffer or
|
||||
a user buffer. If decompressing, the inflate state will be initialized.
|
||||
gz_look() will return 0 on success or -1 on failure. */
|
||||
static int gz_look(gz_state *state) {
|
||||
PREFIX3(stream) *strm = &(state->strm);
|
||||
|
||||
/* allocate read buffers and inflate memory */
|
||||
if (state->size == 0) {
|
||||
/* allocate buffers */
|
||||
state->in = (unsigned char *)zng_alloc(state->want);
|
||||
state->out = (unsigned char *)zng_alloc(state->want << 1);
|
||||
if (state->in == NULL || state->out == NULL) {
|
||||
zng_free(state->out);
|
||||
zng_free(state->in);
|
||||
gz_error(state, Z_MEM_ERROR, "out of memory");
|
||||
return -1;
|
||||
}
|
||||
state->size = state->want;
|
||||
|
||||
/* allocate inflate memory */
|
||||
state->strm.zalloc = NULL;
|
||||
state->strm.zfree = NULL;
|
||||
state->strm.opaque = NULL;
|
||||
state->strm.avail_in = 0;
|
||||
state->strm.next_in = NULL;
|
||||
if (PREFIX(inflateInit2)(&(state->strm), MAX_WBITS + 16) != Z_OK) { /* gunzip */
|
||||
zng_free(state->out);
|
||||
zng_free(state->in);
|
||||
state->size = 0;
|
||||
gz_error(state, Z_MEM_ERROR, "out of memory");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* get at least the magic bytes in the input buffer */
|
||||
if (strm->avail_in < 2) {
|
||||
if (gz_avail(state) == -1)
|
||||
return -1;
|
||||
if (strm->avail_in == 0)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* look for gzip magic bytes -- if there, do gzip decoding (note: there is
|
||||
a logical dilemma here when considering the case of a partially written
|
||||
gzip file, to wit, if a single 31 byte is written, then we cannot tell
|
||||
whether this is a single-byte file, or just a partially written gzip
|
||||
file -- for here we assume that if a gzip file is being written, then
|
||||
the header will be written in a single operation, so that reading a
|
||||
single byte is sufficient indication that it is not a gzip file) */
|
||||
if (strm->avail_in > 1 &&
|
||||
strm->next_in[0] == 31 && strm->next_in[1] == 139) {
|
||||
PREFIX(inflateReset)(strm);
|
||||
state->how = GZIP;
|
||||
state->direct = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* no gzip header -- if we were decoding gzip before, then this is trailing
|
||||
garbage. Ignore the trailing garbage and finish. */
|
||||
if (state->direct == 0) {
|
||||
strm->avail_in = 0;
|
||||
state->eof = 1;
|
||||
state->x.have = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* doing raw i/o, copy any leftover input to output -- this assumes that
|
||||
the output buffer is larger than the input buffer, which also assures
|
||||
space for gzungetc() */
|
||||
state->x.next = state->out;
|
||||
memcpy(state->x.next, strm->next_in, strm->avail_in);
|
||||
state->x.have = strm->avail_in;
|
||||
strm->avail_in = 0;
|
||||
state->how = COPY;
|
||||
state->direct = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Decompress from input to the provided next_out and avail_out in the state.
|
||||
On return, state->x.have and state->x.next point to the just decompressed
|
||||
data. If the gzip stream completes, state->how is reset to LOOK to look for
|
||||
the next gzip stream or raw data, once state->x.have is depleted. Returns 0
|
||||
on success, -1 on failure. */
|
||||
static int gz_decomp(gz_state *state) {
|
||||
int ret = Z_OK;
|
||||
unsigned had;
|
||||
PREFIX3(stream) *strm = &(state->strm);
|
||||
|
||||
/* fill output buffer up to end of deflate stream */
|
||||
had = strm->avail_out;
|
||||
do {
|
||||
/* get more input for inflate() */
|
||||
if (strm->avail_in == 0 && gz_avail(state) == -1)
|
||||
return -1;
|
||||
if (strm->avail_in == 0) {
|
||||
gz_error(state, Z_BUF_ERROR, "unexpected end of file");
|
||||
break;
|
||||
}
|
||||
|
||||
/* decompress and handle errors */
|
||||
ret = PREFIX(inflate)(strm, Z_NO_FLUSH);
|
||||
if (ret == Z_STREAM_ERROR || ret == Z_NEED_DICT) {
|
||||
gz_error(state, Z_STREAM_ERROR, "internal error: inflate stream corrupt");
|
||||
return -1;
|
||||
}
|
||||
if (ret == Z_MEM_ERROR) {
|
||||
gz_error(state, Z_MEM_ERROR, "out of memory");
|
||||
return -1;
|
||||
}
|
||||
if (ret == Z_DATA_ERROR) { /* deflate stream invalid */
|
||||
gz_error(state, Z_DATA_ERROR, strm->msg == NULL ? "compressed data error" : strm->msg);
|
||||
return -1;
|
||||
}
|
||||
} while (strm->avail_out && ret != Z_STREAM_END);
|
||||
|
||||
/* update available output */
|
||||
state->x.have = had - strm->avail_out;
|
||||
state->x.next = strm->next_out - state->x.have;
|
||||
|
||||
/* if the gzip stream completed successfully, look for another */
|
||||
if (ret == Z_STREAM_END)
|
||||
state->how = LOOK;
|
||||
|
||||
/* good decompression */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Fetch data and put it in the output buffer. Assumes state->x.have is 0.
|
||||
Data is either copied from the input file or decompressed from the input
|
||||
file depending on state->how. If state->how is LOOK, then a gzip header is
|
||||
looked for to determine whether to copy or decompress. Returns -1 on error,
|
||||
otherwise 0. gz_fetch() will leave state->how as COPY or GZIP unless the
|
||||
end of the input file has been reached and all data has been processed. */
|
||||
static int gz_fetch(gz_state *state) {
|
||||
PREFIX3(stream) *strm = &(state->strm);
|
||||
|
||||
do {
|
||||
switch (state->how) {
|
||||
case LOOK: /* -> LOOK, COPY (only if never GZIP), or GZIP */
|
||||
if (gz_look(state) == -1)
|
||||
return -1;
|
||||
if (state->how == LOOK)
|
||||
return 0;
|
||||
break;
|
||||
case COPY: /* -> COPY */
|
||||
if (gz_load(state, state->out, state->size << 1, &(state->x.have))
|
||||
== -1)
|
||||
return -1;
|
||||
state->x.next = state->out;
|
||||
return 0;
|
||||
case GZIP: /* -> GZIP or LOOK (if end of gzip stream) */
|
||||
strm->avail_out = state->size << 1;
|
||||
strm->next_out = state->out;
|
||||
if (gz_decomp(state) == -1)
|
||||
return -1;
|
||||
}
|
||||
} while (state->x.have == 0 && (!state->eof || strm->avail_in));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Skip len uncompressed bytes of output. Return -1 on error, 0 on success. */
|
||||
static int gz_skip(gz_state *state, z_off64_t len) {
|
||||
unsigned n;
|
||||
|
||||
/* skip over len bytes or reach end-of-file, whichever comes first */
|
||||
while (len)
|
||||
/* skip over whatever is in output buffer */
|
||||
if (state->x.have) {
|
||||
n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > len ?
|
||||
(unsigned)len : state->x.have;
|
||||
state->x.have -= n;
|
||||
state->x.next += n;
|
||||
state->x.pos += n;
|
||||
len -= n;
|
||||
} else if (state->eof && state->strm.avail_in == 0) {
|
||||
/* output buffer empty -- return if we're at the end of the input */
|
||||
break;
|
||||
} else {
|
||||
/* need more data to skip -- load up output buffer */
|
||||
/* get more output, looking for header if required */
|
||||
if (gz_fetch(state) == -1)
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Read len bytes into buf from file, or less than len up to the end of the
|
||||
input. Return the number of bytes read. If zero is returned, either the
|
||||
end of file was reached, or there was an error. state->err must be
|
||||
consulted in that case to determine which. */
|
||||
static size_t gz_read(gz_state *state, void *buf, size_t len) {
|
||||
size_t got;
|
||||
unsigned n;
|
||||
|
||||
/* if len is zero, avoid unnecessary operations */
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
/* process a skip request */
|
||||
if (state->seek) {
|
||||
state->seek = 0;
|
||||
if (gz_skip(state, state->skip) == -1)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* get len bytes to buf, or less than len if at the end */
|
||||
got = 0;
|
||||
do {
|
||||
/* set n to the maximum amount of len that fits in an unsigned int */
|
||||
n = (unsigned)-1;
|
||||
if (n > len)
|
||||
n = (unsigned)len;
|
||||
|
||||
/* first just try copying data from the output buffer */
|
||||
if (state->x.have) {
|
||||
if (state->x.have < n)
|
||||
n = state->x.have;
|
||||
memcpy(buf, state->x.next, n);
|
||||
state->x.next += n;
|
||||
state->x.have -= n;
|
||||
}
|
||||
|
||||
/* output buffer empty -- return if we're at the end of the input */
|
||||
else if (state->eof && state->strm.avail_in == 0) {
|
||||
state->past = 1; /* tried to read past end */
|
||||
break;
|
||||
}
|
||||
|
||||
/* need output data -- for small len or new stream load up our output
|
||||
buffer */
|
||||
else if (state->how == LOOK || n < (state->size << 1)) {
|
||||
/* get more output, looking for header if required */
|
||||
if (gz_fetch(state) == -1)
|
||||
return 0;
|
||||
continue; /* no progress yet -- go back to copy above */
|
||||
/* the copy above assures that we will leave with space in the
|
||||
output buffer, allowing at least one gzungetc() to succeed */
|
||||
}
|
||||
|
||||
/* large len -- read directly into user buffer */
|
||||
else if (state->how == COPY) { /* read directly */
|
||||
if (gz_load(state, (unsigned char *)buf, n, &n) == -1)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* large len -- decompress directly into user buffer */
|
||||
else { /* state->how == GZIP */
|
||||
state->strm.avail_out = n;
|
||||
state->strm.next_out = (unsigned char *)buf;
|
||||
if (gz_decomp(state) == -1)
|
||||
return 0;
|
||||
n = state->x.have;
|
||||
state->x.have = 0;
|
||||
}
|
||||
|
||||
/* update progress */
|
||||
len -= n;
|
||||
buf = (char *)buf + n;
|
||||
got += n;
|
||||
state->x.pos += n;
|
||||
} while (len);
|
||||
|
||||
/* return number of bytes read into user buffer */
|
||||
return got;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
int Z_EXPORT PREFIX(gzread)(gzFile file, void *buf, unsigned len) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* check that we're reading and that there's no (serious) error */
|
||||
if (state->mode != GZ_READ ||
|
||||
(state->err != Z_OK && state->err != Z_BUF_ERROR))
|
||||
return -1;
|
||||
|
||||
/* since an int is returned, make sure len fits in one, otherwise return
|
||||
with an error (this avoids a flaw in the interface) */
|
||||
if ((int)len < 0) {
|
||||
gz_error(state, Z_STREAM_ERROR, "request does not fit in an int");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* read len or fewer bytes to buf */
|
||||
len = (unsigned)gz_read(state, buf, len);
|
||||
|
||||
/* check for an error */
|
||||
if (len == 0 && state->err != Z_OK && state->err != Z_BUF_ERROR)
|
||||
return -1;
|
||||
|
||||
/* return the number of bytes read (this is assured to fit in an int) */
|
||||
return (int)len;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
size_t Z_EXPORT PREFIX(gzfread)(void *buf, size_t size, size_t nitems, gzFile file) {
|
||||
size_t len;
|
||||
gz_state *state;
|
||||
|
||||
/* Exit early if size is zero, also prevents potential division by zero */
|
||||
if (size == 0)
|
||||
return 0;
|
||||
|
||||
/* get internal structure */
|
||||
if (file == NULL)
|
||||
return 0;
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* check that we're reading and that there's no (serious) error */
|
||||
if (state->mode != GZ_READ ||
|
||||
(state->err != Z_OK && state->err != Z_BUF_ERROR))
|
||||
return 0;
|
||||
|
||||
/* compute bytes to read -- error on overflow */
|
||||
if (size && SIZE_MAX / size < nitems) {
|
||||
gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
|
||||
return 0;
|
||||
}
|
||||
len = nitems * size;
|
||||
|
||||
/* read len or fewer bytes to buf, return the number of full items read */
|
||||
return len ? gz_read(state, buf, len) / size : 0;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
#undef @ZLIB_SYMBOL_PREFIX@gzgetc
|
||||
#undef @ZLIB_SYMBOL_PREFIX@zng_gzgetc
|
||||
int Z_EXPORT PREFIX(gzgetc)(gzFile file) {
|
||||
unsigned char buf[1];
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* check that we're reading and that there's no (serious) error */
|
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
|
||||
return -1;
|
||||
|
||||
/* try output buffer (no need to check for skip request) */
|
||||
if (state->x.have) {
|
||||
state->x.have--;
|
||||
state->x.pos++;
|
||||
return *(state->x.next)++;
|
||||
}
|
||||
|
||||
/* nothing there -- try gz_read() */
|
||||
return gz_read(state, buf, 1) < 1 ? -1 : buf[0];
|
||||
}
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
int Z_EXPORT PREFIX(gzgetc_)(gzFile file) {
|
||||
return PREFIX(gzgetc)(file);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
int Z_EXPORT PREFIX(gzungetc)(int c, gzFile file) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure */
|
||||
if (file == NULL)
|
||||
return -1;
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* in case this was just opened, set up the input buffer */
|
||||
if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
|
||||
(void)gz_look(state);
|
||||
|
||||
/* check that we're reading and that there's no (serious) error */
|
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
|
||||
return -1;
|
||||
|
||||
/* process a skip request */
|
||||
if (state->seek) {
|
||||
state->seek = 0;
|
||||
if (gz_skip(state, state->skip) == -1)
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* can't push EOF */
|
||||
if (c < 0)
|
||||
return -1;
|
||||
|
||||
/* if output buffer empty, put byte at end (allows more pushing) */
|
||||
if (state->x.have == 0) {
|
||||
state->x.have = 1;
|
||||
state->x.next = state->out + (state->size << 1) - 1;
|
||||
state->x.next[0] = (unsigned char)c;
|
||||
state->x.pos--;
|
||||
state->past = 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
/* if no room, give up (must have already done a gzungetc()) */
|
||||
if (state->x.have == (state->size << 1)) {
|
||||
gz_error(state, Z_DATA_ERROR, "out of room to push characters");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* slide output data if needed and insert byte before existing data */
|
||||
if (state->x.next == state->out) {
|
||||
unsigned char *src = state->out + state->x.have;
|
||||
unsigned char *dest = state->out + (state->size << 1);
|
||||
while (src > state->out)
|
||||
*--dest = *--src;
|
||||
state->x.next = dest;
|
||||
}
|
||||
state->x.have++;
|
||||
state->x.next--;
|
||||
state->x.next[0] = (unsigned char)c;
|
||||
state->x.pos--;
|
||||
state->past = 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
char * Z_EXPORT PREFIX(gzgets)(gzFile file, char *buf, int len) {
|
||||
unsigned left, n;
|
||||
char *str;
|
||||
unsigned char *eol;
|
||||
gz_state *state;
|
||||
|
||||
/* check parameters and get internal structure */
|
||||
if (file == NULL || buf == NULL || len < 1)
|
||||
return NULL;
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* check that we're reading and that there's no (serious) error */
|
||||
if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
|
||||
return NULL;
|
||||
|
||||
/* process a skip request */
|
||||
if (state->seek) {
|
||||
state->seek = 0;
|
||||
if (gz_skip(state, state->skip) == -1)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* copy output bytes up to new line or len - 1, whichever comes first --
|
||||
append a terminating zero to the string (we don't check for a zero in
|
||||
the contents, let the user worry about that) */
|
||||
str = buf;
|
||||
left = (unsigned)len - 1;
|
||||
if (left) {
|
||||
do {
|
||||
/* assure that something is in the output buffer */
|
||||
if (state->x.have == 0 && gz_fetch(state) == -1)
|
||||
return NULL; /* error */
|
||||
if (state->x.have == 0) { /* end of file */
|
||||
state->past = 1; /* read past end */
|
||||
break; /* return what we have */
|
||||
}
|
||||
|
||||
/* look for end-of-line in current output buffer */
|
||||
n = state->x.have > left ? left : state->x.have;
|
||||
eol = (unsigned char *)memchr(state->x.next, '\n', n);
|
||||
if (eol != NULL)
|
||||
n = (unsigned)(eol - state->x.next) + 1;
|
||||
|
||||
/* copy through end-of-line, or remainder if not found */
|
||||
memcpy(buf, state->x.next, n);
|
||||
state->x.have -= n;
|
||||
state->x.next += n;
|
||||
state->x.pos += n;
|
||||
left -= n;
|
||||
buf += n;
|
||||
} while (left && eol == NULL);
|
||||
}
|
||||
|
||||
/* return terminated string, or if nothing, end of file */
|
||||
if (buf == str)
|
||||
return NULL;
|
||||
buf[0] = 0;
|
||||
return str;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
int Z_EXPORT PREFIX(gzdirect)(gzFile file) {
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure */
|
||||
if (file == NULL)
|
||||
return 0;
|
||||
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* if the state is not known, but we can find out, then do so (this is
|
||||
mainly for right after a gzopen() or gzdopen()) */
|
||||
if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
|
||||
(void)gz_look(state);
|
||||
|
||||
/* return 1 if transparent, 0 if processing a gzip stream */
|
||||
return state->direct;
|
||||
}
|
||||
|
||||
/* -- see zlib.h -- */
|
||||
int Z_EXPORT PREFIX(gzclose_r)(gzFile file) {
|
||||
int ret, err;
|
||||
gz_state *state;
|
||||
|
||||
/* get internal structure */
|
||||
if (file == NULL)
|
||||
return Z_STREAM_ERROR;
|
||||
|
||||
state = (gz_state *)file;
|
||||
|
||||
/* check that we're reading */
|
||||
if (state->mode != GZ_READ)
|
||||
return Z_STREAM_ERROR;
|
||||
|
||||
/* free memory and close file */
|
||||
if (state->size) {
|
||||
PREFIX(inflateEnd)(&(state->strm));
|
||||
zng_free(state->out);
|
||||
zng_free(state->in);
|
||||
}
|
||||
err = state->err == Z_BUF_ERROR ? Z_BUF_ERROR : Z_OK;
|
||||
gz_error(state, Z_OK, NULL);
|
||||
free(state->path);
|
||||
ret = close(state->fd);
|
||||
zng_free(state);
|
||||
return ret ? Z_ERRNO : err;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user