Merge pull request #26113 from FantasqueX:zlib-ng-2-2-1

Update zlib-ng to 2.2.1 #26113

Release: https://github.com/zlib-ng/zlib-ng/releases/tag/2.2.1
ARM diagnostics patch: https://github.com/zlib-ng/zlib-ng/pull/1774

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
FantasqueX 2024-09-12 21:05:24 +08:00 committed by GitHub
parent 7de3a8e960
commit 85923c8f30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
132 changed files with 7214 additions and 1751 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
(C) 1995-2013 Jean-loup Gailly and Mark Adler
(C) 1995-2024 Jean-loup Gailly and Mark Adler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages

View File

@ -21,7 +21,6 @@ Features
* Support for CPU intrinsics when available
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
@ -96,13 +95,14 @@ Build Options
-------------
| CMake | configure | Description | Default |
|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------|
| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF |
| ZLIB_ENABLE_TESTS | | Build test binaries | ON |
| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON |
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
| WITH_RUNTIME_CPU_DETECTION | | Compiles with runtime CPU detection | ON |
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
| WITH_GTEST | | Build gtest_zlib | ON |
| WITH_FUZZERS | | Build test/fuzz | OFF |

View File

@ -7,70 +7,24 @@
#include "functable.h"
#include "adler32_p.h"
/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
#ifdef UNROLL_MORE
n = NMAX / 16; /* NMAX is divisible by 16 */
#else
n = NMAX / 8; /* NMAX is divisible by 8 */
#endif
do {
#ifdef UNROLL_MORE
DO16(adler, sum2, buf); /* 16 sums unrolled */
buf += 16;
#else
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
buf += 8;
#endif
} while (--n);
adler %= BASE;
sum2 %= BASE;
}
/* do remaining bytes (less than NMAX, still just one modulo) */
return adler32_len_64(adler, buf, len, sum2);
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
return functable.adler32(adler, buf, len);
return FUNCTABLE_CALL(adler32)(adler, buf, len);
}
#endif
/* ========================================================================= */
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
return (unsigned long)FUNCTABLE_CALL(adler32)((uint32_t)adler, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
return functable.adler32(adler, buf, len);
return FUNCTABLE_CALL(adler32)(adler, buf, len);
}
#endif

View File

@ -1,11 +0,0 @@
/* adler32_fold.h -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif

View File

@ -1,2 +0,0 @@
# ignore Makefiles; they're all automatically generated
Makefile

View File

@ -25,7 +25,6 @@ all: \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
slide_hash_armv6.o slide_hash_armv6.lo \
insert_string_acle.o insert_string_acle.lo
adler32_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@ -69,12 +68,6 @@ slide_hash_armv6.o:
slide_hash_armv6.lo:
$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
insert_string_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
insert_string_acle.lo:
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~

View File

@ -7,8 +7,8 @@
*/
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "zbuild.h"
#include "adler32_p.h"
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {

View File

@ -1,4 +1,4 @@
#include "../../zbuild.h"
#include "zbuild.h"
#include "arm_features.h"
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
@ -11,6 +11,11 @@
# ifndef ID_AA64ISAR0_CRC32_VAL
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
# endif
#elif defined(__OpenBSD__) && defined(__aarch64__)
# include <machine/armreg.h>
# include <machine/cpu.h>
# include <sys/sysctl.h>
# include <sys/types.h>
#elif defined(__APPLE__)
# if !defined(_DARWIN_C_SOURCE)
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
@ -30,6 +35,16 @@ static int arm_has_crc32() {
#elif defined(__FreeBSD__) && defined(__aarch64__)
return getenv("QEMU_EMULATING") == NULL
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
#elif defined(__OpenBSD__) && defined(__aarch64__)
int hascrc32 = 0;
int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
uint64_t isar0 = 0;
size_t len = sizeof(isar0);
if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
hascrc32 = 1;
}
return hascrc32;
#elif defined(__APPLE__)
int hascrc32;
size_t size = sizeof(hascrc32);

View File

@ -2,8 +2,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ARM_H_
#define ARM_H_
#ifndef ARM_FEATURES_H_
#define ARM_FEATURES_H_
struct arm_cpu_features {
int has_simd;
@ -13,4 +13,4 @@ struct arm_cpu_features {
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
#endif /* ARM_H_ */
#endif /* ARM_FEATURES_H_ */

View File

@ -0,0 +1,65 @@
/* arm_functions.h -- ARM implementations for arch-specific functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ARM_FUNCTIONS_H_
#define ARM_FUNCTIONS_H_
#ifdef ARM_NEON
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_neon(void);
uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZLL
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
# endif
void slide_hash_neon(deflate_state *s);
void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef ARM_ACLE
uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
#endif
#ifdef ARM_SIMD
void slide_hash_armv6(deflate_state *s);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// ARM - SIMD
# if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD)
# undef native_slide_hash
# define native_slide_hash slide_hash_armv6
# endif
// ARM - NEON
# if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON
# undef native_adler32
# define native_adler32 adler32_neon
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_neon
# undef native_chunksize
# define native_chunksize chunksize_neon
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_neon
# undef native_slide_hash
# define native_slide_hash slide_hash_neon
# ifdef HAVE_BUILTIN_CTZLL
# undef native_compare256
# define native_compare256 compare256_neon
# undef native_longest_match
# define native_longest_match longest_match_neon
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_neon
# endif
# endif
// ARM - ACLE
# if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32)
# undef native_crc32
# define native_crc32 crc32_acle
# endif
#endif
#endif /* ARM_FUNCTIONS_H_ */

View File

@ -4,8 +4,8 @@
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../generic/chunk_permute_table.h"
#include "zbuild.h"
#include "arch/generic/chunk_permute_table.h"
typedef uint8x16_t chunk_t;

View File

@ -3,8 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)

View File

@ -7,7 +7,7 @@
#ifdef ARM_ACLE
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "zbuild.h"
Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;

View File

@ -1,24 +0,0 @@
/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef ARM_ACLE
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
#define HASH_CALC(s, h, val) \
h = __crc32w(0, val)
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH Z_TARGET_CRC update_hash_acle
#define INSERT_STRING Z_TARGET_CRC insert_string_acle
#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
#include "../../insert_string_tpl.h"
#endif

View File

@ -25,6 +25,13 @@
out.val[3] = vqsubq_u16(a.val[3], b); \
} while (0)
# if defined(__clang__) && defined(__arm__) && defined(__ANDROID__)
/* Clang for 32-bit Android has too strict alignment requirement (:256) for x4 NEON intrinsics */
# undef ARM_NEON_HASLD4
# undef vld1q_u16_x4
# undef vld1q_u8_x4
# undef vst1q_u16_x4
# endif
# ifndef ARM_NEON_HASLD4

View File

@ -5,8 +5,8 @@
#if defined(ARM_SIMD)
#include "acle_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {

View File

@ -10,8 +10,8 @@
#ifdef ARM_NEON
#include "neon_intrins.h"
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {

View File

@ -1,5 +1,6 @@
# Makefile for zlib
# Makefile for zlib-ng
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# Copyright (C) 2024 Hans Kristian Rosbach
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
@ -11,12 +12,62 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all:
all: \
adler32_c.o adler32_c.lo \
adler32_fold_c.o adler32_fold_c.lo \
chunkset_c.o chunkset_c.lo \
compare256_c.o compare256_c.lo \
crc32_braid_c.o crc32_braid_c.lo \
crc32_fold_c.o crc32_fold_c.lo \
slide_hash_c.o slide_hash_c.lo
adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
adler32_c.lo: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zutil_p.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
slide_hash_c.o: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~ \
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov

View File

@ -0,0 +1,54 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011, 2016 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "functable.h"
#include "adler32_p.h"
/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
#ifdef UNROLL_MORE
n = NMAX / 16; /* NMAX is divisible by 16 */
#else
n = NMAX / 8; /* NMAX is divisible by 8 */
#endif
do {
#ifdef UNROLL_MORE
DO16(adler, sum2, buf); /* 16 sums unrolled */
buf += 16;
#else
DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
buf += 8;
#endif
} while (--n);
adler %= BASE;
sum2 %= BASE;
}
/* do remaining bytes (less than NMAX, still just one modulo) */
return adler32_len_64(adler, buf, len, sum2);
}

View File

@ -5,12 +5,11 @@
#include "zbuild.h"
#include "functable.h"
#include "adler32_fold.h"
#include <limits.h>
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
adler = functable.adler32(adler, src, len);
adler = FUNCTABLE_CALL(adler32)(adler, src, len);
memcpy(dst, src, len);
return adler;
}

View File

@ -5,6 +5,7 @@
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
/* ALIGNED, byte comparison */

View File

@ -8,43 +8,9 @@
*/
#include "zbuild.h"
#include "zutil.h"
#include "functable.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
/* ========================================================================= */
const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
return (const uint32_t *)crc_table;
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return functable.crc32(crc, buf, len);
}
#endif
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
return PREFIX(crc32_z)(crc, buf, len);
}
#endif
/* ========================================================================= */
/*
A CRC of a message is computed on N braids of words in the message, where
each word consists of W bytes (4 or 8). If N is 3, for example, then three
@ -66,24 +32,6 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
level. Your mileage may vary.
*/
/* ========================================================================= */
#if BYTE_ORDER == LITTLE_ENDIAN
# define ZSWAPWORD(word) (word)
# define BRAID_TABLE crc_braid_table
#elif BYTE_ORDER == BIG_ENDIAN
# if W == 8
# define ZSWAPWORD(word) ZSWAP64(word)
# elif W == 4
# define ZSWAPWORD(word) ZSWAP32(word)
# endif
# define BRAID_TABLE crc_braid_big_table
#else
# error "No endian defined"
#endif
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
/* ========================================================================= */
#ifdef W
/*
@ -112,7 +60,7 @@ static z_word_t crc_word(z_word_t data) {
/* ========================================================================= */
Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;
uint32_t c;
/* Pre-condition the CRC */
c = (~crc) & 0xffffffff;

View File

@ -3,11 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
#include "functable.h"
#include "crc32_fold.h"
#include <limits.h>
#include "crc32.h"
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
crc->value = CRC32_INITIAL_VALUE;
@ -15,7 +13,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
}
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
crc->value = functable.crc32(crc->value, src, len);
crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
memcpy(dst, src, len);
}
@ -25,7 +23,7 @@ Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, ui
* same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
* init_crc is an unused argument in this context */
Z_UNUSED(init_crc);
crc->value = functable.crc32(crc->value, src, len);
crc->value = FUNCTABLE_CALL(crc32)(crc->value, src, len);
}
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {

View File

@ -0,0 +1,106 @@
/* generic_functions.h -- generic C implementations for arch-specific functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef GENERIC_FUNCTIONS_H_
#define GENERIC_FUNCTIONS_H_
#include "zendian.h"
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_c(void);
uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
# endif
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
# endif
#endif
typedef void (*slide_hash_func)(deflate_state *s);
void slide_hash_c(deflate_state *s);
uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
# ifdef HAVE_BUILTIN_CTZ
uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
# endif
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
# endif
# endif
uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
# ifdef UNALIGNED64_OK
uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
# endif
# endif
// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
# define longest_match_generic longest_match_unaligned_64
# define longest_match_slow_generic longest_match_slow_unaligned_64
# define compare256_generic compare256_unaligned_64
# elif defined(HAVE_BUILTIN_CTZ)
# define longest_match_generic longest_match_unaligned_32
# define longest_match_slow_generic longest_match_slow_unaligned_32
# define compare256_generic compare256_unaligned_32
# else
# define longest_match_generic longest_match_unaligned_16
# define longest_match_slow_generic longest_match_slow_unaligned_16
# define compare256_generic compare256_unaligned_16
# endif
#else
# define longest_match_generic longest_match_c
# define longest_match_slow_generic longest_match_slow_c
# define compare256_generic compare256_c
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Generic code
# define native_adler32 adler32_c
# define native_adler32_fold_copy adler32_fold_copy_c
# define native_chunkmemset_safe chunkmemset_safe_c
# define native_chunksize chunksize_c
# define native_crc32 PREFIX(crc32_braid)
# define native_crc32_fold crc32_fold_c
# define native_crc32_fold_copy crc32_fold_copy_c
# define native_crc32_fold_final crc32_fold_final_c
# define native_crc32_fold_reset crc32_fold_reset_c
# define native_inflate_fast inflate_fast_c
# define native_slide_hash slide_hash_c
# define native_longest_match longest_match_generic
# define native_longest_match_slow longest_match_slow_generic
# define native_compare256 compare256_generic
#endif
#endif

View File

@ -1,6 +1,6 @@
/* slide_hash.c -- slide hash table C implementation
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/

View File

@ -4,7 +4,7 @@
#ifdef POWER8_VSX
#include <altivec.h>
#include "../../zbuild.h"
#include "zbuild.h"
typedef vector unsigned char chunk_t;

View File

@ -5,8 +5,10 @@
#ifdef POWER9
#include <altivec.h>
#include "../../zbuild.h"
#include "../../zendian.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "zendian.h"
/* Older versions of GCC misimplemented semantics for these bit counting builtins.
* https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */

View File

@ -1,16 +1,19 @@
/* power_features.c - POWER feature check
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
* Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifdef POWER_NEED_AUXVEC_H
# include <linux/auxvec.h>
#endif
#ifdef __FreeBSD__
# include <machine/cpu.h>
#endif
#include "../../zbuild.h"
#include "zbuild.h"
#include "power_features.h"
void Z_INTERNAL power_check_features(struct power_cpu_features *features) {

View File

@ -4,8 +4,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_H_
#define POWER_H_
#ifndef POWER_FEATURES_H_
#define POWER_FEATURES_H_
struct power_cpu_features {
int has_altivec;
@ -15,4 +15,4 @@ struct power_cpu_features {
void Z_INTERNAL power_check_features(struct power_cpu_features *features);
#endif /* POWER_H_ */
#endif /* POWER_FEATURES_H_ */

View File

@ -0,0 +1,67 @@
/* power_functions.h -- POWER implementations for arch-specific functions.
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_FUNCTIONS_H_
#define POWER_FUNCTIONS_H_
#ifdef PPC_VMX
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
void slide_hash_vmx(deflate_state *s);
#endif
#ifdef POWER8_VSX
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_power8(void);
uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
void slide_hash_power8(deflate_state *s);
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef POWER9
uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Power - VMX
# if defined(PPC_VMX) && defined(__ALTIVEC__)
# undef native_adler32
# define native_adler32 adler32_vmx
# undef native_slide_hash
# define native_slide_hash slide_hash_vmx
# endif
// Power8 - VSX
# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__)
# undef native_adler32
# define native_adler32 adler32_power8
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_power8
# undef native_chunksize
# define native_chunksize chunksize_power8
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_power8
# undef native_slide_hash
# define native_slide_hash slide_hash_power8
# endif
# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__)
# undef native_crc32
# define native_crc32 crc32_power8
# endif
// Power9
# if defined(POWER9) && defined(_ARCH_PWR9)
# undef native_compare256
# define native_compare256 compare256_power9
# undef native_longest_match
# define native_longest_match longest_match_power9
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_power9
# endif
#endif
#endif /* POWER_FUNCTIONS_H_ */

View File

@ -9,8 +9,8 @@
#include <riscv_vector.h>
#include <stdint.h>
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "zbuild.h"
#include "adler32_p.h"
static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
/* split Adler-32 into component sums */

View File

@ -6,7 +6,9 @@
#ifdef RISCV_RVV
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#include <riscv_vector.h>

View File

@ -1,10 +1,13 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/auxv.h>
#include <sys/utsname.h>
#include "../../zbuild.h"
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
# include <sys/auxv.h>
#endif
#include "zbuild.h"
#include "riscv_features.h"
#define ISA_V_HWCAP (1 << ('v' - 'a'))
@ -33,7 +36,11 @@ void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *fea
}
void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
unsigned long hw_cap = getauxval(AT_HWCAP);
#else
unsigned long hw_cap = 0;
#endif
features->has_rvv = hw_cap & ISA_V_HWCAP;
}

View File

@ -6,8 +6,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef RISCV_H_
#define RISCV_H_
#ifndef RISCV_FEATURES_H_
#define RISCV_FEATURES_H_
struct riscv_cpu_features {
int has_rvv;
@ -15,4 +15,4 @@ struct riscv_cpu_features {
void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
#endif /* RISCV_H_ */
#endif /* RISCV_FEATURES_H_ */

View File

@ -0,0 +1,49 @@
/* riscv_functions.h -- RISCV implementations for arch-specific functions.
*
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef RISCV_FUNCTIONS_H_
#define RISCV_FUNCTIONS_H_
#ifdef RISCV_RVV
uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_rvv(void);
uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
void slide_hash_rvv(deflate_state *s);
void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// RISCV - RVV
# if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__)
# undef native_adler32
# define native_adler32 adler32_rvv
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_rvv
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_rvv
# undef native_chunksize
# define native_chunksize chunksize_rvv
# undef native_compare256
# define native_compare256 compare256_rvv
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_rvv
# undef native_longest_match
# define native_longest_match longest_match_rvv
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_rvv
# undef native_slide_hash
# define native_slide_hash slide_hash_rvv
# endif
#endif
#endif /* RISCV_FUNCTIONS_H_ */

View File

@ -8,18 +8,16 @@
#include <riscv_vector.h>
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
size_t vl;
while (entries > 0) {
vl = __riscv_vsetvl_e16m4(entries);
vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
__riscv_vse16_v_u16m4(table, v_tab, vl);
vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
__riscv_vse16_v_u16m4(table, v_diff, vl);
table += vl, entries -= vl;
}
}

48
3rdparty/zlib-ng/arch/s390/Makefile.in vendored Normal file
View File

@ -0,0 +1,48 @@
# Makefile for zlib-ng
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
VGFMAFLAG=
NOLTOFLAG=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
s390_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
s390_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
dfltcc_deflate.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
dfltcc_deflate.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
dfltcc_inflate.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
dfltcc_inflate.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
crc32-vx.o:
$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
crc32-vx.lo:
$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean: clean
rm -f Makefile

277
3rdparty/zlib-ng/arch/s390/README.md vendored Normal file
View File

@ -0,0 +1,277 @@
# Introduction
This directory contains SystemZ deflate hardware acceleration support.
It can be enabled using the following build commands:
$ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
$ make
or
$ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
$ make
When built like this, zlib-ng would compress using hardware on level 1,
and using software on all other levels. Decompression will always happen
in hardware. In order to enable hardware compression for levels 1-6
(i.e. to make it used by default) one could add
`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
SystemZ deflate hardware acceleration is available on [IBM z15](
https://www.ibm.com/products/z15) and newer machines under the name [
"Integrated Accelerator for zEnterprise Data Compression"](
https://www.ibm.com/support/z-content-solutions/compression/). The
programming interface to it is a machine instruction called DEFLATE
CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
the code and the rest of this document refer to this feature simply as
"DFLTCC".
# Performance
Performance figures are published [here](
https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
). The compression speed-up can be as high as 110x and the decompression
speed-up can be as high as 15x.
# Limitations
Two DFLTCC compression calls with identical inputs are not guaranteed to
produce identical outputs. Therefore care should be taken when using
hardware compression when reproducible results are desired. In
particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
particular stream.
DFLTCC does not support every single zlib-ng feature, in particular:
* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
* `inflateMark()`
* `inflatePrime()`
* `inflateSyncPoint()`
When used, these functions will either switch to software, or, in case
this is not possible, gracefully fail.
# Code structure
All SystemZ-specific code lives in `arch/s390` directory and is
integrated with the rest of zlib-ng using hook macros.
## Hook macros
DFLTCC takes as arguments a parameter block, an input buffer, an output
buffer, and a window. Parameter blocks are stored alongside zlib states;
buffers are forwarded from the caller; and window - which must be
4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
Software and hardware window formats do not match, therefore,
`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
and `inflateGetDictionary()` need special handling, which is triggered using
`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
`INFLATE_RESET_KEEP_HOOK()` macros.
`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
calls gracefully fail.
`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
software compression mid-stream using `deflateParams()`. Switching
normally entails flushing the current block, which might not be possible
in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
in order to detect and gracefully handle such situations.
The algorithm implemented in hardware has different compression ratio
than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
return the correct results for the hardware implementation.
Actual compression and decompression are handled by `DEFLATE_HOOK()` and
`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
window on its own, calling `updatewindow()` is suppressed using
`INFLATE_NEED_UPDATEWINDOW()` macro.
In addition to compression, DFLTCC computes CRC-32 and Adler-32
checksums, therefore, whenever it's used, software checksumming is
suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
macros.
While software always produces reproducible compression results, this
is not the case for DFLTCC. Therefore, zlib-ng users are given the
ability to specify whether or not reproducible compression results
are required. While it is always possible to specify this setting
before the compression begins, it is not always possible to do so in
the middle of a deflate stream - the exact conditions for that are
determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
## SystemZ-specific code
When zlib-ng is built with DFLTCC, the hooks described above are
converted to calls to functions, which are implemented in
`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
categories:
* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
* Translating between software and hardware data formats, e.g.
`dfltcc_deflate_set_dictionary()`.
* Translating between software and hardware state machines, e.g.
`dfltcc_deflate()` and `dfltcc_inflate()`.
The functions from the first two categories are fairly simple, however,
various quirks in both software and hardware state machines make the
functions from the third category quite complicated.
### `dfltcc_deflate()` function
This function is called by `deflate()` and has the following
responsibilities:
* Checking whether DFLTCC can be used with the current stream. If this
is not the case, then it returns `0`, making `deflate()` use some
other function in order to compress in software. Otherwise it returns
`1`.
* Block management and Huffman table generation. DFLTCC ends blocks only
when explicitly instructed to do so by the software. Furthermore,
whether to use fixed or dynamic Huffman tables must also be determined
by the software. Since looking at data in order to gather statistics
would negate performance benefits, the following approach is used: the
first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
dynamic blocks.
* Writing EOBS. Block Closing Control bit in the parameter block
instructs DFLTCC to write EOBS, however, certain conditions need to be
met: input data length must be non-zero or Continuation Flag must be
set. To put this in simpler terms, DFLTCC will silently refuse to
write EOBS if this is the only thing that it is asked to do. Since the
code has to be able to emit EOBS in software anyway, in order to avoid
tricky corner cases Block Closing Control is never used. Whether to
write EOBS is instead controlled by `soft_bcc` variable.
* Triggering block post-processing. Depending on flush mode, `deflate()`
must perform various additional actions when a block or a stream ends.
`dfltcc_deflate()` informs `deflate()` about this using
`block_state *result` parameter.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
and Sub-Byte Boundary. Certain fields cannot be translated and must
persist untouched in the parameter block between calls, for example,
Continuation Flag or Continuation State Buffer.
* Handling flush modes and low-memory situations. These aspects are
quite intertwined and pervasive. The general idea here is that the
code must not do anything in software - whether explicitly by e.g.
calling `send_eobs()`, or implicitly - by returning to `deflate()`
with certain return and `*result` values, when Continuation Flag is
set.
* Ending streams. When a new block is started and flush mode is
`Z_FINISH`, Block Header Final parameter block bit is used to mark
this block as final. However, sometimes an empty final block is
needed, and, unfortunately, just like with EOBS, DFLTCC will silently
refuse to do this. The general idea of DFLTCC implementation is to
rely as much as possible on the existing code. Here in order to do
this, the code pretends that it does not support DFLTCC, which makes
`deflate()` call a software compression function, which writes an
empty final block. Whether this is required is controlled by
`need_empty_block` variable.
* Error handling. This is simply converting
Operation-Ending-Supplemental Code to string. Errors can only happen
due to things like memory corruption, and therefore they don't affect
the `deflate()` return code.
### `dfltcc_inflate()` function
This function is called by `inflate()` from the `TYPEDO` state (that is,
when all the metadata is parsed and the stream is positioned at the type
bits of deflate block header) and it's responsible for the following:
* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
Unfortunately, there is no way to ask DFLTCC to stop decompressing on
block or tree boundary.
* `inflate()` decompression loop management. This is controlled using
the return value, which can be either `DFLTCC_INFLATE_BREAK` or
`DFLTCC_INFLATE_CONTINUE`.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `whave` and History Length or `wnext` and
History Offset.
* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
and is controlled by `last` state field.
* Error handling. Like deflate, error handling comprises
Operation-Ending-Supplemental Code to string conversion. Unlike
deflate, errors may happen due to bad inputs, therefore they are
propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
# Testing
Given complexity of DFLTCC machine instruction, it is not clear whether
QEMU TCG will ever support it. At the time of writing, one has to have
access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
DFLTCC is a non-privileged instruction, neither special VM/LPAR
configuration nor root are required.
zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
testing. There is no official IBM Z GitHub Actions runner, so we build
one inspired by `anup-kodlekere/gaplib`.
Future updates to actions-runner might need an updated patch. The .net
version number patch has been separated into a separate file to avoid a
need for constantly changing the patch.
## Configuring the builder.
### Install prerequisites.
```
sudo dnf install podman
```
### Add actions-runner service.
```
sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
sudo systemctl daemon-reload
```
### Create a config file, needs github personal access token.
```
# Create file /etc/actions-runner
repo=<owner>/<name>
access_token=<ghp_***>
```
Access token should have the repo scope, consult
https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-a-repository
for details.
### Autostart actions-runner.
```
$ sudo systemctl enable --now actions-runner
```
## Rebuilding the container
In order to update the `gaplib-actions-runner` podman container, e.g. to get the
latest OS security fixes, follow these steps:
```
# Stop actions-runner service
sudo systemctl stop actions-runner
# Delete old container
sudo podman container rm gaplib-actions-runner
# Delete old image
sudo podman image rm localhost/zlib-ng/actions-runner
# Build image
sudo podman build --squash -f Dockerfile.zlib-ng --tag zlib-ng/actions-runner --build-arg .
# Build container
sudo podman create --name=gaplib-actions-runner --env-file=/etc/actions-runner --init --interactive --volume=actions-runner-temp:/home/actions-runner zlib-ng/actions-runner
# Start actions-runner service
sudo systemctl start actions-runner
```

222
3rdparty/zlib-ng/arch/s390/crc32-vx.c vendored Normal file
View File

@ -0,0 +1,222 @@
/*
* Hardware-accelerated CRC-32 variants for Linux on z Systems
*
* Use the z/Architecture Vector Extension Facility to accelerate the
* computing of bitreflected CRC-32 checksums.
*
* This CRC-32 implementation algorithm is bitreflected and processes
* the least-significant bit first (Little-Endian).
*
* This code was originally written by Hendrik Brueckner
* <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
* relicensed under the zlib license.
*/
#include "zbuild.h"
#include "arch_functions.h"
#include <vecintrin.h>
typedef unsigned char uv16qi __attribute__((vector_size(16)));
typedef unsigned int uv4si __attribute__((vector_size(16)));
typedef unsigned long long uv2di __attribute__((vector_size(16)));
static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
/*
* The CRC-32 constant block contains reduction constants to fold and
* process particular chunks of the input data stream in parallel.
*
* For the CRC-32 variants, the constants are precomputed according to
* these definitions:
*
* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
* R3 = [(x128+32 mod P'(x) << 32)]' << 1
* R4 = [(x128-32 mod P'(x) << 32)]' << 1
* R5 = [(x64 mod P'(x) << 32)]' << 1
* R6 = [(x32 mod P'(x) << 32)]' << 1
*
* The bitreflected Barret reduction constant, u', is defined as
* the bit reversal of floor(x**64 / P(x)).
*
* where P(x) is the polynomial in the normal domain and the P'(x) is the
* polynomial in the reversed (bitreflected) domain.
*
* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
*
* P(x) = 0x04C11DB7
* P'(x) = 0xEDB88320
*/
const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
const uv2di r5 = {0, 0x163CD6124}; /* R5 */
const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
/*
* Load the initial CRC value.
*
* The CRC value is loaded into the rightmost word of the
* vector register and is later XORed with the LSB portion
* of the loaded input data.
*/
uv2di v0 = {0, 0};
v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
/* Load a 64-byte data chunk and XOR with CRC */
uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
v1 ^= v0;
buf += 64;
len -= 64;
while (len >= 64) {
/* Load the next 64-byte data chunk */
uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
/*
* Perform a GF(2) multiplication of the doublewords in V1 with
* the R1 and R2 reduction constants in V0. The intermediate result
* is then folded (accumulated) with the next data chunk in PART1 and
* stored in V1. Repeat this step for the register contents
* in V2, V3, and V4 respectively.
*/
v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
buf += 64;
len -= 64;
}
/*
* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
* and R4 and accumulating the next 128-bit chunk until a single 128-bit
* value remains.
*/
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
while (len >= 16) {
/* Load next data chunk */
v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
/* Fold next data chunk */
v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
buf += 16;
len -= 16;
}
/*
* Set up a vector register for byte shifts. The shift value must
* be loaded in bits 1-4 in byte element 7 of a vector register.
* Shift by 8 bytes: 0x40
* Shift by 4 bytes: 0x20
*/
uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
v9 = vec_insert((unsigned char)0x40, v9, 7);
/*
* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
* to move R4 into the rightmost doubleword and set the leftmost
* doubleword to 0x1.
*/
v0 = vec_srb(r4r3, (uv2di)v9);
v0[0] = 1;
/*
* Compute GF(2) product of V1 and V0. The rightmost doubleword
* of V1 is multiplied with R4. The leftmost doubleword of V1 is
* multiplied by 0x1 and is then XORed with rightmost product.
* Implicitly, the intermediate leftmost product becomes padded
*/
v1 = (uv2di)vec_gfmsum_128(v0, v1);
/*
* Now do the final 32-bit fold by multiplying the rightmost word
* in V1 with R5 and XOR the result with the remaining bits in V1.
*
* To achieve this by a single VGFMAG, right shift V1 by a word
* and store the result in V2 which is then accumulated. Use the
* vector unpack instruction to load the rightmost half of the
* doubleword into the rightmost doubleword element of V1; the other
* half is loaded in the leftmost doubleword.
* The vector register with CONST_R5 contains the R5 constant in the
* rightmost doubleword and the leftmost doubleword is zero to ignore
* the leftmost product of V1.
*/
v9 = vec_insert((unsigned char)0x20, v9, 7);
v2 = vec_srb(v1, (uv2di)v9);
v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
/*
* Apply a Barret reduction to compute the final 32-bit CRC value.
*
* The input values to the Barret reduction are the degree-63 polynomial
* in V1 (R(x)), degree-32 generator polynomial, and the reduction
* constant u. The Barret reduction result is the CRC value of R(x) mod
* P(x).
*
* The Barret reduction algorithm is defined as:
*
* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
* 3. C(x) = R(x) XOR T2(x) mod x^32
*
* Note: The leftmost doubleword of vector register containing
* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
* is zero and does not contribute to the final result.
*/
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
v2 = vec_unpackl((uv4si)v1);
v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
/*
* Compute the GF(2) product of the CRC polynomial with T1(x) in
* V2 and XOR the intermediate result, T2(x), with the value in V1.
* The final result is stored in word element 2 of V2.
*/
v2 = vec_unpackl((uv4si)v2);
v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
return ((uv4si)v2)[2];
}
#define VX_MIN_LEN 64
#define VX_ALIGNMENT 16L
#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
size_t prealign, aligned, remaining;
if (len < VX_MIN_LEN + VX_ALIGN_MASK)
return PREFIX(crc32_braid)(crc, buf, len);
if ((uintptr_t)buf & VX_ALIGN_MASK) {
prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
len -= prealign;
crc = PREFIX(crc32_braid)(crc, buf, prealign);
buf += prealign;
}
aligned = len & ~VX_ALIGN_MASK;
remaining = len & VX_ALIGN_MASK;
crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
if (remaining)
crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
return crc;
}

View File

@ -0,0 +1,119 @@
#ifndef DFLTCC_COMMON_H
#define DFLTCC_COMMON_H
#include "zutil.h"
/*
Parameter Block for Query Available Functions.
*/
struct dfltcc_qaf_param {
char fns[16];
char reserved1[8];
char fmts[2];
char reserved2[6];
} ALIGNED_(8);
/*
Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
*/
struct dfltcc_param_v0 {
uint16_t pbvn; /* Parameter-Block-Version Number */
uint8_t mvn; /* Model-Version Number */
uint8_t ribm; /* Reserved for IBM use */
uint32_t reserved32 : 31;
uint32_t cf : 1; /* Continuation Flag */
uint8_t reserved64[8];
uint32_t nt : 1; /* New Task */
uint32_t reserved129 : 1;
uint32_t cvt : 1; /* Check Value Type */
uint32_t reserved131 : 1;
uint32_t htt : 1; /* Huffman-Table Type */
uint32_t bcf : 1; /* Block-Continuation Flag */
uint32_t bcc : 1; /* Block Closing Control */
uint32_t bhf : 1; /* Block Header Final */
uint32_t reserved136 : 1;
uint32_t reserved137 : 1;
uint32_t dhtgc : 1; /* DHT Generation Control */
uint32_t reserved139 : 5;
uint32_t reserved144 : 5;
uint32_t sbb : 3; /* Sub-Byte Boundary */
uint8_t oesc; /* Operation-Ending-Supplemental Code */
uint32_t reserved160 : 12;
uint32_t ifs : 4; /* Incomplete-Function Status */
uint16_t ifl; /* Incomplete-Function Length */
uint8_t reserved192[8];
uint8_t reserved256[8];
uint8_t reserved320[4];
uint16_t hl; /* History Length */
uint32_t reserved368 : 1;
uint16_t ho : 15; /* History Offset */
uint32_t cv; /* Check Value */
uint32_t eobs : 15; /* End-of-block Symbol */
uint32_t reserved431: 1;
uint8_t eobl : 4; /* End-of-block Length */
uint32_t reserved436 : 12;
uint32_t reserved448 : 4;
uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table
Length */
uint8_t reserved464[6];
uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */
uint8_t reserved[24];
uint8_t ribm2[8]; /* Reserved for IBM use */
uint8_t csb[1152]; /* Continuation-State Buffer */
} ALIGNED_(8);
/*
Extension of inflate_state and deflate_state.
*/
struct dfltcc_state {
struct dfltcc_param_v0 param; /* Parameter block. */
struct dfltcc_qaf_param af; /* Available functions. */
char msg[64]; /* Buffer for strm->msg */
};
typedef struct {
struct dfltcc_state common;
uint16_t level_mask; /* Levels on which to use DFLTCC */
uint32_t block_size; /* New block each X bytes */
size_t block_threshold; /* New block after total_in > X */
uint32_t dht_threshold; /* New block only if avail_in >= X */
} arch_deflate_state;
typedef struct {
struct dfltcc_state common;
} arch_inflate_state;
/*
History buffer size.
*/
#define HB_BITS 15
#define HB_SIZE (1 << HB_BITS)
/*
Sizes of deflate block parts.
*/
#define DFLTCC_BLOCK_HEADER_BITS 3
#define DFLTCC_HLITS_COUNT_BITS 5
#define DFLTCC_HDISTS_COUNT_BITS 5
#define DFLTCC_HCLENS_COUNT_BITS 4
#define DFLTCC_MAX_HCLENS 19
#define DFLTCC_HCLEN_BITS 3
#define DFLTCC_MAX_HLITS 286
#define DFLTCC_MAX_HDISTS 30
#define DFLTCC_MAX_HLIT_HDIST_BITS 7
#define DFLTCC_MAX_SYMBOL_BITS 16
#define DFLTCC_MAX_EOBS_BITS 15
#define DFLTCC_MAX_PADDING_BITS 7
#define DEFLATE_BOUND_COMPLEN(source_len) \
((DFLTCC_BLOCK_HEADER_BITS + \
DFLTCC_HLITS_COUNT_BITS + \
DFLTCC_HDISTS_COUNT_BITS + \
DFLTCC_HCLENS_COUNT_BITS + \
DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
(DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
(source_len) * DFLTCC_MAX_SYMBOL_BITS + \
DFLTCC_MAX_EOBS_BITS + \
DFLTCC_MAX_PADDING_BITS) >> 3)
#endif

View File

@ -0,0 +1,383 @@
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
/*
Use the following commands to build zlib-ng with DFLTCC compression support:
$ ./configure --with-dfltcc-deflate
or
$ cmake -DWITH_DFLTCC_DEFLATE=1 .
and then
$ make
*/
#include "zbuild.h"
#include "deflate.h"
#include "trees_emit.h"
#include "dfltcc_deflate.h"
#include "dfltcc_detail.h"
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
arch_deflate_state *dfltcc_state = &state->arch;
dfltcc_reset_state(&dfltcc_state->common);
/* Initialize tuning parameters */
dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
}
static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
arch_deflate_state *dfltcc_state = &state->arch;
/* Unsupported compression settings */
if ((dfltcc_state->level_mask & (1 << level)) == 0)
return 0;
if (window_bits != HB_BITS)
return 0;
if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
return 0;
if (reproducible)
return 0;
/* Unsupported hardware */
if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
!is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
return 0;
return 1;
}
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
}
static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
size_t avail_in = strm->avail_in;
dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
}
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
size_t avail_in = strm->avail_in;
size_t avail_out = strm->avail_out;
dfltcc_cc cc;
cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
param, &strm->next_out, &avail_out,
&strm->next_in, &avail_in, state->window);
strm->total_in += (strm->avail_in - avail_in);
strm->total_out += (strm->avail_out - avail_out);
strm->avail_in = avail_in;
strm->avail_out = avail_out;
return cc;
}
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
deflate_state *state = (deflate_state *)strm->state;
send_bits(state, PREFIX(bi_reverse)(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
PREFIX(flush_pending)(strm);
if (state->pending != 0) {
/* The remaining data is located in pending_out[0:pending]. If someone
* calls put_byte() - this might happen in deflate() - the byte will be
* placed into pending_buf[pending], which is incorrect. Move the
* remaining data to the beginning of pending_buf so that put_byte() is
* usable again.
*/
memmove(state->pending_buf, state->pending_out, state->pending);
state->pending_out = state->pending_buf;
}
#ifdef ZLIB_DEBUG
state->compressed_len += param->eobl;
#endif
}
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
deflate_state *state = (deflate_state *)strm->state;
arch_deflate_state *dfltcc_state = &state->arch;
struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
uInt masked_avail_in;
dfltcc_cc cc;
int need_empty_block;
int soft_bcc;
int no_flush;
if (!PREFIX(dfltcc_can_deflate)(strm)) {
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
return 0;
}
again:
masked_avail_in = 0;
soft_bcc = 0;
no_flush = flush == Z_NO_FLUSH;
/* No input data. Return, except when Continuation Flag is set, which means
* that DFLTCC has buffered some output in the parameter block and needs to
* be called again in order to flush it.
*/
if (strm->avail_in == 0 && !param->cf) {
/* A block is still open, and the hardware does not support closing
* blocks without adding data. Thus, close it manually.
*/
if (!no_flush && param->bcf) {
send_eobs(strm, param);
param->bcf = 0;
}
/* Let one of deflate_* functions write a trailing empty block. */
if (flush == Z_FINISH)
return 0;
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
/* Trigger block post-processing if necessary. */
*result = no_flush ? need_more : block_done;
return 1;
}
/* There is an open non-BFINAL block, we are not going to close it just
* yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
* more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
* DHT in order to adapt to a possibly changed input data distribution.
*/
if (param->bcf && no_flush &&
strm->total_in > dfltcc_state->block_threshold &&
strm->avail_in >= dfltcc_state->dht_threshold) {
if (param->cf) {
/* We need to flush the DFLTCC buffer before writing the
* End-of-block Symbol. Mask the input data and proceed as usual.
*/
masked_avail_in += strm->avail_in;
strm->avail_in = 0;
no_flush = 0;
} else {
/* DFLTCC buffer is empty, so we can manually write the
* End-of-block Symbol right away.
*/
send_eobs(strm, param);
param->bcf = 0;
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
}
}
/* No space for compressed data. If we proceed, dfltcc_cmpr() will return
* DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
* set BCF=1, which is wrong. Avoid complications and return early.
*/
if (strm->avail_out == 0) {
*result = need_more;
return 1;
}
/* The caller gave us too much data. Pass only one block worth of
* uncompressed data to DFLTCC and mask the rest, so that on the next
* iteration we start a new block.
*/
if (no_flush && strm->avail_in > dfltcc_state->block_size) {
masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
strm->avail_in = dfltcc_state->block_size;
}
/* When we have an open non-BFINAL deflate block and caller indicates that
* the stream is ending, we need to close an open deflate block and open a
* BFINAL one.
*/
need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
/* Translate stream to parameter block */
param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
if (!no_flush)
/* We need to close a block. Always do this in software - when there is
* no input data, the hardware will not honor BCC. */
soft_bcc = 1;
if (flush == Z_FINISH && !param->bcf)
/* We are about to open a BFINAL block, set Block Header Final bit
* until the stream ends.
*/
param->bhf = 1;
/* DFLTCC-CMPR will write to next_out, so make sure that buffers with
* higher precedence are empty.
*/
Assert(state->pending == 0, "There must be no pending bytes");
Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
param->sbb = (unsigned int)state->bi_valid;
if (param->sbb > 0)
*strm->next_out = (unsigned char)state->bi_buf;
/* Honor history and check value */
param->nt = 0;
if (state->wrap == 1)
param->cv = strm->adler;
else if (state->wrap == 2)
param->cv = ZSWAP32(state->crc_fold.value);
/* When opening a block, choose a Huffman-Table Type */
if (!param->bcf) {
if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
param->htt = HTT_FIXED;
else {
param->htt = HTT_DYNAMIC;
dfltcc_gdht(strm);
}
}
/* Deflate */
do {
cc = dfltcc_cmpr(strm);
if (strm->avail_in < 4096 && masked_avail_in > 0)
/* We are about to call DFLTCC with a small input buffer, which is
* inefficient. Since there is masked data, there will be at least
* one more DFLTCC call, so skip the current one and make the next
* one handle more data.
*/
break;
} while (cc == DFLTCC_CC_AGAIN);
/* Translate parameter block to stream */
strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
state->bi_valid = param->sbb;
if (state->bi_valid == 0)
state->bi_buf = 0; /* Avoid accessing next_out */
else
state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
if (state->wrap == 1)
strm->adler = param->cv;
else if (state->wrap == 2)
state->crc_fold.value = ZSWAP32(param->cv);
/* Unmask the input data */
strm->avail_in += masked_avail_in;
masked_avail_in = 0;
/* If we encounter an error, it means there is a bug in DFLTCC call */
Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
/* Update Block-Continuation Flag. It will be used to check whether to call
* GDHT the next time.
*/
if (cc == DFLTCC_CC_OK) {
if (soft_bcc) {
send_eobs(strm, param);
param->bcf = 0;
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
} else
param->bcf = 1;
if (flush == Z_FINISH) {
if (need_empty_block)
/* Make the current deflate() call also close the stream */
return 0;
else {
bi_windup(state);
*result = finish_done;
}
} else {
if (flush == Z_FULL_FLUSH)
param->hl = 0; /* Clear history */
*result = flush == Z_NO_FLUSH ? need_more : block_done;
}
} else {
param->bcf = 1;
*result = need_more;
}
if (strm->avail_in != 0 && strm->avail_out != 0)
goto again; /* deflate() must use all input or all output */
return 1;
}
/*
Switching between hardware and software compression.
DFLTCC does not support all zlib settings, e.g. generation of non-compressed
blocks or alternative window sizes. When such settings are applied on the
fly with deflateParams, we need to convert between hardware and software
window formats.
*/
static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
deflate_state *state = (deflate_state *)strm->state;
int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
if (can_deflate == could_deflate)
/* We continue to work in the same mode - no changes needed */
return Z_OK;
if (!dfltcc_was_deflate_used(strm))
/* DFLTCC was not used yet - no changes needed */
return Z_OK;
/* For now, do not convert between window formats - simply get rid of the old data instead */
*flush = Z_FULL_FLUSH;
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
/* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
* close the block without resetting the compression state. Detect this
* situation and return that deflation is not done.
*/
if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
return 0;
/* Return that deflation is not done if DFLTCC is used and either it
* buffered some data (Continuation Flag is set), or has not written EOBS
* yet (Block-Continuation Flag is set).
*/
return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
}
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
}
/*
Preloading history.
*/
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
append_history(param, state->window, dictionary, dict_length);
state->strstart = 1; /* Add FDICT to zlib header */
state->block_start = state->strstart; /* Make deflate_stored happy */
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
if (dictionary)
get_history(param, state->window, dictionary);
if (dict_length)
*dict_length = param->hl;
return Z_OK;
}

View File

@ -0,0 +1,58 @@
#ifndef DFLTCC_DEFLATE_H
#define DFLTCC_DEFLATE_H
#include "deflate.h"
#include "dfltcc_common.h"
void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length);
int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_deflate)((strm))) \
return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_deflate)((strm))) \
return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
do { \
int err; \
\
err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
if (err == Z_STREAM_ERROR) \
return err; \
} while (0)
#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
do { \
if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
(complen) = DEFLATE_BOUND_COMPLEN(source_len); \
} while (0)
#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
#endif

View File

@ -0,0 +1,275 @@
#include "zbuild.h"
#include <stdio.h>
#ifdef HAVE_SYS_SDT_H
#include <sys/sdt.h>
#endif
/*
Tuning parameters.
*/
#ifndef DFLTCC_LEVEL_MASK
#define DFLTCC_LEVEL_MASK 0x2
#endif
#ifndef DFLTCC_BLOCK_SIZE
#define DFLTCC_BLOCK_SIZE 1048576
#endif
#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
#endif
#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
#endif
#ifndef DFLTCC_RIBM
#define DFLTCC_RIBM 0
#endif
#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
#define DFLTCC_SIZEOF_QAF 32
static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
static inline int is_bit_set(const char *bits, int n) {
return bits[n / 8] & (1 << (7 - (n % 8)));
}
static inline void clear_bit(char *bits, int n) {
bits[n / 8] &= ~(1 << (7 - (n % 8)));
}
#define DFLTCC_FACILITY 151
static inline int is_dfltcc_enabled(void) {
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
Z_REGISTER uint8_t r0 __asm__("r0");
memset(facilities, 0, sizeof(facilities));
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
* is 64-bit, it's always z/Architecture mode at runtime.
*/
__asm__ volatile(
#ifndef __clang__
".machinemode push\n"
".machinemode zarch\n"
#endif
"stfle %[facilities]\n"
#ifndef __clang__
".machinemode pop\n"
#endif
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
}
#define DFLTCC_FMT0 0
#define CVT_CRC32 0
#define CVT_ADLER32 1
#define HTT_FIXED 0
#define HTT_DYNAMIC 1
#define DFLTCC_SIZEOF_GDHT_V0 384
#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
static inline z_const char *oesc_msg(char *buf, int oesc) {
if (oesc == 0x00)
return NULL; /* Successful completion */
else {
sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
return buf;
}
}
/*
C wrapper for the DEFLATE CONVERSION CALL instruction.
*/
typedef enum {
DFLTCC_CC_OK = 0,
DFLTCC_CC_OP1_TOO_SHORT = 1,
DFLTCC_CC_OP2_TOO_SHORT = 2,
DFLTCC_CC_OP2_CORRUPT = 2,
DFLTCC_CC_AGAIN = 3,
} dfltcc_cc;
#define DFLTCC_QAF 0
#define DFLTCC_GDHT 1
#define DFLTCC_CMPR 2
#define DFLTCC_XPND 4
#define HBT_CIRCULAR (1 << 7)
#define DFLTCC_FN_MASK ((1 << 7) - 1)
/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
*hl_high = MIN(param->hl, HB_SIZE - param->ho);
*hl_low = param->hl - *hl_high;
}
/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
size_t hl_high, hl_low;
get_history_lengths(param, &hl_high, &hl_low);
instrument_read_write(hist + param->ho, hl_high);
instrument_read_write(hist, hl_low);
}
/* Notify MSan about a completed write to the circular history buffer. */
static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
size_t hl_high, hl_low;
get_history_lengths(param, &hl_high, &hl_low);
__msan_unpoison(hist + param->ho, hl_high);
__msan_unpoison(hist, hl_low);
}
static inline dfltcc_cc dfltcc(int fn, void *param,
unsigned char **op1, size_t *len1,
z_const unsigned char **op2, size_t *len2, void *hist) {
unsigned char *t2 = op1 ? *op1 : NULL;
unsigned char *orig_t2 = t2;
size_t t3 = len1 ? *len1 : 0;
z_const unsigned char *t4 = op2 ? *op2 : NULL;
size_t t5 = len2 ? *len2 : 0;
Z_REGISTER int r0 __asm__("r0");
Z_REGISTER void *r1 __asm__("r1");
Z_REGISTER unsigned char *r2 __asm__("r2");
Z_REGISTER size_t r3 __asm__("r3");
Z_REGISTER z_const unsigned char *r4 __asm__("r4");
Z_REGISTER size_t r5 __asm__("r5");
int cc;
/* Insert pre-instrumentation for DFLTCC. */
switch (fn & DFLTCC_FN_MASK) {
case DFLTCC_QAF:
instrument_write(param, DFLTCC_SIZEOF_QAF);
break;
case DFLTCC_GDHT:
instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
instrument_read(t4, t5);
break;
case DFLTCC_CMPR:
case DFLTCC_XPND:
instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
instrument_read(t4, t5);
instrument_write(t2, t3);
instrument_read_write_hist(param, hist);
break;
}
r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
__asm__ volatile(
#ifdef HAVE_SYS_SDT_H
STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
#endif
".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
#ifdef HAVE_SYS_SDT_H
STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
#endif
"ipm %[cc]\n"
: [r2] "+r" (r2)
, [r3] "+r" (r3)
, [r4] "+r" (r4)
, [r5] "+r" (r5)
, [cc] "=r" (cc)
: [r0] "r" (r0)
, [r1] "r" (r1)
, [hist] "r" (hist)
#ifdef HAVE_SYS_SDT_H
, STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
#endif
: "cc", "memory");
t2 = r2; t3 = r3; t4 = r4; t5 = r5;
/* Insert post-instrumentation for DFLTCC. */
switch (fn & DFLTCC_FN_MASK) {
case DFLTCC_QAF:
__msan_unpoison(param, DFLTCC_SIZEOF_QAF);
break;
case DFLTCC_GDHT:
__msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
break;
case DFLTCC_CMPR:
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
__msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
msan_unpoison_hist(param, hist);
break;
case DFLTCC_XPND:
__msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
__msan_unpoison(orig_t2, t2 - orig_t2);
msan_unpoison_hist(param, hist);
break;
}
if (op1)
*op1 = t2;
if (len1)
*len1 = t3;
if (op2)
*op2 = t4;
if (len2)
*len2 = t5;
return (cc >> 28) & 3;
}
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
/* Initialize available functions */
if (is_dfltcc_enabled()) {
dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
} else
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
/* Initialize parameter block */
memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
dfltcc_state->param.nt = 1;
dfltcc_state->param.ribm = DFLTCC_RIBM;
}
static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
}
static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
const unsigned char *buf, uInt count) {
size_t offset;
size_t n;
/* Do not use more than 32K */
if (count > HB_SIZE) {
buf += count - HB_SIZE;
count = HB_SIZE;
}
offset = (param->ho + param->hl) % HB_SIZE;
if (offset + count <= HB_SIZE)
/* Circular history buffer does not wrap - copy one chunk */
memcpy(history + offset, buf, count);
else {
/* Circular history buffer wraps - copy two chunks */
n = HB_SIZE - offset;
memcpy(history + offset, buf, n);
memcpy(history, buf + n, count - n);
}
n = param->hl + count;
if (n <= HB_SIZE)
/* All history fits into buffer - no need to discard anything */
param->hl = n;
else {
/* History does not fit into buffer - discard extra bytes */
param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
param->hl = HB_SIZE;
}
}
static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
unsigned char *buf) {
size_t hl_high, hl_low;
get_history_lengths(param, &hl_high, &hl_low);
memcpy(buf, history + param->ho, hl_high);
memcpy(buf + hl_high, history, hl_low);
}

View File

@ -0,0 +1,191 @@
/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
/*
Use the following commands to build zlib-ng with DFLTCC decompression support:
$ ./configure --with-dfltcc-inflate
or
$ cmake -DWITH_DFLTCC_INFLATE=1 .
and then
$ make
*/
#include "zbuild.h"
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "dfltcc_inflate.h"
#include "dfltcc_detail.h"
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
dfltcc_reset_state(&state->arch.common);
}
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = &state->arch.common;
/* Unsupported hardware */
return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
}
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
size_t avail_in = strm->avail_in;
size_t avail_out = strm->avail_out;
dfltcc_cc cc;
cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
param, &strm->next_out, &avail_out,
&strm->next_in, &avail_in, state->window);
strm->avail_in = avail_in;
strm->avail_out = avail_out;
return cc;
}
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = &state->arch.common;
struct dfltcc_param_v0 *param = &dfltcc_state->param;
dfltcc_cc cc;
if (flush == Z_BLOCK || flush == Z_TREES) {
/* DFLTCC does not support stopping on block boundaries */
if (PREFIX(dfltcc_inflate_disable)(strm)) {
*ret = Z_STREAM_ERROR;
return DFLTCC_INFLATE_BREAK;
} else
return DFLTCC_INFLATE_SOFTWARE;
}
if (state->last) {
if (state->bits != 0) {
strm->next_in++;
strm->avail_in--;
state->bits = 0;
}
state->mode = CHECK;
return DFLTCC_INFLATE_CONTINUE;
}
if (strm->avail_in == 0 && !param->cf)
return DFLTCC_INFLATE_BREAK;
/* if window not in use yet, initialize */
if (state->wsize == 0)
state->wsize = 1U << state->wbits;
/* Translate stream to parameter block */
param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
param->sbb = state->bits;
if (param->hl)
param->nt = 0; /* Honor history for the first block */
if (state->wrap & 4)
param->cv = state->flags ? ZSWAP32(state->check) : state->check;
/* Inflate */
do {
cc = dfltcc_xpnd(strm);
} while (cc == DFLTCC_CC_AGAIN);
/* Translate parameter block to stream */
strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
state->last = cc == DFLTCC_CC_OK;
state->bits = param->sbb;
if (state->wrap & 4)
strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
/* Report an error if stream is corrupted */
state->mode = BAD;
return DFLTCC_INFLATE_CONTINUE;
}
state->mode = TYPEDO;
/* Break if operands are exhausted, otherwise continue looping */
return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
}
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
return !state->arch.common.param.nt;
}
/*
Rotates a circular buffer.
The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
*/
static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
unsigned char *p = pivot;
unsigned char tmp;
while (p != start) {
tmp = *start;
*start = *p;
*p = tmp;
start++;
p++;
if (p == end)
p = pivot;
else if (start == pivot)
pivot = p;
}
}
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = &state->arch.common;
struct dfltcc_param_v0 *param = &dfltcc_state->param;
if (!PREFIX(dfltcc_can_inflate)(strm))
return 0;
if (PREFIX(dfltcc_was_inflate_used)(strm))
/* DFLTCC has already decompressed some data. Since there is not
* enough information to resume decompression in software, the call
* must fail.
*/
return 1;
/* DFLTCC was not used yet - decompress in software */
memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
/* Convert the window from the hardware to the software format */
rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
state->whave = state->wnext = MIN(param->hl, state->wsize);
return 0;
}
/*
Preloading history.
*/
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
/* if window not in use yet, initialize */
if (state->wsize == 0)
state->wsize = 1U << state->wbits;
append_history(param, state->window, dictionary, dict_length);
state->havedict = 1;
return Z_OK;
}
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
unsigned char *dictionary, uInt *dict_length) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &state->arch.common.param;
if (dictionary && state->window)
get_history(param, state->window, dictionary);
if (dict_length)
*dict_length = param->hl;
return Z_OK;
}

View File

@ -0,0 +1,67 @@
#ifndef DFLTCC_INFLATE_H
#define DFLTCC_INFLATE_H
#include "dfltcc_common.h"
void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
typedef enum {
DFLTCC_INFLATE_CONTINUE,
DFLTCC_INFLATE_BREAK,
DFLTCC_INFLATE_SOFTWARE,
} dfltcc_inflate_action;
dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length);
int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
unsigned char *dictionary, uInt* dict_length);
#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
#define INFLATE_PRIME_HOOK(strm, bits, value) \
do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
#define INFLATE_TYPEDO_HOOK(strm, flush) \
if (PREFIX(dfltcc_can_inflate)((strm))) { \
dfltcc_inflate_action action; \
\
RESTORE(); \
action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
LOAD(); \
if (action == DFLTCC_INFLATE_CONTINUE) \
break; \
else if (action == DFLTCC_INFLATE_BREAK) \
goto inf_leave; \
}
#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
#define INFLATE_MARK_HOOK(strm) \
do { \
if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
} while (0)
#define INFLATE_SYNC_POINT_HOOK(strm) \
do { \
if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
} while (0)
#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_inflate)((strm))) \
return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
if (PREFIX(dfltcc_can_inflate)((strm))) \
return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
} while (0)
#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
#endif

View File

@ -0,0 +1,14 @@
#include "zbuild.h"
#include "s390_features.h"
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifndef HWCAP_S390_VXRS
#define HWCAP_S390_VXRS HWCAP_S390_VX
#endif
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
}

View File

@ -0,0 +1,14 @@
/* s390_features.h -- check for s390 features.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef S390_FEATURES_H_
#define S390_FEATURES_H_
struct s390_cpu_features {
int has_vx;
};
void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
#endif

View File

@ -0,0 +1,20 @@
/* s390_functions.h -- s390 implementations for arch-specific functions.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef S390_FUNCTIONS_H_
#define S390_FUNCTIONS_H_
#ifdef S390_CRC32_VX
uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
# if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
# undef native_crc32
# define native_crc32 = crc32_s390_vx
# endif
#endif
#endif

View File

@ -0,0 +1,47 @@
# Self-Hosted IBM Z Github Actions Runner.
FROM almalinux:9
RUN dnf update -y -q && \
dnf install -y -q --enablerepo=crb wget git which sudo jq \
cmake make automake autoconf m4 libtool ninja-build python3-pip \
gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
RUN dnf install -y -q dotnet-sdk-6.0 && \
echo "Using SDK - `dotnet --version`"
COPY runner-s390x.patch /tmp/runner.patch
COPY runner-global.json /tmp/global.json
RUN cd /tmp && \
git clone -q https://github.com/actions/runner && \
cd runner && \
git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -b build && \
git apply /tmp/runner.patch && \
cp -f /tmp/global.json src/global.json
RUN cd /tmp/runner/src && \
./dev.sh layout && \
./dev.sh package && \
rm -rf /root/.dotnet /root/.nuget
RUN useradd -c "Action Runner" -m actions-runner && \
usermod -L actions-runner
RUN tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
chown -R actions-runner:actions-runner /home/actions-runner
#VOLUME /home/actions-runner
RUN rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
dnf clean all
USER actions-runner
# Scripts.
COPY fs/ /
WORKDIR /home/actions-runner
ENTRYPOINT ["/usr/bin/entrypoint"]
CMD ["/usr/bin/actions-runner"]

View File

@ -0,0 +1,18 @@
[Unit]
Description=Podman container: Gaplib Github Actions Runner
Wants=network-online.target
After=network-online.target
StartLimitIntervalSec=1
RequiresMountsFor=/run/user/1001/containers
[Service]
Environment=PODMAN_SYSTEMD_UNIT=%n
Restart=always
TimeoutStopSec=61
ExecStart=/usr/bin/podman start gaplib-actions-runner
ExecStop=/usr/bin/podman stop -t 1 gaplib-actions-runner
ExecStopPost=/usr/bin/podman stop -t 1 gaplib-actions-runner
Type=forking
[Install]
WantedBy=default.target

View File

@ -0,0 +1,5 @@
{
"sdk": {
"version": "6.0.421"
}
}

View File

@ -0,0 +1,243 @@
diff --git a/src/Directory.Build.props b/src/Directory.Build.props
index 9db5fac..f02e235 100644
--- a/src/Directory.Build.props
+++ b/src/Directory.Build.props
@@ -44,6 +44,9 @@
<PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-arm64'">
<DefineConstants>$(DefineConstants);ARM64</DefineConstants>
</PropertyGroup>
+ <PropertyGroup Condition="'$(BUILD_OS)' == 'Linux' AND '$(PackageRuntime)' == 'linux-s390x'">
+ <DefineConstants>$(DefineConstants);S390X</DefineConstants>
+ </PropertyGroup>
<!-- Set TRACE/DEBUG vars -->
<PropertyGroup>
diff --git a/src/Misc/externals.sh b/src/Misc/externals.sh
index 383221e..1555f67 100755
--- a/src/Misc/externals.sh
+++ b/src/Misc/externals.sh
@@ -189,3 +189,8 @@ if [[ "$PACKAGERUNTIME" == "linux-arm" ]]; then
acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-armv7l.tar.gz" node16 fix_nested_dir
acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-armv7l.tar.gz" node20 fix_nested_dir
fi
+
+if [[ "$PACKAGERUNTIME" == "linux-s390x" ]]; then
+ acquireExternalTool "$NODE_URL/v${NODE16_VERSION}/node-v${NODE16_VERSION}-linux-s390x.tar.gz" node16 fix_nested_dir
+ acquireExternalTool "$NODE_URL/v${NODE20_VERSION}/node-v${NODE20_VERSION}-linux-s390x.tar.gz" node20 fix_nested_dir
+fi
diff --git a/src/Misc/layoutroot/config.sh b/src/Misc/layoutroot/config.sh
index 14cc6ba..9b5b8e6 100755
--- a/src/Misc/layoutroot/config.sh
+++ b/src/Misc/layoutroot/config.sh
@@ -20,25 +20,29 @@ then
message="Execute sudo ./bin/installdependencies.sh to install any missing Dotnet Core 6.0 dependencies."
- ldd ./bin/libcoreclr.so | grep 'not found'
- if [ $? -eq 0 ]; then
- echo "Dependencies is missing for Dotnet Core 6.0"
- echo $message
- exit 1
- fi
+ ARCH=`uname -m`
+ if [ "${ARCH}" != "s390x" -a "${ARCH}" != "ppc64le" ]
+ then
+ ldd ./bin/libcoreclr.so | grep 'not found'
+ if [ $? -eq 0 ]; then
+ echo "Dependencies is missing for Dotnet Core 6.0"
+ echo $message
+ exit 1
+ fi
- ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
- if [ $? -eq 0 ]; then
- echo "Dependencies is missing for Dotnet Core 6.0"
- echo $message
- exit 1
- fi
+ ldd ./bin/libSystem.Security.Cryptography.Native.OpenSsl.so | grep 'not found'
+ if [ $? -eq 0 ]; then
+ echo "Dependencies is missing for Dotnet Core 6.0"
+ echo $message
+ exit 1
+ fi
- ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
- if [ $? -eq 0 ]; then
- echo "Dependencies is missing for Dotnet Core 6.0"
- echo $message
- exit 1
+ ldd ./bin/libSystem.IO.Compression.Native.so | grep 'not found'
+ if [ $? -eq 0 ]; then
+ echo "Dependencies is missing for Dotnet Core 6.0"
+ echo $message
+ exit 1
+ fi
fi
if ! [ -x "$(command -v ldconfig)" ]; then
diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs
index 177e3c9..9545981 100644
--- a/src/Runner.Common/Constants.cs
+++ b/src/Runner.Common/Constants.cs
@@ -58,7 +58,8 @@ namespace GitHub.Runner.Common
X86,
X64,
Arm,
- Arm64
+ Arm64,
+ S390x
}
public static class Runner
@@ -81,6 +82,8 @@ namespace GitHub.Runner.Common
public static readonly Architecture PlatformArchitecture = Architecture.Arm;
#elif ARM64
public static readonly Architecture PlatformArchitecture = Architecture.Arm64;
+#elif S390X
+ public static readonly Architecture PlatformArchitecture = Architecture.S390x;
#else
public static readonly Architecture PlatformArchitecture = Architecture.X64;
#endif
diff --git a/src/Runner.Common/Util/VarUtil.cs b/src/Runner.Common/Util/VarUtil.cs
index 97273a1..2a34430 100644
--- a/src/Runner.Common/Util/VarUtil.cs
+++ b/src/Runner.Common/Util/VarUtil.cs
@@ -53,6 +53,8 @@ namespace GitHub.Runner.Common.Util
return "ARM";
case Constants.Architecture.Arm64:
return "ARM64";
+ case Constants.Architecture.S390x:
+ return "S390X";
default:
throw new NotSupportedException(); // Should never reach here.
}
diff --git a/src/Test/L0/ConstantGenerationL0.cs b/src/Test/L0/ConstantGenerationL0.cs
index 2042485..a9d8b46 100644
--- a/src/Test/L0/ConstantGenerationL0.cs
+++ b/src/Test/L0/ConstantGenerationL0.cs
@@ -20,6 +20,7 @@ namespace GitHub.Runner.Common.Tests
"linux-x64",
"linux-arm",
"linux-arm64",
+ "linux-s390x",
"osx-x64",
"osx-arm64"
};
diff --git a/src/Test/L0/Listener/SelfUpdaterL0.cs b/src/Test/L0/Listener/SelfUpdaterL0.cs
index 26ba65e..6791df3 100644
--- a/src/Test/L0/Listener/SelfUpdaterL0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterL0.cs
@@ -1,4 +1,4 @@
-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
using System;
using System.Collections.Generic;
using System.IO;
@@ -16,6 +16,7 @@ using Xunit;
namespace GitHub.Runner.Common.Tests.Listener
{
+#if !S390X // Self-update is not currently supported on S390X
public sealed class SelfUpdaterL0
{
private Mock<IRunnerServer> _runnerServer;
@@ -291,5 +292,6 @@ namespace GitHub.Runner.Common.Tests.Listener
}
}
}
+#endif
}
#endif
diff --git a/src/Test/L0/Listener/SelfUpdaterV2L0.cs b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
index 5115a6b..dd8d198 100644
--- a/src/Test/L0/Listener/SelfUpdaterV2L0.cs
+++ b/src/Test/L0/Listener/SelfUpdaterV2L0.cs
@@ -1,4 +1,4 @@
-#if !(OS_WINDOWS && ARM64)
+#if !(OS_WINDOWS && ARM64) && !S390X
using System;
using System.Collections.Generic;
using System.IO;
diff --git a/src/Test/L0/Worker/StepHostL0.cs b/src/Test/L0/Worker/StepHostL0.cs
index f6b5889..26f8e21 100644
--- a/src/Test/L0/Worker/StepHostL0.cs
+++ b/src/Test/L0/Worker/StepHostL0.cs
@@ -31,7 +31,7 @@ namespace GitHub.Runner.Common.Tests.Worker
return hc;
}
-#if OS_LINUX
+#if OS_LINUX && !S390X
[Fact]
[Trait("Level", "L0")]
[Trait("Category", "Worker")]
diff --git a/src/dev.sh b/src/dev.sh
index fa637d1..8c66f37 100755
--- a/src/dev.sh
+++ b/src/dev.sh
@@ -54,6 +54,7 @@ elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
case $CPU_NAME in
armv7l) RUNTIME_ID="linux-arm";;
aarch64) RUNTIME_ID="linux-arm64";;
+ s390x) RUNTIME_ID="linux-s390x";;
esac
fi
elif [[ "$CURRENT_PLATFORM" == 'darwin' ]]; then
@@ -80,7 +81,7 @@ if [[ "$CURRENT_PLATFORM" == 'windows' ]]; then
exit 1
fi
elif [[ "$CURRENT_PLATFORM" == 'linux' ]]; then
- if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') ]]; then
+ if [[ ("$RUNTIME_ID" != 'linux-x64') && ("$RUNTIME_ID" != 'linux-x86') && ("$RUNTIME_ID" != 'linux-arm64') && ("$RUNTIME_ID" != 'linux-arm') && ("$RUNTIME_ID" != 'linux-s390x') ]]; then
echo "Failed: Can't build $RUNTIME_ID package $CURRENT_PLATFORM" >&2
exit 1
fi
@@ -199,7 +200,8 @@ function package ()
popd > /dev/null
}
-if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet") ]]; then
+if [[ "${RUNTIME_ID}" != "linux-s390x" && ((! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}") || (! -e "${DOTNETSDK_INSTALLDIR}/dotnet")) ]]; then
+
# Download dotnet SDK to ../_dotnetsdk directory
heading "Ensure Dotnet SDK"
@@ -224,8 +226,10 @@ if [[ (! -d "${DOTNETSDK_INSTALLDIR}") || (! -e "${DOTNETSDK_INSTALLDIR}/.${DOTN
echo "${DOTNETSDK_VERSION}" > "${DOTNETSDK_INSTALLDIR}/.${DOTNETSDK_VERSION}"
fi
-echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
-export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+if [[ -d "${DOTNETSDK_INSTALLDIR}" ]]; then
+ echo "Prepend ${DOTNETSDK_INSTALLDIR} to %PATH%"
+ export PATH=${DOTNETSDK_INSTALLDIR}:$PATH
+fi
heading "Dotnet SDK Version"
dotnet --version
diff --git a/src/dir.proj b/src/dir.proj
index 056a312..8370922 100644
--- a/src/dir.proj
+++ b/src/dir.proj
@@ -41,8 +41,18 @@
</ItemGroup>
<Target Name="Build" DependsOnTargets="GenerateConstant">
- <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" />
- <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);RuntimeIdentifier=$(PackageRuntime);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
+ <PropertyGroup>
+ <!-- Normally we want to publish a self-contained app for $(PackageRuntime) -->
+ <PublishRuntimeIdentifier>RuntimeIdentifier=$(PackageRuntime)</PublishRuntimeIdentifier>
+ <!-- However, on s390x there are no apphost or runtime packages on nuget.org, so self-contained publishing is not supported.
+ Perform a non-self-contained publish using the current runtime identifier (normally something like rhel.8-s390x) instead.
+ In addition, when not using an explicit runtime identifier, the SDK will copy runtime assets from dependent packages;
+ as this would confuse the expected layout, disable that behavior as well. -->
+ <PublishRuntimeIdentifier Condition="'$(PackageRuntime)' == 'linux-s390x'">SelfContained=false;CopyLocalRuntimeTargetAssets=false</PublishRuntimeIdentifier>
+ </PropertyGroup>
+
+ <MSBuild Targets="Restore" Projects="@(ProjectFiles)" StopOnFirstFailure="true" Properties="$(PublishRuntimeIdentifier)" />
+ <MSBuild Targets="Publish" Projects="@(ProjectFiles)" BuildInParallel="false" StopOnFirstFailure="true" Properties="Configuration=$(BUILDCONFIG);PackageRuntime=$(PackageRuntime);Version=$(RunnerVersion);$(PublishRuntimeIdentifier);PublishDir=$(MSBuildProjectDirectory)/../_layout/bin" />
<Exec Command="%22$(DesktopMSBuild)%22 Runner.Service/Windows/RunnerService.csproj /p:Configuration=$(BUILDCONFIG) /p:PackageRuntime=$(PackageRuntime) /p:OutputPath=%22$(MSBuildProjectDirectory)/../_layout/bin%22" ConsoleToMSBuild="true" Condition="'$(PackageRuntime)' == 'win-x64' Or '$(PackageRuntime)' == 'win-x86' Or '$(PackageRuntime)' == 'win-arm64'" />
</Target>

View File

@ -35,7 +35,6 @@ all: \
chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
insert_string_sse42.o insert_string_sse42.lo \
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
@ -77,12 +76,6 @@ compare256_sse2.o:
compare256_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
insert_string_sse42.o:
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
insert_string_sse42.lo:
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
crc32_pclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
@ -90,10 +83,10 @@ crc32_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
crc32_vpclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
crc32_vpclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
slide_hash_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c

View File

@ -9,24 +9,15 @@
#ifdef X86_AVX2
#include "../../zbuild.h"
#include "zbuild.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "../../adler32_p.h"
#include "adler32_p.h"
#include "adler32_avx2_p.h"
#include "x86_intrins.h"
#ifdef X86_SSE42
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
#define sub32(a, b, c) adler32_ssse3(a, b, c)
#else
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
#endif
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
@ -44,9 +35,9 @@ rem_peel:
}
} else if (len < 32) {
if (COPY) {
return copy_sub32(adler, dst, src, len);
return adler32_fold_copy_sse42(adler, dst, src, len);
} else {
return sub32(adler, src, len);
return adler32_ssse3(adler, src, len);
}
}

View File

@ -8,10 +8,9 @@
#ifdef X86_AVX512
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "../../cpu_features.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "arch_functions.h"
#include <immintrin.h>
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
@ -33,13 +32,7 @@ rem_peel:
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
}
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
__m512i vbuf, vs1_0, vs3;

View File

@ -9,11 +9,10 @@
#ifdef X86_AVX512VNNI
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../cpu_features.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "arch_functions.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
#include "x86_intrins.h"
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
@ -28,20 +27,10 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size
rem_peel:
if (len < 32)
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
if (len < 64)
#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
#elif defined(X86_SSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
@ -135,11 +124,7 @@ rem_peel_copy:
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
#else
return adler32_len_16(adler0, src, len, adler1);
#endif
}
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,

View File

@ -6,9 +6,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../adler32_fold.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "adler32_ssse3_p.h"
#include <immintrin.h>

View File

@ -6,8 +6,8 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "zbuild.h"
#include "adler32_p.h"
#include "adler32_ssse3_p.h"
#ifdef X86_SSSE3

View File

@ -4,10 +4,7 @@
#include "zbuild.h"
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
* code size by sharing the chunkcopy functions, which will certainly compile
* to identical machine code */
#if defined(X86_SSSE3) && defined(X86_SSE2)
#if defined(X86_SSSE3)
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
@ -19,8 +16,6 @@ typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
#define HAVE_CHUNKCOPY
#define HAVE_CHUNKUNROLL
static const lut_rem_pair perm_idx_lut[13] = {
{0, 1}, /* 3 */
@ -83,14 +78,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
return ret_vec;
}
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
#define CHUNKSIZE chunksize_ssse3
#define CHUNKMEMSET chunkmemset_ssse3
#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKUNROLL chunkunroll_sse2
#define CHUNKCOPY chunkcopy_ssse3
#define CHUNKUNROLL chunkunroll_ssse3
#include "chunkset_tpl.h"

View File

@ -3,8 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)

View File

@ -3,8 +3,9 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)

View File

@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i xmm_crc_part = _mm_setzero_si128();
#ifdef COPY
char ALIGNED_(16) partial_buf[16] = { 0 };
#else
#ifndef COPY
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
int32_t first = init_crc != 0;
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
* by definition can be up to 15 bytes + one full vector load. */
assert(len >= 31 || first == 0);
/* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
* for the aligning load that occurs. If there's an initial CRC, to carry it forward through
* the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
* up to 15 bytes + one full vector load. */
assert(len >= 16 || first == 0);
#endif
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
#ifdef COPY
if (len == 0)
return;
memcpy(partial_buf, src, len);
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
#ifdef COPY
memcpy(dst, partial_buf, len);
#endif
goto partial;
@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
if (algn_diff < 4 && init_crc != 0) {
xmm_t0 = xmm_crc_part;
if (len >= 32) {
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
} else {
memcpy(partial_buf, src + 16, len - 16);
xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
src += 16;
len -= 16;
#ifdef COPY
dst -= algn_diff;
#endif
goto partial;
}
src += 16;
len -= 16;
}

View File

@ -17,7 +17,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include <immintrin.h>
#include <wmmintrin.h>
@ -26,8 +26,9 @@
# include <immintrin.h>
#endif
#include "../../crc32_fold.h"
#include "../../crc32_braid_p.h"
#include "crc32.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
#include "x86_intrins.h"
#include <assert.h>
@ -350,11 +351,22 @@ Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
return crc->value;
}
static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
uint32_t c = (~crc) & 0xffffffff;
while (len) {
len--;
DO1;
}
return c ^ 0xffffffff;
}
Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
* these short lengths might also prove to be effective */
if (len < 64)
return PREFIX(crc32_braid)(crc32, buf, len);
/* For lens smaller than ~12, crc32_small method is faster.
* But there are also minimum requirements for the pclmul functions due to alignment */
if (len < 16)
return crc32_small(crc32, buf, len);
crc32_fold ALIGNED_(16) crc_state;
CRC32_FOLD_RESET(&crc_state);

View File

@ -3,7 +3,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
#ifdef X86_VPCLMULQDQ_CRC
#define X86_VPCLMULQDQ
#define CRC32_FOLD_COPY crc32_fold_vpclmulqdq_copy

View File

@ -1,24 +0,0 @@
/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
#ifdef X86_SSE42
#include "../../zbuild.h"
#include <nmmintrin.h>
#include "../../deflate.h"
#define HASH_CALC(s, h, val)\
h = _mm_crc32_u32(h, val)
#define HASH_CALC_VAR h
#define HASH_CALC_VAR_INIT uint32_t h = 0
#define UPDATE_HASH update_hash_sse42
#define INSERT_STRING insert_string_sse42
#define QUICK_INSERT_STRING quick_insert_string_sse42
#include "../../insert_string_tpl.h"
#endif

View File

@ -9,8 +9,8 @@
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
#include <immintrin.h>

View File

@ -8,8 +8,8 @@
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include "zbuild.h"
#include "deflate.h"
#include <immintrin.h>
#include <assert.h>

View File

@ -7,7 +7,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "zbuild.h"
#include "x86_features.h"
#ifdef _MSC_VER
@ -15,6 +15,13 @@
#else
// Newer versions of GCC and clang come with cpuid.h
# include <cpuid.h>
# ifdef X86_HAVE_XSAVE_INTRIN
# if __GNUC__ == 8
# include <xsaveintrin.h>
# else
# include <immintrin.h>
# endif
# endif
#endif
#include <string.h>
@ -29,6 +36,7 @@ static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx,
*ecx = registers[2];
*edx = registers[3];
#else
*eax = *ebx = *ecx = *edx = 0;
__cpuid(info, *eax, *ebx, *ecx, *edx);
#endif
}
@ -43,12 +51,13 @@ static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx,
*ecx = registers[2];
*edx = registers[3];
#else
*eax = *ebx = *ecx = *edx = 0;
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
#endif
}
static inline uint64_t xgetbv(unsigned int xcr) {
#ifdef _MSC_VER
#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
return _xgetbv(xcr);
#else
uint32_t eax, edx;
@ -90,7 +99,16 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
// check AVX512 bits if the OS supports saving ZMM registers
if (features->has_os_save_zmm) {
features->has_avx512 = ebx & 0x00010000;
features->has_avx512f = ebx & 0x00010000;
if (features->has_avx512f) {
// According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
// AVX512(DQ,BW,VL).
features->has_avx512dq = ebx & 0x00020000;
features->has_avx512bw = ebx & 0x40000000;
features->has_avx512vl = ebx & 0x80000000;
}
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
&& features->has_avx512vl;
features->has_avx512vnni = ecx & 0x800;
}
}

View File

@ -8,7 +8,11 @@
struct x86_cpu_features {
int has_avx2;
int has_avx512;
int has_avx512f;
int has_avx512dq;
int has_avx512bw;
int has_avx512vl;
int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
int has_avx512vnni;
int has_sse2;
int has_ssse3;
@ -21,4 +25,4 @@ struct x86_cpu_features {
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
#endif /* CPU_H_ */
#endif /* X86_FEATURES_H_ */

View File

@ -0,0 +1,172 @@
/* x86_functions.h -- x86 implementations for arch-specific functions.
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef X86_FUNCTIONS_H_
#define X86_FUNCTIONS_H_
#ifdef X86_SSE2
uint32_t chunksize_sse2(void);
uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
void slide_hash_sse2(deflate_state *s);
# endif
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_SSSE3
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_SSE42
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX2
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_avx2(void);
uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
void slide_hash_avx2(deflate_state *s);
# endif
void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
#endif
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512VNNI
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_PCLMULQDQ_CRC
uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#ifdef X86_VPCLMULQDQ_CRC
uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// X86 - SSE2
# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_sse2
# undef native_chunksize
# define native_chunksize chunksize_sse2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_sse2
# undef native_slide_hash
# define native_slide_hash slide_hash_sse2
# ifdef HAVE_BUILTIN_CTZ
# undef native_compare256
# define native_compare256 compare256_sse2
# undef native_longest_match
# define native_longest_match longest_match_sse2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_sse2
# endif
#endif
// X86 - SSSE3
# if defined(X86_SSSE3) && defined(__SSSE3__)
# undef native_adler32
# define native_adler32 adler32_ssse3
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_ssse3
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_ssse3
# endif
// X86 - SSE4.2
# if defined(X86_SSE42) && defined(__SSE4_2__)
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_sse42
# endif
// X86 - PCLMUL
#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
# undef native_crc32
# define native_crc32 crc32_pclmulqdq
# undef native_crc32_fold
# define native_crc32_fold crc32_fold_pclmulqdq
# undef native_crc32_fold_copy
# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
# undef native_crc32_fold_final
# define native_crc32_fold_final crc32_fold_pclmulqdq_final
# undef native_crc32_fold_reset
# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
#endif
// X86 - AVX
# if defined(X86_AVX2) && defined(__AVX2__)
# undef native_adler32
# define native_adler32 adler32_avx2
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx2
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx2
# undef native_chunksize
# define native_chunksize chunksize_avx2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx2
# undef native_slide_hash
# define native_slide_hash slide_hash_avx2
# ifdef HAVE_BUILTIN_CTZ
# undef native_compare256
# define native_compare256 compare256_avx2
# undef native_longest_match
# define native_longest_match longest_match_avx2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_avx2
# endif
# endif
// X86 - AVX512 (F,DQ,BW,Vl)
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
# undef native_adler32
# define native_adler32 adler32_avx512
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx512
// X86 - AVX512 (VNNI)
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
# undef native_adler32
# define native_adler32 adler32_avx512_vnni
# undef native_adler32_fold_copy
# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
# endif
// X86 - VPCLMULQDQ
# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
# undef native_crc32
# define native_crc32 crc32_vpclmulqdq
# undef native_crc32_fold
# define native_crc32_fold crc32_fold_vpclmulqdq
# undef native_crc32_fold_copy
# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
# undef native_crc32_fold_final
# define native_crc32_fold_final crc32_fold_vpclmulqdq_final
# undef native_crc32_fold_reset
# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
# endif
# endif
#endif
#endif /* X86_FUNCTIONS_H_ */

View File

@ -7,7 +7,7 @@
#ifdef __AVX2__
#include <immintrin.h>
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
__m128i r;
@ -29,7 +29,7 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
/* GCC <9 is missing some AVX512 intrinsics.
*/
#ifdef __AVX512F__
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
#include <immintrin.h>
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \

29
3rdparty/zlib-ng/arch_functions.h vendored Normal file
View File

@ -0,0 +1,29 @@
/* arch_functions.h -- Arch-specific function prototypes.
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CPU_FUNCTIONS_H_
#define CPU_FUNCTIONS_H_
#include "zbuild.h"
#include "zutil.h"
#include "crc32.h"
#include "deflate.h"
#include "fallback_builtins.h"
#include "arch/generic/generic_functions.h"
#if defined(X86_FEATURES)
# include "arch/x86/x86_functions.h"
#elif defined(ARM_FEATURES)
# include "arch/arm/arm_functions.h"
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
# include "arch/power/power_functions.h"
#elif defined(S390_FEATURES)
# include "arch/s390/s390_functions.h"
#elif defined(RISCV_FEATURES)
# include "arch/riscv/riscv_functions.h"
#endif
#endif

View File

@ -5,7 +5,7 @@
#include "zbuild.h"
#include <stdlib.h>
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
#endif
@ -25,7 +25,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) {
without iteration, which will hopefully make the branch prediction more
reliable. */
#ifndef HAVE_CHUNKCOPY
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
@ -54,7 +54,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
least 258 bytes of output space available (258 being the maximum length
output from a single token; see inflate_fast()'s assumptions below). */
#ifndef HAVE_CHUNKUNROLL
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
unsigned char const *from = out - *dist;
chunk_t chunk;
while (*dist < *len && *dist < sizeof(chunk_t)) {
@ -98,7 +98,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
/* Only AVX2 */
#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
if (len <= 16) {
return chunkmemset_ssse3(out, dist, len);
}

115
3rdparty/zlib-ng/cmake/detect-arch.c vendored Normal file
View File

@ -0,0 +1,115 @@
// archdetect.c -- Detect compiler architecture and raise preprocessor error
// containing a simple arch identifier.
// Copyright (C) 2019 Hans Kristian Rosbach
// Licensed under the Zlib license, see LICENSE.md for details
// x86_64
#if defined(__x86_64__) || defined(_M_X64)
#error archfound x86_64
// x86
#elif defined(__i386) || defined(_M_IX86)
#error archfound i686
// ARM
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#error archfound aarch64
#elif defined(__arm__) || defined(__arm) || defined(_M_ARM) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM64_ARCH_8__) || defined(__ARMv8__) || defined(__ARMv8_A__)
#error archfound armv8
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
#error archfound armv7
#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6M__)
#error archfound armv6
#elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
#error archfound armv5
#elif defined(__ARM_ARCH_4T__) || defined(__TARGET_ARCH_5E__)
#error archfound armv4
#elif defined(__ARM_ARCH_3__) || defined(__TARGET_ARCH_3M__)
#error archfound armv3
#elif defined(__ARM_ARCH_2__)
#error archfound armv2
#endif
// PowerPC
#elif defined(__powerpc__) || defined(_ppc__) || defined(__PPC__)
#if defined(__64BIT__) || defined(__powerpc64__) || defined(__ppc64__)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#error archfound powerpc64le
#else
#error archfound powerpc64
#endif
#else
#error archfound powerpc
#endif
// --------------- Less common architectures alphabetically below ---------------
// ALPHA
#elif defined(__alpha__) || defined(__alpha)
#error archfound alpha
// Blackfin
#elif defined(__BFIN__)
#error archfound blackfin
// Itanium
#elif defined(__ia64) || defined(_M_IA64)
#error archfound ia64
// MIPS
#elif defined(__mips__) || defined(__mips)
#error archfound mips
// Motorola 68000-series
#elif defined(__m68k__)
#error archfound m68k
// SuperH
#elif defined(__sh__)
#error archfound sh
// SPARC
#elif defined(__sparc__) || defined(__sparc)
#if defined(__sparcv9) || defined(__sparc_v9__)
#error archfound sparc9
#elif defined(__sparcv8) || defined(__sparc_v8__)
#error archfound sparc8
#endif
// SystemZ
#elif defined(__370__)
#error archfound s370
#elif defined(__s390__)
#error archfound s390
#elif defined(__s390x) || defined(__zarch__)
#error archfound s390x
// PARISC
#elif defined(__hppa__)
#error archfound parisc
// RS-6000
#elif defined(__THW_RS6000)
#error archfound rs6000
// RISC-V
#elif defined(__riscv)
#if __riscv_xlen == 64
#error archfound riscv64
#elif __riscv_xlen == 32
#error archfound riscv32
#endif
// LOONGARCH
#elif defined(__loongarch_lp64)
#error archfound loongarch64
// Emscripten (WebAssembly)
#elif defined(__EMSCRIPTEN__)
#error archfound wasm32
// return 'unrecognized' if we do not know what architecture this is
#else
#error archfound unrecognized
#endif

104
3rdparty/zlib-ng/cmake/detect-arch.cmake vendored Normal file
View File

@ -0,0 +1,104 @@
# detect-arch.cmake -- Detect compiler architecture and set ARCH and BASEARCH
# Copyright (C) 2019 Hans Kristian Rosbach
# Licensed under the Zlib license, see LICENSE.md for details
set(ARCHDETECT_FOUND TRUE)
if(CMAKE_OSX_ARCHITECTURES)
# If multiple architectures are requested (universal build), pick only the first
list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH)
elseif(MSVC)
if("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "X86")
set(ARCH "i686")
elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "x64")
set(ARCH "x86_64")
elseif("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARMV7")
set(ARCH "arm")
elseif ("${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64" OR "${MSVC_C_ARCHITECTURE_ID}" STREQUAL "ARM64EC")
set(ARCH "aarch64")
endif()
elseif(EMSCRIPTEN)
set(ARCH "wasm32")
elseif(CMAKE_CROSSCOMPILING)
set(ARCH ${CMAKE_C_COMPILER_TARGET})
else()
# Let preprocessor parse archdetect.c and raise an error containing the arch identifier
enable_language(C)
try_run(
run_result_unused
compile_result_unused
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_LIST_DIR}/detect-arch.c
COMPILE_OUTPUT_VARIABLE RAWOUTPUT
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)
# Find basearch tag, and extract the arch word into BASEARCH variable
string(REGEX REPLACE ".*archfound ([a-zA-Z0-9_]+).*" "\\1" ARCH "${RAWOUTPUT}")
if(NOT ARCH)
set(ARCH unknown)
endif()
endif()
# Make sure we have ARCH set
if(NOT ARCH OR ARCH STREQUAL "unknown")
set(ARCH ${CMAKE_SYSTEM_PROCESSOR})
message(STATUS "Arch not recognized, falling back to cmake arch: '${ARCH}'")
else()
message(STATUS "Arch detected: '${ARCH}'")
endif()
# Base arch detection
if("${ARCH}" MATCHES "(x86_64|AMD64|i[3-6]86)")
set(BASEARCH "x86")
set(BASEARCH_X86_FOUND TRUE)
elseif("${ARCH}" MATCHES "(arm(v[0-9])?|aarch64|cortex)")
set(BASEARCH "arm")
set(BASEARCH_ARM_FOUND TRUE)
elseif("${ARCH}" MATCHES "ppc(64(le)?)?|powerpc(64(le)?)?")
set(BASEARCH "ppc")
set(BASEARCH_PPC_FOUND TRUE)
elseif("${ARCH}" MATCHES "alpha")
set(BASEARCH "alpha")
set(BASEARCH_ALPHA_FOUND TRUE)
elseif("${ARCH}" MATCHES "blackfin")
set(BASEARCH "blackfin")
set(BASEARCH_BLACKFIN_FOUND TRUE)
elseif("${ARCH}" MATCHES "ia64")
set(BASEARCH "ia64")
set(BASEARCH_IA64_FOUND TRUE)
elseif("${ARCH}" MATCHES "mips")
set(BASEARCH "mips")
set(BASEARCH_MIPS_FOUND TRUE)
elseif("${ARCH}" MATCHES "m68k")
set(BASEARCH "m68k")
set(BASEARCH_M68K_FOUND TRUE)
elseif("${ARCH}" MATCHES "sh")
set(BASEARCH "sh")
set(BASEARCH_SH_FOUND TRUE)
elseif("${ARCH}" MATCHES "sparc[89]?")
set(BASEARCH "sparc")
set(BASEARCH_SPARC_FOUND TRUE)
elseif("${ARCH}" MATCHES "s3[679]0x?")
set(BASEARCH "s360")
set(BASEARCH_S360_FOUND TRUE)
elseif("${ARCH}" MATCHES "parisc")
set(BASEARCH "parisc")
set(BASEARCH_PARISC_FOUND TRUE)
elseif("${ARCH}" MATCHES "rs6000")
set(BASEARCH "rs6000")
set(BASEARCH_RS6000_FOUND TRUE)
elseif("${ARCH}" MATCHES "riscv(32|64)")
set(BASEARCH "riscv")
set(BASEARCH_RISCV_FOUND TRUE)
elseif("${ARCH}" MATCHES "loongarch64")
set(BASEARCH "loongarch")
set(BASEARCH_LOONGARCH_FOUND TRUE)
elseif("${ARCH}" MATCHES "wasm32")
set(BASEARCH "wasm32")
set(BASEARCH_WASM32_FOUND TRUE)
else()
set(BASEARCH "x86")
set(BASEARCH_X86_FOUND TRUE)
message(STATUS "Basearch '${ARCH}' not recognized, defaulting to 'x86'.")
endif()
message(STATUS "Basearch of '${ARCH}' has been detected as: '${BASEARCH}'")

View File

@ -0,0 +1,46 @@
# detect-coverage.cmake -- Detect supported compiler coverage flags
# Licensed under the Zlib license, see LICENSE.md for details
macro(add_code_coverage)
# Check for -coverage flag support for Clang/GCC
if(CMAKE_VERSION VERSION_LESS 3.14)
set(CMAKE_REQUIRED_LIBRARIES -lgcov)
else()
set(CMAKE_REQUIRED_LINK_OPTIONS -coverage)
endif()
check_c_compiler_flag(-coverage HAVE_COVERAGE)
set(CMAKE_REQUIRED_LIBRARIES)
set(CMAKE_REQUIRED_LINK_OPTIONS)
if(HAVE_COVERAGE)
add_compile_options(-coverage)
add_link_options(-coverage)
message(STATUS "Code coverage enabled using: -coverage")
else()
# Some versions of GCC don't support -coverage shorthand
if(CMAKE_VERSION VERSION_LESS 3.14)
set(CMAKE_REQUIRED_LIBRARIES -lgcov)
else()
set(CMAKE_REQUIRED_LINK_OPTIONS -lgcov -fprofile-arcs)
endif()
check_c_compiler_flag("-ftest-coverage -fprofile-arcs -fprofile-values" HAVE_TEST_COVERAGE)
set(CMAKE_REQUIRED_LIBRARIES)
set(CMAKE_REQUIRED_LINK_OPTIONS)
if(HAVE_TEST_COVERAGE)
add_compile_options(-ftest-coverage -fprofile-arcs -fprofile-values)
add_link_options(-lgcov -fprofile-arcs)
message(STATUS "Code coverage enabled using: -ftest-coverage")
else()
message(WARNING "Compiler does not support code coverage")
set(WITH_CODE_COVERAGE OFF)
endif()
endif()
# Set optimization level to zero for code coverage builds
if (WITH_CODE_COVERAGE)
# Use CMake compiler flag variables due to add_compile_options failure on Windows GCC
set(CMAKE_C_FLAGS "-O0 ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-O0 ${CMAKE_CXX_FLAGS}")
endif()
endmacro()

View File

@ -0,0 +1,43 @@
# detect-install-dirs.cmake -- Detect install directory parameters
# Copyright (C) 2021 Hans Kristian Rosbach
# Licensed under the Zlib license, see LICENSE.md for details
# Determine installation directory for executables
if (DEFINED BIN_INSTALL_DIR)
set(BIN_INSTALL_DIR "${BIN_INSTALL_DIR}" CACHE PATH "Installation directory for executables (Deprecated)" FORCE)
set(CMAKE_INSTALL_BINDIR "${BIN_INSTALL_DIR}")
elseif (DEFINED INSTALL_BIN_DIR)
set(CMAKE_INSTALL_BINDIR "${INSTALL_BIN_DIR}")
endif()
# Determine installation directory for libraries
if (DEFINED LIB_INSTALL_DIR)
set(LIB_INSTALL_DIR "${LIB_INSTALL_DIR}" CACHE PATH "Installation directory for libraries (Deprecated)" FORCE)
set(CMAKE_INSTALL_LIBDIR "${LIB_INSTALL_DIR}")
elseif (DEFINED INSTALL_LIB_DIR)
set(CMAKE_INSTALL_LIBDIR "${INSTALL_LIB_DIR}")
endif()
# Determine installation directory for include files
if (DEFINED INC_INSTALL_DIR)
set(INC_INSTALL_DIR "${INC_INSTALL_DIR}" CACHE PATH "Installation directory for headers (Deprecated)" FORCE)
set(CMAKE_INSTALL_INCLUDEDIR "${INC_INSTALL_DIR}")
elseif (DEFINED INSTALL_INC_DIR)
set(CMAKE_INSTALL_INCLUDEDIR "${INSTALL_INC_DIR}")
endif()
# Define GNU standard installation directories
include(GNUInstallDirs)
# Determine installation directory for pkgconfig files
if (DEFINED PKGCONFIG_INSTALL_DIR)
set(PKGCONFIG_INSTALL_DIR "${PKGCONFIG_INSTALL_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
elseif (DEFINED INSTALL_PKGCONFIG_DIR)
set(PKGCONFIG_INSTALL_DIR "${INSTALL_PKGCONFIG_DIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
elseif (DEFINED CMAKE_INSTALL_PKGCONFIGDIR)
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
elseif (DEFINED CMAKE_INSTALL_FULL_PKGCONFIGDIR)
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_FULL_PKGCONFIGDIR}" CACHE PATH "Installation directory for pkgconfig (.pc) files" FORCE)
else()
set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files")
endif()

View File

@ -2,40 +2,39 @@
# Licensed under the Zlib license, see LICENSE.md for details
macro(check_acle_compiler_flag)
if(MSVC)
# Both ARM and ARM64-targeting msvc support intrinsics, but
# ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
set(HAVE_ACLE_FLAG TRUE)
endif()
else()
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
check_c_compiler_flag("-march=armv8-a+crc" HAVE_MARCH_ARMV8_CRC)
if(HAVE_MARCH_ARMV8_CRC)
set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
else()
check_c_compiler_flag("-march=armv8-a+crc+simd" HAVE_MARCH_ARMV8_CRC_SIMD)
if(HAVE_MARCH_ARMV8_CRC_SIMD)
set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support")
endif()
endif()
# Check whether compiler supports ACLE flag
endif()
endif()
# Check whether compiler supports ARMv8 CRC intrinsics
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"int main() { return 0; }"
HAVE_ACLE_FLAG FAIL_REGEX "not supported")
if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
# Check whether compiler supports ACLE flag
set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
check_c_source_compiles(
"int main() { return 0; }"
HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
endif()
"#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <arm_acle.h>
#endif
unsigned int f(unsigned int a, unsigned int b) {
return __crc32w(a, b);
}
int main(void) { return 0; }"
HAVE_ACLE_FLAG
)
set(CMAKE_REQUIRED_FLAGS)
endif()
endmacro()
macro(check_armv6_compiler_flag)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
if(HAVE_MARCH_ARMV6)
set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
@ -67,13 +66,14 @@ macro(check_armv6_compiler_flag)
return __uqsub16(a, b);
#endif
}
int main(void) { return 0; }"
int main(void) { return f(1,2); }"
HAVE_ARMV6_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_avx512_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
@ -81,7 +81,6 @@ macro(check_avx512_intrinsics)
set(AVX512FLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
@ -94,10 +93,10 @@ macro(check_avx512_intrinsics)
endif()
unset(HAVE_CASCADE_LAKE)
endif()
endif()
elseif(MSVC)
set(AVX512FLAG "/arch:AVX512")
endif()
endif()
# Check whether compiler supports AVX512 intrinsics
set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
@ -109,26 +108,17 @@ macro(check_avx512_intrinsics)
int main(void) { return 0; }"
HAVE_AVX512_INTRIN
)
# Evidently both GCC and clang were late to implementing these
check_c_source_compiles(
"#include <immintrin.h>
__mmask16 f(__mmask16 x) { return _knot_mask16(x); }
int main(void) { return 0; }"
HAVE_MASK_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_avx512vnni_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
else()
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
@ -139,11 +129,10 @@ macro(check_avx512vnni_intrinsics)
endif()
unset(HAVE_CASCADE_LAKE)
endif()
endif()
elseif(MSVC)
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
endif()
# Check whether compiler supports AVX512vnni intrinsics
set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
@ -159,6 +148,7 @@ macro(check_avx512vnni_intrinsics)
endmacro()
macro(check_avx2_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(AVX2FLAG "-mavx2")
@ -166,12 +156,11 @@ macro(check_avx2_intrinsics)
set(AVX2FLAG "/arch:AVX2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(AVX2FLAG "-mavx2")
endif()
elseif(MSVC)
set(AVX2FLAG "/arch:AVX2")
endif()
endif()
# Check whether compiler supports AVX2 intrinics
set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
@ -187,8 +176,8 @@ macro(check_avx2_intrinsics)
endmacro()
macro(check_neon_compiler_flag)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if("${ARCH}" MATCHES "aarch64")
set(NEONFLAG "-march=armv8-a+simd")
else()
@ -206,12 +195,52 @@ macro(check_neon_compiler_flag)
#endif
int main() { return 0; }"
NEON_AVAILABLE FAIL_REGEX "not supported")
# Check whether compiler native flag is enough for NEON support
# Some GCC versions don't enable FPU (vector unit) when using -march=native
if(NEON_AVAILABLE AND NATIVEFLAG AND (NOT "${ARCH}" MATCHES "aarch64"))
check_c_source_compiles(
"#include <arm_neon.h>
uint8x16_t f(uint8x16_t x, uint8x16_t y) {
return vaddq_u8(x, y);
}
int main(int argc, char* argv[]) {
uint8x16_t a = vdupq_n_u8(argc);
uint8x16_t b = vdupq_n_u8(argc);
uint8x16_t result = f(a, b);
return result[0];
}"
ARM_NEON_SUPPORT_NATIVE
)
if(NOT ARM_NEON_SUPPORT_NATIVE)
set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG} -mfpu=neon ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <arm_neon.h>
uint8x16_t f(uint8x16_t x, uint8x16_t y) {
return vaddq_u8(x, y);
}
int main(int argc, char* argv[]) {
uint8x16_t a = vdupq_n_u8(argc);
uint8x16_t b = vdupq_n_u8(argc);
uint8x16_t result = f(a, b);
return result[0];
}"
ARM_NEON_SUPPORT_NATIVE_MFPU
)
if(ARM_NEON_SUPPORT_NATIVE_MFPU)
set(NEONFLAG "-mfpu=neon")
else()
# Remove local NEON_AVAILABLE variable and overwrite the cache
unset(NEON_AVAILABLE)
set(NEON_AVAILABLE "" CACHE INTERNAL "NEON support available" FORCE)
endif()
endif()
endif()
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_neon_ld4_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if("${ARCH}" MATCHES "aarch64")
set(NEONFLAG "-march=armv8-a+simd")
else()
@ -234,8 +263,8 @@ macro(check_neon_ld4_intrinsics)
endmacro()
macro(check_pclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
set(PCLMULFLAG "-mpclmul")
endif()
endif()
@ -257,8 +286,8 @@ macro(check_pclmulqdq_intrinsics)
endmacro()
macro(check_vpclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
endif()
endif()
@ -341,8 +370,8 @@ macro(check_ppc_intrinsics)
endmacro()
macro(check_power8_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
set(POWER8FLAG "-mcpu=power8")
endif()
endif()
@ -364,12 +393,27 @@ macro(check_power8_intrinsics)
}"
HAVE_POWER8_INTRIN
)
if(NOT HAVE_POWER8_INTRIN AND HAVE_LINUX_AUXVEC_H)
check_c_source_compiles(
"#include <sys/auxv.h>
#include <linux/auxvec.h>
int main() {
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
}"
HAVE_POWER8_INTRIN2
)
if(HAVE_POWER8_INTRIN2)
set(POWER8_NEED_AUXVEC_H 1)
set(HAVE_POWER8_INTRIN ${HAVE_POWER8_INTRIN2} CACHE INTERNAL "Have POWER8 intrinsics" FORCE)
unset(HAVE_POWER8_INTRIN2 CACHE)
endif()
endif()
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_rvv_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
set(RISCVFLAG "-march=rv64gcv")
endif()
endif()
@ -399,8 +443,8 @@ macro(check_s390_intrinsics)
endmacro()
macro(check_power9_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
set(POWER9FLAG "-mcpu=power9")
endif()
endif()
@ -422,10 +466,26 @@ macro(check_power9_intrinsics)
}"
HAVE_POWER9_INTRIN
)
if(NOT HAVE_POWER9_INTRIN AND HAVE_LINUX_AUXVEC_H)
check_c_source_compiles(
"#include <sys/auxv.h>
#include <linux/auxvec.h>
int main() {
return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00);
}"
HAVE_POWER9_INTRIN2
)
if(HAVE_POWER9_INTRIN2)
set(POWER9_NEED_AUXVEC_H 1)
set(HAVE_POWER9_INTRIN ${HAVE_POWER9_INTRIN2} CACHE INTERNAL "Have POWER9 intrinsics" FORCE)
unset(HAVE_POWER9_INTRIN2 CACHE)
endif()
endif()
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_sse2_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSE2FLAG "-msse2")
@ -437,7 +497,6 @@ macro(check_sse2_intrinsics)
set(SSE2FLAG "/arch:SSE2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSE2FLAG "-msse2")
endif()
endif()
@ -453,6 +512,7 @@ macro(check_sse2_intrinsics)
endmacro()
macro(check_ssse3_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSSE3FLAG "-mssse3")
@ -460,7 +520,6 @@ macro(check_ssse3_intrinsics)
set(SSSE3FLAG "/arch:SSSE3")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSSE3FLAG "-mssse3")
endif()
endif()
@ -478,6 +537,7 @@ macro(check_ssse3_intrinsics)
endmacro()
macro(check_sse42_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSE42FLAG "-msse4.2")
@ -485,7 +545,6 @@ macro(check_sse42_intrinsics)
set(SSE42FLAG "/arch:SSE4.2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
set(SSE42FLAG "-msse4.2")
endif()
endif()
@ -526,15 +585,17 @@ macro(check_vgfma_intrinsics)
endmacro()
macro(check_xsave_intrinsics)
if(NOT NATIVEFLAG AND NOT MSVC)
if(NOT NATIVEFLAG AND NOT MSVC AND NOT CMAKE_C_COMPILER_ID MATCHES "Intel")
set(XSAVEFLAG "-mxsave")
endif()
set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#ifdef _MSC_VER
# include <intrin.h>
#elif __GNUC__ == 8 && __GNUC_MINOR__ > 1
# include <xsaveintrin.h>
#else
# include <x86gprintrin.h>
# include <immintrin.h>
#endif
unsigned int f(unsigned int a) { return (int) _xgetbv(a); }
int main(void) { return 0; }"

View File

@ -0,0 +1,166 @@
# detect-sanitizer.cmake -- Detect supported compiler sanitizer flags
# Licensed under the Zlib license, see LICENSE.md for details
macro(add_common_sanitizer_flags)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
add_compile_options(-g3)
endif()
check_c_compiler_flag(-fno-omit-frame-pointer HAVE_NO_OMIT_FRAME_POINTER)
if(HAVE_NO_OMIT_FRAME_POINTER)
add_compile_options(-fno-omit-frame-pointer)
add_link_options(-fno-omit-frame-pointer)
endif()
check_c_compiler_flag(-fno-optimize-sibling-calls HAVE_NO_OPTIMIZE_SIBLING_CALLS)
if(HAVE_NO_OPTIMIZE_SIBLING_CALLS)
add_compile_options(-fno-optimize-sibling-calls)
add_link_options(-fno-optimize-sibling-calls)
endif()
endmacro()
macro(check_sanitizer_support known_checks supported_checks)
set(available_checks "")
# Build list of supported sanitizer flags by incrementally trying compilation with
# known sanitizer checks
foreach(check ${known_checks})
if(available_checks STREQUAL "")
set(compile_checks "${check}")
else()
set(compile_checks "${available_checks},${check}")
endif()
set(CMAKE_REQUIRED_FLAGS -fsanitize=${compile_checks})
check_c_source_compiles("int main() { return 0; }" HAVE_SANITIZER_${check}
FAIL_REGEX "not supported|unrecognized command|unknown option")
set(CMAKE_REQUIRED_FLAGS)
if(HAVE_SANITIZER_${check})
set(available_checks ${compile_checks})
endif()
endforeach()
set(${supported_checks} ${available_checks})
endmacro()
macro(add_address_sanitizer)
set(known_checks
address
pointer-compare
pointer-subtract
)
check_sanitizer_support("${known_checks}" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Address sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
else()
message(STATUS "Address sanitizer is not supported")
endif()
if(CMAKE_CROSSCOMPILING_EMULATOR)
# Only check for leak sanitizer if not cross-compiling due to qemu crash
message(WARNING "Leak sanitizer is not supported when cross compiling")
else()
# Leak sanitizer requires address sanitizer
check_sanitizer_support("leak" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Leak sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
else()
message(STATUS "Leak sanitizer is not supported")
endif()
endif()
endmacro()
macro(add_memory_sanitizer)
check_sanitizer_support("memory" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Memory sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
check_c_compiler_flag(-fsanitize-memory-track-origins HAVE_MEMORY_TRACK_ORIGINS)
if(HAVE_MEMORY_TRACK_ORIGINS)
add_compile_options(-fsanitize-memory-track-origins)
add_link_options(-fsanitize-memory-track-origins)
endif()
else()
message(STATUS "Memory sanitizer is not supported")
endif()
endmacro()
macro(add_thread_sanitizer)
check_sanitizer_support("thread" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Thread sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
add_common_sanitizer_flags()
else()
message(STATUS "Thread sanitizer is not supported")
endif()
endmacro()
macro(add_undefined_sanitizer)
set(known_checks
array-bounds
bool
bounds
builtin
enum
float-cast-overflow
float-divide-by-zero
function
integer-divide-by-zero
local-bounds
null
nonnull-attribute
pointer-overflow
return
returns-nonnull-attribute
shift
shift-base
shift-exponent
signed-integer-overflow
undefined
unsigned-integer-overflow
unsigned-shift-base
vla-bound
vptr
)
# Only check for alignment sanitizer flag if unaligned access is not supported
if(NOT WITH_UNALIGNED)
list(APPEND known_checks alignment)
endif()
# Object size sanitizer has no effect at -O0 and produces compiler warning if enabled
if(NOT CMAKE_C_FLAGS MATCHES "-O0")
list(APPEND known_checks object-size)
endif()
check_sanitizer_support("${known_checks}" supported_checks)
if(NOT ${supported_checks} STREQUAL "")
message(STATUS "Undefined behavior sanitizer is enabled: ${supported_checks}")
add_compile_options(-fsanitize=${supported_checks})
add_link_options(-fsanitize=${supported_checks})
# Group sanitizer flag -fsanitize=undefined will automatically add alignment, even if
# it is not in our sanitize flag list, so we need to explicitly disable alignment sanitizing.
if(WITH_UNALIGNED)
add_compile_options(-fno-sanitize=alignment)
endif()
add_common_sanitizer_flags()
else()
message(STATUS "Undefined behavior sanitizer is not supported")
endif()
endmacro()

View File

@ -6,12 +6,10 @@
#ifndef CPU_FEATURES_H_
#define CPU_FEATURES_H_
#include "adler32_fold.h"
#include "crc32_fold.h"
#ifndef DISABLE_RUNTIME_CPU_DETECTION
#if defined(X86_FEATURES)
# include "arch/x86/x86_features.h"
# include "fallback_builtins.h"
#elif defined(ARM_FEATURES)
# include "arch/arm/arm_features.h"
#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
@ -38,266 +36,8 @@ struct cpu_features {
#endif
};
extern void cpu_check_features(struct cpu_features *features);
void cpu_check_features(struct cpu_features *features);
/* adler32 */
typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
#ifdef ARM_NEON
extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef PPC_VMX
extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef RISCV_RVV
extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_SSSE3
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX2
extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX512
extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX512VNNI
extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef POWER8_VSX
extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
#endif
/* adler32 folding */
#ifdef RISCV_RVV
extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_SSE42
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX2
extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512
extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512VNNI
extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
/* CRC32 folding */
#ifdef X86_PCLMULQDQ_CRC
extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
extern void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
extern void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
extern void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
/* memory chunking */
extern uint32_t chunksize_c(void);
extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#ifdef X86_SSE2
extern uint32_t chunksize_sse2(void);
extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef X86_SSSE3
extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef X86_AVX2
extern uint32_t chunksize_avx2(void);
extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef ARM_NEON
extern uint32_t chunksize_neon(void);
extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef POWER8_VSX
extern uint32_t chunksize_power8(void);
extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef RISCV_RVV
extern uint32_t chunksize_rvv(void);
extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
#endif
#ifdef ZLIB_COMPAT
typedef struct z_stream_s z_stream;
#else
typedef struct zng_stream_s zng_stream;
#endif
/* inflate fast loop */
extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
#ifdef X86_SSE2
extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_SSSE3
extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_AVX2
extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef ARM_NEON
extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef POWER8_VSX
extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef RISCV_RVV
extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
#endif
/* CRC32 */
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
#ifdef ARM_ACLE
extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
#elif defined(POWER8_VSX)
extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
#elif defined(S390_CRC32_VX)
extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
#endif
/* compare256 */
typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
#ifdef HAVE_BUILTIN_CTZ
extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
#endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
#endif
#ifdef POWER9
extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
#endif
#ifdef RISCV_RVV
extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
#endif
#ifdef DEFLATE_H_
/* insert_string */
extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
#ifdef X86_SSE42
extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
#elif defined(ARM_ACLE)
extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
#endif
/* longest_match */
extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
#ifdef HAVE_BUILTIN_CTZ
extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
#endif
#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
#endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
#endif
#ifdef POWER9
extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
#endif
#ifdef RISCV_RVV
extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
#endif
/* longest_match_slow */
extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
#ifdef UNALIGNED64_OK
extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
#endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
#endif
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
#endif
#ifdef POWER9
extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
#endif
#ifdef RISCV_RVV
extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
#endif
/* quick_insert_string */
extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
#ifdef X86_SSE42
extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
#elif defined(ARM_ACLE)
extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
#endif
/* slide_hash */
typedef void (*slide_hash_func)(deflate_state *s);
#ifdef X86_SSE2
extern void slide_hash_sse2(deflate_state *s);
#endif
#if defined(ARM_SIMD)
extern void slide_hash_armv6(deflate_state *s);
#endif
#if defined(ARM_NEON)
extern void slide_hash_neon(deflate_state *s);
#endif
#if defined(PPC_VMX)
extern void slide_hash_vmx(deflate_state *s);
#endif
#if defined(POWER8_VSX)
extern void slide_hash_power8(deflate_state *s);
#endif
#if defined(RISCV_RVV)
extern void slide_hash_rvv(deflate_state *s);
#endif
#ifdef X86_AVX2
extern void slide_hash_avx2(deflate_state *s);
#endif
/* update_hash */
extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
#ifdef X86_SSE42
extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val);
#elif defined(ARM_ACLE)
extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
#endif
#endif
#endif

42
3rdparty/zlib-ng/crc32.c vendored Normal file
View File

@ -0,0 +1,42 @@
/* crc32.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-2022 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* This interleaved implementation of a CRC makes use of pipelined multiple
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
*/
#include "zbuild.h"
#include "functable.h"
#include "crc32_braid_tbl.h"
/* ========================================================================= */
const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
return (const uint32_t *)crc_table;
}
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return (unsigned long)FUNCTABLE_CALL(crc32)((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
if (buf == NULL) return 0;
return FUNCTABLE_CALL(crc32)(crc, buf, len);
}
#endif
#ifdef ZLIB_COMPAT
unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
}
#else
uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
return PREFIX(crc32_z)(crc, buf, len);
}
#endif

16
3rdparty/zlib-ng/crc32.h vendored Normal file
View File

@ -0,0 +1,16 @@
/* crc32.h -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CRC32_H_
#define CRC32_H_
#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
/* sizeof(__m128i) * (4 folds) */
typedef struct crc32_fold_s {
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
uint32_t value;
} crc32_fold;
#endif

View File

@ -7,7 +7,6 @@
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
*/
#include "zbuild.h"
#include "zutil.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"

View File

@ -1,7 +1,6 @@
#ifndef CRC32_BRAID_P_H_
#define CRC32_BRAID_P_H_
#include "zbuild.h"
#include "zendian.h"
/* Define N */
@ -25,7 +24,7 @@
# endif
#else
# ifndef W
# if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__)
# if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
# define W 8
# else
# define W 4
@ -42,9 +41,24 @@
# endif
#endif
#if BYTE_ORDER == LITTLE_ENDIAN
# define ZSWAPWORD(word) (word)
# define BRAID_TABLE crc_braid_table
#elif BYTE_ORDER == BIG_ENDIAN
# if W == 8
# define ZSWAPWORD(word) ZSWAP64(word)
# elif W == 4
# define ZSWAPWORD(word) ZSWAP32(word)
# endif
# define BRAID_TABLE crc_braid_big_table
#else
# error "No endian defined"
#endif
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
/* CRC polynomial. */
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
#endif /* CRC32_BRAID_P_H_ */

View File

@ -1,21 +0,0 @@
/* crc32_fold.h -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CRC32_FOLD_H_
#define CRC32_FOLD_H_
#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
/* sizeof(__m128i) * (4 folds) */
typedef struct crc32_fold_s {
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
uint32_t value;
} crc32_fold;
Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
#endif

View File

@ -1,5 +1,5 @@
/* deflate.c -- compress data using the deflation algorithm
* Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
@ -58,7 +58,7 @@
# undef deflateInit2
#endif
const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jean-loup Gailly and Mark Adler ";
const char PREFIX(deflate_copyright)[] = " deflate 1.3.1 Copyright 1995-2024 Jean-loup Gailly and Mark Adler ";
/*
If you use the zlib library in a product, an acknowledgment is welcome
in the documentation of your product. If for some reason you cannot
@ -71,14 +71,16 @@ const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jea
*/
#ifdef S390_DFLTCC_DEFLATE
# include "arch/s390/dfltcc_deflate.h"
/* DFLTCC instructions require window to be page-aligned */
# define PAD_WINDOW PAD_4096
# define WINDOW_PAD_SIZE 4096
# define HINT_ALIGNED_WINDOW HINT_ALIGNED_4096
#else
/* Memory management for the deflate state. Useful for allocating arch-specific extension blocks. */
# define ZALLOC_DEFLATE_STATE(strm) ((deflate_state *)ZALLOC(strm, 1, sizeof(deflate_state)))
# define ZFREE_STATE(strm, addr) ZFREE(strm, addr)
# define ZCOPY_DEFLATE_STATE(dst, src) memcpy(dst, src, sizeof(deflate_state))
/* Memory management for the window. Useful for allocation the aligned window. */
# define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size)
# define TRY_FREE_WINDOW(strm, addr) TRY_FREE(strm, addr)
# define PAD_WINDOW PAD_64
# define WINDOW_PAD_SIZE 64
# define HINT_ALIGNED_WINDOW HINT_ALIGNED_64
/* Adjust the window size for the arch-specific deflate code. */
# define DEFLATE_ADJUST_WINDOW_SIZE(n) (n)
/* Invoked at the beginning of deflateSetDictionary(). Useful for checking arch-specific window data. */
# define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
/* Invoked at the beginning of deflateGetDictionary(). Useful for adjusting arch-specific window data. */
@ -120,10 +122,6 @@ static void lm_set_level (deflate_state *s, int level);
static void lm_init (deflate_state *s);
Z_INTERNAL unsigned read_buf (PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
extern uint32_t update_hash_roll (deflate_state *const s, uint32_t h, uint32_t val);
extern void insert_string_roll (deflate_state *const s, uint32_t str, uint32_t count);
extern Pos quick_insert_string_roll(deflate_state *const s, uint32_t str);
/* ===========================================================================
* Local data
*/
@ -185,17 +183,111 @@ static const config configuration_table[10] = {
memset((unsigned char *)s->head, 0, HASH_SIZE * sizeof(*s->head)); \
} while (0)
/* ========================================================================= */
/* This function is hidden in ZLIB_COMPAT builds. */
#ifdef DEF_ALLOC_DEBUG
# include <stdio.h>
# define LOGSZ(name,size) fprintf(stderr, "%s is %d bytes\n", name, size)
# define LOGSZP(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %d, padded %d\n", name, size, loc, pad)
# define LOGSZPL(name,size,loc,pad) fprintf(stderr, "%s is %d bytes, offset %ld, padded %d\n", name, size, loc, pad)
#else
# define LOGSZ(name,size)
# define LOGSZP(name,size,loc,pad)
# define LOGSZPL(name,size,loc,pad)
#endif
/* ===========================================================================
* Allocate a big buffer and divide it up into the various buffers deflate needs.
* Handles alignment of allocated buffer and alignment of individual buffers.
*/
Z_INTERNAL deflate_allocs* alloc_deflate(PREFIX3(stream) *strm, int windowBits, int lit_bufsize) {
int curr_size = 0;
/* Define sizes */
int window_size = DEFLATE_ADJUST_WINDOW_SIZE((1 << windowBits) * 2);
int prev_size = (1 << windowBits) * sizeof(Pos);
int head_size = HASH_SIZE * sizeof(Pos);
int pending_size = lit_bufsize * LIT_BUFS;
int state_size = sizeof(deflate_state);
int alloc_size = sizeof(deflate_allocs);
/* Calculate relative buffer positions and paddings */
LOGSZP("window", window_size, PAD_WINDOW(curr_size), PADSZ(curr_size,WINDOW_PAD_SIZE));
int window_pos = PAD_WINDOW(curr_size);
curr_size = window_pos + window_size;
LOGSZP("prev", prev_size, PAD_64(curr_size), PADSZ(curr_size,64));
int prev_pos = PAD_64(curr_size);
curr_size = prev_pos + prev_size;
LOGSZP("head", head_size, PAD_64(curr_size), PADSZ(curr_size,64));
int head_pos = PAD_64(curr_size);
curr_size = head_pos + head_size;
LOGSZP("pending", pending_size, PAD_64(curr_size), PADSZ(curr_size,64));
int pending_pos = PAD_64(curr_size);
curr_size = pending_pos + pending_size;
LOGSZP("state", state_size, PAD_64(curr_size), PADSZ(curr_size,64));
int state_pos = PAD_64(curr_size);
curr_size = state_pos + state_size;
LOGSZP("alloc", alloc_size, PAD_16(curr_size), PADSZ(curr_size,16));
int alloc_pos = PAD_16(curr_size);
curr_size = alloc_pos + alloc_size;
/* Add 64-1 or 4096-1 to allow window alignment, and round size of buffer up to multiple of 64 */
int total_size = PAD_64(curr_size + (WINDOW_PAD_SIZE - 1));
/* Allocate buffer, align to 64-byte cacheline, and zerofill the resulting buffer */
char *original_buf = strm->zalloc(strm->opaque, 1, total_size);
if (original_buf == NULL)
return NULL;
char *buff = (char *)HINT_ALIGNED_WINDOW((char *)PAD_WINDOW(original_buf));
LOGSZPL("Buffer alloc", total_size, PADSZ((uintptr_t)original_buf,WINDOW_PAD_SIZE), PADSZ(curr_size,WINDOW_PAD_SIZE));
/* Initialize alloc_bufs */
deflate_allocs *alloc_bufs = (struct deflate_allocs_s *)(buff + alloc_pos);
alloc_bufs->buf_start = (char *)original_buf;
alloc_bufs->zfree = strm->zfree;
/* Assign buffers */
alloc_bufs->window = (unsigned char *)HINT_ALIGNED_WINDOW(buff + window_pos);
alloc_bufs->prev = (Pos *)HINT_ALIGNED_64(buff + prev_pos);
alloc_bufs->head = (Pos *)HINT_ALIGNED_64(buff + head_pos);
alloc_bufs->pending_buf = (unsigned char *)HINT_ALIGNED_64(buff + pending_pos);
alloc_bufs->state = (deflate_state *)HINT_ALIGNED_16(buff + state_pos);
memset((char *)alloc_bufs->prev, 0, prev_size);
return alloc_bufs;
}
/* ===========================================================================
* Free all allocated deflate buffers
*/
static inline void free_deflate(PREFIX3(stream) *strm) {
deflate_state *state = (deflate_state *)strm->state;
if (state->alloc_bufs != NULL) {
deflate_allocs *alloc_bufs = state->alloc_bufs;
alloc_bufs->zfree(strm->opaque, alloc_bufs->buf_start);
strm->state = NULL;
}
}
/* ===========================================================================
* Initialize deflate state and buffers.
* This function is hidden in ZLIB_COMPAT builds.
*/
int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level, int32_t method, int32_t windowBits,
int32_t memLevel, int32_t strategy) {
/* Todo: ignore strm->next_in if we use it as window */
uint32_t window_padding = 0;
deflate_state *s;
int wrap = 1;
/* Force initialization functable, because deflate captures function pointers from functable. */
functable.force_init();
/* Initialize functable */
FUNCTABLE_INIT;
if (strm == NULL)
return Z_STREAM_ERROR;
@ -230,9 +322,19 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
if (windowBits == 8)
windowBits = 9; /* until 256-byte window bug fixed */
s = ZALLOC_DEFLATE_STATE(strm);
if (s == NULL)
/* Allocate buffers */
int lit_bufsize = 1 << (memLevel + 6);
deflate_allocs *alloc_bufs = alloc_deflate(strm, windowBits, lit_bufsize);
if (alloc_bufs == NULL)
return Z_MEM_ERROR;
s = alloc_bufs->state;
s->alloc_bufs = alloc_bufs;
s->window = alloc_bufs->window;
s->prev = alloc_bufs->prev;
s->head = alloc_bufs->head;
s->pending_buf = alloc_bufs->pending_buf;
strm->state = (struct internal_state *)s;
s->strm = strm;
s->status = INIT_STATE; /* to pass state test in deflateReset() */
@ -243,18 +345,9 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
s->w_size = 1 << s->w_bits;
s->w_mask = s->w_size - 1;
#ifdef X86_PCLMULQDQ_CRC
window_padding = 8;
#endif
s->window = (unsigned char *) ZALLOC_WINDOW(strm, s->w_size + window_padding, 2*sizeof(unsigned char));
s->prev = (Pos *) ZALLOC(strm, s->w_size, sizeof(Pos));
memset(s->prev, 0, s->w_size * sizeof(Pos));
s->head = (Pos *) ZALLOC(strm, HASH_SIZE, sizeof(Pos));
s->high_water = 0; /* nothing written to s->window yet */
s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
s->lit_bufsize = lit_bufsize; /* 16K elements by default */
/* We overlay pending_buf and sym_buf. This works since the average size
* for length/distance pairs over any compressed block is assured to be 31
@ -295,7 +388,6 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
* symbols from which it is being constructed.
*/
s->pending_buf = (unsigned char *) ZALLOC(strm, s->lit_bufsize, 4);
s->pending_buf_size = s->lit_bufsize * 4;
if (s->window == NULL || s->prev == NULL || s->head == NULL || s->pending_buf == NULL) {
@ -304,8 +396,15 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level
PREFIX(deflateEnd)(strm);
return Z_MEM_ERROR;
}
#ifdef LIT_MEM
s->d_buf = (uint16_t *)(s->pending_buf + (s->lit_bufsize << 1));
s->l_buf = s->pending_buf + (s->lit_bufsize << 2);
s->sym_end = s->lit_bufsize - 1;
#else
s->sym_buf = s->pending_buf + s->lit_bufsize;
s->sym_end = (s->lit_bufsize - 1) * 3;
#endif
/* We avoid equality with lit_bufsize*3 because of wraparound at 64K
* on 16 bit machines and because stored blocks are restricted to
* 64K-1 bytes.
@ -348,7 +447,7 @@ static int deflateStateCheck(PREFIX3(stream) *strm) {
if (strm == NULL || strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
return 1;
s = strm->state;
if (s == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE))
if (s == NULL || s->alloc_bufs == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE))
return 1;
return 0;
}
@ -370,7 +469,7 @@ int32_t Z_EXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const uint8
/* when using zlib wrappers, compute Adler-32 for provided dictionary */
if (wrap == 1)
strm->adler = functable.adler32(strm->adler, dictionary, dictLength);
strm->adler = FUNCTABLE_CALL(adler32)(strm->adler, dictionary, dictLength);
DEFLATE_SET_DICTIONARY_HOOK(strm, dictionary, dictLength); /* hook for IBM Z DFLTCC */
s->wrap = 0; /* avoid computing Adler-32 in read_buf */
@ -457,7 +556,7 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
#ifdef GZIP
if (s->wrap == 2) {
strm->adler = functable.crc32_fold_reset(&s->crc_fold);
strm->adler = FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
} else
#endif
strm->adler = ADLER32_INITIAL_VALUE;
@ -506,9 +605,17 @@ int32_t Z_EXPORT PREFIX(deflatePrime)(PREFIX3(stream) *strm, int32_t bits, int32
if (deflateStateCheck(strm))
return Z_STREAM_ERROR;
s = strm->state;
#ifdef LIT_MEM
if (bits < 0 || bits > BIT_BUF_SIZE ||
(unsigned char *)s->d_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3))
return Z_BUF_ERROR;
#else
if (bits < 0 || bits > BIT_BUF_SIZE || bits > (int32_t)(sizeof(value) << 3) ||
s->sym_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3))
return Z_BUF_ERROR;
#endif
do {
put = BIT_BUF_SIZE - s->bi_valid;
put = MIN(put, bits);
@ -555,7 +662,7 @@ int32_t Z_EXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int32_t level, int
if (s->level != level) {
if (s->level == 0 && s->matches != 0) {
if (s->matches == 1) {
functable.slide_hash(s);
FUNCTABLE_CALL(slide_hash)(s);
} else {
CLEAR_HASH(s);
}
@ -794,7 +901,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
#ifdef GZIP
if (s->status == GZIP_STATE) {
/* gzip header */
functable.crc32_fold_reset(&s->crc_fold);
FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
put_byte(s, 31);
put_byte(s, 139);
put_byte(s, 8);
@ -911,7 +1018,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
}
}
put_short(s, (uint16_t)strm->adler);
functable.crc32_fold_reset(&s->crc_fold);
FUNCTABLE_CALL(crc32_fold_reset)(&s->crc_fold);
}
s->status = BUSY_STATE;
@ -982,7 +1089,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
/* Write the trailer */
#ifdef GZIP
if (s->wrap == 2) {
strm->adler = functable.crc32_fold_final(&s->crc_fold);
strm->adler = FUNCTABLE_CALL(crc32_fold_final)(&s->crc_fold);
put_uint32(s, strm->adler);
put_uint32(s, (uint32_t)strm->total_in);
@ -1007,21 +1114,13 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
/* ========================================================================= */
int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) {
int32_t status;
if (deflateStateCheck(strm))
return Z_STREAM_ERROR;
status = strm->state->status;
int32_t status = strm->state->status;
/* Deallocate in reverse order of allocations: */
TRY_FREE(strm, strm->state->pending_buf);
TRY_FREE(strm, strm->state->head);
TRY_FREE(strm, strm->state->prev);
TRY_FREE_WINDOW(strm, strm->state->window);
ZFREE_STATE(strm, strm->state);
strm->state = NULL;
/* Free allocated buffers */
free_deflate(strm);
return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
}
@ -1032,7 +1131,6 @@ int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) {
int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *source) {
deflate_state *ds;
deflate_state *ss;
uint32_t window_padding = 0;
if (deflateStateCheck(source) || dest == NULL)
return Z_STREAM_ERROR;
@ -1041,34 +1139,39 @@ int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *sou
memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream)));
ds = ZALLOC_DEFLATE_STATE(dest);
if (ds == NULL)
deflate_allocs *alloc_bufs = alloc_deflate(dest, ss->w_bits, ss->lit_bufsize);
if (alloc_bufs == NULL)
return Z_MEM_ERROR;
ds = alloc_bufs->state;
dest->state = (struct internal_state *) ds;
ZCOPY_DEFLATE_STATE(ds, ss);
memcpy(ds, ss, sizeof(deflate_state));
ds->strm = dest;
#ifdef X86_PCLMULQDQ_CRC
window_padding = 8;
#endif
ds->window = (unsigned char *) ZALLOC_WINDOW(dest, ds->w_size + window_padding, 2*sizeof(unsigned char));
ds->prev = (Pos *) ZALLOC(dest, ds->w_size, sizeof(Pos));
ds->head = (Pos *) ZALLOC(dest, HASH_SIZE, sizeof(Pos));
ds->pending_buf = (unsigned char *) ZALLOC(dest, ds->lit_bufsize, 4);
ds->alloc_bufs = alloc_bufs;
ds->window = alloc_bufs->window;
ds->prev = alloc_bufs->prev;
ds->head = alloc_bufs->head;
ds->pending_buf = alloc_bufs->pending_buf;
if (ds->window == NULL || ds->prev == NULL || ds->head == NULL || ds->pending_buf == NULL) {
PREFIX(deflateEnd)(dest);
return Z_MEM_ERROR;
}
memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(unsigned char));
memcpy(ds->window, ss->window, DEFLATE_ADJUST_WINDOW_SIZE(ds->w_size * 2 * sizeof(unsigned char)));
memcpy((void *)ds->prev, (void *)ss->prev, ds->w_size * sizeof(Pos));
memcpy((void *)ds->head, (void *)ss->head, HASH_SIZE * sizeof(Pos));
memcpy(ds->pending_buf, ss->pending_buf, ds->pending_buf_size);
memcpy(ds->pending_buf, ss->pending_buf, ds->lit_bufsize * LIT_BUFS);
ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
#ifdef LIT_MEM
ds->d_buf = (uint16_t *)(ds->pending_buf + (ds->lit_bufsize << 1));
ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2);
#else
ds->sym_buf = ds->pending_buf + ds->lit_bufsize;
#endif
ds->l_desc.dyn_tree = ds->dyn_ltree;
ds->d_desc.dyn_tree = ds->dyn_dtree;
@ -1095,10 +1198,10 @@ Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf,
memcpy(buf, strm->next_in, len);
#ifdef GZIP
} else if (strm->state->wrap == 2) {
functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
FUNCTABLE_CALL(crc32_fold_copy)(&strm->state->crc_fold, buf, strm->next_in, len);
#endif
} else if (strm->state->wrap == 1) {
strm->adler = functable.adler32_fold_copy(strm->adler, buf, strm->next_in, len);
strm->adler = FUNCTABLE_CALL(adler32_fold_copy)(strm->adler, buf, strm->next_in, len);
} else {
memcpy(buf, strm->next_in, len);
}
@ -1125,9 +1228,9 @@ static void lm_set_level(deflate_state *s, int level) {
s->insert_string = &insert_string_roll;
s->quick_insert_string = &quick_insert_string_roll;
} else {
s->update_hash = functable.update_hash;
s->insert_string = functable.insert_string;
s->quick_insert_string = functable.quick_insert_string;
s->update_hash = update_hash;
s->insert_string = insert_string;
s->quick_insert_string = quick_insert_string;
}
s->level = level;
@ -1191,7 +1294,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) {
s->block_start -= (int)wsize;
if (s->insert > s->strstart)
s->insert = s->strstart;
functable.slide_hash(s);
FUNCTABLE_CALL(slide_hash)(s);
more += wsize;
}
if (s->strm->avail_in == 0)
@ -1217,7 +1320,7 @@ void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) {
if (s->lookahead + s->insert >= STD_MIN_MATCH) {
unsigned int str = s->strstart - s->insert;
if (UNLIKELY(s->max_chain_length > 1024)) {
s->ins_h = s->update_hash(s, s->window[str], s->window[str+1]);
s->ins_h = s->update_hash(s->window[str], s->window[str+1]);
} else if (str >= 1) {
s->quick_insert_string(s, str + 2 - STD_MIN_MATCH);
}

View File

@ -12,8 +12,12 @@
#include "zutil.h"
#include "zendian.h"
#include "adler32_fold.h"
#include "crc32_fold.h"
#include "crc32.h"
#ifdef S390_DFLTCC_DEFLATE
# include "arch/s390/dfltcc_common.h"
# define HAVE_ARCH_DEFLATE_STATE
#endif
/* define NO_GZIP when compiling if you want to disable gzip header and
trailer creation by deflate(). NO_GZIP would be used to avoid linking in
@ -23,6 +27,12 @@
# define GZIP
#endif
/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
the cost of a larger memory footprint */
#ifndef NO_LIT_MEM
# define LIT_MEM
#endif
/* ===========================================================================
* Internal compression state.
*/
@ -108,11 +118,30 @@ typedef uint16_t Pos;
/* Type definitions for hash callbacks */
typedef struct internal_state deflate_state;
typedef uint32_t (* update_hash_cb) (deflate_state *const s, uint32_t h, uint32_t val);
typedef uint32_t (* update_hash_cb) (uint32_t h, uint32_t val);
typedef void (* insert_string_cb) (deflate_state *const s, uint32_t str, uint32_t count);
typedef Pos (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
struct internal_state {
uint32_t update_hash (uint32_t h, uint32_t val);
void insert_string (deflate_state *const s, uint32_t str, uint32_t count);
Pos quick_insert_string (deflate_state *const s, uint32_t str);
uint32_t update_hash_roll (uint32_t h, uint32_t val);
void insert_string_roll (deflate_state *const s, uint32_t str, uint32_t count);
Pos quick_insert_string_roll(deflate_state *const s, uint32_t str);
/* Struct for memory allocation handling */
typedef struct deflate_allocs_s {
char *buf_start;
free_func zfree;
deflate_state *state;
unsigned char *window;
unsigned char *pending_buf;
Pos *prev;
Pos *head;
} deflate_allocs;
struct ALIGNED_(64) internal_state {
PREFIX3(stream) *strm; /* pointer back to this zlib stream */
unsigned char *pending_buf; /* output still pending */
unsigned char *pending_out; /* next pending byte to output to the stream */
@ -260,8 +289,16 @@ struct internal_state {
* - I can't count above 4
*/
#ifdef LIT_MEM
# define LIT_BUFS 5
uint16_t *d_buf; /* buffer for distances */
unsigned char *l_buf; /* buffer for literals/lengths */
#else
# define LIT_BUFS 4
unsigned char *sym_buf; /* buffer for distances and literals/lengths */
unsigned int sym_next; /* running index in sym_buf */
#endif
unsigned int sym_next; /* running index in symbol buffer */
unsigned int sym_end; /* symbol table full when sym_next reaches this */
unsigned long opt_len; /* bit length of current block with optimal trees */
@ -273,8 +310,11 @@ struct internal_state {
unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */
unsigned long bits_sent; /* bit length of compressed data sent mod 2^32 */
/* Reserved for future use and alignment purposes */
char *reserved_p;
deflate_allocs *alloc_bufs;
#ifdef HAVE_ARCH_DEFLATE_STATE
arch_deflate_state arch; /* architecture-specific extensions */
#endif
uint64_t bi_buf;
/* Output buffer. bits are inserted starting at the bottom (least significant bits). */
@ -284,7 +324,7 @@ struct internal_state {
/* Reserved for future use and alignment purposes */
int32_t reserved[11];
} ALIGNED_(8);
};
typedef enum {
need_more, /* block not completed, need more input or more output */

View File

@ -1,6 +1,6 @@
/* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
@ -41,7 +41,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
* dictionary, and set hash_head to the head of the hash chain:
*/
if (s->lookahead >= WANT_MIN_MATCH) {
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);
dist = (int64_t)s->strstart - hash_head;
/* Find the longest match, discarding those <= prev_length.
@ -52,7 +52,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
match_len = functable.longest_match(s, hash_head);
match_len = FUNCTABLE_CALL(longest_match)(s, hash_head);
/* longest_match() sets match_start */
}
}
@ -71,11 +71,11 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
match_len--; /* string at strstart already in table */
s->strstart++;
functable.insert_string(s, s->strstart, match_len);
insert_string(s, s->strstart, match_len);
s->strstart += match_len;
} else {
s->strstart += match_len;
functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
/* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
* matter since it will be recomputed at next deflate call.

View File

@ -1,6 +1,6 @@
/* deflate_huff.c -- compress data using huffman encoding only strategy
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/

View File

@ -45,16 +45,18 @@ static void insert_match(deflate_state *s, struct match match) {
if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH)))
return;
/* matches that are not long enough we need to emit as literals */
if (LIKELY(match.match_length < WANT_MIN_MATCH)) {
/* string at strstart already in table */
match.strstart++;
match.match_length--;
/* matches that are not long enough we need to emit as literals */
if (LIKELY(match.match_length < WANT_MIN_MATCH - 1)) {
if (UNLIKELY(match.match_length > 0)) {
if (match.strstart >= match.orgstart) {
if (match.strstart + match.match_length - 1 >= match.orgstart) {
functable.insert_string(s, match.strstart, match.match_length);
insert_string(s, match.strstart, match.match_length);
} else {
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
}
match.strstart += match.match_length;
match.match_length = 0;
@ -63,35 +65,18 @@ static void insert_match(deflate_state *s, struct match match) {
return;
}
/* Insert new strings in the hash table only if the match length
* is not too large. This saves time but degrades compression.
*/
if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
match.match_length--; /* string at strstart already in table */
match.strstart++;
/* Insert into hash table. */
if (LIKELY(match.strstart >= match.orgstart)) {
if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
functable.insert_string(s, match.strstart, match.match_length);
insert_string(s, match.strstart, match.match_length);
} else {
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
}
} else if (match.orgstart < match.strstart + match.match_length) {
functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
}
match.strstart += match.match_length;
match.match_length = 0;
} else {
match.strstart += match.match_length;
match.match_length = 0;
if (match.strstart >= (STD_MIN_MATCH - 2))
functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
/* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
* matter since it will be recomputed at next deflate call.
*/
}
}
static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) {
@ -199,7 +184,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
} else {
hash_head = 0;
if (s->lookahead >= WANT_MIN_MATCH) {
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);
}
current_match.strstart = (uint16_t)s->strstart;
@ -215,7 +200,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
current_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
current_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head);
current_match.match_start = (uint16_t)s->match_start;
if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH))
current_match.match_length = 1;
@ -235,7 +220,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
/* now, look ahead one */
if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
s->strstart = current_match.strstart + current_match.match_length;
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);
next_match.strstart = (uint16_t)s->strstart;
next_match.orgstart = next_match.strstart;
@ -250,7 +235,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
next_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
next_match.match_length = (uint16_t)FUNCTABLE_CALL(longest_match)(s, hash_head);
next_match.match_start = (uint16_t)s->match_start;
if (UNLIKELY(next_match.match_start >= next_match.strstart)) {
/* this can happen due to some restarts */

View File

@ -1,7 +1,7 @@
/* deflate_p.h -- Private inline functions and macros shared with more than
* one deflate method
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
*/
@ -60,9 +60,14 @@ extern const unsigned char Z_INTERNAL zng_dist_code[];
static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
/* c is the unmatched char */
#ifdef LIT_MEM
s->d_buf[s->sym_next] = 0;
s->l_buf[s->sym_next++] = c;
#else
s->sym_buf[s->sym_next++] = 0;
s->sym_buf[s->sym_next++] = 0;
s->sym_buf[s->sym_next++] = c;
#endif
s->dyn_ltree[c].Freq++;
Tracevv((stderr, "%c", c));
Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal");
@ -72,9 +77,14 @@ static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
static inline int zng_tr_tally_dist(deflate_state* s, uint32_t dist, uint32_t len) {
/* dist: distance of matched string */
/* len: match length-STD_MIN_MATCH */
#ifdef LIT_MEM
s->d_buf[s->sym_next] = dist;
s->l_buf[s->sym_next++] = len;
#else
s->sym_buf[s->sym_next++] = (uint8_t)(dist);
s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8);
s->sym_buf[s->sym_next++] = (uint8_t)len;
#endif
s->matches++;
dist--;
Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES,

View File

@ -86,7 +86,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
}
if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);
dist = (int64_t)s->strstart - hash_head;
if (dist <= MAX_DIST(s) && dist > 0) {
@ -94,7 +94,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
const uint8_t *match_start = s->window + hash_head;
if (zng_memcmp_2(str_start, match_start) == 0) {
match_len = functable.compare256(str_start+2, match_start+2) + 2;
match_len = FUNCTABLE_CALL(compare256)(str_start+2, match_start+2) + 2;
if (match_len >= WANT_MIN_MATCH) {
if (UNLIKELY(match_len > s->lookahead))

View File

@ -1,6 +1,6 @@
/* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/

View File

@ -1,6 +1,6 @@
/* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
@ -19,12 +19,12 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
int bflush; /* set if current block must be flushed */
int64_t dist;
uint32_t match_len;
match_func *longest_match;
match_func longest_match;
if (s->max_chain_length <= 1024)
longest_match = &functable.longest_match;
longest_match = FUNCTABLE_FPTR(longest_match);
else
longest_match = &functable.longest_match_slow;
longest_match = FUNCTABLE_FPTR(longest_match_slow);
/* Process the input block. */
for (;;) {
@ -61,7 +61,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
* of window index 0 (in particular we have to avoid a match
* of the string with itself at the start of the input file).
*/
match_len = (*longest_match)(s, hash_head);
match_len = longest_match(s, hash_head);
/* longest_match() sets match_start */
if (match_len <= 5 && (s->strategy == Z_FILTERED)) {
@ -129,7 +129,7 @@ Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
}
Assert(flush != Z_NO_FLUSH, "no flush?");
if (UNLIKELY(s->match_available)) {
(void) zng_tr_tally_lit(s, s->window[s->strstart-1]);
Z_UNUSED(zng_tr_tally_lit(s, s->window[s->strstart-1]));
s->match_available = 0;
}
s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);

View File

@ -1,6 +1,6 @@
/* deflate_stored.c -- store data without compression using deflation algorithm
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
@ -22,7 +22,7 @@
*
* deflate_stored() is written to minimize the number of times an input byte is
* copied. It is most efficient with large input and output buffers, which
* maximizes the opportunites to have a single copy from next_in to next_out.
* maximizes the opportunities to have a single copy from next_in to next_out.
*/
Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) {
/* Smallest worthy block size when not flushing or finishing. By default

View File

@ -5,9 +5,6 @@
#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
#include <intrin.h>
#ifdef X86_FEATURES
# include "arch/x86/x86_features.h"
#endif
/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
* Because of that assumption trailing_zero is not initialized and the return value is not checked.

View File

@ -2,14 +2,12 @@
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef DISABLE_RUNTIME_CPU_DETECTION
#include "zbuild.h"
#include "zendian.h"
#include "crc32_braid_p.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
#include "cpu_features.h"
#include "arch_functions.h"
#if defined(_MSC_VER)
# include <intrin.h>
@ -61,31 +59,10 @@ static void init_functable(void) {
ft.crc32_fold_final = &crc32_fold_final_c;
ft.crc32_fold_reset = &crc32_fold_reset_c;
ft.inflate_fast = &inflate_fast_c;
ft.insert_string = &insert_string_c;
ft.quick_insert_string = &quick_insert_string_c;
ft.slide_hash = &slide_hash_c;
ft.update_hash = &update_hash_c;
#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
ft.longest_match = &longest_match_unaligned_64;
ft.longest_match_slow = &longest_match_slow_unaligned_64;
ft.compare256 = &compare256_unaligned_64;
# elif defined(HAVE_BUILTIN_CTZ)
ft.longest_match = &longest_match_unaligned_32;
ft.longest_match_slow = &longest_match_slow_unaligned_32;
ft.compare256 = &compare256_unaligned_32;
# else
ft.longest_match = &longest_match_unaligned_16;
ft.longest_match_slow = &longest_match_slow_unaligned_16;
ft.compare256 = &compare256_unaligned_16;
# endif
#else
ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_c;
ft.compare256 = &compare256_c;
#endif
ft.longest_match = &longest_match_generic;
ft.longest_match_slow = &longest_match_slow_generic;
ft.compare256 = &compare256_generic;
// Select arch-optimized functions
@ -110,19 +87,14 @@ static void init_functable(void) {
#ifdef X86_SSSE3
if (cf.x86.has_ssse3) {
ft.adler32 = &adler32_ssse3;
# ifdef X86_SSE2
ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
ft.inflate_fast = &inflate_fast_ssse3;
# endif
}
#endif
// X86 - SSE4.2
#ifdef X86_SSE42
if (cf.x86.has_sse42) {
ft.adler32_fold_copy = &adler32_fold_copy_sse42;
ft.insert_string = &insert_string_sse42;
ft.quick_insert_string = &quick_insert_string_sse42;
ft.update_hash = &update_hash_sse42;
}
#endif
// X86 - PCLMUL
@ -151,8 +123,9 @@ static void init_functable(void) {
# endif
}
#endif
// X86 - AVX512 (F,DQ,BW,Vl)
#ifdef X86_AVX512
if (cf.x86.has_avx512) {
if (cf.x86.has_avx512_common) {
ft.adler32 = &adler32_avx512;
ft.adler32_fold_copy = &adler32_fold_copy_avx512;
}
@ -164,8 +137,8 @@ static void init_functable(void) {
}
#endif
// X86 - VPCLMULQDQ
#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) {
#ifdef X86_VPCLMULQDQ_CRC
if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
ft.crc32 = &crc32_vpclmulqdq;
ft.crc32_fold = &crc32_fold_vpclmulqdq;
ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
@ -206,9 +179,6 @@ static void init_functable(void) {
#ifdef ARM_ACLE
if (cf.arm.has_crc32) {
ft.crc32 = &crc32_acle;
ft.insert_string = &insert_string_acle;
ft.quick_insert_string = &quick_insert_string_acle;
ft.update_hash = &update_hash_acle;
}
#endif
@ -279,12 +249,9 @@ static void init_functable(void) {
FUNCTABLE_ASSIGN(ft, crc32_fold_final);
FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
FUNCTABLE_ASSIGN(ft, inflate_fast);
FUNCTABLE_ASSIGN(ft, insert_string);
FUNCTABLE_ASSIGN(ft, longest_match);
FUNCTABLE_ASSIGN(ft, longest_match_slow);
FUNCTABLE_ASSIGN(ft, quick_insert_string);
FUNCTABLE_ASSIGN(ft, slide_hash);
FUNCTABLE_ASSIGN(ft, update_hash);
// Memory barrier for weak memory order CPUs
FUNCTABLE_BARRIER();
@ -350,11 +317,6 @@ static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
functable.inflate_fast(strm, start);
}
static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) {
init_functable();
functable.insert_string(s, str, count);
}
static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
init_functable();
return functable.longest_match(s, cur_match);
@ -365,21 +327,11 @@ static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
return functable.longest_match_slow(s, cur_match);
}
static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) {
init_functable();
return functable.quick_insert_string(s, str);
}
static void slide_hash_stub(deflate_state* s) {
init_functable();
functable.slide_hash(s);
}
static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) {
init_functable();
return functable.update_hash(s, h, val);
}
/* functable init */
Z_INTERNAL struct functable_s functable = {
force_init_stub,
@ -394,10 +346,9 @@ Z_INTERNAL struct functable_s functable = {
crc32_fold_final_stub,
crc32_fold_reset_stub,
inflate_fast_stub,
insert_string_stub,
longest_match_stub,
longest_match_slow_stub,
quick_insert_string_stub,
slide_hash_stub,
update_hash_stub
};
#endif

View File

@ -7,14 +7,21 @@
#define FUNCTABLE_H_
#include "deflate.h"
#include "crc32_fold.h"
#include "adler32_fold.h"
#include "crc32.h"
#ifdef DISABLE_RUNTIME_CPU_DETECTION
# include "arch_functions.h"
/* When compiling with native instructions it is not necessary to use functable.
* Instead we use native_ macro indicating the best available variant of arch-specific
* functions for the current platform.
*/
# define FUNCTABLE_INIT ((void)0)
# define FUNCTABLE_CALL(name) native_ ## name
# define FUNCTABLE_FPTR(name) &native_ ## name
#ifdef ZLIB_COMPAT
typedef struct z_stream_s z_stream;
#else
typedef struct zng_stream_s zng_stream;
#endif
struct functable_s {
void (* force_init) (void);
@ -29,14 +36,20 @@ struct functable_s {
uint32_t (* crc32_fold_final) (struct crc32_fold_s *crc);
uint32_t (* crc32_fold_reset) (struct crc32_fold_s *crc);
void (* inflate_fast) (PREFIX3(stream) *strm, uint32_t start);
void (* insert_string) (deflate_state *const s, uint32_t str, uint32_t count);
uint32_t (* longest_match) (deflate_state *const s, Pos cur_match);
uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match);
Pos (* quick_insert_string)(deflate_state *const s, uint32_t str);
void (* slide_hash) (deflate_state *s);
uint32_t (* update_hash) (deflate_state *const s, uint32_t h, uint32_t val);
};
Z_INTERNAL extern struct functable_s functable;
/* Explicitly indicate functions are conditionally dispatched.
*/
# define FUNCTABLE_INIT functable.force_init()
# define FUNCTABLE_CALL(name) functable.name
# define FUNCTABLE_FPTR(name) functable.name
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More