opencv/3rdparty/zlib-ng/inflate_p.h
Letu Ren 0de26fd78e Add zlib-ng as an alternative zlib implementation
Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier.

Related discussion: https://github.com/opencv/opencv/issues/22573
2024-01-14 14:58:47 +08:00

231 lines
7.5 KiB
C

/* inflate_p.h -- Private inline functions and macros shared with more than one deflate method
*
*/
#ifndef INFLATE_P_H
#define INFLATE_P_H
#include <stdlib.h>
/* Architecture-specific hooks. */
#ifdef S390_DFLTCC_INFLATE
# include "arch/s390/dfltcc_inflate.h"
#else
/* Memory management for the inflate state. Useful for allocating arch-specific extension blocks. */
# define ZALLOC_INFLATE_STATE(strm) ((struct inflate_state *)ZALLOC(strm, 1, sizeof(struct inflate_state)))
# define ZFREE_STATE(strm, addr) ZFREE(strm, addr)
# define ZCOPY_INFLATE_STATE(dst, src) memcpy(dst, src, sizeof(struct inflate_state))
/* Memory management for the window. Useful for allocation the aligned window. */
# define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size)
# define ZCOPY_WINDOW(dest, src, n) memcpy(dest, src, n)
# define ZFREE_WINDOW(strm, addr) ZFREE(strm, addr)
/* Invoked at the end of inflateResetKeep(). Useful for initializing arch-specific extension blocks. */
# define INFLATE_RESET_KEEP_HOOK(strm) do {} while (0)
/* Invoked at the beginning of inflatePrime(). Useful for updating arch-specific buffers. */
# define INFLATE_PRIME_HOOK(strm, bits, value) do {} while (0)
/* Invoked at the beginning of each block. Useful for plugging arch-specific inflation code. */
# define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0)
/* Returns whether zlib-ng should compute a checksum. Set to 0 if arch-specific inflation code already does that. */
# define INFLATE_NEED_CHECKSUM(strm) 1
/* Returns whether zlib-ng should update a window. Set to 0 if arch-specific inflation code already does that. */
# define INFLATE_NEED_UPDATEWINDOW(strm) 1
/* Invoked at the beginning of inflateMark(). Useful for updating arch-specific pointers and offsets. */
# define INFLATE_MARK_HOOK(strm) do {} while (0)
/* Invoked at the beginning of inflateSyncPoint(). Useful for performing arch-specific state checks. */
# define INFLATE_SYNC_POINT_HOOK(strm) do {} while (0)
/* Invoked at the beginning of inflateSetDictionary(). Useful for checking arch-specific window data. */
# define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
/* Invoked at the beginning of inflateGetDictionary(). Useful for adjusting arch-specific window data. */
# define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
#endif
/*
* Macros shared by inflate() and inflateBack()
*/
/* check function to use adler32() for zlib or crc32() for gzip */
#ifdef GUNZIP
# define UPDATE(check, buf, len) \
(state->flags ? PREFIX(crc32)(check, buf, len) : functable.adler32(check, buf, len))
#else
# define UPDATE(check, buf, len) functable.adler32(check, buf, len)
#endif
/* check macros for header crc */
#ifdef GUNZIP
# define CRC2(check, word) \
do { \
hbuf[0] = (unsigned char)(word); \
hbuf[1] = (unsigned char)((word) >> 8); \
check = PREFIX(crc32)(check, hbuf, 2); \
} while (0)
# define CRC4(check, word) \
do { \
hbuf[0] = (unsigned char)(word); \
hbuf[1] = (unsigned char)((word) >> 8); \
hbuf[2] = (unsigned char)((word) >> 16); \
hbuf[3] = (unsigned char)((word) >> 24); \
check = PREFIX(crc32)(check, hbuf, 4); \
} while (0)
#endif
/* Load registers with state in inflate() for speed */
#define LOAD() \
do { \
put = strm->next_out; \
left = strm->avail_out; \
next = strm->next_in; \
have = strm->avail_in; \
hold = state->hold; \
bits = state->bits; \
} while (0)
/* Restore state from registers in inflate() */
#define RESTORE() \
do { \
strm->next_out = put; \
strm->avail_out = left; \
strm->next_in = (z_const unsigned char *)next; \
strm->avail_in = have; \
state->hold = hold; \
state->bits = bits; \
} while (0)
/* Clear the input bit accumulator */
#define INITBITS() \
do { \
hold = 0; \
bits = 0; \
} while (0)
/* Ensure that there is at least n bits in the bit accumulator. If there is
not enough available input to do that, then return from inflate()/inflateBack(). */
#define NEEDBITS(n) \
do { \
while (bits < (unsigned)(n)) \
PULLBYTE(); \
} while (0)
/* Return the low n bits of the bit accumulator (n < 16) */
#define BITS(n) \
(hold & ((1U << (unsigned)(n)) - 1))
/* Remove n bits from the bit accumulator */
#define DROPBITS(n) \
do { \
hold >>= (n); \
bits -= (unsigned)(n); \
} while (0)
/* Remove zero to seven bits as needed to go to a byte boundary */
#define BYTEBITS() \
do { \
hold >>= bits & 7; \
bits -= bits & 7; \
} while (0)
/* Set mode=BAD and prepare error message */
#define SET_BAD(errmsg) \
do { \
state->mode = BAD; \
strm->msg = (char *)errmsg; \
} while (0)
#define INFLATE_FAST_MIN_HAVE 15
#define INFLATE_FAST_MIN_LEFT 260
/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
uint64_t chunk;
memcpy(&chunk, in, sizeof(chunk));
#if BYTE_ORDER == LITTLE_ENDIAN
return chunk << bits;
#else
return ZSWAP64(chunk) << bits;
#endif
}
/* Behave like chunkcopy, but avoid writing beyond of legal output. */
static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) {
uint64_t safelen = (safe - out) + 1;
len = MIN(len, safelen);
int32_t olap_src = from >= out && from < out + len;
int32_t olap_dst = out >= from && out < from + len;
uint64_t tocopy;
/* For all cases without overlap, memcpy is ideal */
if (!(olap_src || olap_dst)) {
memcpy(out, from, (size_t)len);
return out + len;
}
/* Complete overlap: Source == destination */
if (out == from) {
return out + len;
}
/* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior,
* we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
* initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
* atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
* behind or lookahead distance. */
uint64_t non_olap_size = llabs(from - out); // llabs vs labs for compatibility with windows
memcpy(out, from, (size_t)non_olap_size);
out += non_olap_size;
from += non_olap_size;
len -= non_olap_size;
/* So this doesn't give use a worst case scenario of function calls in a loop,
* we want to instead break this down into copy blocks of fixed lengths */
while (len) {
tocopy = MIN(non_olap_size, len);
len -= tocopy;
while (tocopy >= 32) {
memcpy(out, from, 32);
out += 32;
from += 32;
tocopy -= 32;
}
if (tocopy >= 16) {
memcpy(out, from, 16);
out += 16;
from += 16;
tocopy -= 16;
}
if (tocopy >= 8) {
memcpy(out, from, 8);
out += 8;
from += 8;
tocopy -= 8;
}
if (tocopy >= 4) {
memcpy(out, from, 4);
out += 4;
from += 4;
tocopy -= 4;
}
if (tocopy >= 2) {
memcpy(out, from, 2);
out += 2;
from += 2;
tocopy -= 2;
}
if (tocopy) {
*out++ = *from++;
}
}
return out;
}
#endif