mirror of
https://github.com/opencv/opencv.git
synced 2025-01-07 11:41:48 +08:00
0de26fd78e
Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573
187 lines
5.8 KiB
C
187 lines
5.8 KiB
C
/*
|
|
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
|
* instruction.
|
|
*
|
|
* A white paper describing this algorithm can be found at:
|
|
* doc/crc-pclmulqdq.pdf
|
|
*
|
|
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
|
* Copyright (C) 2016 Marian Beermann (support for initial value)
|
|
* Authors:
|
|
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
|
* Jim Guilford <james.guilford@intel.com>
|
|
* Vinodh Gopal <vinodh.gopal@intel.com>
|
|
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
|
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
|
*
|
|
* For conditions of distribution and use, see copyright notice in zlib.h
|
|
*/
|
|
|
|
#ifdef COPY
|
|
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
|
|
#else
|
|
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
|
|
#endif
|
|
unsigned long algn_diff;
|
|
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
|
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
|
|
__m128i xmm_crc_part = _mm_setzero_si128();
|
|
#ifdef COPY
|
|
char ALIGNED_(16) partial_buf[16] = { 0 };
|
|
#else
|
|
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
|
|
int32_t first = init_crc != 0;
|
|
|
|
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
|
|
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
|
|
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
|
|
* by definition can be up to 15 bytes + one full vector load. */
|
|
assert(len >= 31 || first == 0);
|
|
#endif
|
|
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
|
|
|
if (len < 16) {
|
|
#ifdef COPY
|
|
if (len == 0)
|
|
return;
|
|
|
|
memcpy(partial_buf, src, len);
|
|
xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
|
|
memcpy(dst, partial_buf, len);
|
|
#endif
|
|
goto partial;
|
|
}
|
|
|
|
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
|
|
if (algn_diff) {
|
|
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
|
#ifdef COPY
|
|
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
|
dst += algn_diff;
|
|
#else
|
|
XOR_INITIAL128(xmm_crc_part);
|
|
|
|
if (algn_diff < 4 && init_crc != 0) {
|
|
xmm_t0 = xmm_crc_part;
|
|
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
|
|
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
|
src += 16;
|
|
len -= 16;
|
|
}
|
|
#endif
|
|
|
|
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
|
|
|
src += algn_diff;
|
|
len -= algn_diff;
|
|
}
|
|
|
|
#ifdef X86_VPCLMULQDQ
|
|
if (len >= 256) {
|
|
#ifdef COPY
|
|
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
|
|
dst += n;
|
|
#else
|
|
size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
|
|
xmm_initial, first);
|
|
first = 0;
|
|
#endif
|
|
len -= n;
|
|
src += n;
|
|
}
|
|
#endif
|
|
|
|
while (len >= 64) {
|
|
len -= 64;
|
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
|
src += 64;
|
|
|
|
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
|
#ifdef COPY
|
|
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
|
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
|
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
|
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
|
|
dst += 64;
|
|
#else
|
|
XOR_INITIAL128(xmm_t0);
|
|
#endif
|
|
|
|
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
|
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
|
}
|
|
|
|
/*
|
|
* len = num bytes left - 64
|
|
*/
|
|
if (len >= 48) {
|
|
len -= 48;
|
|
|
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
src += 48;
|
|
#ifdef COPY
|
|
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
|
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
|
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
|
dst += 48;
|
|
#else
|
|
XOR_INITIAL128(xmm_t0);
|
|
#endif
|
|
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
|
|
|
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
|
} else if (len >= 32) {
|
|
len -= 32;
|
|
|
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
src += 32;
|
|
#ifdef COPY
|
|
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
|
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
|
dst += 32;
|
|
#else
|
|
XOR_INITIAL128(xmm_t0);
|
|
#endif
|
|
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
|
|
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
|
} else if (len >= 16) {
|
|
len -= 16;
|
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
src += 16;
|
|
#ifdef COPY
|
|
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
|
dst += 16;
|
|
#else
|
|
XOR_INITIAL128(xmm_t0);
|
|
#endif
|
|
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
|
|
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
|
}
|
|
|
|
partial:
|
|
if (len) {
|
|
memcpy(&xmm_crc_part, src, len);
|
|
#ifdef COPY
|
|
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
|
|
memcpy(dst, partial_buf, len);
|
|
#endif
|
|
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
|
}
|
|
|
|
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
|
}
|