opencv/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h

/*
 * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
 * instruction.
 *
 * A white paper describing this algorithm can be found at:
 *     doc/crc-pclmulqdq.pdf
 *
 * Copyright (C) 2013 Intel Corporation. All rights reserved.
 * Copyright (C) 2016 Marian Beermann (support for initial value)
 * Authors:
 *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
 *     Jim Guilford    <james.guilford@intel.com>
 *     Vinodh Gopal    <vinodh.gopal@intel.com>
 *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
 *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
 *
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#ifdef COPY
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
#else
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
#endif
    unsigned long algn_diff;
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
    __m128i xmm_crc_part = _mm_setzero_si128();
    char ALIGNED_(16) partial_buf[16] = { 0 };
#ifndef COPY
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
    int32_t first = init_crc != 0;

    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
     * up to 15 bytes + one full vector load. */
    assert(len >= 16 || first == 0);
#endif
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);

    if (len < 16) {
        if (len == 0)
            return;

        memcpy(partial_buf, src, len);
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
#ifdef COPY
        memcpy(dst, partial_buf, len);
#endif
        goto partial;
    }

    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
    if (algn_diff) {
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
#ifdef COPY
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
        dst += algn_diff;
#else
        XOR_INITIAL128(xmm_crc_part);

        if (algn_diff < 4 && init_crc != 0) {
            xmm_t0 = xmm_crc_part;
            if (len >= 32) {
                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
            } else {
                memcpy(partial_buf, src + 16, len - 16);
                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
                src += 16;
                len -= 16;
#ifdef COPY
                dst -= algn_diff;
#endif
                goto partial;
            }

            src += 16;
            len -= 16;
        }
#endif

        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);

        src += algn_diff;
        len -= algn_diff;
    }

#ifdef X86_VPCLMULQDQ
    if (len >= 256) {
#ifdef COPY
        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
        dst += n;
#else
        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
            xmm_initial, first);
        first = 0;
#endif
        len -= n;
        src += n;
    }
#endif

    while (len >= 64) {
        len -= 64;
        xmm_t0 = _mm_load_si128((__m128i *)src);
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
        src += 64;

        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
        dst += 64;
#else
        XOR_INITIAL128(xmm_t0);
#endif

        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
    }

    /*
     * len = num bytes left - 64
     */
    if (len >= 48) {
        len -= 48;

        xmm_t0 = _mm_load_si128((__m128i *)src);
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
        src += 48;
#ifdef COPY
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
        dst += 48;
#else
        XOR_INITIAL128(xmm_t0);
#endif
        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);

        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
    } else if (len >= 32) {
        len -= 32;

        xmm_t0 = _mm_load_si128((__m128i *)src);
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
        src += 32;
#ifdef COPY
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
        dst += 32;
#else
        XOR_INITIAL128(xmm_t0);
#endif
        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);

        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
    } else if (len >= 16) {
        len -= 16;
        xmm_t0 = _mm_load_si128((__m128i *)src);
        src += 16;
#ifdef COPY
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
        dst += 16;
#else
        XOR_INITIAL128(xmm_t0);
#endif
        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);

        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
    }

partial:
    if (len) {
        memcpy(&xmm_crc_part, src, len);
#ifdef COPY
        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
        memcpy(dst, partial_buf, len);
#endif
        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
    }

    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
}
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`/*`
			`* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ`
			`* instruction.`
			`*`
			`* A white paper describing this algorithm can be found at:`
			`* doc/crc-pclmulqdq.pdf`
			`*`
			`* Copyright (C) 2013 Intel Corporation. All rights reserved.`
			`* Copyright (C) 2016 Marian Beermann (support for initial value)`
			`* Authors:`
			`* Wajdi Feghali <wajdi.k.feghali@intel.com>`
			`* Jim Guilford <james.guilford@intel.com>`
			`* Vinodh Gopal <vinodh.gopal@intel.com>`
			`* Erdinc Ozturk <erdinc.ozturk@intel.com>`
			`* Jim Kukunas <james.t.kukunas@linux.intel.com>`
			`*`
			`* For conditions of distribution and use, see copyright notice in zlib.h`
			`*/`

			`#ifdef COPY`
			`Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold crc, uint8_t dst, const uint8_t *src, size_t len) {`
			`#else`
			`Z_INTERNAL void CRC32_FOLD(crc32_fold crc, const uint8_t src, size_t len, uint32_t init_crc) {`
			`#endif`
			`unsigned long algn_diff;`
			`__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;`
			`__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;`
			`__m128i xmm_crc_part = _mm_setzero_si128();`
			`char ALIGNED_(16) partial_buf[16] = { 0 };`
Merge pull request #26113 from FantasqueX:zlib-ng-2-2-1 Update zlib-ng to 2.2.1 #26113 Release: https://github.com/zlib-ng/zlib-ng/releases/tag/2.2.1 ARM diagnostics patch: https://github.com/zlib-ng/zlib-ng/pull/1774 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2024-09-12 21:05:24 +08:00			`#ifndef COPY`
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);`
			`int32_t first = init_crc != 0;`

Merge pull request #26113 from FantasqueX:zlib-ng-2-2-1 Update zlib-ng to 2.2.1 #26113 Release: https://github.com/zlib-ng/zlib-ng/releases/tag/2.2.1 ARM diagnostics patch: https://github.com/zlib-ng/zlib-ng/pull/1774 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2024-09-12 21:05:24 +08:00			`/* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed`
			`* for the aligning load that occurs. If there's an initial CRC, to carry it forward through`
			`* the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be`
			`* up to 15 bytes + one full vector load. */`
			`assert(len >= 16 \|\| first == 0);`
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`#endif`
			`crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`

			`if (len < 16) {`
			`if (len == 0)`
			`return;`

			`memcpy(partial_buf, src, len);`
			`xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);`
Merge pull request #26113 from FantasqueX:zlib-ng-2-2-1 Update zlib-ng to 2.2.1 #26113 Release: https://github.com/zlib-ng/zlib-ng/releases/tag/2.2.1 ARM diagnostics patch: https://github.com/zlib-ng/zlib-ng/pull/1774 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2024-09-12 21:05:24 +08:00			`#ifdef COPY`
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`memcpy(dst, partial_buf, len);`
			`#endif`
			`goto partial;`
			`}`

			`algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;`
			`if (algn_diff) {`
			`xmm_crc_part = _mm_loadu_si128((__m128i *)src);`
			`#ifdef COPY`
			`_mm_storeu_si128((__m128i *)dst, xmm_crc_part);`
			`dst += algn_diff;`
			`#else`
			`XOR_INITIAL128(xmm_crc_part);`

			`if (algn_diff < 4 && init_crc != 0) {`
			`xmm_t0 = xmm_crc_part;`
Merge pull request #26113 from FantasqueX:zlib-ng-2-2-1 Update zlib-ng to 2.2.1 #26113 Release: https://github.com/zlib-ng/zlib-ng/releases/tag/2.2.1 ARM diagnostics patch: https://github.com/zlib-ng/zlib-ng/pull/1774 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2024-09-12 21:05:24 +08:00			`if (len >= 32) {`
			`xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);`
			`fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`
			`xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);`
			`} else {`
			`memcpy(partial_buf, src + 16, len - 16);`
			`xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);`
			`fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`
			`xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);`
			`src += 16;`
			`len -= 16;`
			`#ifdef COPY`
			`dst -= algn_diff;`
			`#endif`
			`goto partial;`
			`}`

Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`src += 16;`
			`len -= 16;`
			`}`
			`#endif`

			`partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);`

			`src += algn_diff;`
			`len -= algn_diff;`
			`}`

			`#ifdef X86_VPCLMULQDQ`
			`if (len >= 256) {`
			`#ifdef COPY`
			`size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);`
			`dst += n;`
			`#else`
			`size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,`
			`xmm_initial, first);`
			`first = 0;`
			`#endif`
			`len -= n;`
			`src += n;`
			`}`
			`#endif`

			`while (len >= 64) {`
			`len -= 64;`
			`xmm_t0 = _mm_load_si128((__m128i *)src);`
			`xmm_t1 = _mm_load_si128((__m128i *)src + 1);`
			`xmm_t2 = _mm_load_si128((__m128i *)src + 2);`
			`xmm_t3 = _mm_load_si128((__m128i *)src + 3);`
			`src += 64;`

			`fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`
			`#ifdef COPY`
			`_mm_storeu_si128((__m128i *)dst, xmm_t0);`
			`_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);`
			`_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);`
			`_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);`
			`dst += 64;`
			`#else`
			`XOR_INITIAL128(xmm_t0);`
			`#endif`

			`xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);`
			`xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);`
			`xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);`
			`xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);`
			`}`

			`/*`
			`* len = num bytes left - 64`
			`*/`
			`if (len >= 48) {`
			`len -= 48;`

			`xmm_t0 = _mm_load_si128((__m128i *)src);`
			`xmm_t1 = _mm_load_si128((__m128i *)src + 1);`
			`xmm_t2 = _mm_load_si128((__m128i *)src + 2);`
			`src += 48;`
			`#ifdef COPY`
			`_mm_storeu_si128((__m128i *)dst, xmm_t0);`
			`_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);`
			`_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);`
			`dst += 48;`
			`#else`
			`XOR_INITIAL128(xmm_t0);`
			`#endif`
			`fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`

			`xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);`
			`xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);`
			`xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);`
			`} else if (len >= 32) {`
			`len -= 32;`

			`xmm_t0 = _mm_load_si128((__m128i *)src);`
			`xmm_t1 = _mm_load_si128((__m128i *)src + 1);`
			`src += 32;`
			`#ifdef COPY`
			`_mm_storeu_si128((__m128i *)dst, xmm_t0);`
			`_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);`
			`dst += 32;`
			`#else`
			`XOR_INITIAL128(xmm_t0);`
			`#endif`
			`fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`

			`xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);`
			`xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);`
			`} else if (len >= 16) {`
			`len -= 16;`
			`xmm_t0 = _mm_load_si128((__m128i *)src);`
			`src += 16;`
			`#ifdef COPY`
			`_mm_storeu_si128((__m128i *)dst, xmm_t0);`
			`dst += 16;`
			`#else`
			`XOR_INITIAL128(xmm_t0);`
			`#endif`
			`fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`

			`xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);`
			`}`

			`partial:`
			`if (len) {`
			`memcpy(&xmm_crc_part, src, len);`
			`#ifdef COPY`
			`_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);`
			`memcpy(dst, partial_buf, len);`
			`#endif`
			`partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);`
			`}`

			`crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);`
			`}`