opencv/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c

/* adler32_rvv.c - RVV version of adler32
 * Copyright (C) 2023 SiFive, Inc. All rights reserved.
 * Contributed by Alex Chiang <alex.chiang@sifive.com>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#ifdef RISCV_RVV

#include <riscv_vector.h>
#include <stdint.h>

#include "zbuild.h"
#include "adler32_p.h"

static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
    /* split Adler-32 into component sums */
    uint32_t sum2 = (adler >> 16) & 0xffff;
    adler &= 0xffff;

    /* in case user likes doing a byte at a time, keep it fast */
    if (len == 1) {
        if (COPY) memcpy(dst, src, 1);
        return adler32_len_1(adler, src, sum2);
    }

    /* initial Adler-32 value (deferred check for len == 1 speed) */
    if (src == NULL)
        return 1L;

    /* in case short lengths are provided, keep it somewhat fast */
    if (len < 16) {
        if (COPY) memcpy(dst, src, len);
        return adler32_len_16(adler, src, len, sum2);
    }

    size_t left = len;
    size_t vl = __riscv_vsetvlmax_e8m1();
    vl = vl > 256 ? 256 : vl;
    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
    vuint16m2_t v_buf16_accu;

    /*
     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
     * accumulators to boost performance.
     *
     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
     * vl > 256 (255 * 256 <= UINT16_MAX).
     *
     * We accumulate 8-bit data into a 16-bit accumulator and then
     * move the data into the 32-bit accumulator at the last iteration.
     */
    size_t block_size = (256 / vl) * vl;
    size_t nmax_limit = (NMAX / block_size);
    size_t cnt = 0;
    while (left >= block_size) {
        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
        size_t subprob = block_size;
        while (subprob > 0) {
            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
            if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
            src += vl;
            if (COPY) dst += vl;
            subprob -= vl;
        }
        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
        left -= block_size;
        /* do modulo once each block of NMAX size */
        if (++cnt >= nmax_limit) {
            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
            cnt = 0;
        }
    }
    /* the left len <= 256 now, we can use 16-bit accum safely */
    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
    size_t res = left;
    while (left >= vl) {
        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
        if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
        src += vl;
        if (COPY) dst += vl;
        left -= vl;
    }
    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);

    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);

    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);

    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);

    sum2 += (sum2_sum + adler * (len - left));

    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);

    adler += adler_sum;

    while (left--) {
        if (COPY) *dst++ = *src;
        adler += *src++;
        sum2 += adler;
    }

    sum2 %= BASE;
    adler %= BASE;

    return adler | (sum2 << 16);
}

Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
    return adler32_rvv_impl(adler, dst, src, len, 1);
}

Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
    return adler32_rvv_impl(adler, NULL, buf, len, 0);
}

#endif // RISCV_RVV
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`/* adler32_rvv.c - RVV version of adler32`
			`* Copyright (C) 2023 SiFive, Inc. All rights reserved.`
			`* Contributed by Alex Chiang <alex.chiang@sifive.com>`
			`* For conditions of distribution and use, see copyright notice in zlib.h`
			`*/`

			`#ifdef RISCV_RVV`

			`#include <riscv_vector.h>`
			`#include <stdint.h>`

Merge pull request #26113 from FantasqueX:zlib-ng-2-2-1 Update zlib-ng to 2.2.1 #26113 Release: https://github.com/zlib-ng/zlib-ng/releases/tag/2.2.1 ARM diagnostics patch: https://github.com/zlib-ng/zlib-ng/pull/1774 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake 2024-09-12 21:05:24 +08:00			`#include "zbuild.h"`
			`#include "adler32_p.h"`
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00
			`static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {`
			`/* split Adler-32 into component sums */`
			`uint32_t sum2 = (adler >> 16) & 0xffff;`
			`adler &= 0xffff;`

			`/* in case user likes doing a byte at a time, keep it fast */`
			`if (len == 1) {`
			`if (COPY) memcpy(dst, src, 1);`
			`return adler32_len_1(adler, src, sum2);`
			`}`

			`/* initial Adler-32 value (deferred check for len == 1 speed) */`
			`if (src == NULL)`
			`return 1L;`

			`/* in case short lengths are provided, keep it somewhat fast */`
			`if (len < 16) {`
			`if (COPY) memcpy(dst, src, len);`
			`return adler32_len_16(adler, src, len, sum2);`
			`}`

			`size_t left = len;`
			`size_t vl = __riscv_vsetvlmax_e8m1();`
			`vl = vl > 256 ? 256 : vl;`
			`vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);`
			`vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);`
			`vuint16m2_t v_buf16_accu;`

			`/*`
			`* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.`
			`* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit`
			`* accumulators to boost performance.`
			`*`
			`* The block_size is the largest multiple of vl that <= 256, because overflow would occur when`
			`* vl > 256 (255 * 256 <= UINT16_MAX).`
			`*`
			`* We accumulate 8-bit data into a 16-bit accumulator and then`
			`* move the data into the 32-bit accumulator at the last iteration.`
			`*/`
			`size_t block_size = (256 / vl) * vl;`
			`size_t nmax_limit = (NMAX / block_size);`
			`size_t cnt = 0;`
			`while (left >= block_size) {`
			`v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);`
			`size_t subprob = block_size;`
			`while (subprob > 0) {`
			`vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);`
			`if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);`
			`v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);`
			`v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);`
			`src += vl;`
			`if (COPY) dst += vl;`
			`subprob -= vl;`
			`}`
			`v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);`
			`v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);`
			`left -= block_size;`
			`/* do modulo once each block of NMAX size */`
			`if (++cnt >= nmax_limit) {`
			`v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);`
			`cnt = 0;`
			`}`
			`}`
			`/* the left len <= 256 now, we can use 16-bit accum safely */`
			`v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);`
			`size_t res = left;`
			`while (left >= vl) {`
			`vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);`
			`if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);`
			`v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);`
			`v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);`
			`src += vl;`
			`if (COPY) dst += vl;`
			`left -= vl;`
			`}`
			`v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);`
			`v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);`
			`v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);`

			`vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);`
			`vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);`
			`vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);`

			`v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);`

			`vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);`
			`v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);`
			`uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);`

			`sum2 += (sum2_sum + adler * (len - left));`

			`vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);`
			`v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);`
			`uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);`

			`adler += adler_sum;`

			`while (left--) {`
			`if (COPY) dst++ = src;`
			`adler += *src++;`
			`sum2 += adler;`
			`}`

			`sum2 %= BASE;`
			`adler %= BASE;`

			`return adler \| (sum2 << 16);`
			`}`

			`Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t dst, const uint8_t src, size_t len) {`
			`return adler32_rvv_impl(adler, dst, src, len, 1);`
			`}`

			`Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {`
			`return adler32_rvv_impl(adler, NULL, buf, len, 0);`
			`}`

			`#endif // RISCV_RVV`