opencv/3rdparty/zlib-ng/arch/power/crc32_power8.c

/* crc32 for POWER8 using VSX instructions
 * Copyright (C) 2021 IBM Corporation
 *
 * Author: Rogerio Alves <rogealve@br.ibm.com>
 *
 * For conditions of distribution and use, see copyright notice in zlib.h
 *
 * Calculate the checksum of data that is 16 byte aligned and a multiple of
 * 16 bytes.
 *
 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
 * chunks in order to mask the latency of the vpmsum instructions. If we
 * have more than 32 kB of data to checksum we repeat this step multiple
 * times, passing in the previous 1024 bits.
 *
 * The next step is to reduce the 1024 bits to 64 bits. This step adds
 * 32 bits of 0s to the end - this matches what a CRC does. We just
 * calculate constants that land the data in this 32 bits.
 *
 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
 * for n = CRC using POWER8 instructions. We use x = 32.
 *
 * http://en.wikipedia.org/wiki/Barrett_reduction
 *
 * This code uses gcc vector builtins instead using assembly directly.
 */

#include <altivec.h>
#include "zendian.h"
#include "zbuild.h"

#include "crc32_constants.h"
#include "crc32_braid_tbl.h"

#if defined (__clang__)
#include "fallback_builtins.h"
#endif

#define MAX_SIZE    32768
#define VMX_ALIGN	16
#define VMX_ALIGN_MASK	(VMX_ALIGN-1)

static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
    while (len--)
        crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
    return crc;
}

static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);

Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
    unsigned int prealign;
    unsigned int tail;

    unsigned long len = (unsigned long) _len;

    if (p == (const unsigned char *) 0x0)
        return 0;

    crc ^= 0xffffffff;

    if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
        crc = crc32_align(crc, p, len);
        goto out;
    }

    if ((unsigned long)p & VMX_ALIGN_MASK) {
        prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
        crc = crc32_align(crc, p, prealign);
        len -= prealign;
        p += prealign;
    }

    crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);

    tail = len & VMX_ALIGN_MASK;
    if (tail) {
        p += len & ~VMX_ALIGN_MASK;
        crc = crc32_align(crc, p, tail);
    }

out:
    crc ^= 0xffffffff;

    return crc;
}

/* When we have a load-store in a single-dispatch group and address overlap
 * such that forward is not allowed (load-hit-store) the group must be flushed.
 * A group ending NOP prevents the flush.
 */
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")

#if BYTE_ORDER == BIG_ENDIAN
#define BYTESWAP_DATA
#endif

#ifdef BYTESWAP_DATA
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
#if BYTE_ORDER == LITTLE_ENDIAN
/* Byte reverse permute constant LE. */
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
#else
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
#endif
#else
#define VEC_PERM(vr, va, vb, vc)
#endif

static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {

    const __vector unsigned long long vzero = {0,0};
    const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};

    const __vector unsigned long long vmask_32bit =
        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);

    const __vector unsigned long long vmask_64bit =
        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);

    __vector unsigned long long vcrc;

    __vector unsigned long long vconst1, vconst2;

    /* vdata0-vdata7 will contain our data (p). */
    __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;

    /* v0-v7 will contain our checksums */
    __vector unsigned long long v0 = {0,0};
    __vector unsigned long long v1 = {0,0};
    __vector unsigned long long v2 = {0,0};
    __vector unsigned long long v3 = {0,0};
    __vector unsigned long long v4 = {0,0};
    __vector unsigned long long v5 = {0,0};
    __vector unsigned long long v6 = {0,0};
    __vector unsigned long long v7 = {0,0};


    /* Vector auxiliary variables. */
    __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;

    unsigned int offset; /* Constant table offset. */

    unsigned long i; /* Counter. */
    unsigned long chunks;

    unsigned long block_size;
    int next_block = 0;

    /* Align by 128 bits. The last 128 bit block will be processed at end. */
    unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;

    vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);

    /* Short version. */
    if (len < 256) {
        /* Calculate where in the constant table we need to start. */
        offset = 256 - len;

        vconst1 = vec_ld(offset, vcrc_short_const);
        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
        VEC_PERM(vdata0, vdata0, vconst1, vperm_const);

        /* xor initial value */
        vdata0 = vec_xor(vdata0, vcrc);

        vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
        v0 = vec_xor(v0, vdata0);

        for (i = 16; i < len; i += 16) {
            vconst1 = vec_ld(offset + i, vcrc_short_const);
            vdata0 = vec_ld(i, (__vector unsigned long long*) p);
            VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
            vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
                (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
            v0 = vec_xor(v0, vdata0);
        }
    } else {

        /* Load initial values. */
        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
        vdata1 = vec_ld(16, (__vector unsigned long long*) p);

        VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
        VEC_PERM(vdata1, vdata1, vdata1, vperm_const);

        vdata2 = vec_ld(32, (__vector unsigned long long*) p);
        vdata3 = vec_ld(48, (__vector unsigned long long*) p);

        VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
        VEC_PERM(vdata3, vdata3, vdata3, vperm_const);

        vdata4 = vec_ld(64, (__vector unsigned long long*) p);
        vdata5 = vec_ld(80, (__vector unsigned long long*) p);

        VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
        VEC_PERM(vdata5, vdata5, vdata5, vperm_const);

        vdata6 = vec_ld(96, (__vector unsigned long long*) p);
        vdata7 = vec_ld(112, (__vector unsigned long long*) p);

        VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
        VEC_PERM(vdata7, vdata7, vdata7, vperm_const);

        /* xor in initial value */
        vdata0 = vec_xor(vdata0, vcrc);

        p = (char *)p + 128;

        do {
            /* Checksum in blocks of MAX_SIZE. */
            block_size = length;
            if (block_size > MAX_SIZE) {
                block_size = MAX_SIZE;
            }

            length = length - block_size;

            /*
             * Work out the offset into the constants table to start at. Each
             * constant is 16 bytes, and it is used against 128 bytes of input
             * data - 128 / 16 = 8
             */
            offset = (MAX_SIZE/8) - (block_size/8);
            /* We reduce our final 128 bytes in a separate step */
            chunks = (block_size/128)-1;

            vconst1 = vec_ld(offset, vcrc_const);

            va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
                                           (__vector unsigned long long)vconst1);
            va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
                                           (__vector unsigned long long)vconst1);
            va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
                                           (__vector unsigned long long)vconst1);
            va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
                                           (__vector unsigned long long)vconst1);
            va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
                                           (__vector unsigned long long)vconst1);
            va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
                                           (__vector unsigned long long)vconst1);
            va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
                                           (__vector unsigned long long)vconst1);
            va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
                                           (__vector unsigned long long)vconst1);

            if (chunks > 1) {
                offset += 16;
                vconst2 = vec_ld(offset, vcrc_const);
                GROUP_ENDING_NOP;

                vdata0 = vec_ld(0, (__vector unsigned long long*) p);
                VEC_PERM(vdata0, vdata0, vdata0, vperm_const);

                vdata1 = vec_ld(16, (__vector unsigned long long*) p);
                VEC_PERM(vdata1, vdata1, vdata1, vperm_const);

                vdata2 = vec_ld(32, (__vector unsigned long long*) p);
                VEC_PERM(vdata2, vdata2, vdata2, vperm_const);

                vdata3 = vec_ld(48, (__vector unsigned long long*) p);
                VEC_PERM(vdata3, vdata3, vdata3, vperm_const);

                vdata4 = vec_ld(64, (__vector unsigned long long*) p);
                VEC_PERM(vdata4, vdata4, vdata4, vperm_const);

                vdata5 = vec_ld(80, (__vector unsigned long long*) p);
                VEC_PERM(vdata5, vdata5, vdata5, vperm_const);

                vdata6 = vec_ld(96, (__vector unsigned long long*) p);
                VEC_PERM(vdata6, vdata6, vdata6, vperm_const);

                vdata7 = vec_ld(112, (__vector unsigned long long*) p);
                VEC_PERM(vdata7, vdata7, vdata7, vperm_const);

                p = (char *)p + 128;

                /*
                 * main loop. Each iteration calculates the CRC for a 128-byte
                 * block.
                 */
                for (i = 0; i < chunks-2; i++) {
                    vconst1 = vec_ld(offset, vcrc_const);
                    offset += 16;
                    GROUP_ENDING_NOP;

                    v0 = vec_xor(v0, va0);
                    va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
                                                   (__vector unsigned long long)vconst2);
                    vdata0 = vec_ld(0, (__vector unsigned long long*) p);
                    VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
                    GROUP_ENDING_NOP;

                    v1 = vec_xor(v1, va1);
                    va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
                                                   (__vector unsigned long long)vconst2);
                    vdata1 = vec_ld(16, (__vector unsigned long long*) p);
                    VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
                    GROUP_ENDING_NOP;

                    v2 = vec_xor(v2, va2);
                    va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
                                                   vdata2, (__vector unsigned long long)vconst2);
                    vdata2 = vec_ld(32, (__vector unsigned long long*) p);
                    VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
                    GROUP_ENDING_NOP;

                    v3 = vec_xor(v3, va3);
                    va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
                                                   (__vector unsigned long long)vconst2);
                    vdata3 = vec_ld(48, (__vector unsigned long long*) p);
                    VEC_PERM(vdata3, vdata3, vdata3, vperm_const);

                    vconst2 = vec_ld(offset, vcrc_const);
                    GROUP_ENDING_NOP;

                    v4 = vec_xor(v4, va4);
                    va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
                                                   (__vector unsigned long long)vconst1);
                    vdata4 = vec_ld(64, (__vector unsigned long long*) p);
                    VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
                    GROUP_ENDING_NOP;

                    v5 = vec_xor(v5, va5);
                    va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
                                                   (__vector unsigned long long)vconst1);
                    vdata5 = vec_ld(80, (__vector unsigned long long*) p);
                    VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
                    GROUP_ENDING_NOP;

                    v6 = vec_xor(v6, va6);
                    va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
                                                   (__vector unsigned long long)vconst1);
                    vdata6 = vec_ld(96, (__vector unsigned long long*) p);
                    VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
                    GROUP_ENDING_NOP;

                    v7 = vec_xor(v7, va7);
                    va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
                                                   (__vector unsigned long long)vconst1);
                    vdata7 = vec_ld(112, (__vector unsigned long long*) p);
                    VEC_PERM(vdata7, vdata7, vdata7, vperm_const);

                    p = (char *)p + 128;
                }

                /* First cool down */
                vconst1 = vec_ld(offset, vcrc_const);
                offset += 16;

                v0 = vec_xor(v0, va0);
                va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
                                               (__vector unsigned long long)vconst1);
                GROUP_ENDING_NOP;

                v1 = vec_xor(v1, va1);
                va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
                                               (__vector unsigned long long)vconst1);
                GROUP_ENDING_NOP;

                v2 = vec_xor(v2, va2);
                va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
                                               (__vector unsigned long long)vconst1);
                GROUP_ENDING_NOP;

                v3 = vec_xor(v3, va3);
                va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
                                               (__vector unsigned long long)vconst1);
                GROUP_ENDING_NOP;

                v4 = vec_xor(v4, va4);
                va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
                                               (__vector unsigned long long)vconst1);
                GROUP_ENDING_NOP;

                v5 = vec_xor(v5, va5);
                va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
                                               (__vector unsigned long long)vconst1);
                GROUP_ENDING_NOP;

                v6 = vec_xor(v6, va6);
                va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
                                               (__vector unsigned long long)vconst1);
                GROUP_ENDING_NOP;

                v7 = vec_xor(v7, va7);
                va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
                                               (__vector unsigned long long)vconst1);
            }/* else */

            /* Second cool down. */
            v0 = vec_xor(v0, va0);
            v1 = vec_xor(v1, va1);
            v2 = vec_xor(v2, va2);
            v3 = vec_xor(v3, va3);
            v4 = vec_xor(v4, va4);
            v5 = vec_xor(v5, va5);
            v6 = vec_xor(v6, va6);
            v7 = vec_xor(v7, va7);

            /*
             * vpmsumd produces a 96 bit result in the least significant bits
             * of the register. Since we are bit reflected we have to shift it
             * left 32 bits so it occupies the least significant bits in the
             * bit reflected domain.
             */
            v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
                                                      (__vector unsigned char)vzero, 4);
            v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
                                                      (__vector unsigned char)vzero, 4);
            v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
                                                      (__vector unsigned char)vzero, 4);
            v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
                                                      (__vector unsigned char)vzero, 4);
            v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
                                                      (__vector unsigned char)vzero, 4);
            v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
                                                      (__vector unsigned char)vzero, 4);
            v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
                                                      (__vector unsigned char)vzero, 4);
            v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
                                                      (__vector unsigned char)vzero, 4);

            /* xor with the last 1024 bits. */
            va0 = vec_ld(0, (__vector unsigned long long*) p);
            VEC_PERM(va0, va0, va0, vperm_const);

            va1 = vec_ld(16, (__vector unsigned long long*) p);
            VEC_PERM(va1, va1, va1, vperm_const);

            va2 = vec_ld(32, (__vector unsigned long long*) p);
            VEC_PERM(va2, va2, va2, vperm_const);

            va3 = vec_ld(48, (__vector unsigned long long*) p);
            VEC_PERM(va3, va3, va3, vperm_const);

            va4 = vec_ld(64, (__vector unsigned long long*) p);
            VEC_PERM(va4, va4, va4, vperm_const);

            va5 = vec_ld(80, (__vector unsigned long long*) p);
            VEC_PERM(va5, va5, va5, vperm_const);

            va6 = vec_ld(96, (__vector unsigned long long*) p);
            VEC_PERM(va6, va6, va6, vperm_const);

            va7 = vec_ld(112, (__vector unsigned long long*) p);
            VEC_PERM(va7, va7, va7, vperm_const);

            p = (char *)p + 128;

            vdata0 = vec_xor(v0, va0);
            vdata1 = vec_xor(v1, va1);
            vdata2 = vec_xor(v2, va2);
            vdata3 = vec_xor(v3, va3);
            vdata4 = vec_xor(v4, va4);
            vdata5 = vec_xor(v5, va5);
            vdata6 = vec_xor(v6, va6);
            vdata7 = vec_xor(v7, va7);

            /* Check if we have more blocks to process */
            next_block = 0;
            if (length != 0) {
                next_block = 1;

                /* zero v0-v7 */
                v0 = vec_xor(v0, v0);
                v1 = vec_xor(v1, v1);
                v2 = vec_xor(v2, v2);
                v3 = vec_xor(v3, v3);
                v4 = vec_xor(v4, v4);
                v5 = vec_xor(v5, v5);
                v6 = vec_xor(v6, v6);
                v7 = vec_xor(v7, v7);
            }
            length = length + 128;

        } while (next_block);

        /* Calculate how many bytes we have left. */
        length = (len & 127);

        /* Calculate where in (short) constant table we need to start. */
        offset = 128 - length;

        v0 = vec_ld(offset, vcrc_short_const);
        v1 = vec_ld(offset + 16, vcrc_short_const);
        v2 = vec_ld(offset + 32, vcrc_short_const);
        v3 = vec_ld(offset + 48, vcrc_short_const);
        v4 = vec_ld(offset + 64, vcrc_short_const);
        v5 = vec_ld(offset + 80, vcrc_short_const);
        v6 = vec_ld(offset + 96, vcrc_short_const);
        v7 = vec_ld(offset + 112, vcrc_short_const);

        offset += 128;

        v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata0, (__vector unsigned int)v0);
        v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata1, (__vector unsigned int)v1);
        v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata2, (__vector unsigned int)v2);
        v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata3, (__vector unsigned int)v3);
        v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata4, (__vector unsigned int)v4);
        v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata5, (__vector unsigned int)v5);
        v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata6, (__vector unsigned int)v6);
        v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
            (__vector unsigned int)vdata7, (__vector unsigned int)v7);

        /* Now reduce the tail (0-112 bytes). */
        for (i = 0; i < length; i+=16) {
            vdata0 = vec_ld(i,(__vector unsigned long long*)p);
            VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
            va0 = vec_ld(offset + i,vcrc_short_const);
            va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
                (__vector unsigned int)vdata0, (__vector unsigned int)va0);
            v0 = vec_xor(v0, va0);
        }

        /* xor all parallel chunks together. */
        v0 = vec_xor(v0, v1);
        v2 = vec_xor(v2, v3);
        v4 = vec_xor(v4, v5);
        v6 = vec_xor(v6, v7);

        v0 = vec_xor(v0, v2);
        v4 = vec_xor(v4, v6);

        v0 = vec_xor(v0, v4);
    }

    /* Barrett Reduction */
    vconst1 = vec_ld(0, v_Barrett_const);
    vconst2 = vec_ld(16, v_Barrett_const);

    v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
                                              (__vector unsigned char)v0, 8);
    v0 = vec_xor(v1,v0);

    /* shift left one bit */
    __vector unsigned char vsht_splat = vec_splat_u8 (1);
    v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);

    v0 = vec_and(v0, vmask_64bit);

    /*
     * The reflected version of Barrett reduction. Instead of bit
     * reflecting our data (which is expensive to do), we bit reflect our
     * constants and our algorithm, which means the intermediate data in
     * our vector registers goes from 0-63 instead of 63-0. We can reflect
     * the algorithm because we don't carry in mod 2 arithmetic.
     */

    /* bottom 32 bits of a */
    v1 = vec_and(v0, vmask_32bit);

    /* ma */
    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
                                  (__vector unsigned long long)vconst1);

    /* bottom 32bits of ma */
    v1 = vec_and(v1, vmask_32bit);
    /* qn */
    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
                                  (__vector unsigned long long)vconst2);
    /* a - qn, subtraction is xor in GF(2) */
    v0 = vec_xor (v0, v1);

    /*
     * Since we are bit reflected, the result (ie the low 32 bits) is in
     * the high 32 bits. We just need to shift it left 4 bytes
     * V0 [ 0 1 X 3 ]
     * V0 [ 0 X 2 3 ]
     */

    /* shift result into top 64 bits of */
    v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
                                              (__vector unsigned char)vzero, 4);

#if BYTE_ORDER == BIG_ENDIAN
    return v0[0];
#else
    return v0[1];
#endif
}
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`/* crc32 for POWER8 using VSX instructions`
			`* Copyright (C) 2021 IBM Corporation`
			`*`
			`* Author: Rogerio Alves <rogealve@br.ibm.com>`
			`*`
			`* For conditions of distribution and use, see copyright notice in zlib.h`
			`*`
			`* Calculate the checksum of data that is 16 byte aligned and a multiple of`
			`* 16 bytes.`
			`*`
			`* The first step is to reduce it to 1024 bits. We do this in 8 parallel`
			`* chunks in order to mask the latency of the vpmsum instructions. If we`
			`* have more than 32 kB of data to checksum we repeat this step multiple`
			`* times, passing in the previous 1024 bits.`
			`*`
			`* The next step is to reduce the 1024 bits to 64 bits. This step adds`
			`* 32 bits of 0s to the end - this matches what a CRC does. We just`
			`* calculate constants that land the data in this 32 bits.`
			`*`
			`* We then use fixed point Barrett reduction to compute a mod n over GF(2)`
			`* for n = CRC using POWER8 instructions. We use x = 32.`
			`*`
			`* http://en.wikipedia.org/wiki/Barrett_reduction`
			`*`
			`* This code uses gcc vector builtins instead using assembly directly.`
			`*/`

			`#include <altivec.h>`
			`#include "zendian.h"`
			`#include "zbuild.h"`

			`#include "crc32_constants.h"`
			`#include "crc32_braid_tbl.h"`

			`#if defined (__clang__)`
			`#include "fallback_builtins.h"`
			`#endif`

			`#define MAX_SIZE 32768`
			`#define VMX_ALIGN 16`
			`#define VMX_ALIGN_MASK (VMX_ALIGN-1)`

			`static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {`
			`while (len--)`
			`crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);`
			`return crc;`
			`}`

			`static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);`

			`Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {`
			`unsigned int prealign;`
			`unsigned int tail;`

			`unsigned long len = (unsigned long) _len;`

			`if (p == (const unsigned char *) 0x0)`
			`return 0;`

			`crc ^= 0xffffffff;`

			`if (len < VMX_ALIGN + VMX_ALIGN_MASK) {`
			`crc = crc32_align(crc, p, len);`
			`goto out;`
			`}`

			`if ((unsigned long)p & VMX_ALIGN_MASK) {`
			`prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);`
			`crc = crc32_align(crc, p, prealign);`
			`len -= prealign;`
			`p += prealign;`
			`}`

			`crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);`

			`tail = len & VMX_ALIGN_MASK;`
			`if (tail) {`
			`p += len & ~VMX_ALIGN_MASK;`
			`crc = crc32_align(crc, p, tail);`
			`}`

			`out:`
			`crc ^= 0xffffffff;`

			`return crc;`
			`}`

			`/* When we have a load-store in a single-dispatch group and address overlap`
			`* such that forward is not allowed (load-hit-store) the group must be flushed.`
			`* A group ending NOP prevents the flush.`
			`*/`
			`#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")`

			`#if BYTE_ORDER == BIG_ENDIAN`
			`#define BYTESWAP_DATA`
			`#endif`

			`#ifdef BYTESWAP_DATA`
			`#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)`
			`#if BYTE_ORDER == LITTLE_ENDIAN`
			`/* Byte reverse permute constant LE. */`
			`static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };`
			`#else`
			`static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };`
			`#endif`
			`#else`
			`#define VEC_PERM(vr, va, vb, vc)`
			`#endif`

			`static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {`

			`const __vector unsigned long long vzero = {0,0};`
			`const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};`

			`const __vector unsigned long long vmask_32bit =`
			`(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);`

			`const __vector unsigned long long vmask_64bit =`
			`(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);`

			`__vector unsigned long long vcrc;`

			`__vector unsigned long long vconst1, vconst2;`

			`/* vdata0-vdata7 will contain our data (p). */`
			`__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;`

			`/* v0-v7 will contain our checksums */`
			`__vector unsigned long long v0 = {0,0};`
			`__vector unsigned long long v1 = {0,0};`
			`__vector unsigned long long v2 = {0,0};`
			`__vector unsigned long long v3 = {0,0};`
			`__vector unsigned long long v4 = {0,0};`
			`__vector unsigned long long v5 = {0,0};`
			`__vector unsigned long long v6 = {0,0};`
			`__vector unsigned long long v7 = {0,0};`


			`/* Vector auxiliary variables. */`
			`__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;`

			`unsigned int offset; /* Constant table offset. */`

			`unsigned long i; /* Counter. */`
			`unsigned long chunks;`

			`unsigned long block_size;`
			`int next_block = 0;`

			`/* Align by 128 bits. The last 128 bit block will be processed at end. */`
			`unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;`

			`vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);`

			`/* Short version. */`
			`if (len < 256) {`
			`/* Calculate where in the constant table we need to start. */`
			`offset = 256 - len;`

			`vconst1 = vec_ld(offset, vcrc_short_const);`
			`vdata0 = vec_ld(0, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata0, vdata0, vconst1, vperm_const);`

			`/* xor initial value */`
			`vdata0 = vec_xor(vdata0, vcrc);`

			`vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);`
			`v0 = vec_xor(v0, vdata0);`

			`for (i = 16; i < len; i += 16) {`
			`vconst1 = vec_ld(offset + i, vcrc_short_const);`
			`vdata0 = vec_ld(i, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata0, vdata0, vconst1, vperm_const);`
			`vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);`
			`v0 = vec_xor(v0, vdata0);`
			`}`
			`} else {`

			`/* Load initial values. */`
			`vdata0 = vec_ld(0, (__vector unsigned long long*) p);`
			`vdata1 = vec_ld(16, (__vector unsigned long long*) p);`

			`VEC_PERM(vdata0, vdata0, vdata0, vperm_const);`
			`VEC_PERM(vdata1, vdata1, vdata1, vperm_const);`

			`vdata2 = vec_ld(32, (__vector unsigned long long*) p);`
			`vdata3 = vec_ld(48, (__vector unsigned long long*) p);`

			`VEC_PERM(vdata2, vdata2, vdata2, vperm_const);`
			`VEC_PERM(vdata3, vdata3, vdata3, vperm_const);`

			`vdata4 = vec_ld(64, (__vector unsigned long long*) p);`
			`vdata5 = vec_ld(80, (__vector unsigned long long*) p);`

			`VEC_PERM(vdata4, vdata4, vdata4, vperm_const);`
			`VEC_PERM(vdata5, vdata5, vdata5, vperm_const);`

			`vdata6 = vec_ld(96, (__vector unsigned long long*) p);`
			`vdata7 = vec_ld(112, (__vector unsigned long long*) p);`

			`VEC_PERM(vdata6, vdata6, vdata6, vperm_const);`
			`VEC_PERM(vdata7, vdata7, vdata7, vperm_const);`

			`/* xor in initial value */`
			`vdata0 = vec_xor(vdata0, vcrc);`

			`p = (char *)p + 128;`

			`do {`
			`/* Checksum in blocks of MAX_SIZE. */`
			`block_size = length;`
			`if (block_size > MAX_SIZE) {`
			`block_size = MAX_SIZE;`
			`}`

			`length = length - block_size;`

			`/*`
			`* Work out the offset into the constants table to start at. Each`
			`* constant is 16 bytes, and it is used against 128 bytes of input`
			`* data - 128 / 16 = 8`
			`*/`
			`offset = (MAX_SIZE/8) - (block_size/8);`
			`/* We reduce our final 128 bytes in a separate step */`
			`chunks = (block_size/128)-1;`

			`vconst1 = vec_ld(offset, vcrc_const);`

			`va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,`
			`(__vector unsigned long long)vconst1);`
			`va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,`
			`(__vector unsigned long long)vconst1);`
			`va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,`
			`(__vector unsigned long long)vconst1);`
			`va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,`
			`(__vector unsigned long long)vconst1);`
			`va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,`
			`(__vector unsigned long long)vconst1);`
			`va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,`
			`(__vector unsigned long long)vconst1);`
			`va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,`
			`(__vector unsigned long long)vconst1);`
			`va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,`
			`(__vector unsigned long long)vconst1);`

			`if (chunks > 1) {`
			`offset += 16;`
			`vconst2 = vec_ld(offset, vcrc_const);`
			`GROUP_ENDING_NOP;`

			`vdata0 = vec_ld(0, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata0, vdata0, vdata0, vperm_const);`

			`vdata1 = vec_ld(16, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata1, vdata1, vdata1, vperm_const);`

			`vdata2 = vec_ld(32, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata2, vdata2, vdata2, vperm_const);`

			`vdata3 = vec_ld(48, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata3, vdata3, vdata3, vperm_const);`

			`vdata4 = vec_ld(64, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata4, vdata4, vdata4, vperm_const);`

			`vdata5 = vec_ld(80, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata5, vdata5, vdata5, vperm_const);`

			`vdata6 = vec_ld(96, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata6, vdata6, vdata6, vperm_const);`

			`vdata7 = vec_ld(112, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata7, vdata7, vdata7, vperm_const);`

			`p = (char *)p + 128;`

			`/*`
			`* main loop. Each iteration calculates the CRC for a 128-byte`
			`* block.`
			`*/`
			`for (i = 0; i < chunks-2; i++) {`
			`vconst1 = vec_ld(offset, vcrc_const);`
			`offset += 16;`
			`GROUP_ENDING_NOP;`

			`v0 = vec_xor(v0, va0);`
			`va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,`
			`(__vector unsigned long long)vconst2);`
			`vdata0 = vec_ld(0, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata0, vdata0, vdata0, vperm_const);`
			`GROUP_ENDING_NOP;`

			`v1 = vec_xor(v1, va1);`
			`va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,`
			`(__vector unsigned long long)vconst2);`
			`vdata1 = vec_ld(16, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata1, vdata1, vdata1, vperm_const);`
			`GROUP_ENDING_NOP;`

			`v2 = vec_xor(v2, va2);`
			`va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)`
			`vdata2, (__vector unsigned long long)vconst2);`
			`vdata2 = vec_ld(32, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata2, vdata2, vdata2, vperm_const);`
			`GROUP_ENDING_NOP;`

			`v3 = vec_xor(v3, va3);`
			`va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,`
			`(__vector unsigned long long)vconst2);`
			`vdata3 = vec_ld(48, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata3, vdata3, vdata3, vperm_const);`

			`vconst2 = vec_ld(offset, vcrc_const);`
			`GROUP_ENDING_NOP;`

			`v4 = vec_xor(v4, va4);`
			`va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,`
			`(__vector unsigned long long)vconst1);`
			`vdata4 = vec_ld(64, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata4, vdata4, vdata4, vperm_const);`
			`GROUP_ENDING_NOP;`

			`v5 = vec_xor(v5, va5);`
			`va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,`
			`(__vector unsigned long long)vconst1);`
			`vdata5 = vec_ld(80, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata5, vdata5, vdata5, vperm_const);`
			`GROUP_ENDING_NOP;`

			`v6 = vec_xor(v6, va6);`
			`va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,`
			`(__vector unsigned long long)vconst1);`
			`vdata6 = vec_ld(96, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata6, vdata6, vdata6, vperm_const);`
			`GROUP_ENDING_NOP;`

			`v7 = vec_xor(v7, va7);`
			`va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,`
			`(__vector unsigned long long)vconst1);`
			`vdata7 = vec_ld(112, (__vector unsigned long long*) p);`
			`VEC_PERM(vdata7, vdata7, vdata7, vperm_const);`

			`p = (char *)p + 128;`
			`}`

			`/* First cool down */`
			`vconst1 = vec_ld(offset, vcrc_const);`
			`offset += 16;`

			`v0 = vec_xor(v0, va0);`
			`va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,`
			`(__vector unsigned long long)vconst1);`
			`GROUP_ENDING_NOP;`

			`v1 = vec_xor(v1, va1);`
			`va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,`
			`(__vector unsigned long long)vconst1);`
			`GROUP_ENDING_NOP;`

			`v2 = vec_xor(v2, va2);`
			`va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,`
			`(__vector unsigned long long)vconst1);`
			`GROUP_ENDING_NOP;`

			`v3 = vec_xor(v3, va3);`
			`va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,`
			`(__vector unsigned long long)vconst1);`
			`GROUP_ENDING_NOP;`

			`v4 = vec_xor(v4, va4);`
			`va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,`
			`(__vector unsigned long long)vconst1);`
			`GROUP_ENDING_NOP;`

			`v5 = vec_xor(v5, va5);`
			`va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,`
			`(__vector unsigned long long)vconst1);`
			`GROUP_ENDING_NOP;`

			`v6 = vec_xor(v6, va6);`
			`va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,`
			`(__vector unsigned long long)vconst1);`
			`GROUP_ENDING_NOP;`

			`v7 = vec_xor(v7, va7);`
			`va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,`
			`(__vector unsigned long long)vconst1);`
			`}/* else */`

			`/* Second cool down. */`
			`v0 = vec_xor(v0, va0);`
			`v1 = vec_xor(v1, va1);`
			`v2 = vec_xor(v2, va2);`
			`v3 = vec_xor(v3, va3);`
			`v4 = vec_xor(v4, va4);`
			`v5 = vec_xor(v5, va5);`
			`v6 = vec_xor(v6, va6);`
			`v7 = vec_xor(v7, va7);`

			`/*`
			`* vpmsumd produces a 96 bit result in the least significant bits`
			`* of the register. Since we are bit reflected we have to shift it`
			`* left 32 bits so it occupies the least significant bits in the`
			`* bit reflected domain.`
			`*/`
			`v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,`
			`(__vector unsigned char)vzero, 4);`
			`v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,`
			`(__vector unsigned char)vzero, 4);`
			`v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,`
			`(__vector unsigned char)vzero, 4);`
			`v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,`
			`(__vector unsigned char)vzero, 4);`
			`v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,`
			`(__vector unsigned char)vzero, 4);`
			`v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,`
			`(__vector unsigned char)vzero, 4);`
			`v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,`
			`(__vector unsigned char)vzero, 4);`
			`v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,`
			`(__vector unsigned char)vzero, 4);`

			`/* xor with the last 1024 bits. */`
			`va0 = vec_ld(0, (__vector unsigned long long*) p);`
			`VEC_PERM(va0, va0, va0, vperm_const);`

			`va1 = vec_ld(16, (__vector unsigned long long*) p);`
			`VEC_PERM(va1, va1, va1, vperm_const);`

			`va2 = vec_ld(32, (__vector unsigned long long*) p);`
			`VEC_PERM(va2, va2, va2, vperm_const);`

			`va3 = vec_ld(48, (__vector unsigned long long*) p);`
			`VEC_PERM(va3, va3, va3, vperm_const);`

			`va4 = vec_ld(64, (__vector unsigned long long*) p);`
			`VEC_PERM(va4, va4, va4, vperm_const);`

			`va5 = vec_ld(80, (__vector unsigned long long*) p);`
			`VEC_PERM(va5, va5, va5, vperm_const);`

			`va6 = vec_ld(96, (__vector unsigned long long*) p);`
			`VEC_PERM(va6, va6, va6, vperm_const);`

			`va7 = vec_ld(112, (__vector unsigned long long*) p);`
			`VEC_PERM(va7, va7, va7, vperm_const);`

			`p = (char *)p + 128;`

			`vdata0 = vec_xor(v0, va0);`
			`vdata1 = vec_xor(v1, va1);`
			`vdata2 = vec_xor(v2, va2);`
			`vdata3 = vec_xor(v3, va3);`
			`vdata4 = vec_xor(v4, va4);`
			`vdata5 = vec_xor(v5, va5);`
			`vdata6 = vec_xor(v6, va6);`
			`vdata7 = vec_xor(v7, va7);`

			`/* Check if we have more blocks to process */`
			`next_block = 0;`
			`if (length != 0) {`
			`next_block = 1;`

			`/* zero v0-v7 */`
			`v0 = vec_xor(v0, v0);`
			`v1 = vec_xor(v1, v1);`
			`v2 = vec_xor(v2, v2);`
			`v3 = vec_xor(v3, v3);`
			`v4 = vec_xor(v4, v4);`
			`v5 = vec_xor(v5, v5);`
			`v6 = vec_xor(v6, v6);`
			`v7 = vec_xor(v7, v7);`
			`}`
			`length = length + 128;`

			`} while (next_block);`

			`/* Calculate how many bytes we have left. */`
			`length = (len & 127);`

			`/* Calculate where in (short) constant table we need to start. */`
			`offset = 128 - length;`

			`v0 = vec_ld(offset, vcrc_short_const);`
			`v1 = vec_ld(offset + 16, vcrc_short_const);`
			`v2 = vec_ld(offset + 32, vcrc_short_const);`
			`v3 = vec_ld(offset + 48, vcrc_short_const);`
			`v4 = vec_ld(offset + 64, vcrc_short_const);`
			`v5 = vec_ld(offset + 80, vcrc_short_const);`
			`v6 = vec_ld(offset + 96, vcrc_short_const);`
			`v7 = vec_ld(offset + 112, vcrc_short_const);`

			`offset += 128;`

			`v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata0, (__vector unsigned int)v0);`
			`v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata1, (__vector unsigned int)v1);`
			`v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata2, (__vector unsigned int)v2);`
			`v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata3, (__vector unsigned int)v3);`
			`v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata4, (__vector unsigned int)v4);`
			`v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata5, (__vector unsigned int)v5);`
			`v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata6, (__vector unsigned int)v6);`
			`v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata7, (__vector unsigned int)v7);`

			`/* Now reduce the tail (0-112 bytes). */`
			`for (i = 0; i < length; i+=16) {`
			`vdata0 = vec_ld(i,(__vector unsigned long long*)p);`
			`VEC_PERM(vdata0, vdata0, vdata0, vperm_const);`
			`va0 = vec_ld(offset + i,vcrc_short_const);`
			`va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(`
			`(__vector unsigned int)vdata0, (__vector unsigned int)va0);`
			`v0 = vec_xor(v0, va0);`
			`}`

			`/* xor all parallel chunks together. */`
			`v0 = vec_xor(v0, v1);`
			`v2 = vec_xor(v2, v3);`
			`v4 = vec_xor(v4, v5);`
			`v6 = vec_xor(v6, v7);`

			`v0 = vec_xor(v0, v2);`
			`v4 = vec_xor(v4, v6);`

			`v0 = vec_xor(v0, v4);`
			`}`

			`/* Barrett Reduction */`
			`vconst1 = vec_ld(0, v_Barrett_const);`
			`vconst2 = vec_ld(16, v_Barrett_const);`

			`v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,`
			`(__vector unsigned char)v0, 8);`
			`v0 = vec_xor(v1,v0);`

			`/* shift left one bit */`
			`__vector unsigned char vsht_splat = vec_splat_u8 (1);`
			`v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);`

			`v0 = vec_and(v0, vmask_64bit);`

			`/*`
			`* The reflected version of Barrett reduction. Instead of bit`
			`* reflecting our data (which is expensive to do), we bit reflect our`
			`* constants and our algorithm, which means the intermediate data in`
			`* our vector registers goes from 0-63 instead of 63-0. We can reflect`
			`* the algorithm because we don't carry in mod 2 arithmetic.`
			`*/`

			`/* bottom 32 bits of a */`
			`v1 = vec_and(v0, vmask_32bit);`

			`/* ma */`
			`v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,`
			`(__vector unsigned long long)vconst1);`

			`/* bottom 32bits of ma */`
			`v1 = vec_and(v1, vmask_32bit);`
			`/* qn */`
			`v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,`
			`(__vector unsigned long long)vconst2);`
			`/* a - qn, subtraction is xor in GF(2) */`
			`v0 = vec_xor (v0, v1);`

			`/*`
			`* Since we are bit reflected, the result (ie the low 32 bits) is in`
			`* the high 32 bits. We just need to shift it left 4 bytes`
			`* V0 [ 0 1 X 3 ]`
			`* V0 [ 0 X 2 3 ]`
			`*/`

			`/* shift result into top 64 bits of */`
			`v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,`
			`(__vector unsigned char)vzero, 4);`

			`#if BYTE_ORDER == BIG_ENDIAN`
			`return v0[0];`
			`#else`
			`return v0[1];`
			`#endif`
			`}`