opencv/3rdparty/zlib-ng/match_tpl.h

/* match_tpl.h -- find longest match template for compare256 variants
 *
 * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 *
 * Portions copyright (C) 2014-2021 Konstantin Nosov
 *  Fast-zlib optimized longest_match
 *  https://github.com/gildor2/fast_zlib
 */

#include "zbuild.h"
#include "zutil_p.h"
#include "deflate.h"
#include "functable.h"

#ifndef MATCH_TPL_H
#define MATCH_TPL_H

#define EARLY_EXIT_TRIGGER_LEVEL 5

#endif

/* Set match_start to the longest match starting at the given string and
 * return its length. Matches shorter or equal to prev_length are discarded,
 * in which case the result is equal to prev_length and match_start is garbage.
 *
 * IN assertions: cur_match is the head of the hash chain for the current
 * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
 * OUT assertion: the match length is not greater than s->lookahead
 */
Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
    unsigned int strstart = s->strstart;
    const unsigned wmask = s->w_mask;
    unsigned char *window = s->window;
    unsigned char *scan = window + strstart;
    Z_REGISTER unsigned char *mbase_start = window;
    Z_REGISTER unsigned char *mbase_end;
    const Pos *prev = s->prev;
    Pos limit;
#ifdef LONGEST_MATCH_SLOW
    Pos limit_base;
#else
    int32_t early_exit;
#endif
    uint32_t chain_length, nice_match, best_len, offset;
    uint32_t lookahead = s->lookahead;
    Pos match_offset = 0;
#ifdef UNALIGNED_OK
    uint8_t scan_start[8];
#endif
    uint8_t scan_end[8];

#define GOTO_NEXT_CHAIN \
    if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \
        continue; \
    return best_len;

    /* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */
    Assert(STD_MAX_MATCH == 258, "Code too clever");

    best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1;

    /* Calculate read offset which should only extend an extra byte
     * to find the next best match length.
     */
    offset = best_len-1;
#ifdef UNALIGNED_OK
    if (best_len >= sizeof(uint32_t)) {
        offset -= 2;
#ifdef UNALIGNED64_OK
        if (best_len >= sizeof(uint64_t))
            offset -= 4;
#endif
    }
#endif

#ifdef UNALIGNED64_OK
    memcpy(scan_start, scan, sizeof(uint64_t));
    memcpy(scan_end, scan+offset, sizeof(uint64_t));
#elif defined(UNALIGNED_OK)
    memcpy(scan_start, scan, sizeof(uint32_t));
    memcpy(scan_end, scan+offset, sizeof(uint32_t));
#else
    scan_end[0] = *(scan+offset);
    scan_end[1] = *(scan+offset+1);
#endif
    mbase_end  = (mbase_start+offset);

    /* Do not waste too much time if we already have a good match */
    chain_length = s->max_chain_length;
    if (best_len >= s->good_match)
        chain_length >>= 2;
    nice_match = (uint32_t)s->nice_match;

    /* Stop when cur_match becomes <= limit. To simplify the code,
     * we prevent matches with the string of window index 0
     */
    limit = strstart > MAX_DIST(s) ? (Pos)(strstart - MAX_DIST(s)) : 0;
#ifdef LONGEST_MATCH_SLOW
    limit_base = limit;
    if (best_len >= STD_MIN_MATCH) {
        /* We're continuing search (lazy evaluation). */
        uint32_t i, hash;
        Pos pos;

        /* Find a most distant chain starting from scan with index=1 (index=0 corresponds
         * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because
         * these strings are not yet inserted into the hash table.
         */
        hash = s->update_hash(s, 0, scan[1]);
        hash = s->update_hash(s, hash, scan[2]);

        for (i = 3; i <= best_len; i++) {
            hash = s->update_hash(s, hash, scan[i]);

            /* If we're starting with best_len >= 3, we can use offset search. */
            pos = s->head[hash];
            if (pos < cur_match) {
                match_offset = (Pos)(i - 2);
                cur_match = pos;
            }
        }

        /* Update offset-dependent variables */
        limit = limit_base+match_offset;
        if (cur_match <= limit)
            goto break_matching;
        mbase_start -= match_offset;
        mbase_end -= match_offset;
    }
#else
    early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL;
#endif
    Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
    for (;;) {
        if (cur_match >= strstart)
            break;

        /* Skip to next match if the match length cannot increase or if the match length is
         * less than 2. Note that the checks below for insufficient lookahead only occur
         * occasionally for performance reasons.
         * Therefore uninitialized memory will be accessed and conditional jumps will be made
         * that depend on those values. However the length of the match is limited to the
         * lookahead, so the output of deflate is not affected by the uninitialized values.
         */
#ifdef UNALIGNED_OK
        if (best_len < sizeof(uint32_t)) {
            for (;;) {
                if (zng_memcmp_2(mbase_end+cur_match, scan_end) == 0 &&
                    zng_memcmp_2(mbase_start+cur_match, scan_start) == 0)
                    break;
                GOTO_NEXT_CHAIN;
            }
#  ifdef UNALIGNED64_OK
        } else if (best_len >= sizeof(uint64_t)) {
            for (;;) {
                if (zng_memcmp_8(mbase_end+cur_match, scan_end) == 0 &&
                    zng_memcmp_8(mbase_start+cur_match, scan_start) == 0)
                    break;
                GOTO_NEXT_CHAIN;
            }
#  endif
        } else {
            for (;;) {
                if (zng_memcmp_4(mbase_end+cur_match, scan_end) == 0 &&
                    zng_memcmp_4(mbase_start+cur_match, scan_start) == 0)
                    break;
                GOTO_NEXT_CHAIN;
            }
        }
#else
        for (;;) {
            if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] &&
                mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1])
                break;
            GOTO_NEXT_CHAIN;
        }
#endif
        uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2;
        Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan");

        if (len > best_len) {
            uint32_t match_start = cur_match - match_offset;
            s->match_start = match_start;

            /* Do not look for matches beyond the end of the input. */
            if (len > lookahead)
                return lookahead;
            best_len = len;
            if (best_len >= nice_match)
                return best_len;

            offset = best_len-1;
#ifdef UNALIGNED_OK
            if (best_len >= sizeof(uint32_t)) {
                offset -= 2;
#ifdef UNALIGNED64_OK
                if (best_len >= sizeof(uint64_t))
                    offset -= 4;
#endif
            }
#endif

#ifdef UNALIGNED64_OK
            memcpy(scan_end, scan+offset, sizeof(uint64_t));
#elif defined(UNALIGNED_OK)
            memcpy(scan_end, scan+offset, sizeof(uint32_t));
#else
            scan_end[0] = *(scan+offset);
            scan_end[1] = *(scan+offset+1);
#endif

#ifdef LONGEST_MATCH_SLOW
            /* Look for a better string offset */
            if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) {
                Pos pos, next_pos;
                uint32_t i, hash;
                unsigned char *scan_endstr;

                /* Go back to offset 0 */
                cur_match -= match_offset;
                match_offset = 0;
                next_pos = cur_match;
                for (i = 0; i <= len - STD_MIN_MATCH; i++) {
                    pos = prev[(cur_match + i) & wmask];
                    if (pos < next_pos) {
                        /* Hash chain is more distant, use it */
                        if (pos <= limit_base + i)
                            goto break_matching;
                        next_pos = pos;
                        match_offset = (Pos)i;
                    }
                }
                /* Switch cur_match to next_pos chain */
                cur_match = next_pos;

                /* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get
                 * a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets
                 * us include one more byte into hash - the byte which will be checked
                 * in main loop now, and which allows to grow match by 1.
                 */
                scan_endstr = scan + len - (STD_MIN_MATCH+1);

                hash = s->update_hash(s, 0, scan_endstr[0]);
                hash = s->update_hash(s, hash, scan_endstr[1]);
                hash = s->update_hash(s, hash, scan_endstr[2]);

                pos = s->head[hash];
                if (pos < cur_match) {
                    match_offset = (Pos)(len - (STD_MIN_MATCH+1));
                    if (pos <= limit_base + match_offset)
                        goto break_matching;
                    cur_match = pos;
                }

                /* Update offset-dependent variables */
                limit = limit_base+match_offset;
                mbase_start = window-match_offset;
                mbase_end = (mbase_start+offset);
                continue;
            }
#endif
            mbase_end = (mbase_start+offset);
        }
#ifndef LONGEST_MATCH_SLOW
        else if (UNLIKELY(early_exit)) {
            /* The probability of finding a match later if we here is pretty low, so for
             * performance it's best to outright stop here for the lower compression levels
             */
            break;
        }
#endif
        GOTO_NEXT_CHAIN;
    }
    return best_len;

#ifdef LONGEST_MATCH_SLOW
break_matching:

    if (best_len < s->lookahead)
        return best_len;

    return s->lookahead;
#endif
}

#undef LONGEST_MATCH_SLOW
#undef LONGEST_MATCH
#undef COMPARE256
Add zlib-ng as an alternative zlib implementation Zlib-ng is zlib replacement with optimizations for "next generation" systems. Its optimization may benifits image library decode and encode speed such as libpng. In our tests, if using zlib-ng and libpng combination on a x86_64 machine with AVX2, the time of `imdecode` amd `imencode` will drop 20% approximately. This patch enables zlib-ng's optimization if `CV_DISABLE_OPTIMIZATION` is OFF. Since Zlib-ng can dispatch intrinsics on the fly, port work is much easier. Related discussion: https://github.com/opencv/opencv/issues/22573 2023-12-27 12:06:17 +08:00			`/* match_tpl.h -- find longest match template for compare256 variants`
			`*`
			`* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler`
			`* For conditions of distribution and use, see copyright notice in zlib.h`
			`*`
			`* Portions copyright (C) 2014-2021 Konstantin Nosov`
			`* Fast-zlib optimized longest_match`
			`* https://github.com/gildor2/fast_zlib`
			`*/`

			`#include "zbuild.h"`
			`#include "zutil_p.h"`
			`#include "deflate.h"`
			`#include "functable.h"`

			`#ifndef MATCH_TPL_H`
			`#define MATCH_TPL_H`

			`#define EARLY_EXIT_TRIGGER_LEVEL 5`

			`#endif`

			`/* Set match_start to the longest match starting at the given string and`
			`* return its length. Matches shorter or equal to prev_length are discarded,`
			`* in which case the result is equal to prev_length and match_start is garbage.`
			`*`
			`* IN assertions: cur_match is the head of the hash chain for the current`
			`* string (strstart) and its distance is <= MAX_DIST, and prev_length >=1`
			`* OUT assertion: the match length is not greater than s->lookahead`
			`*/`
			`Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {`
			`unsigned int strstart = s->strstart;`
			`const unsigned wmask = s->w_mask;`
			`unsigned char *window = s->window;`
			`unsigned char *scan = window + strstart;`
			`Z_REGISTER unsigned char *mbase_start = window;`
			`Z_REGISTER unsigned char *mbase_end;`
			`const Pos *prev = s->prev;`
			`Pos limit;`
			`#ifdef LONGEST_MATCH_SLOW`
			`Pos limit_base;`
			`#else`
			`int32_t early_exit;`
			`#endif`
			`uint32_t chain_length, nice_match, best_len, offset;`
			`uint32_t lookahead = s->lookahead;`
			`Pos match_offset = 0;`
			`#ifdef UNALIGNED_OK`
			`uint8_t scan_start[8];`
			`#endif`
			`uint8_t scan_end[8];`

			`#define GOTO_NEXT_CHAIN \`
			`if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \`
			`continue; \`
			`return best_len;`

			`/* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */`
			`Assert(STD_MAX_MATCH == 258, "Code too clever");`

			`best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1;`

			`/* Calculate read offset which should only extend an extra byte`
			`* to find the next best match length.`
			`*/`
			`offset = best_len-1;`
			`#ifdef UNALIGNED_OK`
			`if (best_len >= sizeof(uint32_t)) {`
			`offset -= 2;`
			`#ifdef UNALIGNED64_OK`
			`if (best_len >= sizeof(uint64_t))`
			`offset -= 4;`
			`#endif`
			`}`
			`#endif`

			`#ifdef UNALIGNED64_OK`
			`memcpy(scan_start, scan, sizeof(uint64_t));`
			`memcpy(scan_end, scan+offset, sizeof(uint64_t));`
			`#elif defined(UNALIGNED_OK)`
			`memcpy(scan_start, scan, sizeof(uint32_t));`
			`memcpy(scan_end, scan+offset, sizeof(uint32_t));`
			`#else`
			`scan_end[0] = *(scan+offset);`
			`scan_end[1] = *(scan+offset+1);`
			`#endif`
			`mbase_end = (mbase_start+offset);`

			`/* Do not waste too much time if we already have a good match */`
			`chain_length = s->max_chain_length;`
			`if (best_len >= s->good_match)`
			`chain_length >>= 2;`
			`nice_match = (uint32_t)s->nice_match;`

			`/* Stop when cur_match becomes <= limit. To simplify the code,`
			`* we prevent matches with the string of window index 0`
			`*/`
			`limit = strstart > MAX_DIST(s) ? (Pos)(strstart - MAX_DIST(s)) : 0;`
			`#ifdef LONGEST_MATCH_SLOW`
			`limit_base = limit;`
			`if (best_len >= STD_MIN_MATCH) {`
			`/* We're continuing search (lazy evaluation). */`
			`uint32_t i, hash;`
			`Pos pos;`

			`/* Find a most distant chain starting from scan with index=1 (index=0 corresponds`
			`* to cur_match). We cannot use s->prev[strstart+1,...] immediately, because`
			`* these strings are not yet inserted into the hash table.`
			`*/`
			`hash = s->update_hash(s, 0, scan[1]);`
			`hash = s->update_hash(s, hash, scan[2]);`

			`for (i = 3; i <= best_len; i++) {`
			`hash = s->update_hash(s, hash, scan[i]);`

			`/* If we're starting with best_len >= 3, we can use offset search. */`
			`pos = s->head[hash];`
			`if (pos < cur_match) {`
			`match_offset = (Pos)(i - 2);`
			`cur_match = pos;`
			`}`
			`}`

			`/* Update offset-dependent variables */`
			`limit = limit_base+match_offset;`
			`if (cur_match <= limit)`
			`goto break_matching;`
			`mbase_start -= match_offset;`
			`mbase_end -= match_offset;`
			`}`
			`#else`
			`early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL;`
			`#endif`
			`Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");`
			`for (;;) {`
			`if (cur_match >= strstart)`
			`break;`

			`/* Skip to next match if the match length cannot increase or if the match length is`
			`* less than 2. Note that the checks below for insufficient lookahead only occur`
			`* occasionally for performance reasons.`
			`* Therefore uninitialized memory will be accessed and conditional jumps will be made`
			`* that depend on those values. However the length of the match is limited to the`
			`* lookahead, so the output of deflate is not affected by the uninitialized values.`
			`*/`
			`#ifdef UNALIGNED_OK`
			`if (best_len < sizeof(uint32_t)) {`
			`for (;;) {`
			`if (zng_memcmp_2(mbase_end+cur_match, scan_end) == 0 &&`
			`zng_memcmp_2(mbase_start+cur_match, scan_start) == 0)`
			`break;`
			`GOTO_NEXT_CHAIN;`
			`}`
			`# ifdef UNALIGNED64_OK`
			`} else if (best_len >= sizeof(uint64_t)) {`
			`for (;;) {`
			`if (zng_memcmp_8(mbase_end+cur_match, scan_end) == 0 &&`
			`zng_memcmp_8(mbase_start+cur_match, scan_start) == 0)`
			`break;`
			`GOTO_NEXT_CHAIN;`
			`}`
			`# endif`
			`} else {`
			`for (;;) {`
			`if (zng_memcmp_4(mbase_end+cur_match, scan_end) == 0 &&`
			`zng_memcmp_4(mbase_start+cur_match, scan_start) == 0)`
			`break;`
			`GOTO_NEXT_CHAIN;`
			`}`
			`}`
			`#else`
			`for (;;) {`
			`if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] &&`
			`mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1])`
			`break;`
			`GOTO_NEXT_CHAIN;`
			`}`
			`#endif`
			`uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2;`
			`Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan");`

			`if (len > best_len) {`
			`uint32_t match_start = cur_match - match_offset;`
			`s->match_start = match_start;`

			`/* Do not look for matches beyond the end of the input. */`
			`if (len > lookahead)`
			`return lookahead;`
			`best_len = len;`
			`if (best_len >= nice_match)`
			`return best_len;`

			`offset = best_len-1;`
			`#ifdef UNALIGNED_OK`
			`if (best_len >= sizeof(uint32_t)) {`
			`offset -= 2;`
			`#ifdef UNALIGNED64_OK`
			`if (best_len >= sizeof(uint64_t))`
			`offset -= 4;`
			`#endif`
			`}`
			`#endif`

			`#ifdef UNALIGNED64_OK`
			`memcpy(scan_end, scan+offset, sizeof(uint64_t));`
			`#elif defined(UNALIGNED_OK)`
			`memcpy(scan_end, scan+offset, sizeof(uint32_t));`
			`#else`
			`scan_end[0] = *(scan+offset);`
			`scan_end[1] = *(scan+offset+1);`
			`#endif`

			`#ifdef LONGEST_MATCH_SLOW`
			`/* Look for a better string offset */`
			`if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) {`
			`Pos pos, next_pos;`
			`uint32_t i, hash;`
			`unsigned char *scan_endstr;`

			`/* Go back to offset 0 */`
			`cur_match -= match_offset;`
			`match_offset = 0;`
			`next_pos = cur_match;`
			`for (i = 0; i <= len - STD_MIN_MATCH; i++) {`
			`pos = prev[(cur_match + i) & wmask];`
			`if (pos < next_pos) {`
			`/* Hash chain is more distant, use it */`
			`if (pos <= limit_base + i)`
			`goto break_matching;`
			`next_pos = pos;`
			`match_offset = (Pos)i;`
			`}`
			`}`
			`/* Switch cur_match to next_pos chain */`
			`cur_match = next_pos;`

			`/* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get`
			`* a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets`
			`* us include one more byte into hash - the byte which will be checked`
			`* in main loop now, and which allows to grow match by 1.`
			`*/`
			`scan_endstr = scan + len - (STD_MIN_MATCH+1);`

			`hash = s->update_hash(s, 0, scan_endstr[0]);`
			`hash = s->update_hash(s, hash, scan_endstr[1]);`
			`hash = s->update_hash(s, hash, scan_endstr[2]);`

			`pos = s->head[hash];`
			`if (pos < cur_match) {`
			`match_offset = (Pos)(len - (STD_MIN_MATCH+1));`
			`if (pos <= limit_base + match_offset)`
			`goto break_matching;`
			`cur_match = pos;`
			`}`

			`/* Update offset-dependent variables */`
			`limit = limit_base+match_offset;`
			`mbase_start = window-match_offset;`
			`mbase_end = (mbase_start+offset);`
			`continue;`
			`}`
			`#endif`
			`mbase_end = (mbase_start+offset);`
			`}`
			`#ifndef LONGEST_MATCH_SLOW`
			`else if (UNLIKELY(early_exit)) {`
			`/* The probability of finding a match later if we here is pretty low, so for`
			`* performance it's best to outright stop here for the lower compression levels`
			`*/`
			`break;`
			`}`
			`#endif`
			`GOTO_NEXT_CHAIN;`
			`}`
			`return best_len;`

			`#ifdef LONGEST_MATCH_SLOW`
			`break_matching:`

			`if (best_len < s->lookahead)`
			`return best_len;`

			`return s->lookahead;`
			`#endif`
			`}`

			`#undef LONGEST_MATCH_SLOW`
			`#undef LONGEST_MATCH`
			`#undef COMPARE256`