diff --git a/3rdparty/quirc/CMakeLists.txt b/3rdparty/quirc/CMakeLists.txt new file mode 100644 index 0000000000..7a6b2bb222 --- /dev/null +++ b/3rdparty/quirc/CMakeLists.txt @@ -0,0 +1,30 @@ +project(quirc) + +set(CURR_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/include") + +set_property(GLOBAL PROPERTY QUIRC_INCLUDE_DIR ${CURR_INCLUDE_DIR}) +ocv_include_directories(${CURR_INCLUDE_DIR}) + +file(GLOB_RECURSE quirc_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "include/*.h") +file(GLOB_RECURSE quirc_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "src/*.c") + +add_library(${PROJECT_NAME} STATIC ${quirc_headers} ${quirc_sources}) +ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable -Wshadow) + +set_target_properties(${PROJECT_NAME} + PROPERTIES OUTPUT_NAME ${PROJECT_NAME} + DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" + COMPILE_PDB_NAME ${PROJECT_NAME} + COMPILE_PDB_NAME_DEBUG "${PROJECT_NAME}${OPENCV_DEBUG_POSTFIX}" + ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH} + ) + +if(ENABLE_SOLUTION_FOLDERS) + set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER "3rdparty") +endif() + +if(NOT BUILD_SHARED_LIBS) + ocv_install_target(${PROJECT_NAME} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) +endif() + +ocv_install_3rdparty_licenses(${PROJECT_NAME} LICENSE) diff --git a/3rdparty/quirc/LICENSE b/3rdparty/quirc/LICENSE new file mode 100644 index 0000000000..d47c0262c2 --- /dev/null +++ b/3rdparty/quirc/LICENSE @@ -0,0 +1,16 @@ +quirc -- QR-code recognition library +Copyright (C) 2010-2012 Daniel Beer + +Permission to use, copy, modify, and/or distribute this software for +any purpose with or without fee is hereby granted, provided that the +above copyright notice and this permission notice appear in all +copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL +DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR +PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff --git a/3rdparty/quirc/include/quirc.h b/3rdparty/quirc/include/quirc.h new file mode 100644 index 0000000000..0e7cb94d1c --- /dev/null +++ b/3rdparty/quirc/include/quirc.h @@ -0,0 +1,173 @@ +/* quirc -- QR-code recognition library + * Copyright (C) 2010-2012 Daniel Beer + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef QUIRC_H_ +#define QUIRC_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct quirc; + +/* Obtain the library version string. */ +const char *quirc_version(void); + +/* Construct a new QR-code recognizer. This function will return NULL + * if sufficient memory could not be allocated. + */ +struct quirc *quirc_new(void); + +/* Destroy a QR-code recognizer. */ +void quirc_destroy(struct quirc *q); + +/* Resize the QR-code recognizer. The size of an image must be + * specified before codes can be analyzed. + * + * This function returns 0 on success, or -1 if sufficient memory could + * not be allocated. + */ +int quirc_resize(struct quirc *q, int w, int h); + +/* These functions are used to process images for QR-code recognition. + * quirc_begin() must first be called to obtain access to a buffer into + * which the input image should be placed. Optionally, the current + * width and height may be returned. + * + * After filling the buffer, quirc_end() should be called to process + * the image for QR-code recognition. The locations and content of each + * code may be obtained using accessor functions described below. + */ +uint8_t *quirc_begin(struct quirc *q, int *w, int *h); +void quirc_end(struct quirc *q); + +/* This structure describes a location in the input image buffer. */ +struct quirc_point { + int x; + int y; +}; + +/* This enum describes the various decoder errors which may occur. */ +typedef enum { + QUIRC_SUCCESS = 0, + QUIRC_ERROR_INVALID_GRID_SIZE, + QUIRC_ERROR_INVALID_VERSION, + QUIRC_ERROR_FORMAT_ECC, + QUIRC_ERROR_DATA_ECC, + QUIRC_ERROR_UNKNOWN_DATA_TYPE, + QUIRC_ERROR_DATA_OVERFLOW, + QUIRC_ERROR_DATA_UNDERFLOW +} quirc_decode_error_t; + +/* Return a string error message for an error code. */ +const char *quirc_strerror(quirc_decode_error_t err); + +/* Limits on the maximum size of QR-codes and their content. */ +#define QUIRC_MAX_BITMAP 3917 +#define QUIRC_MAX_PAYLOAD 8896 + +/* QR-code ECC types. */ +#define QUIRC_ECC_LEVEL_M 0 +#define QUIRC_ECC_LEVEL_L 1 +#define QUIRC_ECC_LEVEL_H 2 +#define QUIRC_ECC_LEVEL_Q 3 + +/* QR-code data types. */ +#define QUIRC_DATA_TYPE_NUMERIC 1 +#define QUIRC_DATA_TYPE_ALPHA 2 +#define QUIRC_DATA_TYPE_BYTE 4 +#define QUIRC_DATA_TYPE_KANJI 8 + +/* Common character encodings */ +#define QUIRC_ECI_ISO_8859_1 1 +#define QUIRC_ECI_IBM437 2 +#define QUIRC_ECI_ISO_8859_2 4 +#define QUIRC_ECI_ISO_8859_3 5 +#define QUIRC_ECI_ISO_8859_4 6 +#define QUIRC_ECI_ISO_8859_5 7 +#define QUIRC_ECI_ISO_8859_6 8 +#define QUIRC_ECI_ISO_8859_7 9 +#define QUIRC_ECI_ISO_8859_8 10 +#define QUIRC_ECI_ISO_8859_9 11 +#define QUIRC_ECI_WINDOWS_874 13 +#define QUIRC_ECI_ISO_8859_13 15 +#define QUIRC_ECI_ISO_8859_15 17 +#define QUIRC_ECI_SHIFT_JIS 20 +#define QUIRC_ECI_UTF_8 26 + +/* This structure is used to return information about detected QR codes + * in the input image. + */ +struct quirc_code { + /* The four corners of the QR-code, from top left, clockwise */ + struct quirc_point corners[4]; + + /* The number of cells across in the QR-code. The cell bitmap + * is a bitmask giving the actual values of cells. If the cell + * at (x, y) is black, then the following bit is set: + * + * cell_bitmap[i >> 3] & (1 << (i & 7)) + * + * where i = (y * size) + x. + */ + int size; + uint8_t cell_bitmap[QUIRC_MAX_BITMAP]; +}; + +/* This structure holds the decoded QR-code data */ +struct quirc_data { + /* Various parameters of the QR-code. These can mostly be + * ignored if you only care about the data. + */ + int version; + int ecc_level; + int mask; + + /* This field is the highest-valued data type found in the QR + * code. + */ + int data_type; + + /* Data payload. For the Kanji datatype, payload is encoded as + * Shift-JIS. For all other datatypes, payload is ASCII text. + */ + uint8_t payload[QUIRC_MAX_PAYLOAD]; + int payload_len; + + /* ECI assignment number */ + uint32_t eci; +}; + +/* Return the number of QR-codes identified in the last processed + * image. + */ +int quirc_count(const struct quirc *q); + +/* Extract the QR-code specified by the given index. */ +void quirc_extract(const struct quirc *q, int index, + struct quirc_code *code); + +/* Decode a QR-code, returning the payload data. */ +quirc_decode_error_t quirc_decode(const struct quirc_code *code, + struct quirc_data *data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/3rdparty/quirc/include/quirc_internal.h b/3rdparty/quirc/include/quirc_internal.h new file mode 100644 index 0000000000..56f5d28bfa --- /dev/null +++ b/3rdparty/quirc/include/quirc_internal.h @@ -0,0 +1,115 @@ +/* quirc -- QR-code recognition library + * Copyright (C) 2010-2012 Daniel Beer + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef QUIRC_INTERNAL_H_ +#define QUIRC_INTERNAL_H_ + +#include + +#define QUIRC_PIXEL_WHITE 0 +#define QUIRC_PIXEL_BLACK 1 +#define QUIRC_PIXEL_REGION 2 + +#ifndef QUIRC_MAX_REGIONS +#define QUIRC_MAX_REGIONS 254 +#endif +#define QUIRC_MAX_CAPSTONES 32 +#define QUIRC_MAX_GRIDS 8 + +#define QUIRC_PERSPECTIVE_PARAMS 8 + +#if QUIRC_MAX_REGIONS < UINT8_MAX +typedef uint8_t quirc_pixel_t; +#elif QUIRC_MAX_REGIONS < UINT16_MAX +typedef uint16_t quirc_pixel_t; +#else +#error "QUIRC_MAX_REGIONS > 65534 is not supported" +#endif + +struct quirc_region { + struct quirc_point seed; + int count; + int capstone; +}; + +struct quirc_capstone { + int ring; + int stone; + + struct quirc_point corners[4]; + struct quirc_point center; + double c[QUIRC_PERSPECTIVE_PARAMS]; + + int qr_grid; +}; + +struct quirc_grid { + /* Capstone indices */ + int caps[3]; + + /* Alignment pattern region and corner */ + int align_region; + struct quirc_point align; + + /* Timing pattern endpoints */ + struct quirc_point tpep[3]; + int hscan; + int vscan; + + /* Grid size and perspective transform */ + int grid_size; + double c[QUIRC_PERSPECTIVE_PARAMS]; +}; + +struct quirc { + uint8_t *image; + quirc_pixel_t *pixels; + int *row_average; /* used by threshold() */ + int w; + int h; + + int num_regions; + struct quirc_region regions[QUIRC_MAX_REGIONS]; + + int num_capstones; + struct quirc_capstone capstones[QUIRC_MAX_CAPSTONES]; + + int num_grids; + struct quirc_grid grids[QUIRC_MAX_GRIDS]; +}; + +/************************************************************************ + * QR-code version information database + */ + +#define QUIRC_MAX_VERSION 40 +#define QUIRC_MAX_ALIGNMENT 7 + +struct quirc_rs_params { + int bs; /* Small block size */ + int dw; /* Small data words */ + int ns; /* Number of small blocks */ +}; + +struct quirc_version_info { + int data_bytes; + int apat[QUIRC_MAX_ALIGNMENT]; + struct quirc_rs_params ecc[4]; +}; + +extern const struct quirc_version_info quirc_version_db[QUIRC_MAX_VERSION + 1]; + +#endif diff --git a/3rdparty/quirc/src/decode.c b/3rdparty/quirc/src/decode.c new file mode 100644 index 0000000000..f556097b65 --- /dev/null +++ b/3rdparty/quirc/src/decode.c @@ -0,0 +1,919 @@ +/* quirc -- QR-code recognition library + * Copyright (C) 2010-2012 Daniel Beer + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include +#include + +#define MAX_POLY 64 + +/************************************************************************ + * Galois fields + */ + +struct galois_field { + int p; + const uint8_t *log; + const uint8_t *exp; +}; + +static const uint8_t gf16_exp[16] = { + 0x01, 0x02, 0x04, 0x08, 0x03, 0x06, 0x0c, 0x0b, + 0x05, 0x0a, 0x07, 0x0e, 0x0f, 0x0d, 0x09, 0x01 +}; + +static const uint8_t gf16_log[16] = { + 0x00, 0x0f, 0x01, 0x04, 0x02, 0x08, 0x05, 0x0a, + 0x03, 0x0e, 0x09, 0x07, 0x06, 0x0d, 0x0b, 0x0c +}; + +static const struct galois_field gf16 = { + .p = 15, + .log = gf16_log, + .exp = gf16_exp +}; + +static const uint8_t gf256_exp[256] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; + +static const uint8_t gf256_log[256] = { + 0x00, 0xff, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf +}; + +static const struct galois_field gf256 = { + .p = 255, + .log = gf256_log, + .exp = gf256_exp +}; + +/************************************************************************ + * Polynomial operations + */ + +static void poly_add(uint8_t *dst, const uint8_t *src, uint8_t c, + int shift, const struct galois_field *gf) +{ + int i; + int log_c = gf->log[c]; + + if (!c) + return; + + for (i = 0; i < MAX_POLY; i++) { + int p = i + shift; + uint8_t v = src[i]; + + if (p < 0 || p >= MAX_POLY) + continue; + if (!v) + continue; + + dst[p] ^= gf->exp[(gf->log[v] + log_c) % gf->p]; + } +} + +static uint8_t poly_eval(const uint8_t *s, uint8_t x, + const struct galois_field *gf) +{ + int i; + uint8_t sum = 0; + uint8_t log_x = gf->log[x]; + + if (!x) + return s[0]; + + for (i = 0; i < MAX_POLY; i++) { + uint8_t c = s[i]; + + if (!c) + continue; + + sum ^= gf->exp[(gf->log[c] + log_x * i) % gf->p]; + } + + return sum; +} + +/************************************************************************ + * Berlekamp-Massey algorithm for finding error locator polynomials. + */ + +static void berlekamp_massey(const uint8_t *s, int N, + const struct galois_field *gf, + uint8_t *sigma) +{ + uint8_t C[MAX_POLY]; + uint8_t B[MAX_POLY]; + int L = 0; + int m = 1; + uint8_t b = 1; + int n; + + memset(B, 0, sizeof(B)); + memset(C, 0, sizeof(C)); + B[0] = 1; + C[0] = 1; + + for (n = 0; n < N; n++) { + uint8_t d = s[n]; + uint8_t mult; + int i; + + for (i = 1; i <= L; i++) { + if (!(C[i] && s[n - i])) + continue; + + d ^= gf->exp[(gf->log[C[i]] + + gf->log[s[n - i]]) % + gf->p]; + } + + mult = gf->exp[(gf->p - gf->log[b] + gf->log[d]) % gf->p]; + + if (!d) { + m++; + } else if (L * 2 <= n) { + uint8_t T[MAX_POLY]; + + memcpy(T, C, sizeof(T)); + poly_add(C, B, mult, m, gf); + memcpy(B, T, sizeof(B)); + L = n + 1 - L; + b = d; + m = 1; + } else { + poly_add(C, B, mult, m, gf); + m++; + } + } + + memcpy(sigma, C, MAX_POLY); +} + +/************************************************************************ + * Code stream error correction + * + * Generator polynomial for GF(2^8) is x^8 + x^4 + x^3 + x^2 + 1 + */ + +static int block_syndromes(const uint8_t *data, int bs, int npar, uint8_t *s) +{ + int nonzero = 0; + int i; + + memset(s, 0, MAX_POLY); + + for (i = 0; i < npar; i++) { + int j; + + for (j = 0; j < bs; j++) { + uint8_t c = data[bs - j - 1]; + + if (!c) + continue; + + s[i] ^= gf256_exp[((int)gf256_log[c] + + i * j) % 255]; + } + + if (s[i]) + nonzero = 1; + } + + return nonzero; +} + +static void eloc_poly(uint8_t *omega, + const uint8_t *s, const uint8_t *sigma, + int npar) +{ + int i; + + memset(omega, 0, MAX_POLY); + + for (i = 0; i < npar; i++) { + const uint8_t a = sigma[i]; + const uint8_t log_a = gf256_log[a]; + int j; + + if (!a) + continue; + + for (j = 0; j + 1 < MAX_POLY; j++) { + const uint8_t b = s[j + 1]; + + if (i + j >= npar) + break; + + if (!b) + continue; + + omega[i + j] ^= + gf256_exp[(log_a + gf256_log[b]) % 255]; + } + } +} + +static quirc_decode_error_t correct_block(uint8_t *data, + const struct quirc_rs_params *ecc) +{ + int npar = ecc->bs - ecc->dw; + uint8_t s[MAX_POLY]; + uint8_t sigma[MAX_POLY]; + uint8_t sigma_deriv[MAX_POLY]; + uint8_t omega[MAX_POLY]; + int i; + + /* Compute syndrome vector */ + if (!block_syndromes(data, ecc->bs, npar, s)) + return QUIRC_SUCCESS; + + berlekamp_massey(s, npar, &gf256, sigma); + + /* Compute derivative of sigma */ + memset(sigma_deriv, 0, MAX_POLY); + for (i = 0; i + 1 < MAX_POLY; i += 2) + sigma_deriv[i] = sigma[i + 1]; + + /* Compute error evaluator polynomial */ + eloc_poly(omega, s, sigma, npar - 1); + + /* Find error locations and magnitudes */ + for (i = 0; i < ecc->bs; i++) { + uint8_t xinv = gf256_exp[255 - i]; + + if (!poly_eval(sigma, xinv, &gf256)) { + uint8_t sd_x = poly_eval(sigma_deriv, xinv, &gf256); + uint8_t omega_x = poly_eval(omega, xinv, &gf256); + uint8_t error = gf256_exp[(255 - gf256_log[sd_x] + + gf256_log[omega_x]) % 255]; + + data[ecc->bs - i - 1] ^= error; + } + } + + if (block_syndromes(data, ecc->bs, npar, s)) + return QUIRC_ERROR_DATA_ECC; + + return QUIRC_SUCCESS; +} + +/************************************************************************ + * Format value error correction + * + * Generator polynomial for GF(2^4) is x^4 + x + 1 + */ + +#define FORMAT_MAX_ERROR 3 +#define FORMAT_SYNDROMES (FORMAT_MAX_ERROR * 2) +#define FORMAT_BITS 15 + +static int format_syndromes(uint16_t u, uint8_t *s) +{ + int i; + int nonzero = 0; + + memset(s, 0, MAX_POLY); + + for (i = 0; i < FORMAT_SYNDROMES; i++) { + int j; + + s[i] = 0; + for (j = 0; j < FORMAT_BITS; j++) + if (u & (1 << j)) + s[i] ^= gf16_exp[((i + 1) * j) % 15]; + + if (s[i]) + nonzero = 1; + } + + return nonzero; +} + +static quirc_decode_error_t correct_format(uint16_t *f_ret) +{ + uint16_t u = *f_ret; + int i; + uint8_t s[MAX_POLY]; + uint8_t sigma[MAX_POLY]; + + /* Evaluate U (received codeword) at each of alpha_1 .. alpha_6 + * to get S_1 .. S_6 (but we index them from 0). + */ + if (!format_syndromes(u, s)) + return QUIRC_SUCCESS; + + berlekamp_massey(s, FORMAT_SYNDROMES, &gf16, sigma); + + /* Now, find the roots of the polynomial */ + for (i = 0; i < 15; i++) + if (!poly_eval(sigma, gf16_exp[15 - i], &gf16)) + u ^= (1 << i); + + if (format_syndromes(u, s)) + return QUIRC_ERROR_FORMAT_ECC; + + *f_ret = u; + return QUIRC_SUCCESS; +} + +/************************************************************************ + * Decoder algorithm + */ + +struct datastream { + uint8_t raw[QUIRC_MAX_PAYLOAD]; + int data_bits; + int ptr; + + uint8_t data[QUIRC_MAX_PAYLOAD]; +}; + +static inline int grid_bit(const struct quirc_code *code, int x, int y) +{ + int p = y * code->size + x; + + return (code->cell_bitmap[p >> 3] >> (p & 7)) & 1; +} + +static quirc_decode_error_t read_format(const struct quirc_code *code, + struct quirc_data *data, int which) +{ + int i; + uint16_t format = 0; + uint16_t fdata; + quirc_decode_error_t err; + + if (which) { + for (i = 0; i < 7; i++) + format = (format << 1) | + grid_bit(code, 8, code->size - 1 - i); + for (i = 0; i < 8; i++) + format = (format << 1) | + grid_bit(code, code->size - 8 + i, 8); + } else { + static const int xs[15] = { + 8, 8, 8, 8, 8, 8, 8, 8, 7, 5, 4, 3, 2, 1, 0 + }; + static const int ys[15] = { + 0, 1, 2, 3, 4, 5, 7, 8, 8, 8, 8, 8, 8, 8, 8 + }; + + for (i = 14; i >= 0; i--) + format = (format << 1) | grid_bit(code, xs[i], ys[i]); + } + + format ^= 0x5412; + + err = correct_format(&format); + if (err) + return err; + + fdata = format >> 10; + data->ecc_level = fdata >> 3; + data->mask = fdata & 7; + + return QUIRC_SUCCESS; +} + +static int mask_bit(int mask, int i, int j) +{ + switch (mask) { + case 0: return !((i + j) % 2); + case 1: return !(i % 2); + case 2: return !(j % 3); + case 3: return !((i + j) % 3); + case 4: return !(((i / 2) + (j / 3)) % 2); + case 5: return !((i * j) % 2 + (i * j) % 3); + case 6: return !(((i * j) % 2 + (i * j) % 3) % 2); + case 7: return !(((i * j) % 3 + (i + j) % 2) % 2); + } + + return 0; +} + +static int reserved_cell(int version, int i, int j) +{ + const struct quirc_version_info *ver = &quirc_version_db[version]; + int size = version * 4 + 17; + int ai = -1, aj = -1, a; + + /* Finder + format: top left */ + if (i < 9 && j < 9) + return 1; + + /* Finder + format: bottom left */ + if (i + 8 >= size && j < 9) + return 1; + + /* Finder + format: top right */ + if (i < 9 && j + 8 >= size) + return 1; + + /* Exclude timing patterns */ + if (i == 6 || j == 6) + return 1; + + /* Exclude version info, if it exists. Version info sits adjacent to + * the top-right and bottom-left finders in three rows, bounded by + * the timing pattern. + */ + if (version >= 7) { + if (i < 6 && j + 11 >= size) + return 1; + if (i + 11 >= size && j < 6) + return 1; + } + + /* Exclude alignment patterns */ + for (a = 0; a < QUIRC_MAX_ALIGNMENT && ver->apat[a]; a++) { + int p = ver->apat[a]; + + if (abs(p - i) < 3) + ai = a; + if (abs(p - j) < 3) + aj = a; + } + + if (ai >= 0 && aj >= 0) { + a--; + if (ai > 0 && ai < a) + return 1; + if (aj > 0 && aj < a) + return 1; + if (aj == a && ai == a) + return 1; + } + + return 0; +} + +static void read_bit(const struct quirc_code *code, + struct quirc_data *data, + struct datastream *ds, int i, int j) +{ + int bitpos = ds->data_bits & 7; + int bytepos = ds->data_bits >> 3; + int v = grid_bit(code, j, i); + + if (mask_bit(data->mask, i, j)) + v ^= 1; + + if (v) + ds->raw[bytepos] |= (0x80 >> bitpos); + + ds->data_bits++; +} + +static void read_data(const struct quirc_code *code, + struct quirc_data *data, + struct datastream *ds) +{ + int y = code->size - 1; + int x = code->size - 1; + int dir = -1; + + while (x > 0) { + if (x == 6) + x--; + + if (!reserved_cell(data->version, y, x)) + read_bit(code, data, ds, y, x); + + if (!reserved_cell(data->version, y, x - 1)) + read_bit(code, data, ds, y, x - 1); + + y += dir; + if (y < 0 || y >= code->size) { + dir = -dir; + x -= 2; + y += dir; + } + } +} + +static quirc_decode_error_t codestream_ecc(struct quirc_data *data, + struct datastream *ds) +{ + const struct quirc_version_info *ver = + &quirc_version_db[data->version]; + const struct quirc_rs_params *sb_ecc = &ver->ecc[data->ecc_level]; + struct quirc_rs_params lb_ecc; + const int lb_count = + (ver->data_bytes - sb_ecc->bs * sb_ecc->ns) / (sb_ecc->bs + 1); + const int bc = lb_count + sb_ecc->ns; + const int ecc_offset = sb_ecc->dw * bc + lb_count; + int dst_offset = 0; + int i; + + memcpy(&lb_ecc, sb_ecc, sizeof(lb_ecc)); + lb_ecc.dw++; + lb_ecc.bs++; + + for (i = 0; i < bc; i++) { + uint8_t *dst = ds->data + dst_offset; + const struct quirc_rs_params *ecc = + (i < sb_ecc->ns) ? sb_ecc : &lb_ecc; + const int num_ec = ecc->bs - ecc->dw; + quirc_decode_error_t err; + int j; + + for (j = 0; j < ecc->dw; j++) + dst[j] = ds->raw[j * bc + i]; + for (j = 0; j < num_ec; j++) + dst[ecc->dw + j] = ds->raw[ecc_offset + j * bc + i]; + + err = correct_block(dst, ecc); + if (err) + return err; + + dst_offset += ecc->dw; + } + + ds->data_bits = dst_offset * 8; + + return QUIRC_SUCCESS; +} + +static inline int bits_remaining(const struct datastream *ds) +{ + return ds->data_bits - ds->ptr; +} + +static int take_bits(struct datastream *ds, int len) +{ + int ret = 0; + + while (len && (ds->ptr < ds->data_bits)) { + uint8_t b = ds->data[ds->ptr >> 3]; + int bitpos = ds->ptr & 7; + + ret <<= 1; + if ((b << bitpos) & 0x80) + ret |= 1; + + ds->ptr++; + len--; + } + + return ret; +} + +static int numeric_tuple(struct quirc_data *data, + struct datastream *ds, + int bits, int digits) +{ + int tuple; + int i; + + if (bits_remaining(ds) < bits) + return -1; + + tuple = take_bits(ds, bits); + + for (i = digits - 1; i >= 0; i--) { + data->payload[data->payload_len + i] = tuple % 10 + '0'; + tuple /= 10; + } + + data->payload_len += digits; + return 0; +} + +static quirc_decode_error_t decode_numeric(struct quirc_data *data, + struct datastream *ds) +{ + int bits = 14; + int count; + + if (data->version < 10) + bits = 10; + else if (data->version < 27) + bits = 12; + + count = take_bits(ds, bits); + if (data->payload_len + count + 1 > QUIRC_MAX_PAYLOAD) + return QUIRC_ERROR_DATA_OVERFLOW; + + while (count >= 3) { + if (numeric_tuple(data, ds, 10, 3) < 0) + return QUIRC_ERROR_DATA_UNDERFLOW; + count -= 3; + } + + if (count >= 2) { + if (numeric_tuple(data, ds, 7, 2) < 0) + return QUIRC_ERROR_DATA_UNDERFLOW; + count -= 2; + } + + if (count) { + if (numeric_tuple(data, ds, 4, 1) < 0) + return QUIRC_ERROR_DATA_UNDERFLOW; + count--; + } + + return QUIRC_SUCCESS; +} + +static int alpha_tuple(struct quirc_data *data, + struct datastream *ds, + int bits, int digits) +{ + int tuple; + int i; + + if (bits_remaining(ds) < bits) + return -1; + + tuple = take_bits(ds, bits); + + for (i = 0; i < digits; i++) { + static const char *alpha_map = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%*+-./:"; + + data->payload[data->payload_len + digits - i - 1] = + alpha_map[tuple % 45]; + tuple /= 45; + } + + data->payload_len += digits; + return 0; +} + +static quirc_decode_error_t decode_alpha(struct quirc_data *data, + struct datastream *ds) +{ + int bits = 13; + int count; + + if (data->version < 10) + bits = 9; + else if (data->version < 27) + bits = 11; + + count = take_bits(ds, bits); + if (data->payload_len + count + 1 > QUIRC_MAX_PAYLOAD) + return QUIRC_ERROR_DATA_OVERFLOW; + + while (count >= 2) { + if (alpha_tuple(data, ds, 11, 2) < 0) + return QUIRC_ERROR_DATA_UNDERFLOW; + count -= 2; + } + + if (count) { + if (alpha_tuple(data, ds, 6, 1) < 0) + return QUIRC_ERROR_DATA_UNDERFLOW; + count--; + } + + return QUIRC_SUCCESS; +} + +static quirc_decode_error_t decode_byte(struct quirc_data *data, + struct datastream *ds) +{ + int bits = 16; + int count; + int i; + + if (data->version < 10) + bits = 8; + + count = take_bits(ds, bits); + if (data->payload_len + count + 1 > QUIRC_MAX_PAYLOAD) + return QUIRC_ERROR_DATA_OVERFLOW; + if (bits_remaining(ds) < count * 8) + return QUIRC_ERROR_DATA_UNDERFLOW; + + for (i = 0; i < count; i++) + data->payload[data->payload_len++] = take_bits(ds, 8); + + return QUIRC_SUCCESS; +} + +static quirc_decode_error_t decode_kanji(struct quirc_data *data, + struct datastream *ds) +{ + int bits = 12; + int count; + int i; + + if (data->version < 10) + bits = 8; + else if (data->version < 27) + bits = 10; + + count = take_bits(ds, bits); + if (data->payload_len + count * 2 + 1 > QUIRC_MAX_PAYLOAD) + return QUIRC_ERROR_DATA_OVERFLOW; + if (bits_remaining(ds) < count * 13) + return QUIRC_ERROR_DATA_UNDERFLOW; + + for (i = 0; i < count; i++) { + int d = take_bits(ds, 13); + int msB = d / 0xc0; + int lsB = d % 0xc0; + int intermediate = (msB << 8) | lsB; + uint16_t sjw; + + if (intermediate + 0x8140 <= 0x9ffc) { + /* bytes are in the range 0x8140 to 0x9FFC */ + sjw = intermediate + 0x8140; + } else { + /* bytes are in the range 0xE040 to 0xEBBF */ + sjw = intermediate + 0xc140; + } + + data->payload[data->payload_len++] = sjw >> 8; + data->payload[data->payload_len++] = sjw & 0xff; + } + + return QUIRC_SUCCESS; +} + +static quirc_decode_error_t decode_eci(struct quirc_data *data, + struct datastream *ds) +{ + if (bits_remaining(ds) < 8) + return QUIRC_ERROR_DATA_UNDERFLOW; + + data->eci = take_bits(ds, 8); + + if ((data->eci & 0xc0) == 0x80) { + if (bits_remaining(ds) < 8) + return QUIRC_ERROR_DATA_UNDERFLOW; + + data->eci = (data->eci << 8) | take_bits(ds, 8); + } else if ((data->eci & 0xe0) == 0xc0) { + if (bits_remaining(ds) < 16) + return QUIRC_ERROR_DATA_UNDERFLOW; + + data->eci = (data->eci << 16) | take_bits(ds, 16); + } + + return QUIRC_SUCCESS; +} + +static quirc_decode_error_t decode_payload(struct quirc_data *data, + struct datastream *ds) +{ + while (bits_remaining(ds) >= 4) { + quirc_decode_error_t err = QUIRC_SUCCESS; + int type = take_bits(ds, 4); + + switch (type) { + case QUIRC_DATA_TYPE_NUMERIC: + err = decode_numeric(data, ds); + break; + + case QUIRC_DATA_TYPE_ALPHA: + err = decode_alpha(data, ds); + break; + + case QUIRC_DATA_TYPE_BYTE: + err = decode_byte(data, ds); + break; + + case QUIRC_DATA_TYPE_KANJI: + err = decode_kanji(data, ds); + break; + + case 7: + err = decode_eci(data, ds); + break; + + default: + goto done; + } + + if (err) + return err; + + if (!(type & (type - 1)) && (type > data->data_type)) + data->data_type = type; + } +done: + + /* Add nul terminator to all payloads */ + if ((unsigned)data->payload_len >= sizeof(data->payload)) + data->payload_len--; + data->payload[data->payload_len] = 0; + + return QUIRC_SUCCESS; +} + +quirc_decode_error_t quirc_decode(const struct quirc_code *code, + struct quirc_data *data) +{ + quirc_decode_error_t err; + struct datastream ds; + + if ((code->size - 17) % 4) + return QUIRC_ERROR_INVALID_GRID_SIZE; + + memset(data, 0, sizeof(*data)); + memset(&ds, 0, sizeof(ds)); + + data->version = (code->size - 17) / 4; + + if (data->version < 1 || + data->version > QUIRC_MAX_VERSION) + return QUIRC_ERROR_INVALID_VERSION; + + /* Read format information -- try both locations */ + err = read_format(code, data, 0); + if (err) + err = read_format(code, data, 1); + if (err) + return err; + + read_data(code, data, &ds); + err = codestream_ecc(data, &ds); + if (err) + return err; + + err = decode_payload(data, &ds); + if (err) + return err; + + return QUIRC_SUCCESS; +} diff --git a/3rdparty/quirc/src/quirc.c b/3rdparty/quirc/src/quirc.c new file mode 100644 index 0000000000..a1418b2b26 --- /dev/null +++ b/3rdparty/quirc/src/quirc.c @@ -0,0 +1,138 @@ +/* quirc -- QR-code recognition library + * Copyright (C) 2010-2012 Daniel Beer + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +const char *quirc_version(void) +{ + return "1.0"; +} + +struct quirc *quirc_new(void) +{ + struct quirc *q = malloc(sizeof(*q)); + + if (!q) + return NULL; + + memset(q, 0, sizeof(*q)); + return q; +} + +void quirc_destroy(struct quirc *q) +{ + free(q->image); + /* q->pixels may alias q->image when their type representation is of the + same size, so we need to be careful here to avoid a double free */ + if (sizeof(*q->image) != sizeof(*q->pixels)) + free(q->pixels); + free(q->row_average); + free(q); +} + +int quirc_resize(struct quirc *q, int w, int h) +{ + uint8_t *image = NULL; + quirc_pixel_t *pixels = NULL; + int *row_average = NULL; + + /* + * XXX: w and h should be size_t (or at least unsigned) as negatives + * values would not make much sense. The downside is that it would break + * both the API and ABI. Thus, at the moment, let's just do a sanity + * check. + */ + if (w < 0 || h < 0) + goto fail; + + /* + * alloc a new buffer for q->image. We avoid realloc(3) because we want + * on failure to be leave `q` in a consistant, unmodified state. + */ + image = calloc(w, h); + if (!image) + goto fail; + + /* compute the "old" (i.e. currently allocated) and the "new" + (i.e. requested) image dimensions */ + size_t olddim = q->w * q->h; + size_t newdim = w * h; + size_t min = (olddim < newdim ? olddim : newdim); + + /* + * copy the data into the new buffer, avoiding (a) to read beyond the + * old buffer when the new size is greater and (b) to write beyond the + * new buffer when the new size is smaller, hence the min computation. + */ + (void)memcpy(image, q->image, min); + + /* alloc a new buffer for q->pixels if needed */ + if (sizeof(*q->image) != sizeof(*q->pixels)) { + pixels = calloc(newdim, sizeof(quirc_pixel_t)); + if (!pixels) + goto fail; + } + + /* alloc a new buffer for q->row_average */ + row_average = calloc(w, sizeof(int)); + if (!row_average) + goto fail; + + /* alloc succeeded, update `q` with the new size and buffers */ + q->w = w; + q->h = h; + free(q->image); + q->image = image; + if (sizeof(*q->image) != sizeof(*q->pixels)) { + free(q->pixels); + q->pixels = pixels; + } + free(q->row_average); + q->row_average = row_average; + + return 0; + /* NOTREACHED */ +fail: + free(image); + free(pixels); + free(row_average); + + return -1; +} + +int quirc_count(const struct quirc *q) +{ + return q->num_grids; +} + +static const char *const error_table[] = { + [QUIRC_SUCCESS] = "Success", + [QUIRC_ERROR_INVALID_GRID_SIZE] = "Invalid grid size", + [QUIRC_ERROR_INVALID_VERSION] = "Invalid version", + [QUIRC_ERROR_FORMAT_ECC] = "Format data ECC failure", + [QUIRC_ERROR_DATA_ECC] = "ECC failure", + [QUIRC_ERROR_UNKNOWN_DATA_TYPE] = "Unknown data type", + [QUIRC_ERROR_DATA_OVERFLOW] = "Data overflow", + [QUIRC_ERROR_DATA_UNDERFLOW] = "Data underflow" +}; + +const char *quirc_strerror(quirc_decode_error_t err) +{ + if ((int)err < 8) { return error_table[err]; } + else { return "Unknown error"; } +} diff --git a/3rdparty/quirc/src/version_db.c b/3rdparty/quirc/src/version_db.c new file mode 100644 index 0000000000..9c77e63d47 --- /dev/null +++ b/3rdparty/quirc/src/version_db.c @@ -0,0 +1,430 @@ +/* quirc -- QR-code recognition library + * Copyright (C) 2010-2012 Daniel Beer + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +const struct quirc_version_info quirc_version_db[QUIRC_MAX_VERSION + 1] = { + { /* 0 */ + .data_bytes = 0, + .apat = {0}, + .ecc = { + {.bs = 0, .dw = 0, .ns = 0}, + {.bs = 0, .dw = 0, .ns = 0}, + {.bs = 0, .dw = 0, .ns = 0}, + {.bs = 0, .dw = 0, .ns = 0} + } + }, + { /* Version 1 */ + .data_bytes = 26, + .apat = {0}, + .ecc = { + {.bs = 26, .dw = 16, .ns = 1}, + {.bs = 26, .dw = 19, .ns = 1}, + {.bs = 26, .dw = 9, .ns = 1}, + {.bs = 26, .dw = 13, .ns = 1} + } + }, + { /* Version 2 */ + .data_bytes = 44, + .apat = {6, 18, 0}, + .ecc = { + {.bs = 44, .dw = 28, .ns = 1}, + {.bs = 44, .dw = 34, .ns = 1}, + {.bs = 44, .dw = 16, .ns = 1}, + {.bs = 44, .dw = 22, .ns = 1} + } + }, + { /* Version 3 */ + .data_bytes = 70, + .apat = {6, 22, 0}, + .ecc = { + {.bs = 70, .dw = 44, .ns = 1}, + {.bs = 70, .dw = 55, .ns = 1}, + {.bs = 35, .dw = 13, .ns = 2}, + {.bs = 35, .dw = 17, .ns = 2} + } + }, + { /* Version 4 */ + .data_bytes = 100, + .apat = {6, 26, 0}, + .ecc = { + {.bs = 50, .dw = 32, .ns = 2}, + {.bs = 100, .dw = 80, .ns = 1}, + {.bs = 25, .dw = 9, .ns = 4}, + {.bs = 50, .dw = 24, .ns = 2} + } + }, + { /* Version 5 */ + .data_bytes = 134, + .apat = {6, 30, 0}, + .ecc = { + {.bs = 67, .dw = 43, .ns = 2}, + {.bs = 134, .dw = 108, .ns = 1}, + {.bs = 33, .dw = 11, .ns = 2}, + {.bs = 33, .dw = 15, .ns = 2} + } + }, + { /* Version 6 */ + .data_bytes = 172, + .apat = {6, 34, 0}, + .ecc = { + {.bs = 43, .dw = 27, .ns = 4}, + {.bs = 86, .dw = 68, .ns = 2}, + {.bs = 43, .dw = 15, .ns = 4}, + {.bs = 43, .dw = 19, .ns = 4} + } + }, + { /* Version 7 */ + .data_bytes = 196, + .apat = {6, 22, 38, 0}, + .ecc = { + {.bs = 49, .dw = 31, .ns = 4}, + {.bs = 98, .dw = 78, .ns = 2}, + {.bs = 39, .dw = 13, .ns = 4}, + {.bs = 32, .dw = 14, .ns = 2} + } + }, + { /* Version 8 */ + .data_bytes = 242, + .apat = {6, 24, 42, 0}, + .ecc = { + {.bs = 60, .dw = 38, .ns = 2}, + {.bs = 121, .dw = 97, .ns = 2}, + {.bs = 40, .dw = 14, .ns = 4}, + {.bs = 40, .dw = 18, .ns = 4} + } + }, + { /* Version 9 */ + .data_bytes = 292, + .apat = {6, 26, 46, 0}, + .ecc = { + {.bs = 58, .dw = 36, .ns = 3}, + {.bs = 146, .dw = 116, .ns = 2}, + {.bs = 36, .dw = 12, .ns = 4}, + {.bs = 36, .dw = 16, .ns = 4} + } + }, + { /* Version 10 */ + .data_bytes = 346, + .apat = {6, 28, 50, 0}, + .ecc = { + {.bs = 69, .dw = 43, .ns = 4}, + {.bs = 86, .dw = 68, .ns = 2}, + {.bs = 43, .dw = 15, .ns = 6}, + {.bs = 43, .dw = 19, .ns = 6} + } + }, + { /* Version 11 */ + .data_bytes = 404, + .apat = {6, 30, 54, 0}, + .ecc = { + {.bs = 80, .dw = 50, .ns = 1}, + {.bs = 101, .dw = 81, .ns = 4}, + {.bs = 36, .dw = 12, .ns = 3}, + {.bs = 50, .dw = 22, .ns = 4} + } + }, + { /* Version 12 */ + .data_bytes = 466, + .apat = {6, 32, 58, 0}, + .ecc = { + {.bs = 58, .dw = 36, .ns = 6}, + {.bs = 116, .dw = 92, .ns = 2}, + {.bs = 42, .dw = 14, .ns = 7}, + {.bs = 46, .dw = 20, .ns = 4} + } + }, + { /* Version 13 */ + .data_bytes = 532, + .apat = {6, 34, 62, 0}, + .ecc = { + {.bs = 59, .dw = 37, .ns = 8}, + {.bs = 133, .dw = 107, .ns = 4}, + {.bs = 33, .dw = 11, .ns = 12}, + {.bs = 44, .dw = 20, .ns = 8} + } + }, + { /* Version 14 */ + .data_bytes = 581, + .apat = {6, 26, 46, 66, 0}, + .ecc = { + {.bs = 64, .dw = 40, .ns = 4}, + {.bs = 145, .dw = 115, .ns = 3}, + {.bs = 36, .dw = 12, .ns = 11}, + {.bs = 36, .dw = 16, .ns = 11} + } + }, + { /* Version 15 */ + .data_bytes = 655, + .apat = {6, 26, 48, 70, 0}, + .ecc = { + {.bs = 65, .dw = 41, .ns = 5}, + {.bs = 109, .dw = 87, .ns = 5}, + {.bs = 36, .dw = 12, .ns = 11}, + {.bs = 54, .dw = 24, .ns = 5} + } + }, + { /* Version 16 */ + .data_bytes = 733, + .apat = {6, 26, 50, 74, 0}, + .ecc = { + {.bs = 73, .dw = 45, .ns = 7}, + {.bs = 122, .dw = 98, .ns = 5}, + {.bs = 45, .dw = 15, .ns = 3}, + {.bs = 43, .dw = 19, .ns = 15} + } + }, + { /* Version 17 */ + .data_bytes = 815, + .apat = {6, 30, 54, 78, 0}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 10}, + {.bs = 135, .dw = 107, .ns = 1}, + {.bs = 42, .dw = 14, .ns = 2}, + {.bs = 50, .dw = 22, .ns = 1} + } + }, + { /* Version 18 */ + .data_bytes = 901, + .apat = {6, 30, 56, 82, 0}, + .ecc = { + {.bs = 69, .dw = 43, .ns = 9}, + {.bs = 150, .dw = 120, .ns = 5}, + {.bs = 42, .dw = 14, .ns = 2}, + {.bs = 50, .dw = 22, .ns = 17} + } + }, + { /* Version 19 */ + .data_bytes = 991, + .apat = {6, 30, 58, 86, 0}, + .ecc = { + {.bs = 70, .dw = 44, .ns = 3}, + {.bs = 141, .dw = 113, .ns = 3}, + {.bs = 39, .dw = 13, .ns = 9}, + {.bs = 47, .dw = 21, .ns = 17} + } + }, + { /* Version 20 */ + .data_bytes = 1085, + .apat = {6, 34, 62, 90, 0}, + .ecc = { + {.bs = 67, .dw = 41, .ns = 3}, + {.bs = 135, .dw = 107, .ns = 3}, + {.bs = 43, .dw = 15, .ns = 15}, + {.bs = 54, .dw = 24, .ns = 15} + } + }, + { /* Version 21 */ + .data_bytes = 1156, + .apat = {6, 28, 50, 72, 92, 0}, + .ecc = { + {.bs = 68, .dw = 42, .ns = 17}, + {.bs = 144, .dw = 116, .ns = 4}, + {.bs = 46, .dw = 16, .ns = 19}, + {.bs = 50, .dw = 22, .ns = 17} + } + }, + { /* Version 22 */ + .data_bytes = 1258, + .apat = {6, 26, 50, 74, 98, 0}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 17}, + {.bs = 139, .dw = 111, .ns = 2}, + {.bs = 37, .dw = 13, .ns = 34}, + {.bs = 54, .dw = 24, .ns = 7} + } + }, + { /* Version 23 */ + .data_bytes = 1364, + .apat = {6, 30, 54, 78, 102, 0}, + .ecc = { + {.bs = 75, .dw = 47, .ns = 4}, + {.bs = 151, .dw = 121, .ns = 4}, + {.bs = 45, .dw = 15, .ns = 16}, + {.bs = 54, .dw = 24, .ns = 11} + } + }, + { /* Version 24 */ + .data_bytes = 1474, + .apat = {6, 28, 54, 80, 106, 0}, + .ecc = { + {.bs = 73, .dw = 45, .ns = 6}, + {.bs = 147, .dw = 117, .ns = 6}, + {.bs = 46, .dw = 16, .ns = 30}, + {.bs = 54, .dw = 24, .ns = 11} + } + }, + { /* Version 25 */ + .data_bytes = 1588, + .apat = {6, 32, 58, 84, 110, 0}, + .ecc = { + {.bs = 75, .dw = 47, .ns = 8}, + {.bs = 132, .dw = 106, .ns = 8}, + {.bs = 45, .dw = 15, .ns = 22}, + {.bs = 54, .dw = 24, .ns = 7} + } + }, + { /* Version 26 */ + .data_bytes = 1706, + .apat = {6, 30, 58, 86, 114, 0}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 19}, + {.bs = 142, .dw = 114, .ns = 10}, + {.bs = 46, .dw = 16, .ns = 33}, + {.bs = 50, .dw = 22, .ns = 28} + } + }, + { /* Version 27 */ + .data_bytes = 1828, + .apat = {6, 34, 62, 90, 118, 0}, + .ecc = { + {.bs = 73, .dw = 45, .ns = 22}, + {.bs = 152, .dw = 122, .ns = 8}, + {.bs = 45, .dw = 15, .ns = 12}, + {.bs = 53, .dw = 23, .ns = 8} + } + }, + { /* Version 28 */ + .data_bytes = 1921, + .apat = {6, 26, 50, 74, 98, 122, 0}, + .ecc = { + {.bs = 73, .dw = 45, .ns = 3}, + {.bs = 147, .dw = 117, .ns = 3}, + {.bs = 45, .dw = 15, .ns = 11}, + {.bs = 54, .dw = 24, .ns = 4} + } + }, + { /* Version 29 */ + .data_bytes = 2051, + .apat = {6, 30, 54, 78, 102, 126, 0}, + .ecc = { + {.bs = 73, .dw = 45, .ns = 21}, + {.bs = 146, .dw = 116, .ns = 7}, + {.bs = 45, .dw = 15, .ns = 19}, + {.bs = 53, .dw = 23, .ns = 1} + } + }, + { /* Version 30 */ + .data_bytes = 2185, + .apat = {6, 26, 52, 78, 104, 130, 0}, + .ecc = { + {.bs = 75, .dw = 47, .ns = 19}, + {.bs = 145, .dw = 115, .ns = 5}, + {.bs = 45, .dw = 15, .ns = 23}, + {.bs = 54, .dw = 24, .ns = 15} + } + }, + { /* Version 31 */ + .data_bytes = 2323, + .apat = {6, 30, 56, 82, 108, 134, 0}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 2}, + {.bs = 145, .dw = 115, .ns = 13}, + {.bs = 45, .dw = 15, .ns = 23}, + {.bs = 54, .dw = 24, .ns = 42} + } + }, + { /* Version 32 */ + .data_bytes = 2465, + .apat = {6, 34, 60, 86, 112, 138, 0}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 10}, + {.bs = 145, .dw = 115, .ns = 17}, + {.bs = 45, .dw = 15, .ns = 19}, + {.bs = 54, .dw = 24, .ns = 10} + } + }, + { /* Version 33 */ + .data_bytes = 2611, + .apat = {6, 30, 58, 86, 114, 142, 0}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 14}, + {.bs = 145, .dw = 115, .ns = 17}, + {.bs = 45, .dw = 15, .ns = 11}, + {.bs = 54, .dw = 24, .ns = 29} + } + }, + { /* Version 34 */ + .data_bytes = 2761, + .apat = {6, 34, 62, 90, 118, 146, 0}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 14}, + {.bs = 145, .dw = 115, .ns = 13}, + {.bs = 46, .dw = 16, .ns = 59}, + {.bs = 54, .dw = 24, .ns = 44} + } + }, + { /* Version 35 */ + .data_bytes = 2876, + .apat = {6, 30, 54, 78, 102, 126, 150}, + .ecc = { + {.bs = 75, .dw = 47, .ns = 12}, + {.bs = 151, .dw = 121, .ns = 12}, + {.bs = 45, .dw = 15, .ns = 22}, + {.bs = 54, .dw = 24, .ns = 39} + } + }, + { /* Version 36 */ + .data_bytes = 3034, + .apat = {6, 24, 50, 76, 102, 128, 154}, + .ecc = { + {.bs = 75, .dw = 47, .ns = 6}, + {.bs = 151, .dw = 121, .ns = 6}, + {.bs = 45, .dw = 15, .ns = 2}, + {.bs = 54, .dw = 24, .ns = 46} + } + }, + { /* Version 37 */ + .data_bytes = 3196, + .apat = {6, 28, 54, 80, 106, 132, 158}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 29}, + {.bs = 152, .dw = 122, .ns = 17}, + {.bs = 45, .dw = 15, .ns = 24}, + {.bs = 54, .dw = 24, .ns = 49} + } + }, + { /* Version 38 */ + .data_bytes = 3362, + .apat = {6, 32, 58, 84, 110, 136, 162}, + .ecc = { + {.bs = 74, .dw = 46, .ns = 13}, + {.bs = 152, .dw = 122, .ns = 4}, + {.bs = 45, .dw = 15, .ns = 42}, + {.bs = 54, .dw = 24, .ns = 48} + } + }, + { /* Version 39 */ + .data_bytes = 3532, + .apat = {6, 26, 54, 82, 110, 138, 166}, + .ecc = { + {.bs = 75, .dw = 47, .ns = 40}, + {.bs = 147, .dw = 117, .ns = 20}, + {.bs = 45, .dw = 15, .ns = 10}, + {.bs = 54, .dw = 24, .ns = 43} + } + }, + { /* Version 40 */ + .data_bytes = 3706, + .apat = {6, 30, 58, 86, 114, 142, 170}, + .ecc = { + {.bs = 75, .dw = 47, .ns = 18}, + {.bs = 148, .dw = 118, .ns = 19}, + {.bs = 45, .dw = 15, .ns = 20}, + {.bs = 54, .dw = 24, .ns = 34} + } + } +}; diff --git a/CMakeLists.txt b/CMakeLists.txt index e54b77ec2f..46b64ab451 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -283,6 +283,7 @@ OCV_OPTION(WITH_IMGCODEC_HDR "Include HDR support" ON) OCV_OPTION(WITH_IMGCODEC_SUNRASTER "Include SUNRASTER support" ON) OCV_OPTION(WITH_IMGCODEC_PXM "Include PNM (PBM,PGM,PPM) and PAM formats support" ON) OCV_OPTION(WITH_IMGCODEC_PFM "Include PFM formats support" ON) +OCV_OPTION(WITH_QUIRC "Include library QR-code decoding" ON) # OpenCV build components # =================================================== @@ -696,6 +697,10 @@ if(WITH_OPENVX) include(cmake/FindOpenVX.cmake) endif() +if(WITH_QUIRC) + add_subdirectory(3rdparty/quirc) + set(HAVE_QUIRC TRUE) +endif() # ---------------------------------------------------------------------------- # OpenCV HAL # ---------------------------------------------------------------------------- diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index 6a60648359..373ac9b9c2 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -909,6 +909,13 @@ macro(_ocv_create_module) source_group("Src" FILES "${_VS_VERSION_FILE}") endif() endif() + if(WIN32 AND NOT ("${the_module}" STREQUAL "opencv_core" OR "${the_module}" STREQUAL "opencv_world") + AND (BUILD_SHARED_LIBS AND NOT "x${OPENCV_MODULE_TYPE}" STREQUAL "xSTATIC") + AND NOT OPENCV_SKIP_DLLMAIN_GENERATION + ) + set(_DLLMAIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/${the_module}_main.cpp") + configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/dllmain.cpp.in" "${_DLLMAIN_FILE}" @ONLY) + endif() source_group("Include" FILES "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp") source_group("Src" FILES "${${the_module}_pch}") @@ -918,6 +925,7 @@ macro(_ocv_create_module) "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp" ${${the_module}_pch} ${_VS_VERSION_FILE} + ${_DLLMAIN_FILE} ) set_target_properties(${the_module} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};Module") set_source_files_properties(${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES} ${${the_module}_pch} diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in index 0f63651c38..a208f7ef74 100644 --- a/cmake/templates/cvconfig.h.in +++ b/cmake/templates/cvconfig.h.in @@ -244,5 +244,7 @@ /* OpenCV trace utilities */ #cmakedefine OPENCV_TRACE +/* Library QR-code decoding */ +#cmakedefine HAVE_QUIRC #endif // OPENCV_CVCONFIG_H_INCLUDED diff --git a/cmake/templates/dllmain.cpp.in b/cmake/templates/dllmain.cpp.in new file mode 100644 index 0000000000..6b3005f654 --- /dev/null +++ b/cmake/templates/dllmain.cpp.in @@ -0,0 +1,36 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef _WIN32 +#error "Build configuration error" +#endif +#ifndef CVAPI_EXPORTS +#error "Build configuration error" +#endif + +#define WIN32_LEAN_AND_MEAN +#include + +#define OPENCV_MODULE_S "@the_module@" + +namespace cv { +extern __declspec(dllimport) bool __termination; // Details: #12750 +} + +extern "C" +BOOL WINAPI DllMain(HINSTANCE, DWORD fdwReason, LPVOID lpReserved); + +extern "C" +BOOL WINAPI DllMain(HINSTANCE, DWORD fdwReason, LPVOID lpReserved) +{ + if (fdwReason == DLL_THREAD_DETACH || fdwReason == DLL_PROCESS_DETACH) + { + if (lpReserved != NULL) // called after ExitProcess() call + { + //printf("OpenCV: terminating: " OPENCV_MODULE_S "\n"); + cv::__termination = true; + } + } + return TRUE; +} diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 29c4f646ec..18bdf46f90 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -472,6 +472,9 @@ void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) { +#if CV_SSE4_1 + return v_uint16x8(_mm_packus_epi32(a.val, b.val)); +#else __m128i delta32 = _mm_set1_epi32(32768); // preliminary saturate negative values to zero @@ -480,34 +483,51 @@ inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32)); return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); +#endif } inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) { +#if CV_SSE4_1 + _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val)); +#else __m128i delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(a.val, delta32); __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); _mm_storel_epi64((__m128i*)ptr, r); +#endif } template inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b) { +#if CV_SSE4_1 + __m128i delta = _mm_set1_epi32(1 << (n - 1)); + return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), + _mm_srai_epi32(_mm_add_epi32(b.val, delta), n))); +#else __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32); __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768)); return v_uint16x8(_mm_unpacklo_epi64(a2, b2)); +#endif } template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) { +#if CV_SSE4_1 + __m128i delta = _mm_set1_epi32(1 << (n - 1)); + __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); + _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1)); +#else __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768); __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32); __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768)); _mm_storel_epi64((__m128i*)ptr, a2); +#endif } inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b) diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index f0deb2d067..1dc06bc633 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -4776,6 +4776,10 @@ public: void deallocate_(UMatData* u) const { +#ifdef _WIN32 + if (cv::__termination) // process is not in consistent state (after ExitProcess call) and terminating + return; // avoid any OpenCL calls +#endif if(u->tempUMat()) { CV_Assert(u->origdata); diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp index 93c8c26cd0..d74e377494 100644 --- a/modules/core/src/parallel.cpp +++ b/modules/core/src/parallel.cpp @@ -447,7 +447,16 @@ static int numThreads = -1; #elif defined HAVE_HPX // nothing for HPX #elif defined HAVE_OPENMP -static int numThreadsMax = omp_get_max_threads(); +static inline int _initMaxThreads() +{ + int maxThreads = omp_get_max_threads(); + if (!utils::getConfigurationParameterBool("OPENCV_FOR_OPENMP_DYNAMIC_DISABLE", false)) + { + omp_set_dynamic(maxThreads); + } + return numThreads; +} +static int numThreadsMax = _initMaxThreads(); #elif defined HAVE_GCD // nothing for GCD #elif defined WINRT diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 796edb98ec..3aee8486b5 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -298,8 +298,9 @@ TLSData& getCoreTlsData(); #define CL_RUNTIME_EXPORT #endif -extern bool __termination; // skip some cleanups, because process is terminating - // (for example, if ExitProcess() was already called) +extern CV_EXPORTS +bool __termination; // skip some cleanups, because process is terminating + // (for example, if ExitProcess() was already called) cv::Mutex& getInitializationMutex(); diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index 09078965ff..ab91855a15 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -43,6 +43,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" @@ -73,69 +74,55 @@ template struct PyrUpNoVec int operator()(T1**, T2**, int, int) const { return 0; } }; -#if CV_SSE2 +#if CV_SIMD struct PyrDownVec_32s8u { int operator()(int** src, uchar* dst, int, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - __m128i delta = _mm_set1_epi16(128); - for( ; x <= width - 16; x += 16 ) + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes ) { - __m128i r0, r1, r2, r3, r4, t0, t1; - r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), - _mm_load_si128((const __m128i*)(row0 + x + 4))); - r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), - _mm_load_si128((const __m128i*)(row1 + x + 4))); - r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), - _mm_load_si128((const __m128i*)(row2 + x + 4))); - r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), - _mm_load_si128((const __m128i*)(row3 + x + 4))); - r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), - _mm_load_si128((const __m128i*)(row4 + x + 4))); - r0 = _mm_add_epi16(r0, r4); - r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); - r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); - t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); - r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)), - _mm_load_si128((const __m128i*)(row0 + x + 12))); - r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)), - _mm_load_si128((const __m128i*)(row1 + x + 12))); - r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)), - _mm_load_si128((const __m128i*)(row2 + x + 12))); - r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)), - _mm_load_si128((const __m128i*)(row3 + x + 12))); - r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)), - _mm_load_si128((const __m128i*)(row4 + x + 12))); - r0 = _mm_add_epi16(r0, r4); - r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); - r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); - t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); - t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8); - t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8); - _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1)); + v_uint16 r0, r1, r2, r3, r4, t0, t1; + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); + t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes))); + t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + v_store(dst + x, v_rshr_pack<8>(t0, t1)); } - - for( ; x <= width - 4; x += 4 ) + if (x <= width - v_int16::nlanes) { - __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128(); - r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z); - r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z); - r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z); - r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z); - r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z); - r0 = _mm_add_epi16(r0, r4); - r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); - r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); - r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); - r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8); - *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0)); + v_uint16 r0, r1, r2, r3, r4, t0; + r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes))); + r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes))); + r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes))); + r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes))); + r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes))); + t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + v_rshr_pack_store<8>(dst + x, t0); + x += v_uint16::nlanes; + } + for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) + { + v_int32x4 r0, r1, r2, r3, r4, t0; + r0 = v_load(row0 + x); + r1 = v_load(row1 + x); + r2 = v_load(row2 + x); + r3 = v_load(row3 + x); + r4 = v_load(row4 + x); + t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); + + *(int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); } return x; @@ -146,152 +133,105 @@ struct PyrDownVec_32f { int operator()(float** src, float* dst, int, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - int x = 0; const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - __m128 _4 = _mm_set1_ps(4.f), _scale = _mm_set1_ps(1.f/256); - for( ; x <= width - 8; x += 8 ) + + v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256); + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) { - __m128 r0, r1, r2, r3, r4, t0, t1; - r0 = _mm_load_ps(row0 + x); - r1 = _mm_load_ps(row1 + x); - r2 = _mm_load_ps(row2 + x); - r3 = _mm_load_ps(row3 + x); - r4 = _mm_load_ps(row4 + x); - r0 = _mm_add_ps(r0, r4); - r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); - r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); - t0 = _mm_add_ps(r0, _mm_mul_ps(r1, _4)); - - r0 = _mm_load_ps(row0 + x + 4); - r1 = _mm_load_ps(row1 + x + 4); - r2 = _mm_load_ps(row2 + x + 4); - r3 = _mm_load_ps(row3 + x + 4); - r4 = _mm_load_ps(row4 + x + 4); - r0 = _mm_add_ps(r0, r4); - r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2); - r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); - t1 = _mm_add_ps(r0, _mm_mul_ps(r1, _4)); - - t0 = _mm_mul_ps(t0, _scale); - t1 = _mm_mul_ps(t1, _scale); - - _mm_storeu_ps(dst + x, t0); - _mm_storeu_ps(dst + x + 4, t1); + v_float32 r0, r1, r2, r3, r4; + r0 = vx_load(row0 + x); + r1 = vx_load(row1 + x); + r2 = vx_load(row2 + x); + r3 = vx_load(row3 + x); + r4 = vx_load(row4 + x); + v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale); } return x; } }; -#if CV_SSE4_1 +#if CV_SSE4_1 || CV_NEON struct PyrDownVec_32s16u { - PyrDownVec_32s16u() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); - } - int operator()(int** src, ushort* dst, int, int width) const { int x = 0; - - if (!haveSSE) - return x; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - __m128i v_delta = _mm_set1_epi32(128); - for( ; x <= width - 8; x += 8 ) + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) { - __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), - v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); - __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), - v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); - __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), - v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); - __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), - v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); - __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), - v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); - - v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); - v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); - - v_r10 = _mm_slli_epi32(v_r10, 2); - __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); - - v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); - v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); - v_r11 = _mm_slli_epi32(v_r11, 2); - __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1)); + v_int32 r00 = vx_load(row0 + x), + r01 = vx_load(row0 + x + v_int32::nlanes), + r10 = vx_load(row1 + x), + r11 = vx_load(row1 + x + v_int32::nlanes), + r20 = vx_load(row2 + x), + r21 = vx_load(row2 + x + v_int32::nlanes), + r30 = vx_load(row3 + x), + r31 = vx_load(row3 + x + v_int32::nlanes), + r40 = vx_load(row4 + x), + r41 = vx_load(row4 + x + v_int32::nlanes); + v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), + r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); + } + if (x <= width - v_int32::nlanes) + { + v_int32 r00 = vx_load(row0 + x), + r10 = vx_load(row1 + x), + r20 = vx_load(row2 + x), + r30 = vx_load(row3 + x), + r40 = vx_load(row4 + x); + v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); + x += v_int32::nlanes; } return x; } - - bool haveSSE; }; #else typedef PyrDownNoVec PyrDownVec_32s16u; -#endif // CV_SSE4_1 +#endif struct PyrDownVec_32s16s { - PyrDownVec_32s16s() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - } - int operator()(int** src, short* dst, int, int width) const { int x = 0; - - if (!haveSSE) - return x; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - __m128i v_delta = _mm_set1_epi32(128); - for( ; x <= width - 8; x += 8 ) + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) { - __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), - v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); - __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), - v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); - __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), - v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); - __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), - v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); - __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), - v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); - - v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); - v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); - - v_r10 = _mm_slli_epi32(v_r10, 2); - __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); - - v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); - v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); - v_r11 = _mm_slli_epi32(v_r11, 2); - __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1)); + v_int32 r00 = vx_load(row0 + x), + r01 = vx_load(row0 + x + v_int32::nlanes), + r10 = vx_load(row1 + x), + r11 = vx_load(row1 + x + v_int32::nlanes), + r20 = vx_load(row2 + x), + r21 = vx_load(row2 + x + v_int32::nlanes), + r30 = vx_load(row3 + x), + r31 = vx_load(row3 + x + v_int32::nlanes), + r40 = vx_load(row4 + x), + r41 = vx_load(row4 + x + v_int32::nlanes); + v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2), + r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2))); + } + if (x <= width - v_int32::nlanes) + { + v_int32 r00 = vx_load(row0 + x), + r10 = vx_load(row1 + x), + r20 = vx_load(row2 + x), + r30 = vx_load(row3 + x), + r40 = vx_load(row4 + x); + v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2)); + x += v_int32::nlanes; } return x; } - - bool haveSSE; }; struct PyrUpVec_32s8u @@ -299,59 +239,40 @@ struct PyrUpVec_32s8u int operator()(int** src, uchar** dst, int, int width) const { int x = 0; - - if (!checkHardwareSupport(CV_CPU_SSE2)) - return x; - uchar *dst0 = dst[0], *dst1 = dst[1]; - const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; - __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128(); + const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - 16; x += 16 ) + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) { - __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), - _mm_loadu_si128((__m128i const *)(row0 + x + 4))); - __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), - _mm_loadu_si128((__m128i const *)(row1 + x + 4))); - __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), - _mm_loadu_si128((__m128i const *)(row2 + x + 4))); - - __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); - __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); - __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); - - v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)), - _mm_loadu_si128((__m128i const *)(row0 + x + 12))); - v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)), - _mm_loadu_si128((__m128i const *)(row1 + x + 12))); - v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)), - _mm_loadu_si128((__m128i const *)(row2 + x + 12))); - - v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); - __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); - __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); - - _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6), - _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6))); - _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6), - _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6))); + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), + v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), + v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)), + v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes)); + v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11); + v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11))); + v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); } - - for( ; x <= width - 8; x += 8 ) + if(x <= width - v_uint16::nlanes) { - __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), - _mm_loadu_si128((__m128i const *)(row0 + x + 4))); - __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), - _mm_loadu_si128((__m128i const *)(row1 + x + 4))); - __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), - _mm_loadu_si128((__m128i const *)(row2 + x + 4))); - - __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); - __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); - __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); - - _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero)); - _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero)); + v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)), + v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)), + v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)); + v_int16 v_2r10 = v_r10 + v_r10; + v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10)); + v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); + x += v_uint16::nlanes; + } + for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) + { + v_int32 v_r00 = vx_load(row0 + x), + v_r10 = vx_load(row1 + x), + v_r20 = vx_load(row2 + x); + v_int32 v_2r10 = v_r10 + v_r10; + v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2); + *(int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0(); + *(int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0(); } return x; @@ -363,113 +284,63 @@ struct PyrUpVec_32s16s int operator()(int** src, short** dst, int, int width) const { int x = 0; - - if (!checkHardwareSupport(CV_CPU_SSE2)) - return x; - short *dst0 = dst[0], *dst1 = dst[1]; - const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; - __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); + const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - 8; x += 8 ) + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), - v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), - v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); - __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); - __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); - __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - - v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); - v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); - v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); - v_2r1 = _mm_slli_epi32(v_r1, 1); - v_4r1 = _mm_slli_epi32(v_r1, 2); - __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); - __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - - _mm_storeu_si128((__m128i *)(dst0 + x), - _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6), - _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); - _mm_storeu_si128((__m128i *)(dst1 + x), - _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6), - _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); + v_int32 v_r00 = vx_load(row0 + x), + v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r10 = vx_load(row1 + x), + v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r20 = vx_load(row2 + x), + v_r21 = vx_load(row2 + x + v_int32::nlanes); + v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); + v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); } - - for( ; x <= width - 4; x += 4 ) + if(x <= width - v_int32::nlanes) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), - v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), - v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); - __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); - - __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); - __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - - _mm_storel_epi64((__m128i *)(dst0 + x), - _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); - _mm_storel_epi64((__m128i *)(dst1 + x), - _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); + v_int32 v_r00 = vx_load(row0 + x), + v_r10 = vx_load(row1 + x), + v_r20 = vx_load(row2 + x); + v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); + v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2); + x += v_int32::nlanes; } return x; } }; -#if CV_SSE4_1 +#if CV_SSE4_1 || CV_NEON struct PyrUpVec_32s16u { int operator()(int** src, ushort** dst, int, int width) const { int x = 0; - - if (!checkHardwareSupport(CV_CPU_SSE4_1)) - return x; - ushort *dst0 = dst[0], *dst1 = dst[1]; - const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; - __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); + const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - for( ; x <= width - 8; x += 8 ) + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), - v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), - v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); - __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); - __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); - __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - - v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); - v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); - v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); - v_2r1 = _mm_slli_epi32(v_r1, 1); - v_4r1 = _mm_slli_epi32(v_r1, 2); - __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); - __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - - _mm_storeu_si128((__m128i *)(dst0 + x), - _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6), - _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); - _mm_storeu_si128((__m128i *)(dst1 + x), - _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6), - _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); + v_int32 v_r00 = vx_load(row0 + x), + v_r01 = vx_load(row0 + x + v_int32::nlanes), + v_r10 = vx_load(row1 + x), + v_r11 = vx_load(row1 + x + v_int32::nlanes), + v_r20 = vx_load(row2 + x), + v_r21 = vx_load(row2 + x + v_int32::nlanes); + v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2)))); + v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2)); } - - for( ; x <= width - 4; x += 4 ) + if(x <= width - v_int32::nlanes) { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), - v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), - v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); - __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); - - __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); - __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); - - _mm_storel_epi64((__m128i *)(dst0 + x), - _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); - _mm_storel_epi64((__m128i *)(dst1 + x), - _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); + v_int32 v_r00 = vx_load(row0 + x), + v_r10 = vx_load(row1 + x), + v_r20 = vx_load(row2 + x); + v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2))); + v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2); + x += v_int32::nlanes; } return x; @@ -487,347 +358,17 @@ struct PyrUpVec_32f int operator()(float** src, float** dst, int, int width) const { int x = 0; - - if (!checkHardwareSupport(CV_CPU_SSE2)) - return x; - const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; float *dst0 = dst[0], *dst1 = dst[1]; - __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), - v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f)); - for( ; x <= width - 8; x += 8 ) + v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f); + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) { - __m128 v_r0 = _mm_loadu_ps(row0 + x); - __m128 v_r1 = _mm_loadu_ps(row1 + x); - __m128 v_r2 = _mm_loadu_ps(row2 + x); - - _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); - _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); - - v_r0 = _mm_loadu_ps(row0 + x + 4); - v_r1 = _mm_loadu_ps(row1 + x + 4); - v_r2 = _mm_loadu_ps(row2 + x + 4); - - _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); - _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); - } - - return x; - } -}; - -#elif CV_NEON - -struct PyrDownVec_32s8u -{ - int operator()(int** src, uchar* dst, int, int width) const - { - int x = 0; - const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1], - *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3], - *row4 = (unsigned int*)src[4]; - uint16x8_t v_delta = vdupq_n_u16(128); - - for( ; x <= width - 16; x += 16 ) - { - uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); - uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); - uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); - uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4))); - uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4))); - - v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2)); - v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3); - uint16x8_t v_dst0 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); - - v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12))); - v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12))); - v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12))); - v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12))); - v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12))); - - v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2)); - v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3); - uint16x8_t v_dst1 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); - - vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)), - vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8)))); - } - - return x; - } -}; - -struct PyrDownVec_32s16u -{ - int operator()(int** src, ushort* dst, int, int width) const - { - int x = 0; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - int32x4_t v_delta = vdupq_n_s32(128); - - for( ; x <= width - 8; x += 8 ) - { - int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); - int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); - int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); - int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); - int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); - - v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20)); - v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30); - - v_r10 = vshlq_n_s32(v_r10, 2); - int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8); - - v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21)); - v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31); - v_r11 = vshlq_n_s32(v_r11, 2); - int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8); - - vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1))); - } - - return x; - } -}; - -struct PyrDownVec_32s16s -{ - int operator()(int** src, short* dst, int, int width) const - { - int x = 0; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - int32x4_t v_delta = vdupq_n_s32(128); - - for( ; x <= width - 8; x += 8 ) - { - int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); - int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); - int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); - int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); - int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); - - v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20)); - v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30); - v_r10 = vshlq_n_s32(v_r10, 2); - int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8); - - v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21)); - v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31); - v_r11 = vshlq_n_s32(v_r11, 2); - int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); - } - - return x; - } -}; - -struct PyrDownVec_32f -{ - int operator()(float** src, float* dst, int, int width) const - { - int x = 0; - const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; - float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_r0 = vld1q_f32(row0 + x); - float32x4_t v_r1 = vld1q_f32(row1 + x); - float32x4_t v_r2 = vld1q_f32(row2 + x); - float32x4_t v_r3 = vld1q_f32(row3 + x); - float32x4_t v_r4 = vld1q_f32(row4 + x); - - v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); - v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); - vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); - - v_r0 = vld1q_f32(row0 + x + 4); - v_r1 = vld1q_f32(row1 + x + 4); - v_r2 = vld1q_f32(row2 + x + 4); - v_r3 = vld1q_f32(row3 + x + 4); - v_r4 = vld1q_f32(row4 + x + 4); - - v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); - v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); - vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); - } - - return x; - } -}; - -struct PyrUpVec_32s8u -{ - int operator()(int** src, uchar** dst, int, int width) const - { - int x = 0; - uchar *dst0 = dst[0], *dst1 = dst[1]; - const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; - uint16x8_t v_delta = vdupq_n_u16(32); - - for( ; x <= width - 16; x += 16 ) - { - uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); - uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); - uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); - - uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1); - uint16x8_t v_dst00 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1)); - uint16x8_t v_dst10 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2); - - v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12))); - v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12))); - v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12))); - - v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1); - uint16x8_t v_dst01 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1)); - uint16x8_t v_dst11 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2); - - vst1q_u8(dst0 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst00, v_delta), 6)), - vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst01, v_delta), 6)))); - vst1q_u8(dst1 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst10, v_delta), 6)), - vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst11, v_delta), 6)))); - } - - for( ; x <= width - 8; x += 8 ) - { - uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); - uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); - uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); - - uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1); - uint16x8_t v_dst0 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1)); - uint16x8_t v_dst1 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2); - - vst1_u8(dst0 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 6))); - vst1_u8(dst1 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 6))); - } - - return x; - } -}; - -struct PyrUpVec_32s16u -{ - int operator()(int** src, ushort** dst, int, int width) const - { - int x = 0; - ushort *dst0 = dst[0], *dst1 = dst[1]; - const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; - uint32x4_t v_delta = vdupq_n_u32(32); - - for( ; x <= width - 8; x += 8 ) - { - uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x); - uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2); - uint32x4_t v_dst00 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1)); - uint32x4_t v_dst10 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2); - - v_r0 = vld1q_u32(row0 + x + 4); - v_r1 = vld1q_u32(row1 + x + 4); - v_r2 = vld1q_u32(row2 + x + 4); - v_2r1 = vshlq_n_u32(v_r1, 1); - v_4r1 = vshlq_n_u32(v_r1, 2); - uint32x4_t v_dst01 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1)); - uint32x4_t v_dst11 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2); - - vst1q_u16(dst0 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst00, v_delta), 6)), - vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst01, v_delta), 6)))); - vst1q_u16(dst1 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst10, v_delta), 6)), - vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst11, v_delta), 6)))); - } - - for( ; x <= width - 4; x += 4 ) - { - uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x); - uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2); - - uint32x4_t v_dst0 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1)); - uint32x4_t v_dst1 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2); - - vst1_u16(dst0 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0, v_delta), 6))); - vst1_u16(dst1 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1, v_delta), 6))); - } - - return x; - } -}; - -struct PyrUpVec_32s16s -{ - int operator()(int** src, short** dst, int, int width) const - { - int x = 0; - short *dst0 = dst[0], *dst1 = dst[1]; - const int *row0 = src[0], *row1 = src[1], *row2 = src[2]; - int32x4_t v_delta = vdupq_n_s32(32); - - for( ; x <= width - 8; x += 8 ) - { - int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x); - int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2); - int32x4_t v_dst00 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1)); - int32x4_t v_dst10 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2); - - v_r0 = vld1q_s32(row0 + x + 4); - v_r1 = vld1q_s32(row1 + x + 4); - v_r2 = vld1q_s32(row2 + x + 4); - v_2r1 = vshlq_n_s32(v_r1, 1); - v_4r1 = vshlq_n_s32(v_r1, 2); - int32x4_t v_dst01 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1)); - int32x4_t v_dst11 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2); - - vst1q_s16(dst0 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst00, v_delta), 6)), - vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst01, v_delta), 6)))); - vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst10, v_delta), 6)), - vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst11, v_delta), 6)))); - } - - for( ; x <= width - 4; x += 4 ) - { - int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x); - int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2); - - int32x4_t v_dst0 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1)); - int32x4_t v_dst1 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2); - - vst1_s16(dst0 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst0, v_delta), 6))); - vst1_s16(dst1 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst1, v_delta), 6))); - } - - return x; - } -}; - -struct PyrUpVec_32f -{ - int operator()(float** src, float** dst, int, int width) const - { - int x = 0; - const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; - float *dst0 = dst[0], *dst1 = dst[1]; - float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_r0 = vld1q_f32(row0 + x); - float32x4_t v_r1 = vld1q_f32(row1 + x); - float32x4_t v_r2 = vld1q_f32(row2 + x); - - vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2))); - vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2))); - - v_r0 = vld1q_f32(row0 + x + 4); - v_r1 = vld1q_f32(row1 + x + 4); - v_r2 = vld1q_f32(row2 + x + 4); - - vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2))); - vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2))); + v_float32 v_r0 = vx_load(row0 + x), + v_r1 = vx_load(row1 + x), + v_r2 = vx_load(row2 + x); + v_store(dst1 + x, v_scale4 * (v_r1 + v_r2)); + v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2)); } return x; diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 683e4dee5c..5436a78ab5 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -1181,583 +1181,34 @@ struct HResizeNoVec const uchar*, int, int, int, int, int) const { return 0; } }; -#if CV_SSE2 +#if CV_SIMD struct VResizeLinearVec_32s8u { int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - const int** src = (const int**)_src; const short* beta = (const short*)_beta; const int *S0 = src[0], *S1 = src[1]; int x = 0; - __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); - __m128i delta = _mm_set1_epi16(2); + v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]); - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x ) >> 4, vx_load_aligned(S0 + x + v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load_aligned(S1 + x ) >> 4, vx_load_aligned(S1 + x + v_int32::nlanes) >> 4), b1), + v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1))); else - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } - - for( ; x < width - 4; x += 4 ) - { - __m128i x0, y0; - x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); - y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); - x0 = _mm_packs_epi32(x0, x0); - y0 = _mm_packs_epi32(y0, y0); - x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x0 = _mm_packus_epi16(x0, x0); - *(int*)(dst + x) = _mm_cvtsi128_si32(x0); - } - - return x; - } -}; - - -template struct VResizeLinearVec_32f16 -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - ushort* dst = (ushort*)_dst; - int x = 0; - - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); - - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_load_ps(S0 + x + 8); - x1 = _mm_load_ps(S0 + x + 12); - y0 = _mm_load_ps(S1 + x + 8); - y1 = _mm_load_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_loadu_ps(S0 + x + 8); - x1 = _mm_loadu_ps(S0 + x + 12); - y0 = _mm_loadu_ps(S1 + x + 8); - y1 = _mm_loadu_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } - - for( ; x < width - 4; x += 4 ) - { - __m128 x0, y0; - __m128i t0; - x0 = _mm_loadu_ps(S0 + x); - y0 = _mm_loadu_ps(S1 + x); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); - _mm_storel_epi64( (__m128i*)(dst + x), t0); - } - - return x; - } -}; - -typedef VResizeLinearVec_32f16 VResizeLinearVec_32f16u; -typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s; - -struct VResizeLinearVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - float* dst = (float*)_dst; - int x = 0; - - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } - - return x; - } -}; - - -struct VResizeCubicVec_32s8u -{ - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - int x = 0; - float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); - __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), - b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); - - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_load_si128((const __m128i*)(S2 + x)); - x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S3 + x)); - y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } - - return x; - } -}; - - -template struct VResizeCubicVec_32f16 -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - ushort* dst = (ushort*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); - - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1, s0, s1; - __m128i t0, t1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - s0 = _mm_mul_ps(x0, b0); - s1 = _mm_mul_ps(x1, b0); - y0 = _mm_mul_ps(y0, b1); - y1 = _mm_mul_ps(y1, b1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - x0 = _mm_loadu_ps(S2 + x); - x1 = _mm_loadu_ps(S2 + x + 4); - y0 = _mm_loadu_ps(S3 + x); - y1 = _mm_loadu_ps(S3 + x + 4); - - x0 = _mm_mul_ps(x0, b2); - x1 = _mm_mul_ps(x1, b2); - y0 = _mm_mul_ps(y0, b3); - y1 = _mm_mul_ps(y1, b3); - s0 = _mm_add_ps(s0, x0); - s1 = _mm_add_ps(s1, x1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift); - t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift); - - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift); - _mm_storeu_si128( (__m128i*)(dst + x), t0); - } - - return x; - } -}; - -typedef VResizeCubicVec_32f16 VResizeCubicVec_32f16u; -typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s; - -struct VResizeCubicVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - float* dst = (float*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); - - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1, s0, s1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - s0 = _mm_mul_ps(x0, b0); - s1 = _mm_mul_ps(x1, b0); - y0 = _mm_mul_ps(y0, b1); - y1 = _mm_mul_ps(y1, b1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - x0 = _mm_loadu_ps(S2 + x); - x1 = _mm_loadu_ps(S2 + x + 4); - y0 = _mm_loadu_ps(S3 + x); - y1 = _mm_loadu_ps(S3 + x + 4); - - x0 = _mm_mul_ps(x0, b2); - x1 = _mm_mul_ps(x1, b2); - y0 = _mm_mul_ps(y0, b3); - y1 = _mm_mul_ps(y1, b3); - s0 = _mm_add_ps(s0, x0); - s1 = _mm_add_ps(s1, x1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - _mm_storeu_ps( dst + x, s0); - _mm_storeu_ps( dst + x + 4, s1); - } - - return x; - } -}; - -#if CV_TRY_SSE4_1 - -struct VResizeLanczos4Vec_32f16u -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width); - else return 0; - } -}; - -#else - -typedef VResizeNoVec VResizeLanczos4Vec_32f16u; - -#endif - -struct VResizeLanczos4Vec_32f16s -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - short * dst = (short*)_dst; - int x = 0; - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for( ; x <= width - 8; x += 8 ) - { - __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); - - __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); - __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1)); - } - - return x; - } -}; - - -struct VResizeLanczos4Vec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - float* dst = (float*)_dst; - int x = 0; - - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for( ; x <= width - 4; x += 4 ) - { - __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - _mm_storeu_ps(dst + x, v_dst); - } - - return x; - } -}; - - -#elif CV_NEON - -struct VResizeLinearVec_32s8u -{ - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1]; - const short* beta = (const short*)_beta; - int x = 0; - int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2); - - for( ; x <= width - 16; x += 16) - { - int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4); - int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4); - - int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); - int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); - - int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), - vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); - v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2); - - v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4); - v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4); - v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4); - v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4); - - v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); - v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); - - int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), - vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); - v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2); - - vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1))); - } + for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) + v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x ) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load(S1 + x ) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1), + v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1))); + + for( ; x < width - v_int16::nlanes; x += v_int16::nlanes) + v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) + + v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1)); return x; } @@ -1773,18 +1224,20 @@ struct VResizeLinearVec_32f16u ushort* dst = (ushort*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - for( ; x <= width - 8; x += 8 ) + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)), + v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1)))); + else + for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1)))); + for( ; x < width - v_float32::nlanes; x += v_float32::nlanes) { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); - float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); + v_store_low(dst + x, v_pack_u(t0, t0)); } return x; @@ -1801,18 +1254,20 @@ struct VResizeLinearVec_32f16s short* dst = (short*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - for( ; x <= width - 8; x += 8 ) + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x ), b0, vx_load_aligned(S1 + x ) * b1)), + v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1)))); + else + for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, vx_load(S1 + x ) * b1)), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1)))); + for( ; x < width - v_float32::nlanes; x += v_float32::nlanes) { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); - float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); + v_store_low(dst + x, v_pack(t0, t0)); } return x; @@ -1829,22 +1284,56 @@ struct VResizeLinearVec_32f float* dst = (float*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); - vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); - } + if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1)); + else + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1)); return x; } }; -typedef VResizeNoVec VResizeCubicVec_32s8u; + +struct VResizeCubicVec_32s8u +{ + int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const + { + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int x = 0; + float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); + + v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale), + b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 ) + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x )), b0, + v_muladd(v_cvt_f32(vx_load_aligned(S1 + x )), b1, + v_muladd(v_cvt_f32(vx_load_aligned(S2 + x )), b2, + v_cvt_f32(vx_load_aligned(S3 + x )) * b3)))), + v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)), b0, + v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)), b1, + v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)), b2, + v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3)))))); + else + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x )), b0, + v_muladd(v_cvt_f32(vx_load(S1 + x )), b1, + v_muladd(v_cvt_f32(vx_load(S2 + x )), b2, + v_cvt_f32(vx_load(S3 + x )) * b3)))), + v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)), b0, + v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)), b1, + v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)), b2, + v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3)))))); + return x; + } +}; struct VResizeCubicVec_32f16u { @@ -1855,23 +1344,18 @@ struct VResizeCubicVec_32f16u const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; ushort* dst = (ushort*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); - } + for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + vx_load(S3 + x ) * b3)))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + vx_load(S3 + x + v_float32::nlanes) * b3)))))); return x; } @@ -1886,23 +1370,18 @@ struct VResizeCubicVec_32f16s const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; short* dst = (short*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); - } + for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + vx_load(S3 + x ) * b3)))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + vx_load(S3 + x + v_float32::nlanes) * b3)))))); return x; } @@ -1917,25 +1396,33 @@ struct VResizeCubicVec_32f const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; float* dst = (float*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]); - for( ; x <= width - 8; x += 8 ) - { - vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x))); - vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4))); - } + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load(S0 + x), b0, + v_muladd(vx_load(S1 + x), b1, + v_muladd(vx_load(S2 + x), b2, + vx_load(S3 + x) * b3)))); return x; } }; + +#if CV_TRY_SSE4_1 + +struct VResizeLanczos4Vec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width); + else return 0; + } +}; + +#else + struct VResizeLanczos4Vec_32f16u { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const @@ -1946,41 +1433,35 @@ struct VResizeLanczos4Vec_32f16u *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; ushort * dst = (ushort*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]), + b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), + b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); - - v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), - v_b5, vld1q_f32(S5 + x + 4)), - v_b6, vld1q_f32(S6 + x + 4)), - v_b7, vld1q_f32(S7 + x + 4)); - v_dst1 = vaddq_f32(v_dst0, v_dst1); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); - } + for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes) + v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + v_muladd(vx_load(S3 + x ), b3, + v_muladd(vx_load(S4 + x ), b4, + v_muladd(vx_load(S5 + x ), b5, + v_muladd(vx_load(S6 + x ), b6, + vx_load(S7 + x ) * b7)))))))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + v_muladd(vx_load(S3 + x + v_float32::nlanes), b3, + v_muladd(vx_load(S4 + x + v_float32::nlanes), b4, + v_muladd(vx_load(S5 + x + v_float32::nlanes), b5, + v_muladd(vx_load(S6 + x + v_float32::nlanes), b6, + vx_load(S7 + x + v_float32::nlanes) * b7)))))))))); return x; } }; +#endif + struct VResizeLanczos4Vec_32f16s { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const @@ -1991,36 +1472,28 @@ struct VResizeLanczos4Vec_32f16s *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; short * dst = (short*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]), + b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), + b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); - - v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), - v_b5, vld1q_f32(S5 + x + 4)), - v_b6, vld1q_f32(S6 + x + 4)), - v_b7, vld1q_f32(S7 + x + 4)); - v_dst1 = vaddq_f32(v_dst0, v_dst1); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); - } + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes) + v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x ), b0, + v_muladd(vx_load(S1 + x ), b1, + v_muladd(vx_load(S2 + x ), b2, + v_muladd(vx_load(S3 + x ), b3, + v_muladd(vx_load(S4 + x ), b4, + v_muladd(vx_load(S5 + x ), b5, + v_muladd(vx_load(S6 + x ), b6, + vx_load(S7 + x ) * b7)))))))), + v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, + v_muladd(vx_load(S1 + x + v_float32::nlanes), b1, + v_muladd(vx_load(S2 + x + v_float32::nlanes), b2, + v_muladd(vx_load(S3 + x + v_float32::nlanes), b3, + v_muladd(vx_load(S4 + x + v_float32::nlanes), b4, + v_muladd(vx_load(S5 + x + v_float32::nlanes), b5, + v_muladd(vx_load(S6 + x + v_float32::nlanes), b6, + vx_load(S7 + x + v_float32::nlanes) * b7)))))))))); return x; } @@ -2036,23 +1509,21 @@ struct VResizeLanczos4Vec_32f *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; float* dst = (float*)_dst; int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); - for( ; x <= width - 4; x += 4 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1)); - } + v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]), + b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]), + b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]), + b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]); + + for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes) + v_store(dst + x, v_muladd(vx_load(S0 + x), b0, + v_muladd(vx_load(S1 + x), b1, + v_muladd(vx_load(S2 + x), b2, + v_muladd(vx_load(S3 + x), b3, + v_muladd(vx_load(S4 + x), b4, + v_muladd(vx_load(S5 + x), b5, + v_muladd(vx_load(S6 + x), b6, + vx_load(S7 + x) * b7)))))))); return x; } @@ -2695,95 +2166,94 @@ private: int step; }; -#elif CV_SSE2 +#elif CV_SIMD class ResizeAreaFastVec_SIMD_8u { public: ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } + cn(_cn), step(_step) {} int operator() (const uchar* S, uchar* D, int w) const { - if (!use_simd) - return 0; - int dx = 0; const uchar* S0 = S; const uchar* S1 = S0 + step; - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi16(2); if (cn == 1) { - __m128i masklow = _mm_set1_epi16(0x00ff); - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + v_uint16 masklow = vx_setall_u16(0x00ff); + for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); - __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); - s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - - _mm_storel_epi64((__m128i*)D, s0); + v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0)); + v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1)); + v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow)); } } else if (cn == 3) - for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6) + { + if (CV_SIMD_WIDTH > 64) + return 0; + for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); - __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); - __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); - __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); - - __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); - __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); - - s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); - s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - _mm_storel_epi64((__m128i*)(D+3), s0); + v_uint16 t0, t1, t2, t3, t4, t5; + v_uint16 s0, s1, s2, s3, s4, s5; + s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + s1 = vx_load_expand(S0 + v_uint16::nlanes) + vx_load_expand(S1 + v_uint16::nlanes); + s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes); + s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes); + s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes); + s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_uint16 bl, gl, rl; +#if CV_SIMD_WIDTH == 16 + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#elif CV_SIMD_WIDTH == 32 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; +#elif CV_SIMD_WIDTH == 64 + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#endif + s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes); + s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes); + s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes); + s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes); + s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes); + s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_uint16 bh, gh, rh; +#if CV_SIMD_WIDTH == 16 + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#elif CV_SIMD_WIDTH == 32 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; +#elif CV_SIMD_WIDTH == 64 + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#endif + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); } + } else { CV_Assert(cn == 4); - int v[] = { 0, 0, -1, -1 }; - __m128i mask = _mm_loadu_si128((const __m128i*)v); - - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + v_uint32 r00, r01, r10, r11; + v_load_deinterleave((uint32_t*)S0, r00, r01); + v_load_deinterleave((uint32_t*)S1, r10, r11); - __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); - __m128i r0_16h = _mm_unpackhi_epi8(r0, zero); - __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); - __m128i r1_16h = _mm_unpackhi_epi8(r1, zero); - - __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); - __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - __m128i res0 = _mm_srli_epi16(s0, 2); - - s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); - s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - __m128i res1 = _mm_srli_epi16(s0, 2); - s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0), - _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero); - _mm_storel_epi64((__m128i*)(D), s0); + v_uint16 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h; + v_expand(v_reinterpret_as_u8(r00), r00l, r00h); + v_expand(v_reinterpret_as_u8(r01), r01l, r01h); + v_expand(v_reinterpret_as_u8(r10), r10l, r10h); + v_expand(v_reinterpret_as_u8(r11), r11l, r11h); + v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); } } @@ -2792,7 +2262,6 @@ public: private: int cn; - bool use_simd; int step; }; @@ -2800,164 +2269,258 @@ class ResizeAreaFastVec_SIMD_16u { public: ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } + cn(_cn), step(_step) {} int operator() (const ushort* S, ushort* D, int w) const { - if (!use_simd) - return 0; - int dx = 0; const ushort* S0 = (const ushort*)S; const ushort* S1 = (const ushort*)((const uchar*)(S) + step); - __m128i masklow = _mm_set1_epi32(0x0000ffff); - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi32(2); - -#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero) if (cn == 1) { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + v_uint32 masklow = vx_setall_u32(0x0000ffff); + for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); - __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); - s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); - s0 = _mm_srli_epi32(s0, 2); - s0 = _mm_packus_epi32(s0, zero); - - _mm_storel_epi64((__m128i*)D, s0); + v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0)); + v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1)); + v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow)); } } else if (cn == 3) + { +#if CV_SIMD_WIDTH == 16 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) +#if CV_SSE4_1 { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_unpacklo_epi16(r0, zero); - __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); - __m128i r1_16l = _mm_unpacklo_epi16(r1, zero); - __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); - - __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); - __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); - s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); - s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_uint32 r0, r1, r2, r3; + v_expand(vx_load(S0), r0, r1); + v_expand(vx_load(S1), r2, r3); + r0 += r2; r1 += r3; + v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0)); } +#else + v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); +#endif +#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 + for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes) + { + v_uint32 t0, t1, t2, t3, t4, t5; + v_uint32 s0, s1, s2, s3, s4, s5; + s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + s1 = vx_load_expand(S0 + v_uint32::nlanes) + vx_load_expand(S1 + v_uint32::nlanes); + s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes); + s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes); + s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes); + s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_uint32 bl, gl, rl; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; +#endif + s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes); + s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes); + s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes); + s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes); + s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes); + s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_uint32 bh, gh, rh; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; +#endif + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); + } +#elif CV_SIMD_WIDTH >= 64 + v_uint32 masklow = vx_setall_u32(0x0000ffff); + for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes) + { + v_uint16 b0, g0, r0, b1, g1, r1; + v_load_deinterleave(S0, b0, g0, r0); + v_load_deinterleave(S1, b1, g1, r1); + v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow); + v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow); + v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow); + v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0); + v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1); + v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow); + v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow); + v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow); + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); + } +#endif + } else { CV_Assert(cn == 4); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) +#if CV_SIMD_WIDTH >= 64 + for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + v_uint64 r00, r01, r10, r11; + v_load_deinterleave((uint64_t*)S0, r00, r01); + v_load_deinterleave((uint64_t*)S1, r10, r11); - __m128i r0_32l = _mm_unpacklo_epi16(r0, zero); - __m128i r0_32h = _mm_unpackhi_epi16(r0, zero); - __m128i r1_32l = _mm_unpacklo_epi16(r1, zero); - __m128i r1_32h = _mm_unpackhi_epi16(r1, zero); - - __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); - __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); - s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); - s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_uint32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h; + v_expand(v_reinterpret_as_u16(r00), r00l, r00h); + v_expand(v_reinterpret_as_u16(r01), r01l, r01h); + v_expand(v_reinterpret_as_u16(r10), r10l, r10h); + v_expand(v_reinterpret_as_u16(r11), r11l, r11h); + v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); } +#else + for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes) + { + v_uint32 r0, r1, r2, r3; + v_expand(vx_load(S0), r0, r1); + v_expand(vx_load(S1), r2, r3); + r0 += r2; r1 += r3; + v_uint32 v_d; +#if CV_SIMD_WIDTH == 16 + v_d = r0 + r1; +#elif CV_SIMD_WIDTH == 32 + v_uint32 t0, t1; + v_recombine(r0, r1, t0, t1); + v_d = t0 + t1; +#endif + v_rshr_pack_store<2>(D, v_d); + } +#endif } -#undef _mm_packus_epi32 - return dx; } private: int cn; int step; - bool use_simd; }; class ResizeAreaFastVec_SIMD_16s { public: ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } + cn(_cn), step(_step) {} int operator() (const short* S, short* D, int w) const { - if (!use_simd) - return 0; - int dx = 0; const short* S0 = (const short*)S; const short* S1 = (const short*)((const uchar*)(S) + step); - __m128i masklow = _mm_set1_epi32(0x0000ffff); - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi32(2); if (cn == 1) { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + v_int32 masklow = vx_setall_s32(0x0000ffff); + for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); - __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); - s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); - s0 = _mm_srai_epi32(s0, 2); - s0 = _mm_packs_epi32(s0, zero); - - _mm_storel_epi64((__m128i*)D, s0); + v_int32 r0 = v_reinterpret_as_s32(vx_load(S0)); + v_int32 r1 = v_reinterpret_as_s32(vx_load(S1)); + v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16)); } } else if (cn == 3) + { +#if CV_SIMD_WIDTH == 16 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) + v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3)); +#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64 + for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); - __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); - __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); - __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); - - __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); - __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); - s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); - s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_int32 t0, t1, t2, t3, t4, t5; + v_int32 s0, s1, s2, s3, s4, s5; + s0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + s1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes); + s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes); + s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes); + s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes); + s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_int32 bl, gl, rl; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bl = t0 + t3; gl = t1 + t4; rl = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bl = s0 + s3; gl = s1 + s4; rl = s2 + s5; +#endif + s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes); + s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes); + s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes); + s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes); + s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes); + s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes); + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + v_int32 bh, gh, rh; + v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5); +#if CV_SIMD_WIDTH == 32 + bh = t0 + t3; gh = t1 + t4; rh = t2 + t5; +#else //CV_SIMD_WIDTH == 64 + v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5); + bh = s0 + s3; gh = s1 + s4; rh = s2 + s5; +#endif + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); } +#elif CV_SIMD_WIDTH >= 64 + for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes) + { + v_int16 b0, g0, r0, b1, g1, r1; + v_load_deinterleave(S0, b0, g0, r0); + v_load_deinterleave(S1, b1, g1, r1); + v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16); + v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16); + v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16); + v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0); + v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1); + v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16); + v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16); + v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16); + v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh)); + } +#endif + } else { CV_Assert(cn == 4); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes) { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); +#if CV_SIMD_WIDTH >= 64 + v_int64 r00, r01, r10, r11; + v_load_deinterleave((int64_t*)S0, r00, r01); + v_load_deinterleave((int64_t*)S1, r10, r11); - __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); - __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); - __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); - __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); - - __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); - __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); - s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); - s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); + v_int32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h; + v_expand(v_reinterpret_as_s16(r00), r00l, r00h); + v_expand(v_reinterpret_as_s16(r01), r01l, r01h); + v_expand(v_reinterpret_as_s16(r10), r10l, r10h); + v_expand(v_reinterpret_as_s16(r11), r11l, r11h); + v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h)); +#else + v_int32 r0, r1, r2, r3; + r0 = vx_load_expand(S0 ) + vx_load_expand(S1 ); + r1 = vx_load_expand(S0 + v_int32::nlanes) + vx_load_expand(S1 + v_int32::nlanes); + r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes); + r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes); + v_int32 dl, dh; +#if CV_SIMD_WIDTH == 16 + dl = r0 + r1; dh = r2 + r3; +#elif CV_SIMD_WIDTH == 32 + v_int32 t0, t1, t2, t3; + v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3); + dl = t0 + t1; dh = t2 + t3; +#endif + v_store(D, v_rshr_pack<2>(dl, dh)); +#endif } } @@ -2967,7 +2530,6 @@ public: private: int cn; int step; - bool use_simd; }; struct ResizeAreaFastVec_SIMD_32f @@ -2976,7 +2538,6 @@ struct ResizeAreaFastVec_SIMD_32f cn(_cn), step(_step) { fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); - fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); } int operator() (const float * S, float * D, int w) const @@ -2987,33 +2548,32 @@ struct ResizeAreaFastVec_SIMD_32f const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); int dx = 0; - __m128 v_025 = _mm_set1_ps(0.25f); - if (cn == 1) { - const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + v_float32 v_025 = vx_setall_f32(0.25f); + for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes) { - __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), - v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); - - __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), - _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); - __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), - _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); - - _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + v_float32 v_row00, v_row01, v_row10, v_row11; + v_load_deinterleave(S0, v_row00, v_row01); + v_load_deinterleave(S1, v_row10, v_row11); + v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025); } } else if (cn == 4) { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) +#if CV_SIMD_WIDTH == 16 + v_float32 v_025 = vx_setall_f32(0.25f); + for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes) + v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025); +#elif CV_SIMD256 + v_float32x8 v_025 = v256_setall_f32(0.25f); + for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes) { - __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); - __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); - - _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + v_float32x8 dst0, dst1; + v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1); + v_store(D, (dst0 + dst1) * v_025); } +#endif } return dx; diff --git a/modules/objdetect/CMakeLists.txt b/modules/objdetect/CMakeLists.txt index 4e330af6c0..a51740c280 100644 --- a/modules/objdetect/CMakeLists.txt +++ b/modules/objdetect/CMakeLists.txt @@ -1,2 +1,8 @@ set(the_description "Object Detection") -ocv_define_module(objdetect opencv_core opencv_imgproc WRAP java python js) +ocv_define_module(objdetect opencv_core opencv_imgproc opencv_calib3d WRAP java python js) + +if(HAVE_QUIRC) + get_property(QUIRC_INCLUDE GLOBAL PROPERTY QUIRC_INCLUDE_DIR) + ocv_include_directories(${QUIRC_INCLUDE}) + ocv_target_link_libraries(${PROJECT_NAME} quirc) +endif() diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp index 3ccdfe60a3..34f58cdf79 100644 --- a/modules/objdetect/include/opencv2/objdetect.hpp +++ b/modules/objdetect/include/opencv2/objdetect.hpp @@ -690,6 +690,13 @@ protected: */ CV_EXPORTS bool detectQRCode(InputArray in, std::vector &points, double eps_x = 0.2, double eps_y = 0.1); +/** @brief Decode QR code in image and return text that is encrypted in QR code. + @param in Matrix of the type CV_8UC1 containing an image where QR code are detected. + @param points Input vector of vertices of a quadrangle of minimal area that describes QR code. + @param decoded_info String information that is encrypted in QR code. + @param straight_qrcode Matrix of the type CV_8UC1 containing an binary straight QR code. + */ +CV_EXPORTS bool decodeQRCode(InputArray in, InputArray points, std::string &decoded_info, OutputArray straight_qrcode = noArray()); //! @} objdetect } diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp index 5633c31037..aa269d9e4a 100644 --- a/modules/objdetect/src/qrcode.cpp +++ b/modules/objdetect/src/qrcode.cpp @@ -7,10 +7,16 @@ #include "precomp.hpp" #include "opencv2/objdetect.hpp" +#include "opencv2/calib3d.hpp" + +#ifdef HAVE_QUIRC +#include "quirc.h" +#endif #include #include #include +#include namespace cv { @@ -25,11 +31,11 @@ public: Mat getBinBarcode() { return bin_barcode; } Mat getStraightBarcode() { return straight_barcode; } vector getTransformationPoints() { return transformation_points; } + static Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2); protected: vector searchHorizontalLines(); vector separateVerticalLines(const vector &list_lines); void fixationPoints(vector &local_point); - Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2); vector getQuadrilateral(vector angle_list); bool testBypassRoute(vector hull, int start, int finish); inline double getCosVectors(Point2f a, Point2f b, Point2f c); @@ -61,6 +67,7 @@ void QRDetect::init(const Mat& src, double eps_vertical_, double eps_horizontal_ eps_vertical = eps_vertical_; eps_horizontal = eps_horizontal_; adaptiveThreshold(barcode, bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2); + } vector QRDetect::searchHorizontalLines() @@ -538,7 +545,7 @@ vector QRDetect::getQuadrilateral(vector angle_list) vector locations; Mat mask_roi = mask(Range(1, bin_barcode.rows - 1), Range(1, bin_barcode.cols - 1)); - cv::findNonZero(mask_roi, locations); + findNonZero(mask_roi, locations); for (size_t i = 0; i < angle_list.size(); i++) { @@ -783,7 +790,7 @@ bool QRCodeDetector::detect(InputArray in, OutputArray points) const return true; } -CV_EXPORTS bool detectQRCode(InputArray in, std::vector &points, double eps_x, double eps_y) +CV_EXPORTS bool detectQRCode(InputArray in, vector &points, double eps_x, double eps_y) { QRCodeDetector qrdetector; qrdetector.setEpsX(eps_x); @@ -792,4 +799,276 @@ CV_EXPORTS bool detectQRCode(InputArray in, std::vector &points, double e return qrdetector.detect(in, points); } +class QRDecode +{ +public: + void init(const Mat &src, const vector &points); + Mat getIntermediateBarcode() { return intermediate; } + Mat getStraightBarcode() { return straight; } + size_t getVersion() { return version; } + std::string getDecodeInformation() { return result_info; } + bool fullDecodingProcess(); +protected: + bool updatePerspective(); + bool versionDefinition(); + bool samplingForVersion(); + bool decodingProcess(); + Mat original, no_border_intermediate, intermediate, straight; + vector original_points; + std::string result_info; + uint8_t version, version_size; + float test_perspective_size; +}; + +void QRDecode::init(const Mat &src, const vector &points) +{ + original = src.clone(); + intermediate = Mat::zeros(src.size(), CV_8UC1); + original_points = points; + version = 0; + version_size = 0; + test_perspective_size = 251; + result_info = ""; +} + +bool QRDecode::updatePerspective() +{ + const Size temporary_size(cvRound(test_perspective_size), cvRound(test_perspective_size)); + + vector perspective_points; + perspective_points.push_back(Point2f(0.f, 0.f)); + perspective_points.push_back(Point2f(test_perspective_size, 0.f)); + + perspective_points.push_back(Point2f(static_cast(test_perspective_size * 0.5), + static_cast(test_perspective_size * 0.5))); + original_points.insert(original_points.begin() + 2, + QRDetect::intersectionLines( + original_points[0], original_points[2], + original_points[1], original_points[3])); + + perspective_points.push_back(Point2f(test_perspective_size, test_perspective_size)); + perspective_points.push_back(Point2f(0.f, test_perspective_size)); + + Mat H = findHomography(original_points, perspective_points); + Mat bin_original = Mat::zeros(original.size(), CV_8UC1); + adaptiveThreshold(original, bin_original, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2); + Mat temp_intermediate = Mat::zeros(temporary_size, CV_8UC1); + warpPerspective(bin_original, temp_intermediate, H, temporary_size, INTER_NEAREST); + no_border_intermediate = temp_intermediate(Range(1, temp_intermediate.rows), Range(1, temp_intermediate.cols)); + + const int border = cvRound(0.1 * test_perspective_size); + const int borderType = BORDER_CONSTANT; + copyMakeBorder(no_border_intermediate, intermediate, border, border, border, border, borderType, Scalar(255)); + return true; +} + +bool QRDecode::versionDefinition() +{ + LineIterator line_iter(intermediate, Point2f(0, 0), Point2f(test_perspective_size, test_perspective_size)); + Point black_point = Point(0, 0); + for(int j = 0; j < line_iter.count; j++, ++line_iter) + { + const uint8_t value = intermediate.at(line_iter.pos()); + if (value == 0) { black_point = line_iter.pos(); break; } + } + + Mat mask = Mat::zeros(intermediate.rows + 2, intermediate.cols + 2, CV_8UC1); + floodFill(intermediate, mask, black_point, 255, 0, Scalar(), Scalar(), FLOODFILL_MASK_ONLY); + + vector locations, non_zero_elem; + Mat mask_roi = mask(Range(1, intermediate.rows - 1), Range(1, intermediate.cols - 1)); + findNonZero(mask_roi, non_zero_elem); + convexHull(Mat(non_zero_elem), locations); + + Point temp_remote = locations[0], remote_point; + const Point delta_diff = Point(4, 4); + for (size_t i = 0; i < locations.size(); i++) + { + if (norm(black_point - temp_remote) < norm(black_point - locations[i])) + { + const uint8_t value = intermediate.at(temp_remote - delta_diff); + if (value == 0) { remote_point = temp_remote - delta_diff; } + else { remote_point = temp_remote; } + temp_remote = locations[i]; + } + } + + size_t transition_x = 0 , transition_y = 0; + + uint8_t future_pixel = 255; + const uint8_t *intermediate_row = intermediate.ptr(remote_point.y); + for(int i = remote_point.x; i < intermediate.cols; i++) + { + if (intermediate_row[i] == future_pixel) + { + future_pixel = 255 - future_pixel; + transition_x++; + } + } + + future_pixel = 255; + for(int j = remote_point.y; j < intermediate.rows; j++) + { + const uint8_t value = intermediate.at(Point(j, remote_point.x)); + if (value == future_pixel) + { + future_pixel = 255 - future_pixel; + transition_y++; + } + } + + version = saturate_cast((std::min(transition_x, transition_y) - 1) * 0.25 - 1); + if ( !( 0 < version && version <= 40 ) ) { return false; } + version_size = 21 + (version - 1) * 4; + return true; +} + +bool QRDecode::samplingForVersion() +{ + const double multiplyingFactor = (version < 3) ? 1 : + (version == 3) ? 1.5 : + version * (5 + version - 4); + const Size newFactorSize( + cvRound(no_border_intermediate.size().width * multiplyingFactor), + cvRound(no_border_intermediate.size().height * multiplyingFactor)); + Mat postIntermediate(newFactorSize, CV_8UC1); + resize(no_border_intermediate, postIntermediate, newFactorSize, 0, 0, INTER_AREA); + + const int no_inter_rows = postIntermediate.rows; + const int no_inter_cols = postIntermediate.cols; + const int delta_rows = cvRound((no_inter_rows * 1.0) / version_size); + const int delta_cols = cvRound((no_inter_cols * 1.0) / version_size); + + vector listFrequencyElem; + for (int r = 0; r < no_inter_rows; r += delta_rows) + { + for (int c = 0; c < no_inter_cols; c += delta_cols) + { + Mat tile = postIntermediate( + Range(r, min(r + delta_rows, no_inter_rows)), + Range(c, min(c + delta_cols, no_inter_cols))); + const double frequencyElem = (countNonZero(tile) * 1.0) / tile.total(); + listFrequencyElem.push_back(frequencyElem); + } + } + + double dispersionEFE = std::numeric_limits::max(); + double experimentalFrequencyElem = 0; + for (double expVal = 0; expVal < 1; expVal+=0.001) + { + double testDispersionEFE = 0.0; + for (size_t i = 0; i < listFrequencyElem.size(); i++) + { + testDispersionEFE += (listFrequencyElem[i] - expVal) * + (listFrequencyElem[i] - expVal); + } + testDispersionEFE /= (listFrequencyElem.size() - 1); + if (dispersionEFE > testDispersionEFE) + { + dispersionEFE = testDispersionEFE; + experimentalFrequencyElem = expVal; + } + } + + straight = Mat(Size(version_size, version_size), CV_8UC1, Scalar(0)); + size_t k = 0; + for (int r = 0; r < no_inter_rows && + k < listFrequencyElem.size() && + floor((r * 1.0) / delta_rows) < version_size; r += delta_rows) + { + for (int c = 0; c < no_inter_cols && + k < listFrequencyElem.size() && + floor((c * 1.0) / delta_cols) < version_size; c += delta_cols, k++) + { + Mat tile = postIntermediate( + Range(r, min(r + delta_rows, no_inter_rows)), + Range(c, min(c + delta_cols, no_inter_cols))); + + if (listFrequencyElem[k] < experimentalFrequencyElem) { tile.setTo(0); } + else + { + tile.setTo(255); + straight.at(cvRound(floor((r * 1.0) / delta_rows)), + cvRound(floor((c * 1.0) / delta_cols))) = 255; + } + } + } + return true; +} + +bool QRDecode::decodingProcess() +{ +#ifdef HAVE_QUIRC + if (straight.empty()) { return false; } + + quirc_code qr_code; + memset(&qr_code, 0, sizeof(qr_code)); + + qr_code.size = straight.size().width; + for (int x = 0; x < qr_code.size; x++) + { + for (int y = 0; y < qr_code.size; y++) + { + int position = y * qr_code.size + x; + qr_code.cell_bitmap[position >> 3] + |= straight.at(y, x) ? 0 : (1 << (position & 7)); + } + } + + quirc_data qr_code_data; + quirc_decode_error_t errorCode = quirc_decode(&qr_code, &qr_code_data); + if (errorCode != 0) { return false; } + + for (int i = 0; i < qr_code_data.payload_len; i++) + { + result_info += qr_code_data.payload[i]; + } + return true; +#else + return false; +#endif + +} + +bool QRDecode::fullDecodingProcess() +{ +#ifdef HAVE_QUIRC + if (!updatePerspective()) { return false; } + if (!versionDefinition()) { return false; } + if (!samplingForVersion()) { return false; } + if (!decodingProcess()) { return false; } + return true; +#else + std::cout << "Library QUIRC is not linked. No decoding is performed. Take it to the OpenCV repository." << std::endl; + return false; +#endif +} + +CV_EXPORTS bool decodeQRCode(InputArray in, InputArray points, std::string &decoded_info, OutputArray straight_qrcode) +{ + Mat inarr = in.getMat(); + CV_Assert(!inarr.empty()); + inarr.convertTo(inarr, CV_8UC1); + + CV_Assert(points.isVector()); + vector src_points; + points.copyTo(src_points); + CV_Assert(src_points.size() == 4); + + QRDecode qrdec; + qrdec.init(inarr, src_points); + bool exit_flag = qrdec.fullDecodingProcess(); + + decoded_info = qrdec.getDecodeInformation(); + + if (straight_qrcode.needed()) + { + qrdec.getStraightBarcode().convertTo(straight_qrcode, + straight_qrcode.fixedType() ? + straight_qrcode.type() : CV_32FC2); + } + + return exit_flag; +} + } diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp index c0cea50428..0f4b4852c5 100644 --- a/modules/objdetect/test/test_qrcode.cpp +++ b/modules/objdetect/test/test_qrcode.cpp @@ -4,19 +4,15 @@ #include "test_precomp.hpp" - namespace opencv_test { namespace { std::string qrcode_images_name[] = { - // "20110817_030.jpg", - "20110817_048.jpg", - "img_20120226_161648.jpg", - "img_2714.jpg", - "img_2716.jpg", - "img_3011.jpg", - "img_3029.jpg", - "img_3070.jpg", - "qr_test_030.jpg" + "version_1_down.jpg", "version_1_left.jpg", "version_1_right.jpg", "version_1_up.jpg", "version_1_top.jpg", + "version_2_down.jpg", "version_2_left.jpg", "version_2_right.jpg", "version_2_up.jpg", "version_2_top.jpg", + "version_3_down.jpg", "version_3_left.jpg", "version_3_right.jpg", "version_3_up.jpg", "version_3_top.jpg", + "version_4_down.jpg", "version_4_left.jpg", "version_4_right.jpg", "version_4_up.jpg", "version_4_top.jpg", + "version_5_down.jpg", "version_5_left.jpg", "version_5_right.jpg", "version_5_up.jpg", "version_5_top.jpg", + "russian.jpg", "kanji.jpg", "link_github_ocv.jpg", "link_ocv.jpg", "link_wiki_cv.jpg" }; // #define UPDATE_QRCODE_TEST_DATA @@ -35,15 +31,21 @@ TEST(Objdetect_QRCode, generate_test_data) file_config << "{:" << "image_name" << qrcode_images_name[i]; std::string image_path = findDataFile(root + qrcode_images_name[i]); std::vector corners; - Mat src = imread(image_path, IMREAD_GRAYSCALE); + Mat src = imread(image_path, IMREAD_GRAYSCALE), straight_barcode; + std::string decoded_info; ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path; EXPECT_TRUE(detectQRCode(src, corners)); +#ifdef HAVE_QUIRC + EXPECT_TRUE(decodeQRCode(src, corners, decoded_info, straight_barcode)); +#endif file_config << "x" << "[:"; for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; } file_config << "]"; file_config << "y" << "[:"; for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].y; } - file_config << "]" << "}"; + file_config << "]"; + file_config << "info" << decoded_info; + file_config << "}"; } file_config << "]"; file_config.release(); @@ -59,11 +61,15 @@ TEST_P(Objdetect_QRCode, regression) const int pixels_error = 3; std::string image_path = findDataFile(root + name_current_image); - Mat src = imread(image_path, IMREAD_GRAYSCALE); + Mat src = imread(image_path, IMREAD_GRAYSCALE), straight_barcode; ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path; std::vector corners; + std::string decoded_info; ASSERT_TRUE(detectQRCode(src, corners)); +#ifdef HAVE_QUIRC + ASSERT_TRUE(decodeQRCode(src, corners, decoded_info, straight_barcode)); +#endif const std::string dataset_config = findDataFile(root + "dataset_config.json", false); FileStorage file_config(dataset_config, FileStorage::READ); @@ -86,6 +92,12 @@ TEST_P(Objdetect_QRCode, regression) EXPECT_NEAR(x, corners[i].x, pixels_error); EXPECT_NEAR(y, corners[i].y, pixels_error); } + +#ifdef HAVE_QUIRC + std::string original_info = config["info"]; + EXPECT_EQ(decoded_info, original_info); +#endif + return; // done } } @@ -103,9 +115,14 @@ INSTANTIATE_TEST_CASE_P(/**/, Objdetect_QRCode, testing::ValuesIn(qrcode_images_ TEST(Objdetect_QRCode_basic, not_found_qrcode) { - std::vector corners; + std::vector corners, straight_barcode; + std::string decoded_info; Mat zero_image = Mat::zeros(256, 256, CV_8UC1); EXPECT_FALSE(detectQRCode(zero_image, corners)); +#ifdef HAVE_QUIRC + corners = std::vector(4); + EXPECT_FALSE(decodeQRCode(zero_image, corners, decoded_info, straight_barcode)); +#endif } diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp index 6d583c8a6a..795d51adc8 100644 --- a/modules/stitching/src/matchers.cpp +++ b/modules/stitching/src/matchers.cpp @@ -52,6 +52,10 @@ using namespace cv::cuda; #include "opencv2/xfeatures2d.hpp" using xfeatures2d::SURF; using xfeatures2d::SIFT; +#else +# if defined(_MSC_VER) +# pragma warning(disable:4702) // unreachable code +# endif #endif #ifdef HAVE_OPENCV_CUDAIMGPROC diff --git a/platforms/js/build_js.py b/platforms/js/build_js.py index 2e55b7b308..9b7776cd45 100644 --- a/platforms/js/build_js.py +++ b/platforms/js/build_js.py @@ -113,10 +113,10 @@ class Builder: "-DWITH_ITT=OFF", "-DBUILD_ZLIB=ON", "-DBUILD_opencv_apps=OFF", - "-DBUILD_opencv_calib3d=ON", + "-DBUILD_opencv_calib3d=ON", # No bindings provided. This module is used as a dependency for other modules. "-DBUILD_opencv_dnn=ON", "-DBUILD_opencv_features2d=ON", - "-DBUILD_opencv_flann=OFF", + "-DBUILD_opencv_flann=ON", # No bindings provided. This module is used as a dependency for other modules. "-DBUILD_opencv_gapi=OFF", "-DBUILD_opencv_ml=OFF", "-DBUILD_opencv_photo=OFF", diff --git a/samples/_winpack_build_sample.cmd b/samples/_winpack_build_sample.cmd index c671d140fa..84a3b56428 100644 --- a/samples/_winpack_build_sample.cmd +++ b/samples/_winpack_build_sample.cmd @@ -38,6 +38,35 @@ echo =========================================================================== :: Path to FFMPEG binary files set "PATH=!PATH!;!SCRIPTDIR!\..\..\build\bin\" +:: Detect compiler +cl /? >NUL 2>NUL NUL 2>NUL NUL 2>NUL if !ERRORLEVEL! EQU 0 ( @@ -55,32 +84,10 @@ if NOT DEFINED CMAKE_FOUND ( set "MSG=CMake is required to build OpenCV samples. Download it from here: https://cmake.org/download/ and install into 'C:\Program Files\CMake'" goto die ) else ( + call :execute cmake --version echo CMake is detected ) -:: Detect compiler -cl /? >NUL 2>NUL NUL 2>NUL transform; cap >> frame; if(frame.empty()) { break; } @@ -97,6 +98,11 @@ int liveQRCodeDetect() total.start(); bool result_detection = detectQRCode(src, transform); + if (result_detection) + { + bool result_decode = decodeQRCode(src, transform, decode_info, straight_barcode); + if (result_decode) { cout << decode_info << '\n'; } + } total.stop(); double fps = 1 / total.getTimeSec(); total.reset(); @@ -112,11 +118,12 @@ int liveQRCodeDetect() int showImageQRCodeDetect(string in, string out) { - Mat src = imread(in, IMREAD_GRAYSCALE); + Mat src = imread(in, IMREAD_GRAYSCALE), straight_barcode; + string decode_info; vector transform; const int count_experiments = 10; double transform_time = 0.0; - bool result_detection = false; + bool result_detection = false, result_decode = false; TickMeter total; for (size_t i = 0; i < count_experiments; i++) { @@ -125,12 +132,20 @@ int showImageQRCodeDetect(string in, string out) result_detection = detectQRCode(src, transform); total.stop(); transform_time += total.getTimeSec(); - if (!result_detection) { break; } total.reset(); + if (!result_detection) { break; } + + total.start(); + result_decode = decodeQRCode(src, transform, decode_info, straight_barcode); + total.stop(); + transform_time += total.getTimeSec(); + total.reset(); + if (!result_decode) { break; } } double fps = count_experiments / transform_time; if (!result_detection) { cout << "Not find QR-code." << '\n'; return -2; } + if (!result_decode) { cout << "Not decode QR-code." << '\n'; return -3; } Mat color_src = imread(in); getMatWithQRCodeContour(color_src, transform); @@ -151,6 +166,7 @@ int showImageQRCodeDetect(string in, string out) cout << "Output image file path: " << out << '\n'; cout << "Size: " << color_src.size() << '\n'; cout << "FPS: " << fps << '\n'; + cout << "Decode info: " << decode_info << '\n'; vector compression_params; compression_params.push_back(IMWRITE_PNG_COMPRESSION);