diff --git a/3rdparty/quirc/CMakeLists.txt b/3rdparty/quirc/CMakeLists.txt
new file mode 100644
index 0000000000..7a6b2bb222
--- /dev/null
+++ b/3rdparty/quirc/CMakeLists.txt
@@ -0,0 +1,30 @@
+project(quirc)
+
+set(CURR_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/include")
+
+set_property(GLOBAL PROPERTY QUIRC_INCLUDE_DIR ${CURR_INCLUDE_DIR})
+ocv_include_directories(${CURR_INCLUDE_DIR})
+
+file(GLOB_RECURSE quirc_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "include/*.h")
+file(GLOB_RECURSE quirc_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "src/*.c")
+
+add_library(${PROJECT_NAME} STATIC ${quirc_headers} ${quirc_sources})
+ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable -Wshadow)
+
+set_target_properties(${PROJECT_NAME}
+  PROPERTIES OUTPUT_NAME ${PROJECT_NAME}
+  DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+  COMPILE_PDB_NAME ${PROJECT_NAME}
+  COMPILE_PDB_NAME_DEBUG "${PROJECT_NAME}${OPENCV_DEBUG_POSTFIX}"
+  ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
+  )
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER "3rdparty")
+endif()
+
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(${PROJECT_NAME} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+
+ocv_install_3rdparty_licenses(${PROJECT_NAME} LICENSE)
diff --git a/3rdparty/quirc/LICENSE b/3rdparty/quirc/LICENSE
new file mode 100644
index 0000000000..d47c0262c2
--- /dev/null
+++ b/3rdparty/quirc/LICENSE
@@ -0,0 +1,16 @@
+quirc -- QR-code recognition library
+Copyright (C) 2010-2012 Daniel Beer <dlbeer@gmail.com>
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all
+copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
diff --git a/3rdparty/quirc/include/quirc.h b/3rdparty/quirc/include/quirc.h
new file mode 100644
index 0000000000..0e7cb94d1c
--- /dev/null
+++ b/3rdparty/quirc/include/quirc.h
@@ -0,0 +1,173 @@
+/* quirc -- QR-code recognition library
+ * Copyright (C) 2010-2012 Daniel Beer <dlbeer@gmail.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef QUIRC_H_
+#define QUIRC_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct quirc;
+
+/* Obtain the library version string. */
+const char *quirc_version(void);
+
+/* Construct a new QR-code recognizer. This function will return NULL
+ * if sufficient memory could not be allocated.
+ */
+struct quirc *quirc_new(void);
+
+/* Destroy a QR-code recognizer. */
+void quirc_destroy(struct quirc *q);
+
+/* Resize the QR-code recognizer. The size of an image must be
+ * specified before codes can be analyzed.
+ *
+ * This function returns 0 on success, or -1 if sufficient memory could
+ * not be allocated.
+ */
+int quirc_resize(struct quirc *q, int w, int h);
+
+/* These functions are used to process images for QR-code recognition.
+ * quirc_begin() must first be called to obtain access to a buffer into
+ * which the input image should be placed. Optionally, the current
+ * width and height may be returned.
+ *
+ * After filling the buffer, quirc_end() should be called to process
+ * the image for QR-code recognition. The locations and content of each
+ * code may be obtained using accessor functions described below.
+ */
+uint8_t *quirc_begin(struct quirc *q, int *w, int *h);
+void quirc_end(struct quirc *q);
+
+/* This structure describes a location in the input image buffer. */
+struct quirc_point {
+	int	x;
+	int	y;
+};
+
+/* This enum describes the various decoder errors which may occur. */
+typedef enum {
+	QUIRC_SUCCESS = 0,
+	QUIRC_ERROR_INVALID_GRID_SIZE,
+	QUIRC_ERROR_INVALID_VERSION,
+	QUIRC_ERROR_FORMAT_ECC,
+	QUIRC_ERROR_DATA_ECC,
+	QUIRC_ERROR_UNKNOWN_DATA_TYPE,
+	QUIRC_ERROR_DATA_OVERFLOW,
+	QUIRC_ERROR_DATA_UNDERFLOW
+} quirc_decode_error_t;
+
+/* Return a string error message for an error code. */
+const char *quirc_strerror(quirc_decode_error_t err);
+
+/* Limits on the maximum size of QR-codes and their content. */
+#define QUIRC_MAX_BITMAP	3917
+#define QUIRC_MAX_PAYLOAD	8896
+
+/* QR-code ECC types. */
+#define QUIRC_ECC_LEVEL_M     0
+#define QUIRC_ECC_LEVEL_L     1
+#define QUIRC_ECC_LEVEL_H     2
+#define QUIRC_ECC_LEVEL_Q     3
+
+/* QR-code data types. */
+#define QUIRC_DATA_TYPE_NUMERIC       1
+#define QUIRC_DATA_TYPE_ALPHA         2
+#define QUIRC_DATA_TYPE_BYTE          4
+#define QUIRC_DATA_TYPE_KANJI         8
+
+/* Common character encodings */
+#define QUIRC_ECI_ISO_8859_1		1
+#define QUIRC_ECI_IBM437		2
+#define QUIRC_ECI_ISO_8859_2		4
+#define QUIRC_ECI_ISO_8859_3		5
+#define QUIRC_ECI_ISO_8859_4		6
+#define QUIRC_ECI_ISO_8859_5		7
+#define QUIRC_ECI_ISO_8859_6		8
+#define QUIRC_ECI_ISO_8859_7		9
+#define QUIRC_ECI_ISO_8859_8		10
+#define QUIRC_ECI_ISO_8859_9		11
+#define QUIRC_ECI_WINDOWS_874		13
+#define QUIRC_ECI_ISO_8859_13		15
+#define QUIRC_ECI_ISO_8859_15		17
+#define QUIRC_ECI_SHIFT_JIS		20
+#define QUIRC_ECI_UTF_8			26
+
+/* This structure is used to return information about detected QR codes
+ * in the input image.
+ */
+struct quirc_code {
+	/* The four corners of the QR-code, from top left, clockwise */
+	struct quirc_point	corners[4];
+
+	/* The number of cells across in the QR-code. The cell bitmap
+	 * is a bitmask giving the actual values of cells. If the cell
+	 * at (x, y) is black, then the following bit is set:
+	 *
+	 *     cell_bitmap[i >> 3] & (1 << (i & 7))
+	 *
+	 * where i = (y * size) + x.
+	 */
+	int			size;
+	uint8_t			cell_bitmap[QUIRC_MAX_BITMAP];
+};
+
+/* This structure holds the decoded QR-code data */
+struct quirc_data {
+	/* Various parameters of the QR-code. These can mostly be
+	 * ignored if you only care about the data.
+	 */
+	int			version;
+	int			ecc_level;
+	int			mask;
+
+	/* This field is the highest-valued data type found in the QR
+	 * code.
+	 */
+	int			data_type;
+
+	/* Data payload. For the Kanji datatype, payload is encoded as
+	 * Shift-JIS. For all other datatypes, payload is ASCII text.
+	 */
+	uint8_t			payload[QUIRC_MAX_PAYLOAD];
+	int			payload_len;
+
+	/* ECI assignment number */
+	uint32_t		eci;
+};
+
+/* Return the number of QR-codes identified in the last processed
+ * image.
+ */
+int quirc_count(const struct quirc *q);
+
+/* Extract the QR-code specified by the given index. */
+void quirc_extract(const struct quirc *q, int index,
+		   struct quirc_code *code);
+
+/* Decode a QR-code, returning the payload data. */
+quirc_decode_error_t quirc_decode(const struct quirc_code *code,
+				  struct quirc_data *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/3rdparty/quirc/include/quirc_internal.h b/3rdparty/quirc/include/quirc_internal.h
new file mode 100644
index 0000000000..56f5d28bfa
--- /dev/null
+++ b/3rdparty/quirc/include/quirc_internal.h
@@ -0,0 +1,115 @@
+/* quirc -- QR-code recognition library
+ * Copyright (C) 2010-2012 Daniel Beer <dlbeer@gmail.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef QUIRC_INTERNAL_H_
+#define QUIRC_INTERNAL_H_
+
+#include <quirc.h>
+
+#define QUIRC_PIXEL_WHITE	0
+#define QUIRC_PIXEL_BLACK	1
+#define QUIRC_PIXEL_REGION	2
+
+#ifndef QUIRC_MAX_REGIONS
+#define QUIRC_MAX_REGIONS	254
+#endif
+#define QUIRC_MAX_CAPSTONES	32
+#define QUIRC_MAX_GRIDS		8
+
+#define QUIRC_PERSPECTIVE_PARAMS	8
+
+#if QUIRC_MAX_REGIONS < UINT8_MAX
+typedef uint8_t quirc_pixel_t;
+#elif QUIRC_MAX_REGIONS < UINT16_MAX
+typedef uint16_t quirc_pixel_t;
+#else
+#error "QUIRC_MAX_REGIONS > 65534 is not supported"
+#endif
+
+struct quirc_region {
+	struct quirc_point	seed;
+	int			count;
+	int			capstone;
+};
+
+struct quirc_capstone {
+	int			ring;
+	int			stone;
+
+	struct quirc_point	corners[4];
+	struct quirc_point	center;
+	double			c[QUIRC_PERSPECTIVE_PARAMS];
+
+	int			qr_grid;
+};
+
+struct quirc_grid {
+	/* Capstone indices */
+	int			caps[3];
+
+	/* Alignment pattern region and corner */
+	int			align_region;
+	struct quirc_point	align;
+
+	/* Timing pattern endpoints */
+	struct quirc_point	tpep[3];
+	int			hscan;
+	int			vscan;
+
+	/* Grid size and perspective transform */
+	int			grid_size;
+	double			c[QUIRC_PERSPECTIVE_PARAMS];
+};
+
+struct quirc {
+	uint8_t			*image;
+	quirc_pixel_t		*pixels;
+	int			*row_average; /* used by threshold() */
+	int			w;
+	int			h;
+
+	int			num_regions;
+	struct quirc_region	regions[QUIRC_MAX_REGIONS];
+
+	int			num_capstones;
+	struct quirc_capstone	capstones[QUIRC_MAX_CAPSTONES];
+
+	int			num_grids;
+	struct quirc_grid	grids[QUIRC_MAX_GRIDS];
+};
+
+/************************************************************************
+ * QR-code version information database
+ */
+
+#define QUIRC_MAX_VERSION     40
+#define QUIRC_MAX_ALIGNMENT   7
+
+struct quirc_rs_params {
+	int             bs; /* Small block size */
+	int             dw; /* Small data words */
+	int		ns; /* Number of small blocks */
+};
+
+struct quirc_version_info {
+	int				data_bytes;
+	int				apat[QUIRC_MAX_ALIGNMENT];
+	struct quirc_rs_params          ecc[4];
+};
+
+extern const struct quirc_version_info quirc_version_db[QUIRC_MAX_VERSION + 1];
+
+#endif
diff --git a/3rdparty/quirc/src/decode.c b/3rdparty/quirc/src/decode.c
new file mode 100644
index 0000000000..f556097b65
--- /dev/null
+++ b/3rdparty/quirc/src/decode.c
@@ -0,0 +1,919 @@
+/* quirc -- QR-code recognition library
+ * Copyright (C) 2010-2012 Daniel Beer <dlbeer@gmail.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <quirc_internal.h>
+
+#include <string.h>
+#include <stdlib.h>
+
+#define MAX_POLY       64
+
+/************************************************************************
+ * Galois fields
+ */
+
+struct galois_field {
+	int p;
+	const uint8_t *log;
+	const uint8_t *exp;
+};
+
+static const uint8_t gf16_exp[16] = {
+	0x01, 0x02, 0x04, 0x08, 0x03, 0x06, 0x0c, 0x0b,
+	0x05, 0x0a, 0x07, 0x0e, 0x0f, 0x0d, 0x09, 0x01
+};
+
+static const uint8_t gf16_log[16] = {
+	0x00, 0x0f, 0x01, 0x04, 0x02, 0x08, 0x05, 0x0a,
+	0x03, 0x0e, 0x09, 0x07, 0x06, 0x0d, 0x0b, 0x0c
+};
+
+static const struct galois_field gf16 = {
+	.p = 15,
+	.log = gf16_log,
+	.exp = gf16_exp
+};
+
+static const uint8_t gf256_exp[256] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+
+static const uint8_t gf256_log[256] = {
+	0x00, 0xff, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf
+};
+
+static const struct galois_field gf256 = {
+	.p = 255,
+	.log = gf256_log,
+	.exp = gf256_exp
+};
+
+/************************************************************************
+ * Polynomial operations
+ */
+
+static void poly_add(uint8_t *dst, const uint8_t *src, uint8_t c,
+		     int shift, const struct galois_field *gf)
+{
+	int i;
+	int log_c = gf->log[c];
+
+	if (!c)
+		return;
+
+	for (i = 0; i < MAX_POLY; i++) {
+		int p = i + shift;
+		uint8_t v = src[i];
+
+		if (p < 0 || p >= MAX_POLY)
+			continue;
+		if (!v)
+			continue;
+
+		dst[p] ^= gf->exp[(gf->log[v] + log_c) % gf->p];
+	}
+}
+
+static uint8_t poly_eval(const uint8_t *s, uint8_t x,
+			 const struct galois_field *gf)
+{
+	int i;
+	uint8_t sum = 0;
+	uint8_t log_x = gf->log[x];
+
+	if (!x)
+		return s[0];
+
+	for (i = 0; i < MAX_POLY; i++) {
+		uint8_t c = s[i];
+
+		if (!c)
+			continue;
+
+		sum ^= gf->exp[(gf->log[c] + log_x * i) % gf->p];
+	}
+
+	return sum;
+}
+
+/************************************************************************
+ * Berlekamp-Massey algorithm for finding error locator polynomials.
+ */
+
+static void berlekamp_massey(const uint8_t *s, int N,
+			     const struct galois_field *gf,
+			     uint8_t *sigma)
+{
+	uint8_t C[MAX_POLY];
+	uint8_t B[MAX_POLY];
+	int L = 0;
+	int m = 1;
+	uint8_t b = 1;
+	int n;
+
+	memset(B, 0, sizeof(B));
+	memset(C, 0, sizeof(C));
+	B[0] = 1;
+	C[0] = 1;
+
+	for (n = 0; n < N; n++) {
+		uint8_t d = s[n];
+		uint8_t mult;
+		int i;
+
+		for (i = 1; i <= L; i++) {
+			if (!(C[i] && s[n - i]))
+				continue;
+
+			d ^= gf->exp[(gf->log[C[i]] +
+				      gf->log[s[n - i]]) %
+				     gf->p];
+		}
+
+		mult = gf->exp[(gf->p - gf->log[b] + gf->log[d]) % gf->p];
+
+		if (!d) {
+			m++;
+		} else if (L * 2 <= n) {
+			uint8_t T[MAX_POLY];
+
+			memcpy(T, C, sizeof(T));
+			poly_add(C, B, mult, m, gf);
+			memcpy(B, T, sizeof(B));
+			L = n + 1 - L;
+			b = d;
+			m = 1;
+		} else {
+			poly_add(C, B, mult, m, gf);
+			m++;
+		}
+	}
+
+	memcpy(sigma, C, MAX_POLY);
+}
+
+/************************************************************************
+ * Code stream error correction
+ *
+ * Generator polynomial for GF(2^8) is x^8 + x^4 + x^3 + x^2 + 1
+ */
+
+static int block_syndromes(const uint8_t *data, int bs, int npar, uint8_t *s)
+{
+	int nonzero = 0;
+	int i;
+
+	memset(s, 0, MAX_POLY);
+
+	for (i = 0; i < npar; i++) {
+		int j;
+
+		for (j = 0; j < bs; j++) {
+			uint8_t c = data[bs - j - 1];
+
+			if (!c)
+				continue;
+
+			s[i] ^= gf256_exp[((int)gf256_log[c] +
+				    i * j) % 255];
+		}
+
+		if (s[i])
+			nonzero = 1;
+	}
+
+	return nonzero;
+}
+
+static void eloc_poly(uint8_t *omega,
+		      const uint8_t *s, const uint8_t *sigma,
+		      int npar)
+{
+	int i;
+
+	memset(omega, 0, MAX_POLY);
+
+	for (i = 0; i < npar; i++) {
+		const uint8_t a = sigma[i];
+		const uint8_t log_a = gf256_log[a];
+		int j;
+
+		if (!a)
+			continue;
+
+		for (j = 0; j + 1 < MAX_POLY; j++) {
+			const uint8_t b = s[j + 1];
+
+			if (i + j >= npar)
+				break;
+
+			if (!b)
+				continue;
+
+			omega[i + j] ^=
+			    gf256_exp[(log_a + gf256_log[b]) % 255];
+		}
+	}
+}
+
+static quirc_decode_error_t correct_block(uint8_t *data,
+					  const struct quirc_rs_params *ecc)
+{
+	int npar = ecc->bs - ecc->dw;
+	uint8_t s[MAX_POLY];
+	uint8_t sigma[MAX_POLY];
+	uint8_t sigma_deriv[MAX_POLY];
+	uint8_t omega[MAX_POLY];
+	int i;
+
+	/* Compute syndrome vector */
+	if (!block_syndromes(data, ecc->bs, npar, s))
+		return QUIRC_SUCCESS;
+
+	berlekamp_massey(s, npar, &gf256, sigma);
+
+	/* Compute derivative of sigma */
+	memset(sigma_deriv, 0, MAX_POLY);
+	for (i = 0; i + 1 < MAX_POLY; i += 2)
+		sigma_deriv[i] = sigma[i + 1];
+
+	/* Compute error evaluator polynomial */
+	eloc_poly(omega, s, sigma, npar - 1);
+
+	/* Find error locations and magnitudes */
+	for (i = 0; i < ecc->bs; i++) {
+		uint8_t xinv = gf256_exp[255 - i];
+
+		if (!poly_eval(sigma, xinv, &gf256)) {
+			uint8_t sd_x = poly_eval(sigma_deriv, xinv, &gf256);
+			uint8_t omega_x = poly_eval(omega, xinv, &gf256);
+			uint8_t error = gf256_exp[(255 - gf256_log[sd_x] +
+						   gf256_log[omega_x]) % 255];
+
+			data[ecc->bs - i - 1] ^= error;
+		}
+	}
+
+	if (block_syndromes(data, ecc->bs, npar, s))
+		return QUIRC_ERROR_DATA_ECC;
+
+	return QUIRC_SUCCESS;
+}
+
+/************************************************************************
+ * Format value error correction
+ *
+ * Generator polynomial for GF(2^4) is x^4 + x + 1
+ */
+
+#define FORMAT_MAX_ERROR        3
+#define FORMAT_SYNDROMES        (FORMAT_MAX_ERROR * 2)
+#define FORMAT_BITS             15
+
+static int format_syndromes(uint16_t u, uint8_t *s)
+{
+	int i;
+	int nonzero = 0;
+
+	memset(s, 0, MAX_POLY);
+
+	for (i = 0; i < FORMAT_SYNDROMES; i++) {
+		int j;
+
+		s[i] = 0;
+		for (j = 0; j < FORMAT_BITS; j++)
+			if (u & (1 << j))
+				s[i] ^= gf16_exp[((i + 1) * j) % 15];
+
+		if (s[i])
+			nonzero = 1;
+	}
+
+	return nonzero;
+}
+
+static quirc_decode_error_t correct_format(uint16_t *f_ret)
+{
+	uint16_t u = *f_ret;
+	int i;
+	uint8_t s[MAX_POLY];
+	uint8_t sigma[MAX_POLY];
+
+	/* Evaluate U (received codeword) at each of alpha_1 .. alpha_6
+	 * to get S_1 .. S_6 (but we index them from 0).
+	 */
+	if (!format_syndromes(u, s))
+		return QUIRC_SUCCESS;
+
+	berlekamp_massey(s, FORMAT_SYNDROMES, &gf16, sigma);
+
+	/* Now, find the roots of the polynomial */
+	for (i = 0; i < 15; i++)
+		if (!poly_eval(sigma, gf16_exp[15 - i], &gf16))
+			u ^= (1 << i);
+
+	if (format_syndromes(u, s))
+		return QUIRC_ERROR_FORMAT_ECC;
+
+	*f_ret = u;
+	return QUIRC_SUCCESS;
+}
+
+/************************************************************************
+ * Decoder algorithm
+ */
+
+struct datastream {
+	uint8_t		raw[QUIRC_MAX_PAYLOAD];
+	int		data_bits;
+	int		ptr;
+
+	uint8_t         data[QUIRC_MAX_PAYLOAD];
+};
+
+static inline int grid_bit(const struct quirc_code *code, int x, int y)
+{
+	int p = y * code->size + x;
+
+	return (code->cell_bitmap[p >> 3] >> (p & 7)) & 1;
+}
+
+static quirc_decode_error_t read_format(const struct quirc_code *code,
+					struct quirc_data *data, int which)
+{
+	int i;
+	uint16_t format = 0;
+	uint16_t fdata;
+	quirc_decode_error_t err;
+
+	if (which) {
+		for (i = 0; i < 7; i++)
+			format = (format << 1) |
+				grid_bit(code, 8, code->size - 1 - i);
+		for (i = 0; i < 8; i++)
+			format = (format << 1) |
+				grid_bit(code, code->size - 8 + i, 8);
+	} else {
+		static const int xs[15] = {
+			8, 8, 8, 8, 8, 8, 8, 8, 7, 5, 4, 3, 2, 1, 0
+		};
+		static const int ys[15] = {
+			0, 1, 2, 3, 4, 5, 7, 8, 8, 8, 8, 8, 8, 8, 8
+		};
+
+		for (i = 14; i >= 0; i--)
+			format = (format << 1) | grid_bit(code, xs[i], ys[i]);
+	}
+
+	format ^= 0x5412;
+
+	err = correct_format(&format);
+	if (err)
+		return err;
+
+	fdata = format >> 10;
+	data->ecc_level = fdata >> 3;
+	data->mask = fdata & 7;
+
+	return QUIRC_SUCCESS;
+}
+
+static int mask_bit(int mask, int i, int j)
+{
+	switch (mask) {
+	case 0: return !((i + j) % 2);
+	case 1: return !(i % 2);
+	case 2: return !(j % 3);
+	case 3: return !((i + j) % 3);
+	case 4: return !(((i / 2) + (j / 3)) % 2);
+	case 5: return !((i * j) % 2 + (i * j) % 3);
+	case 6: return !(((i * j) % 2 + (i * j) % 3) % 2);
+	case 7: return !(((i * j) % 3 + (i + j) % 2) % 2);
+	}
+
+	return 0;
+}
+
+static int reserved_cell(int version, int i, int j)
+{
+	const struct quirc_version_info *ver = &quirc_version_db[version];
+	int size = version * 4 + 17;
+	int ai = -1, aj = -1, a;
+
+	/* Finder + format: top left */
+	if (i < 9 && j < 9)
+		return 1;
+
+	/* Finder + format: bottom left */
+	if (i + 8 >= size && j < 9)
+		return 1;
+
+	/* Finder + format: top right */
+	if (i < 9 && j + 8 >= size)
+		return 1;
+
+	/* Exclude timing patterns */
+	if (i == 6 || j == 6)
+		return 1;
+
+	/* Exclude version info, if it exists. Version info sits adjacent to
+	 * the top-right and bottom-left finders in three rows, bounded by
+	 * the timing pattern.
+	 */
+	if (version >= 7) {
+		if (i < 6 && j + 11 >= size)
+			return 1;
+		if (i + 11 >= size && j < 6)
+			return 1;
+	}
+
+	/* Exclude alignment patterns */
+	for (a = 0; a < QUIRC_MAX_ALIGNMENT && ver->apat[a]; a++) {
+		int p = ver->apat[a];
+
+		if (abs(p - i) < 3)
+			ai = a;
+		if (abs(p - j) < 3)
+			aj = a;
+	}
+
+	if (ai >= 0 && aj >= 0) {
+		a--;
+		if (ai > 0 && ai < a)
+			return 1;
+		if (aj > 0 && aj < a)
+			return 1;
+		if (aj == a && ai == a)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void read_bit(const struct quirc_code *code,
+		     struct quirc_data *data,
+		     struct datastream *ds, int i, int j)
+{
+	int bitpos = ds->data_bits & 7;
+	int bytepos = ds->data_bits >> 3;
+	int v = grid_bit(code, j, i);
+
+	if (mask_bit(data->mask, i, j))
+		v ^= 1;
+
+	if (v)
+		ds->raw[bytepos] |= (0x80 >> bitpos);
+
+	ds->data_bits++;
+}
+
+static void read_data(const struct quirc_code *code,
+		      struct quirc_data *data,
+		      struct datastream *ds)
+{
+	int y = code->size - 1;
+	int x = code->size - 1;
+	int dir = -1;
+
+	while (x > 0) {
+		if (x == 6)
+			x--;
+
+		if (!reserved_cell(data->version, y, x))
+			read_bit(code, data, ds, y, x);
+
+		if (!reserved_cell(data->version, y, x - 1))
+			read_bit(code, data, ds, y, x - 1);
+
+		y += dir;
+		if (y < 0 || y >= code->size) {
+			dir = -dir;
+			x -= 2;
+			y += dir;
+		}
+	}
+}
+
+static quirc_decode_error_t codestream_ecc(struct quirc_data *data,
+					   struct datastream *ds)
+{
+	const struct quirc_version_info *ver =
+		&quirc_version_db[data->version];
+	const struct quirc_rs_params *sb_ecc = &ver->ecc[data->ecc_level];
+	struct quirc_rs_params lb_ecc;
+	const int lb_count =
+	    (ver->data_bytes - sb_ecc->bs * sb_ecc->ns) / (sb_ecc->bs + 1);
+	const int bc = lb_count + sb_ecc->ns;
+	const int ecc_offset = sb_ecc->dw * bc + lb_count;
+	int dst_offset = 0;
+	int i;
+
+	memcpy(&lb_ecc, sb_ecc, sizeof(lb_ecc));
+	lb_ecc.dw++;
+	lb_ecc.bs++;
+
+	for (i = 0; i < bc; i++) {
+		uint8_t *dst = ds->data + dst_offset;
+		const struct quirc_rs_params *ecc =
+		    (i < sb_ecc->ns) ? sb_ecc : &lb_ecc;
+		const int num_ec = ecc->bs - ecc->dw;
+		quirc_decode_error_t err;
+		int j;
+
+		for (j = 0; j < ecc->dw; j++)
+			dst[j] = ds->raw[j * bc + i];
+		for (j = 0; j < num_ec; j++)
+			dst[ecc->dw + j] = ds->raw[ecc_offset + j * bc + i];
+
+		err = correct_block(dst, ecc);
+		if (err)
+			return err;
+
+		dst_offset += ecc->dw;
+	}
+
+	ds->data_bits = dst_offset * 8;
+
+	return QUIRC_SUCCESS;
+}
+
+static inline int bits_remaining(const struct datastream *ds)
+{
+	return ds->data_bits - ds->ptr;
+}
+
+static int take_bits(struct datastream *ds, int len)
+{
+	int ret = 0;
+
+	while (len && (ds->ptr < ds->data_bits)) {
+		uint8_t b = ds->data[ds->ptr >> 3];
+		int bitpos = ds->ptr & 7;
+
+		ret <<= 1;
+		if ((b << bitpos) & 0x80)
+			ret |= 1;
+
+		ds->ptr++;
+		len--;
+	}
+
+	return ret;
+}
+
+static int numeric_tuple(struct quirc_data *data,
+			 struct datastream *ds,
+			 int bits, int digits)
+{
+	int tuple;
+	int i;
+
+	if (bits_remaining(ds) < bits)
+		return -1;
+
+	tuple = take_bits(ds, bits);
+
+	for (i = digits - 1; i >= 0; i--) {
+		data->payload[data->payload_len + i] = tuple % 10 + '0';
+		tuple /= 10;
+	}
+
+	data->payload_len += digits;
+	return 0;
+}
+
+static quirc_decode_error_t decode_numeric(struct quirc_data *data,
+					   struct datastream *ds)
+{
+	int bits = 14;
+	int count;
+
+	if (data->version < 10)
+		bits = 10;
+	else if (data->version < 27)
+		bits = 12;
+
+	count = take_bits(ds, bits);
+	if (data->payload_len + count + 1 > QUIRC_MAX_PAYLOAD)
+		return QUIRC_ERROR_DATA_OVERFLOW;
+
+	while (count >= 3) {
+		if (numeric_tuple(data, ds, 10, 3) < 0)
+			return QUIRC_ERROR_DATA_UNDERFLOW;
+		count -= 3;
+	}
+
+	if (count >= 2) {
+		if (numeric_tuple(data, ds, 7, 2) < 0)
+			return QUIRC_ERROR_DATA_UNDERFLOW;
+		count -= 2;
+	}
+
+	if (count) {
+		if (numeric_tuple(data, ds, 4, 1) < 0)
+			return QUIRC_ERROR_DATA_UNDERFLOW;
+		count--;
+	}
+
+	return QUIRC_SUCCESS;
+}
+
+static int alpha_tuple(struct quirc_data *data,
+		       struct datastream *ds,
+		       int bits, int digits)
+{
+	int tuple;
+	int i;
+
+	if (bits_remaining(ds) < bits)
+		return -1;
+
+	tuple = take_bits(ds, bits);
+
+	for (i = 0; i < digits; i++) {
+		static const char *alpha_map =
+			"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%*+-./:";
+
+		data->payload[data->payload_len + digits - i - 1] =
+			alpha_map[tuple % 45];
+		tuple /= 45;
+	}
+
+	data->payload_len += digits;
+	return 0;
+}
+
+static quirc_decode_error_t decode_alpha(struct quirc_data *data,
+					 struct datastream *ds)
+{
+	int bits = 13;
+	int count;
+
+	if (data->version < 10)
+		bits = 9;
+	else if (data->version < 27)
+		bits = 11;
+
+	count = take_bits(ds, bits);
+	if (data->payload_len + count + 1 > QUIRC_MAX_PAYLOAD)
+		return QUIRC_ERROR_DATA_OVERFLOW;
+
+	while (count >= 2) {
+		if (alpha_tuple(data, ds, 11, 2) < 0)
+			return QUIRC_ERROR_DATA_UNDERFLOW;
+		count -= 2;
+	}
+
+	if (count) {
+		if (alpha_tuple(data, ds, 6, 1) < 0)
+			return QUIRC_ERROR_DATA_UNDERFLOW;
+		count--;
+	}
+
+	return QUIRC_SUCCESS;
+}
+
+static quirc_decode_error_t decode_byte(struct quirc_data *data,
+					struct datastream *ds)
+{
+	int bits = 16;
+	int count;
+	int i;
+
+	if (data->version < 10)
+		bits = 8;
+
+	count = take_bits(ds, bits);
+	if (data->payload_len + count + 1 > QUIRC_MAX_PAYLOAD)
+		return QUIRC_ERROR_DATA_OVERFLOW;
+	if (bits_remaining(ds) < count * 8)
+		return QUIRC_ERROR_DATA_UNDERFLOW;
+
+	for (i = 0; i < count; i++)
+		data->payload[data->payload_len++] = take_bits(ds, 8);
+
+	return QUIRC_SUCCESS;
+}
+
+static quirc_decode_error_t decode_kanji(struct quirc_data *data,
+					 struct datastream *ds)
+{
+	int bits = 12;
+	int count;
+	int i;
+
+	if (data->version < 10)
+		bits = 8;
+	else if (data->version < 27)
+		bits = 10;
+
+	count = take_bits(ds, bits);
+	if (data->payload_len + count * 2 + 1 > QUIRC_MAX_PAYLOAD)
+		return QUIRC_ERROR_DATA_OVERFLOW;
+	if (bits_remaining(ds) < count * 13)
+		return QUIRC_ERROR_DATA_UNDERFLOW;
+
+	for (i = 0; i < count; i++) {
+		int d = take_bits(ds, 13);
+		int msB = d / 0xc0;
+		int lsB = d % 0xc0;
+		int intermediate = (msB << 8) | lsB;
+		uint16_t sjw;
+
+		if (intermediate + 0x8140 <= 0x9ffc) {
+			/* bytes are in the range 0x8140 to 0x9FFC */
+			sjw = intermediate + 0x8140;
+		} else {
+			/* bytes are in the range 0xE040 to 0xEBBF */
+			sjw = intermediate + 0xc140;
+		}
+
+		data->payload[data->payload_len++] = sjw >> 8;
+		data->payload[data->payload_len++] = sjw & 0xff;
+	}
+
+	return QUIRC_SUCCESS;
+}
+
+static quirc_decode_error_t decode_eci(struct quirc_data *data,
+				       struct datastream *ds)
+{
+	if (bits_remaining(ds) < 8)
+		return QUIRC_ERROR_DATA_UNDERFLOW;
+
+	data->eci = take_bits(ds, 8);
+
+	if ((data->eci & 0xc0) == 0x80) {
+		if (bits_remaining(ds) < 8)
+			return QUIRC_ERROR_DATA_UNDERFLOW;
+
+		data->eci = (data->eci << 8) | take_bits(ds, 8);
+	} else if ((data->eci & 0xe0) == 0xc0) {
+		if (bits_remaining(ds) < 16)
+			return QUIRC_ERROR_DATA_UNDERFLOW;
+
+		data->eci = (data->eci << 16) | take_bits(ds, 16);
+	}
+
+	return QUIRC_SUCCESS;
+}
+
+static quirc_decode_error_t decode_payload(struct quirc_data *data,
+					   struct datastream *ds)
+{
+	while (bits_remaining(ds) >= 4) {
+		quirc_decode_error_t err = QUIRC_SUCCESS;
+		int type = take_bits(ds, 4);
+
+		switch (type) {
+		case QUIRC_DATA_TYPE_NUMERIC:
+			err = decode_numeric(data, ds);
+			break;
+
+		case QUIRC_DATA_TYPE_ALPHA:
+			err = decode_alpha(data, ds);
+			break;
+
+		case QUIRC_DATA_TYPE_BYTE:
+			err = decode_byte(data, ds);
+			break;
+
+		case QUIRC_DATA_TYPE_KANJI:
+			err = decode_kanji(data, ds);
+			break;
+
+		case 7:
+			err = decode_eci(data, ds);
+			break;
+
+		default:
+			goto done;
+		}
+
+		if (err)
+			return err;
+
+		if (!(type & (type - 1)) && (type > data->data_type))
+			data->data_type = type;
+	}
+done:
+
+	/* Add nul terminator to all payloads */
+	if ((unsigned)data->payload_len >= sizeof(data->payload))
+		data->payload_len--;
+	data->payload[data->payload_len] = 0;
+
+	return QUIRC_SUCCESS;
+}
+
+quirc_decode_error_t quirc_decode(const struct quirc_code *code,
+				  struct quirc_data *data)
+{
+	quirc_decode_error_t err;
+	struct datastream ds;
+
+	if ((code->size - 17) % 4)
+		return QUIRC_ERROR_INVALID_GRID_SIZE;
+
+	memset(data, 0, sizeof(*data));
+	memset(&ds, 0, sizeof(ds));
+
+	data->version = (code->size - 17) / 4;
+
+	if (data->version < 1 ||
+	    data->version > QUIRC_MAX_VERSION)
+		return QUIRC_ERROR_INVALID_VERSION;
+
+	/* Read format information -- try both locations */
+	err = read_format(code, data, 0);
+	if (err)
+		err = read_format(code, data, 1);
+	if (err)
+		return err;
+
+	read_data(code, data, &ds);
+	err = codestream_ecc(data, &ds);
+	if (err)
+		return err;
+
+	err = decode_payload(data, &ds);
+	if (err)
+		return err;
+
+	return QUIRC_SUCCESS;
+}
diff --git a/3rdparty/quirc/src/quirc.c b/3rdparty/quirc/src/quirc.c
new file mode 100644
index 0000000000..a1418b2b26
--- /dev/null
+++ b/3rdparty/quirc/src/quirc.c
@@ -0,0 +1,138 @@
+/* quirc -- QR-code recognition library
+ * Copyright (C) 2010-2012 Daniel Beer <dlbeer@gmail.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <quirc_internal.h>
+
+const char *quirc_version(void)
+{
+	return "1.0";
+}
+
+struct quirc *quirc_new(void)
+{
+	struct quirc *q = malloc(sizeof(*q));
+
+	if (!q)
+		return NULL;
+
+	memset(q, 0, sizeof(*q));
+	return q;
+}
+
+void quirc_destroy(struct quirc *q)
+{
+	free(q->image);
+	/* q->pixels may alias q->image when their type representation is of the
+	   same size, so we need to be careful here to avoid a double free */
+	if (sizeof(*q->image) != sizeof(*q->pixels))
+		free(q->pixels);
+	free(q->row_average);
+	free(q);
+}
+
+int quirc_resize(struct quirc *q, int w, int h)
+{
+	uint8_t		*image  = NULL;
+	quirc_pixel_t	*pixels = NULL;
+	int		*row_average = NULL;
+
+	/*
+	 * XXX: w and h should be size_t (or at least unsigned) as negatives
+	 * values would not make much sense. The downside is that it would break
+	 * both the API and ABI. Thus, at the moment, let's just do a sanity
+	 * check.
+	 */
+	if (w < 0 || h < 0)
+		goto fail;
+
+	/*
+	 * alloc a new buffer for q->image. We avoid realloc(3) because we want
+	 * on failure to be leave `q` in a consistant, unmodified state.
+	 */
+	image = calloc(w, h);
+	if (!image)
+		goto fail;
+
+	/* compute the "old" (i.e. currently allocated) and the "new"
+	   (i.e. requested) image dimensions */
+	size_t olddim = q->w * q->h;
+	size_t newdim = w * h;
+	size_t min = (olddim < newdim ? olddim : newdim);
+
+	/*
+	 * copy the data into the new buffer, avoiding (a) to read beyond the
+	 * old buffer when the new size is greater and (b) to write beyond the
+	 * new buffer when the new size is smaller, hence the min computation.
+	 */
+	(void)memcpy(image, q->image, min);
+
+	/* alloc a new buffer for q->pixels if needed */
+	if (sizeof(*q->image) != sizeof(*q->pixels)) {
+		pixels = calloc(newdim, sizeof(quirc_pixel_t));
+		if (!pixels)
+			goto fail;
+	}
+
+	/* alloc a new buffer for q->row_average */
+	row_average = calloc(w, sizeof(int));
+	if (!row_average)
+		goto fail;
+
+	/* alloc succeeded, update `q` with the new size and buffers */
+	q->w = w;
+	q->h = h;
+	free(q->image);
+	q->image = image;
+	if (sizeof(*q->image) != sizeof(*q->pixels)) {
+		free(q->pixels);
+		q->pixels = pixels;
+	}
+	free(q->row_average);
+	q->row_average = row_average;
+
+	return 0;
+	/* NOTREACHED */
+fail:
+	free(image);
+	free(pixels);
+	free(row_average);
+
+	return -1;
+}
+
+int quirc_count(const struct quirc *q)
+{
+	return q->num_grids;
+}
+
+static const char *const error_table[] = {
+	[QUIRC_SUCCESS] = "Success",
+	[QUIRC_ERROR_INVALID_GRID_SIZE] = "Invalid grid size",
+	[QUIRC_ERROR_INVALID_VERSION] = "Invalid version",
+	[QUIRC_ERROR_FORMAT_ECC] = "Format data ECC failure",
+	[QUIRC_ERROR_DATA_ECC] = "ECC failure",
+	[QUIRC_ERROR_UNKNOWN_DATA_TYPE] = "Unknown data type",
+	[QUIRC_ERROR_DATA_OVERFLOW] = "Data overflow",
+	[QUIRC_ERROR_DATA_UNDERFLOW] = "Data underflow"
+};
+
+const char *quirc_strerror(quirc_decode_error_t err)
+{
+	if ((int)err < 8) { return error_table[err]; }
+	else { return "Unknown error"; }
+}
diff --git a/3rdparty/quirc/src/version_db.c b/3rdparty/quirc/src/version_db.c
new file mode 100644
index 0000000000..9c77e63d47
--- /dev/null
+++ b/3rdparty/quirc/src/version_db.c
@@ -0,0 +1,430 @@
+/* quirc -- QR-code recognition library
+ * Copyright (C) 2010-2012 Daniel Beer <dlbeer@gmail.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <quirc_internal.h>
+
+const struct quirc_version_info quirc_version_db[QUIRC_MAX_VERSION + 1] = {
+			{ /* 0 */
+				.data_bytes = 0,
+				.apat = {0},
+				.ecc = {
+					{.bs = 0, .dw = 0, .ns = 0},
+					{.bs = 0, .dw = 0, .ns = 0},
+					{.bs = 0, .dw = 0, .ns = 0},
+					{.bs = 0, .dw = 0, .ns = 0}
+				}
+			},
+	    { /* Version 1 */
+		    .data_bytes = 26,
+		    .apat = {0},
+		    .ecc = {
+			    {.bs = 26, .dw = 16, .ns = 1},
+			    {.bs = 26, .dw = 19, .ns = 1},
+			    {.bs = 26, .dw = 9, .ns = 1},
+			    {.bs = 26, .dw = 13, .ns = 1}
+		    }
+	    },
+	    { /* Version 2 */
+		    .data_bytes = 44,
+		    .apat = {6, 18, 0},
+		    .ecc = {
+			    {.bs = 44, .dw = 28, .ns = 1},
+			    {.bs = 44, .dw = 34, .ns = 1},
+			    {.bs = 44, .dw = 16, .ns = 1},
+			    {.bs = 44, .dw = 22, .ns = 1}
+		    }
+	    },
+	    { /* Version 3 */
+		    .data_bytes = 70,
+		    .apat = {6, 22, 0},
+		    .ecc = {
+			    {.bs = 70, .dw = 44, .ns = 1},
+			    {.bs = 70, .dw = 55, .ns = 1},
+			    {.bs = 35, .dw = 13, .ns = 2},
+			    {.bs = 35, .dw = 17, .ns = 2}
+		    }
+	    },
+	    { /* Version 4 */
+		    .data_bytes = 100,
+		    .apat = {6, 26, 0},
+		    .ecc = {
+			    {.bs = 50, .dw = 32, .ns = 2},
+			    {.bs = 100, .dw = 80, .ns = 1},
+			    {.bs = 25, .dw = 9, .ns = 4},
+			    {.bs = 50, .dw = 24, .ns = 2}
+		    }
+	    },
+	    { /* Version 5 */
+		    .data_bytes = 134,
+		    .apat = {6, 30, 0},
+		    .ecc = {
+			    {.bs = 67, .dw = 43, .ns = 2},
+			    {.bs = 134, .dw = 108, .ns = 1},
+			    {.bs = 33, .dw = 11, .ns = 2},
+			    {.bs = 33, .dw = 15, .ns = 2}
+		    }
+	    },
+	    { /* Version 6 */
+		    .data_bytes = 172,
+		    .apat = {6, 34, 0},
+		    .ecc = {
+			    {.bs = 43, .dw = 27, .ns = 4},
+			    {.bs = 86, .dw = 68, .ns = 2},
+			    {.bs = 43, .dw = 15, .ns = 4},
+			    {.bs = 43, .dw = 19, .ns = 4}
+		    }
+	    },
+	    { /* Version 7 */
+		    .data_bytes = 196,
+		    .apat = {6, 22, 38, 0},
+		    .ecc = {
+			    {.bs = 49, .dw = 31, .ns = 4},
+			    {.bs = 98, .dw = 78, .ns = 2},
+			    {.bs = 39, .dw = 13, .ns = 4},
+			    {.bs = 32, .dw = 14, .ns = 2}
+		    }
+	    },
+	    { /* Version 8 */
+		    .data_bytes = 242,
+		    .apat = {6, 24, 42, 0},
+		    .ecc = {
+			    {.bs = 60, .dw = 38, .ns = 2},
+			    {.bs = 121, .dw = 97, .ns = 2},
+			    {.bs = 40, .dw = 14, .ns = 4},
+			    {.bs = 40, .dw = 18, .ns = 4}
+		    }
+	    },
+	    { /* Version 9 */
+		    .data_bytes = 292,
+		    .apat = {6, 26, 46, 0},
+		    .ecc = {
+			    {.bs = 58, .dw = 36, .ns = 3},
+			    {.bs = 146, .dw = 116, .ns = 2},
+			    {.bs = 36, .dw = 12, .ns = 4},
+			    {.bs = 36, .dw = 16, .ns = 4}
+		    }
+	    },
+	    { /* Version 10 */
+		    .data_bytes = 346,
+		    .apat = {6, 28, 50, 0},
+		    .ecc = {
+			    {.bs = 69, .dw = 43, .ns = 4},
+			    {.bs = 86, .dw = 68, .ns = 2},
+			    {.bs = 43, .dw = 15, .ns = 6},
+			    {.bs = 43, .dw = 19, .ns = 6}
+		    }
+	    },
+	    { /* Version 11 */
+		    .data_bytes = 404,
+		    .apat = {6, 30, 54, 0},
+		    .ecc = {
+			    {.bs = 80, .dw = 50, .ns = 1},
+			    {.bs = 101, .dw = 81, .ns = 4},
+			    {.bs = 36, .dw = 12, .ns = 3},
+			    {.bs = 50, .dw = 22, .ns = 4}
+		    }
+	    },
+	    { /* Version 12 */
+		    .data_bytes = 466,
+		    .apat = {6, 32, 58, 0},
+		    .ecc = {
+			    {.bs = 58, .dw = 36, .ns = 6},
+			    {.bs = 116, .dw = 92, .ns = 2},
+			    {.bs = 42, .dw = 14, .ns = 7},
+			    {.bs = 46, .dw = 20, .ns = 4}
+		    }
+	    },
+	    { /* Version 13 */
+		    .data_bytes = 532,
+		    .apat = {6, 34, 62, 0},
+		    .ecc = {
+			    {.bs = 59, .dw = 37, .ns = 8},
+			    {.bs = 133, .dw = 107, .ns = 4},
+			    {.bs = 33, .dw = 11, .ns = 12},
+			    {.bs = 44, .dw = 20, .ns = 8}
+		    }
+	    },
+	    { /* Version 14 */
+		    .data_bytes = 581,
+		    .apat = {6, 26, 46, 66, 0},
+		    .ecc = {
+			    {.bs = 64, .dw = 40, .ns = 4},
+			    {.bs = 145, .dw = 115, .ns = 3},
+			    {.bs = 36, .dw = 12, .ns = 11},
+			    {.bs = 36, .dw = 16, .ns = 11}
+		    }
+	    },
+	    { /* Version 15 */
+		    .data_bytes = 655,
+		    .apat = {6, 26, 48, 70, 0},
+		    .ecc = {
+			    {.bs = 65, .dw = 41, .ns = 5},
+			    {.bs = 109, .dw = 87, .ns = 5},
+			    {.bs = 36, .dw = 12, .ns = 11},
+			    {.bs = 54, .dw = 24, .ns = 5}
+		    }
+	    },
+	    { /* Version 16 */
+		    .data_bytes = 733,
+		    .apat = {6, 26, 50, 74, 0},
+		    .ecc = {
+			    {.bs = 73, .dw = 45, .ns = 7},
+			    {.bs = 122, .dw = 98, .ns = 5},
+			    {.bs = 45, .dw = 15, .ns = 3},
+			    {.bs = 43, .dw = 19, .ns = 15}
+		    }
+	    },
+	    { /* Version 17 */
+		    .data_bytes = 815,
+		    .apat = {6, 30, 54, 78, 0},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 10},
+			    {.bs = 135, .dw = 107, .ns = 1},
+			    {.bs = 42, .dw = 14, .ns = 2},
+			    {.bs = 50, .dw = 22, .ns = 1}
+		    }
+	    },
+	    { /* Version 18 */
+		    .data_bytes = 901,
+		    .apat = {6, 30, 56, 82, 0},
+		    .ecc = {
+			    {.bs = 69, .dw = 43, .ns = 9},
+			    {.bs = 150, .dw = 120, .ns = 5},
+			    {.bs = 42, .dw = 14, .ns = 2},
+			    {.bs = 50, .dw = 22, .ns = 17}
+		    }
+	    },
+	    { /* Version 19 */
+		    .data_bytes = 991,
+		    .apat = {6, 30, 58, 86, 0},
+		    .ecc = {
+			    {.bs = 70, .dw = 44, .ns = 3},
+			    {.bs = 141, .dw = 113, .ns = 3},
+			    {.bs = 39, .dw = 13, .ns = 9},
+			    {.bs = 47, .dw = 21, .ns = 17}
+		    }
+	    },
+	    { /* Version 20 */
+		    .data_bytes = 1085,
+		    .apat = {6, 34, 62, 90, 0},
+		    .ecc = {
+			    {.bs = 67, .dw = 41, .ns = 3},
+			    {.bs = 135, .dw = 107, .ns = 3},
+			    {.bs = 43, .dw = 15, .ns = 15},
+			    {.bs = 54, .dw = 24, .ns = 15}
+		    }
+	    },
+	    { /* Version 21 */
+		    .data_bytes = 1156,
+		    .apat = {6, 28, 50, 72, 92, 0},
+		    .ecc = {
+			    {.bs = 68, .dw = 42, .ns = 17},
+			    {.bs = 144, .dw = 116, .ns = 4},
+			    {.bs = 46, .dw = 16, .ns = 19},
+			    {.bs = 50, .dw = 22, .ns = 17}
+		    }
+	    },
+	    { /* Version 22 */
+		    .data_bytes = 1258,
+		    .apat = {6, 26, 50, 74, 98, 0},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 17},
+			    {.bs = 139, .dw = 111, .ns = 2},
+			    {.bs = 37, .dw = 13, .ns = 34},
+			    {.bs = 54, .dw = 24, .ns = 7}
+		    }
+	    },
+	    { /* Version 23 */
+		    .data_bytes = 1364,
+		    .apat = {6, 30, 54, 78, 102, 0},
+		    .ecc = {
+			    {.bs = 75, .dw = 47, .ns = 4},
+			    {.bs = 151, .dw = 121, .ns = 4},
+			    {.bs = 45, .dw = 15, .ns = 16},
+			    {.bs = 54, .dw = 24, .ns = 11}
+		    }
+	    },
+	    { /* Version 24 */
+		    .data_bytes = 1474,
+		    .apat = {6, 28, 54, 80, 106, 0},
+		    .ecc = {
+			    {.bs = 73, .dw = 45, .ns = 6},
+			    {.bs = 147, .dw = 117, .ns = 6},
+			    {.bs = 46, .dw = 16, .ns = 30},
+			    {.bs = 54, .dw = 24, .ns = 11}
+		    }
+	    },
+	    { /* Version 25 */
+		    .data_bytes = 1588,
+		    .apat = {6, 32, 58, 84, 110, 0},
+		    .ecc = {
+			    {.bs = 75, .dw = 47, .ns = 8},
+			    {.bs = 132, .dw = 106, .ns = 8},
+			    {.bs = 45, .dw = 15, .ns = 22},
+			    {.bs = 54, .dw = 24, .ns = 7}
+		    }
+	    },
+	    { /* Version 26 */
+		    .data_bytes = 1706,
+		    .apat = {6, 30, 58, 86, 114, 0},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 19},
+			    {.bs = 142, .dw = 114, .ns = 10},
+			    {.bs = 46, .dw = 16, .ns = 33},
+			    {.bs = 50, .dw = 22, .ns = 28}
+		    }
+	    },
+	    { /* Version 27 */
+		    .data_bytes = 1828,
+		    .apat = {6, 34, 62, 90, 118, 0},
+		    .ecc = {
+			    {.bs = 73, .dw = 45, .ns = 22},
+			    {.bs = 152, .dw = 122, .ns = 8},
+			    {.bs = 45, .dw = 15, .ns = 12},
+			    {.bs = 53, .dw = 23, .ns = 8}
+		    }
+	    },
+	    { /* Version 28 */
+		    .data_bytes = 1921,
+		    .apat = {6, 26, 50, 74, 98, 122, 0},
+		    .ecc = {
+			    {.bs = 73, .dw = 45, .ns = 3},
+			    {.bs = 147, .dw = 117, .ns = 3},
+			    {.bs = 45, .dw = 15, .ns = 11},
+			    {.bs = 54, .dw = 24, .ns = 4}
+		    }
+	    },
+	    { /* Version 29 */
+		    .data_bytes = 2051,
+		    .apat = {6, 30, 54, 78, 102, 126, 0},
+		    .ecc = {
+			    {.bs = 73, .dw = 45, .ns = 21},
+			    {.bs = 146, .dw = 116, .ns = 7},
+			    {.bs = 45, .dw = 15, .ns = 19},
+			    {.bs = 53, .dw = 23, .ns = 1}
+		    }
+	    },
+	    { /* Version 30 */
+		    .data_bytes = 2185,
+		    .apat = {6, 26, 52, 78, 104, 130, 0},
+		    .ecc = {
+			    {.bs = 75, .dw = 47, .ns = 19},
+			    {.bs = 145, .dw = 115, .ns = 5},
+			    {.bs = 45, .dw = 15, .ns = 23},
+			    {.bs = 54, .dw = 24, .ns = 15}
+		    }
+	    },
+	    { /* Version 31 */
+		    .data_bytes = 2323,
+		    .apat = {6, 30, 56, 82, 108, 134, 0},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 2},
+			    {.bs = 145, .dw = 115, .ns = 13},
+			    {.bs = 45, .dw = 15, .ns = 23},
+			    {.bs = 54, .dw = 24, .ns = 42}
+		    }
+	    },
+	    { /* Version 32 */
+		    .data_bytes = 2465,
+		    .apat = {6, 34, 60, 86, 112, 138, 0},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 10},
+			    {.bs = 145, .dw = 115, .ns = 17},
+			    {.bs = 45, .dw = 15, .ns = 19},
+			    {.bs = 54, .dw = 24, .ns = 10}
+		    }
+	    },
+	    { /* Version 33 */
+		    .data_bytes = 2611,
+		    .apat = {6, 30, 58, 86, 114, 142, 0},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 14},
+			    {.bs = 145, .dw = 115, .ns = 17},
+			    {.bs = 45, .dw = 15, .ns = 11},
+			    {.bs = 54, .dw = 24, .ns = 29}
+		    }
+	    },
+	    { /* Version 34 */
+		    .data_bytes = 2761,
+		    .apat = {6, 34, 62, 90, 118, 146, 0},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 14},
+			    {.bs = 145, .dw = 115, .ns = 13},
+			    {.bs = 46, .dw = 16, .ns = 59},
+			    {.bs = 54, .dw = 24, .ns = 44}
+		    }
+	    },
+	    { /* Version 35 */
+		    .data_bytes = 2876,
+		    .apat = {6, 30, 54, 78, 102, 126, 150},
+		    .ecc = {
+			    {.bs = 75, .dw = 47, .ns = 12},
+			    {.bs = 151, .dw = 121, .ns = 12},
+			    {.bs = 45, .dw = 15, .ns = 22},
+			    {.bs = 54, .dw = 24, .ns = 39}
+		    }
+	    },
+	    { /* Version 36 */
+		    .data_bytes = 3034,
+		    .apat = {6, 24, 50, 76, 102, 128, 154},
+		    .ecc = {
+			    {.bs = 75, .dw = 47, .ns = 6},
+			    {.bs = 151, .dw = 121, .ns = 6},
+			    {.bs = 45, .dw = 15, .ns = 2},
+			    {.bs = 54, .dw = 24, .ns = 46}
+		    }
+	    },
+	    { /* Version 37 */
+		    .data_bytes = 3196,
+		    .apat = {6, 28, 54, 80, 106, 132, 158},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 29},
+			    {.bs = 152, .dw = 122, .ns = 17},
+			    {.bs = 45, .dw = 15, .ns = 24},
+			    {.bs = 54, .dw = 24, .ns = 49}
+		    }
+	    },
+	    { /* Version 38 */
+		    .data_bytes = 3362,
+		    .apat = {6, 32, 58, 84, 110, 136, 162},
+		    .ecc = {
+			    {.bs = 74, .dw = 46, .ns = 13},
+			    {.bs = 152, .dw = 122, .ns = 4},
+			    {.bs = 45, .dw = 15, .ns = 42},
+			    {.bs = 54, .dw = 24, .ns = 48}
+		    }
+	    },
+	    { /* Version 39 */
+		    .data_bytes = 3532,
+		    .apat = {6, 26, 54, 82, 110, 138, 166},
+		    .ecc = {
+			    {.bs = 75, .dw = 47, .ns = 40},
+			    {.bs = 147, .dw = 117, .ns = 20},
+			    {.bs = 45, .dw = 15, .ns = 10},
+			    {.bs = 54, .dw = 24, .ns = 43}
+		    }
+	    },
+	    { /* Version 40 */
+		    .data_bytes = 3706,
+		    .apat = {6, 30, 58, 86, 114, 142, 170},
+		    .ecc = {
+			    {.bs = 75, .dw = 47, .ns = 18},
+			    {.bs = 148, .dw = 118, .ns = 19},
+			    {.bs = 45, .dw = 15, .ns = 20},
+			    {.bs = 54, .dw = 24, .ns = 34}
+		    }
+	    }
+};
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e54b77ec2f..46b64ab451 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -283,6 +283,7 @@ OCV_OPTION(WITH_IMGCODEC_HDR   "Include HDR support"                         ON)
 OCV_OPTION(WITH_IMGCODEC_SUNRASTER "Include SUNRASTER support"               ON)
 OCV_OPTION(WITH_IMGCODEC_PXM   "Include PNM (PBM,PGM,PPM) and PAM formats support" ON)
 OCV_OPTION(WITH_IMGCODEC_PFM   "Include PFM formats support"                 ON)
+OCV_OPTION(WITH_QUIRC          "Include library QR-code decoding"            ON)
 
 # OpenCV build components
 # ===================================================
@@ -696,6 +697,10 @@ if(WITH_OPENVX)
   include(cmake/FindOpenVX.cmake)
 endif()
 
+if(WITH_QUIRC)
+  add_subdirectory(3rdparty/quirc)
+  set(HAVE_QUIRC TRUE)
+endif()
 # ----------------------------------------------------------------------------
 # OpenCV HAL
 # ----------------------------------------------------------------------------
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 6a60648359..373ac9b9c2 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -909,6 +909,13 @@ macro(_ocv_create_module)
       source_group("Src" FILES "${_VS_VERSION_FILE}")
     endif()
   endif()
+  if(WIN32 AND NOT ("${the_module}" STREQUAL "opencv_core" OR "${the_module}" STREQUAL "opencv_world")
+      AND (BUILD_SHARED_LIBS AND NOT "x${OPENCV_MODULE_TYPE}" STREQUAL "xSTATIC")
+      AND NOT OPENCV_SKIP_DLLMAIN_GENERATION
+  )
+      set(_DLLMAIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/${the_module}_main.cpp")
+      configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/dllmain.cpp.in" "${_DLLMAIN_FILE}" @ONLY)
+  endif()
 
   source_group("Include" FILES "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp")
   source_group("Src" FILES "${${the_module}_pch}")
@@ -918,6 +925,7 @@ macro(_ocv_create_module)
     "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp"
     ${${the_module}_pch}
     ${_VS_VERSION_FILE}
+    ${_DLLMAIN_FILE}
   )
   set_target_properties(${the_module} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};Module")
   set_source_files_properties(${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES} ${${the_module}_pch}
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index 0f63651c38..a208f7ef74 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -244,5 +244,7 @@
 /* OpenCV trace utilities */
 #cmakedefine OPENCV_TRACE
 
+/* Library QR-code decoding */
+#cmakedefine HAVE_QUIRC
 
 #endif // OPENCV_CVCONFIG_H_INCLUDED
diff --git a/cmake/templates/dllmain.cpp.in b/cmake/templates/dllmain.cpp.in
new file mode 100644
index 0000000000..6b3005f654
--- /dev/null
+++ b/cmake/templates/dllmain.cpp.in
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _WIN32
+#error "Build configuration error"
+#endif
+#ifndef CVAPI_EXPORTS
+#error "Build configuration error"
+#endif
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+#define OPENCV_MODULE_S "@the_module@"
+
+namespace cv {
+extern __declspec(dllimport) bool __termination;  // Details: #12750
+}
+
+extern "C"
+BOOL WINAPI DllMain(HINSTANCE, DWORD fdwReason, LPVOID lpReserved);
+
+extern "C"
+BOOL WINAPI DllMain(HINSTANCE, DWORD fdwReason, LPVOID lpReserved)
+{
+    if (fdwReason == DLL_THREAD_DETACH || fdwReason == DLL_PROCESS_DETACH)
+    {
+        if (lpReserved != NULL) // called after ExitProcess() call
+        {
+            //printf("OpenCV: terminating: " OPENCV_MODULE_S "\n");
+            cv::__termination = true;
+        }
+    }
+    return TRUE;
+}
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 29c4f646ec..18bdf46f90 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -472,6 +472,9 @@ void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
 
 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_SSE4_1
+    return v_uint16x8(_mm_packus_epi32(a.val, b.val));
+#else
     __m128i delta32 = _mm_set1_epi32(32768);
 
     // preliminary saturate negative values to zero
@@ -480,34 +483,51 @@ inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
 
     __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+#endif
 }
 
 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
 {
+#if CV_SSE4_1
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
+#else
     __m128i delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(a.val, delta32);
     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
     _mm_storel_epi64((__m128i*)ptr, r);
+#endif
 }
 
 template<int n> inline
 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_SSE4_1
+    __m128i delta = _mm_set1_epi32(1 << (n - 1));
+    return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                       _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+#else
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
     __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
     __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
     return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
+#endif
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
 {
+#if CV_SSE4_1
+    __m128i delta = _mm_set1_epi32(1 << (n - 1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
+#else
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
     _mm_storel_epi64((__m128i*)ptr, a2);
+#endif
 }
 
 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index f0deb2d067..1dc06bc633 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4776,6 +4776,10 @@ public:
 
     void deallocate_(UMatData* u) const
     {
+#ifdef _WIN32
+        if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
+            return;             // avoid any OpenCL calls
+#endif
         if(u->tempUMat())
         {
             CV_Assert(u->origdata);
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 93c8c26cd0..d74e377494 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -447,7 +447,16 @@ static int numThreads = -1;
 #elif defined HAVE_HPX
 // nothing for HPX
 #elif defined HAVE_OPENMP
-static int numThreadsMax = omp_get_max_threads();
+static inline int _initMaxThreads()
+{
+    int maxThreads = omp_get_max_threads();
+    if (!utils::getConfigurationParameterBool("OPENCV_FOR_OPENMP_DYNAMIC_DISABLE", false))
+    {
+        omp_set_dynamic(maxThreads);
+    }
+    return numThreads;
+}
+static int numThreadsMax = _initMaxThreads();
 #elif defined HAVE_GCD
 // nothing for GCD
 #elif defined WINRT
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 796edb98ec..3aee8486b5 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -298,8 +298,9 @@ TLSData<CoreTLSData>& getCoreTlsData();
 #define CL_RUNTIME_EXPORT
 #endif
 
-extern bool __termination; // skip some cleanups, because process is terminating
-                           // (for example, if ExitProcess() was already called)
+extern CV_EXPORTS
+bool __termination;  // skip some cleanups, because process is terminating
+                     // (for example, if ExitProcess() was already called)
 
 cv::Mutex& getInitializationMutex();
 
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 09078965ff..ab91855a15 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -43,6 +43,7 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 
@@ -73,69 +74,55 @@ template<typename T1, typename T2> struct PyrUpNoVec
     int operator()(T1**, T2**, int, int) const { return 0; }
 };
 
-#if CV_SSE2
+#if CV_SIMD
 
 struct PyrDownVec_32s8u
 {
     int operator()(int** src, uchar* dst, int, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         int x = 0;
         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-        __m128i delta = _mm_set1_epi16(128);
 
-        for( ; x <= width - 16; x += 16 )
+        for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes )
         {
-            __m128i r0, r1, r2, r3, r4, t0, t1;
-            r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)),
-                                 _mm_load_si128((const __m128i*)(row0 + x + 4)));
-            r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)),
-                                 _mm_load_si128((const __m128i*)(row1 + x + 4)));
-            r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)),
-                                 _mm_load_si128((const __m128i*)(row2 + x + 4)));
-            r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)),
-                                 _mm_load_si128((const __m128i*)(row3 + x + 4)));
-            r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)),
-                                 _mm_load_si128((const __m128i*)(row4 + x + 4)));
-            r0 = _mm_add_epi16(r0, r4);
-            r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
-            r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
-            t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
-            r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)),
-                                 _mm_load_si128((const __m128i*)(row0 + x + 12)));
-            r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)),
-                                 _mm_load_si128((const __m128i*)(row1 + x + 12)));
-            r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)),
-                                 _mm_load_si128((const __m128i*)(row2 + x + 12)));
-            r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)),
-                                 _mm_load_si128((const __m128i*)(row3 + x + 12)));
-            r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)),
-                                 _mm_load_si128((const __m128i*)(row4 + x + 12)));
-            r0 = _mm_add_epi16(r0, r4);
-            r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
-            r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
-            t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
-            t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8);
-            t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8);
-            _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1));
+            v_uint16 r0, r1, r2, r3, r4, t0, t1;
+            r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
+            r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
+            r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
+            r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
+            r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
+            t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+            r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes)));
+            r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes)));
+            r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes)));
+            r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes)));
+            r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes)));
+            t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+            v_store(dst + x, v_rshr_pack<8>(t0, t1));
         }
-
-        for( ; x <= width - 4; x += 4 )
+        if (x <= width - v_int16::nlanes)
         {
-            __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128();
-            r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z);
-            r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z);
-            r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z);
-            r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z);
-            r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z);
-            r0 = _mm_add_epi16(r0, r4);
-            r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
-            r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
-            r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
-            r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8);
-            *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0));
+            v_uint16 r0, r1, r2, r3, r4, t0;
+            r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
+            r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
+            r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
+            r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
+            r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
+            t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+            v_rshr_pack_store<8>(dst + x, t0);
+            x += v_uint16::nlanes;
+        }
+        for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+        {
+            v_int32x4 r0, r1, r2, r3, r4, t0;
+            r0 = v_load(row0 + x);
+            r1 = v_load(row1 + x);
+            r2 = v_load(row2 + x);
+            r3 = v_load(row3 + x);
+            r4 = v_load(row4 + x);
+            t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+
+            *(int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
         }
 
         return x;
@@ -146,152 +133,105 @@ struct PyrDownVec_32f
 {
     int operator()(float** src, float* dst, int, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
         int x = 0;
         const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-        __m128 _4 = _mm_set1_ps(4.f), _scale = _mm_set1_ps(1.f/256);
-        for( ; x <= width - 8; x += 8 )
+
+        v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256);
+        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
         {
-            __m128 r0, r1, r2, r3, r4, t0, t1;
-            r0 = _mm_load_ps(row0 + x);
-            r1 = _mm_load_ps(row1 + x);
-            r2 = _mm_load_ps(row2 + x);
-            r3 = _mm_load_ps(row3 + x);
-            r4 = _mm_load_ps(row4 + x);
-            r0 = _mm_add_ps(r0, r4);
-            r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
-            r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
-            t0 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
-
-            r0 = _mm_load_ps(row0 + x + 4);
-            r1 = _mm_load_ps(row1 + x + 4);
-            r2 = _mm_load_ps(row2 + x + 4);
-            r3 = _mm_load_ps(row3 + x + 4);
-            r4 = _mm_load_ps(row4 + x + 4);
-            r0 = _mm_add_ps(r0, r4);
-            r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
-            r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
-            t1 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
-
-            t0 = _mm_mul_ps(t0, _scale);
-            t1 = _mm_mul_ps(t1, _scale);
-
-            _mm_storeu_ps(dst + x, t0);
-            _mm_storeu_ps(dst + x + 4, t1);
+            v_float32 r0, r1, r2, r3, r4;
+            r0 = vx_load(row0 + x);
+            r1 = vx_load(row1 + x);
+            r2 = vx_load(row2 + x);
+            r3 = vx_load(row3 + x);
+            r4 = vx_load(row4 + x);
+            v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale);
         }
 
         return x;
     }
 };
 
-#if CV_SSE4_1
+#if CV_SSE4_1 || CV_NEON
 
 struct PyrDownVec_32s16u
 {
-    PyrDownVec_32s16u()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
-    }
-
     int operator()(int** src, ushort* dst, int, int width) const
     {
         int x = 0;
-
-        if (!haveSSE)
-            return x;
-
         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-        __m128i v_delta = _mm_set1_epi32(128);
 
-        for( ; x <= width - 8; x += 8 )
+        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
         {
-            __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
-                    v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
-            __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
-                    v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
-            __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
-                    v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
-            __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
-                    v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
-            __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
-                    v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
-
-            v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
-            v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
-
-            v_r10 = _mm_slli_epi32(v_r10, 2);
-            __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
-
-            v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
-            v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
-            v_r11 = _mm_slli_epi32(v_r11, 2);
-            __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
+            v_int32 r00 = vx_load(row0 + x),
+                    r01 = vx_load(row0 + x + v_int32::nlanes),
+                    r10 = vx_load(row1 + x),
+                    r11 = vx_load(row1 + x + v_int32::nlanes),
+                    r20 = vx_load(row2 + x),
+                    r21 = vx_load(row2 + x + v_int32::nlanes),
+                    r30 = vx_load(row3 + x),
+                    r31 = vx_load(row3 + x + v_int32::nlanes),
+                    r40 = vx_load(row4 + x),
+                    r41 = vx_load(row4 + x + v_int32::nlanes);
+            v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
+                                              r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
+        }
+        if (x <= width - v_int32::nlanes)
+        {
+            v_int32 r00 = vx_load(row0 + x),
+                    r10 = vx_load(row1 + x),
+                    r20 = vx_load(row2 + x),
+                    r30 = vx_load(row3 + x),
+                    r40 = vx_load(row4 + x);
+            v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
+            x += v_int32::nlanes;
         }
 
         return x;
     }
-
-    bool haveSSE;
 };
 
 #else
 
 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
 
-#endif // CV_SSE4_1
+#endif
 
 struct PyrDownVec_32s16s
 {
-    PyrDownVec_32s16s()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
     int operator()(int** src, short* dst, int, int width) const
     {
         int x = 0;
-
-        if (!haveSSE)
-            return x;
-
         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-        __m128i v_delta = _mm_set1_epi32(128);
 
-        for( ; x <= width - 8; x += 8 )
+        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
         {
-            __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
-                    v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
-            __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
-                    v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
-            __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
-                    v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
-            __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
-                    v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
-            __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
-                    v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
-
-            v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
-            v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
-
-            v_r10 = _mm_slli_epi32(v_r10, 2);
-            __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
-
-            v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
-            v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
-            v_r11 = _mm_slli_epi32(v_r11, 2);
-            __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
+            v_int32 r00 = vx_load(row0 + x),
+                    r01 = vx_load(row0 + x + v_int32::nlanes),
+                    r10 = vx_load(row1 + x),
+                    r11 = vx_load(row1 + x + v_int32::nlanes),
+                    r20 = vx_load(row2 + x),
+                    r21 = vx_load(row2 + x + v_int32::nlanes),
+                    r30 = vx_load(row3 + x),
+                    r31 = vx_load(row3 + x + v_int32::nlanes),
+                    r40 = vx_load(row4 + x),
+                    r41 = vx_load(row4 + x + v_int32::nlanes);
+            v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
+                                            r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
+        }
+        if (x <= width - v_int32::nlanes)
+        {
+            v_int32 r00 = vx_load(row0 + x),
+                r10 = vx_load(row1 + x),
+                r20 = vx_load(row2 + x),
+                r30 = vx_load(row3 + x),
+                r40 = vx_load(row4 + x);
+            v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
+            x += v_int32::nlanes;
         }
 
         return x;
     }
-
-    bool haveSSE;
 };
 
 struct PyrUpVec_32s8u
@@ -299,59 +239,40 @@ struct PyrUpVec_32s8u
     int operator()(int** src, uchar** dst, int, int width) const
     {
         int x = 0;
-
-        if (!checkHardwareSupport(CV_CPU_SSE2))
-            return x;
-
         uchar *dst0 = dst[0], *dst1 = dst[1];
-        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
-        __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
+        const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-        for( ; x <= width - 16; x += 16 )
+        for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
         {
-            __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
-                                           _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
-            __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
-                                           _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
-            __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
-                                           _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
-
-            __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
-            __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
-            __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
-
-            v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
-                                   _mm_loadu_si128((__m128i const *)(row0 + x + 12)));
-            v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
-                                   _mm_loadu_si128((__m128i const *)(row1 + x + 12)));
-            v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
-                                   _mm_loadu_si128((__m128i const *)(row2 + x + 12)));
-
-            v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
-            __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
-            __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
-
-            _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
-                                                                     _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
-            _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
-                                                                     _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
+            v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
+                    v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)),
+                    v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
+                    v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)),
+                    v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)),
+                    v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes));
+            v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11);
+            v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11)));
+            v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
         }
-
-        for( ; x <= width - 8; x += 8 )
+        if(x <= width - v_uint16::nlanes)
         {
-            __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
-                                           _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
-            __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
-                                           _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
-            __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
-                                           _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
-
-            __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
-            __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
-            __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
-
-            _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
-            _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
+            v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
+                    v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
+                    v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes));
+            v_int16 v_2r10 = v_r10 + v_r10;
+            v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10));
+            v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
+            x += v_uint16::nlanes;
+        }
+        for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+        {
+            v_int32 v_r00 = vx_load(row0 + x),
+                    v_r10 = vx_load(row1 + x),
+                    v_r20 = vx_load(row2 + x);
+            v_int32 v_2r10 = v_r10 + v_r10;
+            v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2);
+            *(int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
+            *(int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
         }
 
         return x;
@@ -363,113 +284,63 @@ struct PyrUpVec_32s16s
     int operator()(int** src, short** dst, int, int width) const
     {
         int x = 0;
-
-        if (!checkHardwareSupport(CV_CPU_SSE2))
-            return x;
-
         short *dst0 = dst[0], *dst1 = dst[1];
-        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
-        __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
+        const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-        for( ; x <= width - 8; x += 8 )
+        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
         {
-            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
-                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
-                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
-            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
-            __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
-            __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
-
-            v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
-            v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
-            v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
-            v_2r1 = _mm_slli_epi32(v_r1, 1);
-            v_4r1 = _mm_slli_epi32(v_r1, 2);
-            __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
-            __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
-
-            _mm_storeu_si128((__m128i *)(dst0 + x),
-                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
-                                _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
-            _mm_storeu_si128((__m128i *)(dst1 + x),
-                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
-                                _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
+            v_int32 v_r00 = vx_load(row0 + x),
+                    v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                    v_r10 = vx_load(row1 + x),
+                    v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                    v_r20 = vx_load(row2 + x),
+                    v_r21 = vx_load(row2 + x + v_int32::nlanes);
+            v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+            v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
         }
-
-        for( ; x <= width - 4; x += 4 )
+        if(x <= width - v_int32::nlanes)
         {
-            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
-                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
-                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
-            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
-
-            __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
-            __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
-
-            _mm_storel_epi64((__m128i *)(dst0 + x),
-                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
-            _mm_storel_epi64((__m128i *)(dst1 + x),
-                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
+            v_int32 v_r00 = vx_load(row0 + x),
+                    v_r10 = vx_load(row1 + x),
+                    v_r20 = vx_load(row2 + x);
+            v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
+            v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
+            x += v_int32::nlanes;
         }
 
         return x;
     }
 };
 
-#if CV_SSE4_1
+#if CV_SSE4_1 || CV_NEON
 
 struct PyrUpVec_32s16u
 {
     int operator()(int** src, ushort** dst, int, int width) const
     {
         int x = 0;
-
-        if (!checkHardwareSupport(CV_CPU_SSE4_1))
-            return x;
-
         ushort *dst0 = dst[0], *dst1 = dst[1];
-        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
-        __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
+        const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-        for( ; x <= width - 8; x += 8 )
+        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
         {
-            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
-                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
-                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
-            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
-            __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
-            __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
-
-            v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
-            v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
-            v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
-            v_2r1 = _mm_slli_epi32(v_r1, 1);
-            v_4r1 = _mm_slli_epi32(v_r1, 2);
-            __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
-            __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
-
-            _mm_storeu_si128((__m128i *)(dst0 + x),
-                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
-                                 _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
-            _mm_storeu_si128((__m128i *)(dst1 + x),
-                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
-                                 _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
+            v_int32 v_r00 = vx_load(row0 + x),
+                    v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                    v_r10 = vx_load(row1 + x),
+                    v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                    v_r20 = vx_load(row2 + x),
+                    v_r21 = vx_load(row2 + x + v_int32::nlanes);
+            v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+            v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
         }
-
-        for( ; x <= width - 4; x += 4 )
+        if(x <= width - v_int32::nlanes)
         {
-            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
-                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
-                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
-            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
-
-            __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
-            __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
-
-            _mm_storel_epi64((__m128i *)(dst0 + x),
-                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
-            _mm_storel_epi64((__m128i *)(dst1 + x),
-                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
+            v_int32 v_r00 = vx_load(row0 + x),
+                    v_r10 = vx_load(row1 + x),
+                    v_r20 = vx_load(row2 + x);
+            v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
+            v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
+            x += v_int32::nlanes;
         }
 
         return x;
@@ -487,347 +358,17 @@ struct PyrUpVec_32f
     int operator()(float** src, float** dst, int, int width) const
     {
         int x = 0;
-
-        if (!checkHardwareSupport(CV_CPU_SSE2))
-            return x;
-
         const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
         float *dst0 = dst[0], *dst1 = dst[1];
-        __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
-               v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
 
-        for( ; x <= width - 8; x += 8 )
+        v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f);
+        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
         {
-            __m128 v_r0 = _mm_loadu_ps(row0 + x);
-            __m128 v_r1 = _mm_loadu_ps(row1 + x);
-            __m128 v_r2 = _mm_loadu_ps(row2 + x);
-
-            _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
-            _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
-
-            v_r0 = _mm_loadu_ps(row0 + x + 4);
-            v_r1 = _mm_loadu_ps(row1 + x + 4);
-            v_r2 = _mm_loadu_ps(row2 + x + 4);
-
-            _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
-            _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
-        }
-
-        return x;
-    }
-};
-
-#elif CV_NEON
-
-struct PyrDownVec_32s8u
-{
-    int operator()(int** src, uchar* dst, int, int width) const
-    {
-        int x = 0;
-        const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1],
-                           *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3],
-                           *row4 = (unsigned int*)src[4];
-        uint16x8_t v_delta = vdupq_n_u16(128);
-
-        for( ; x <= width - 16; x += 16 )
-        {
-            uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
-            uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
-            uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
-            uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4)));
-            uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4)));
-
-            v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
-            v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
-            uint16x8_t v_dst0 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
-
-            v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
-            v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
-            v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
-            v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12)));
-            v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12)));
-
-            v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
-            v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
-            uint16x8_t v_dst1 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
-
-            vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)),
-                                          vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8))));
-        }
-
-        return x;
-    }
-};
-
-struct PyrDownVec_32s16u
-{
-    int operator()(int** src, ushort* dst, int, int width) const
-    {
-        int x = 0;
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-        int32x4_t v_delta = vdupq_n_s32(128);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
-            int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
-            int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
-            int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
-            int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
-
-            v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
-            v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
-
-            v_r10 = vshlq_n_s32(v_r10, 2);
-            int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
-
-            v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
-            v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
-            v_r11 = vshlq_n_s32(v_r11, 2);
-            int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
-
-            vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1)));
-        }
-
-        return x;
-    }
-};
-
-struct PyrDownVec_32s16s
-{
-    int operator()(int** src, short* dst, int, int width) const
-    {
-        int x = 0;
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-        int32x4_t v_delta = vdupq_n_s32(128);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
-            int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
-            int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
-            int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
-            int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
-
-            v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
-            v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
-            v_r10 = vshlq_n_s32(v_r10, 2);
-            int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
-
-            v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
-            v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
-            v_r11 = vshlq_n_s32(v_r11, 2);
-            int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
-        }
-
-        return x;
-    }
-};
-
-struct PyrDownVec_32f
-{
-    int operator()(float** src, float* dst, int, int width) const
-    {
-        int x = 0;
-        const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-        float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_r0 = vld1q_f32(row0 + x);
-            float32x4_t v_r1 = vld1q_f32(row1 + x);
-            float32x4_t v_r2 = vld1q_f32(row2 + x);
-            float32x4_t v_r3 = vld1q_f32(row3 + x);
-            float32x4_t v_r4 = vld1q_f32(row4 + x);
-
-            v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
-            v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
-            vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
-
-            v_r0 = vld1q_f32(row0 + x + 4);
-            v_r1 = vld1q_f32(row1 + x + 4);
-            v_r2 = vld1q_f32(row2 + x + 4);
-            v_r3 = vld1q_f32(row3 + x + 4);
-            v_r4 = vld1q_f32(row4 + x + 4);
-
-            v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
-            v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
-            vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
-        }
-
-        return x;
-    }
-};
-
-struct PyrUpVec_32s8u
-{
-    int operator()(int** src, uchar** dst, int, int width) const
-    {
-        int x = 0;
-        uchar *dst0 = dst[0], *dst1 = dst[1];
-        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
-        uint16x8_t v_delta = vdupq_n_u16(32);
-
-        for( ; x <= width - 16; x += 16 )
-        {
-            uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
-            uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
-            uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
-
-            uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
-            uint16x8_t v_dst00 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
-            uint16x8_t v_dst10 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
-
-            v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
-            v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
-            v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
-
-            v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
-            uint16x8_t v_dst01 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
-            uint16x8_t v_dst11 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
-
-            vst1q_u8(dst0 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst00, v_delta), 6)),
-                                           vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst01, v_delta), 6))));
-            vst1q_u8(dst1 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst10, v_delta), 6)),
-                                           vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst11, v_delta), 6))));
-        }
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
-            uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
-            uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
-
-            uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
-            uint16x8_t v_dst0 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
-            uint16x8_t v_dst1 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
-
-            vst1_u8(dst0 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 6)));
-            vst1_u8(dst1 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 6)));
-        }
-
-        return x;
-    }
-};
-
-struct PyrUpVec_32s16u
-{
-    int operator()(int** src, ushort** dst, int, int width) const
-    {
-        int x = 0;
-        ushort *dst0 = dst[0], *dst1 = dst[1];
-        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
-        uint32x4_t v_delta = vdupq_n_u32(32);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
-            uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
-            uint32x4_t v_dst00 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
-            uint32x4_t v_dst10 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
-
-            v_r0 = vld1q_u32(row0 + x + 4);
-            v_r1 = vld1q_u32(row1 + x + 4);
-            v_r2 = vld1q_u32(row2 + x + 4);
-            v_2r1 = vshlq_n_u32(v_r1, 1);
-            v_4r1 = vshlq_n_u32(v_r1, 2);
-            uint32x4_t v_dst01 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
-            uint32x4_t v_dst11 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
-
-            vst1q_u16(dst0 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst00, v_delta), 6)),
-                                             vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst01, v_delta), 6))));
-            vst1q_u16(dst1 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst10, v_delta), 6)),
-                                             vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst11, v_delta), 6))));
-        }
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
-            uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
-
-            uint32x4_t v_dst0 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
-            uint32x4_t v_dst1 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
-
-            vst1_u16(dst0 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0, v_delta), 6)));
-            vst1_u16(dst1 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1, v_delta), 6)));
-        }
-
-        return x;
-    }
-};
-
-struct PyrUpVec_32s16s
-{
-    int operator()(int** src, short** dst, int, int width) const
-    {
-        int x = 0;
-        short *dst0 = dst[0], *dst1 = dst[1];
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
-        int32x4_t v_delta = vdupq_n_s32(32);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
-            int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
-            int32x4_t v_dst00 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
-            int32x4_t v_dst10 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
-
-            v_r0 = vld1q_s32(row0 + x + 4);
-            v_r1 = vld1q_s32(row1 + x + 4);
-            v_r2 = vld1q_s32(row2 + x + 4);
-            v_2r1 = vshlq_n_s32(v_r1, 1);
-            v_4r1 = vshlq_n_s32(v_r1, 2);
-            int32x4_t v_dst01 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
-            int32x4_t v_dst11 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
-
-            vst1q_s16(dst0 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst00, v_delta), 6)),
-                                             vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst01, v_delta), 6))));
-            vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst10, v_delta), 6)),
-                                             vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst11, v_delta), 6))));
-        }
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
-            int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
-
-            int32x4_t v_dst0 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
-            int32x4_t v_dst1 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
-
-            vst1_s16(dst0 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst0, v_delta), 6)));
-            vst1_s16(dst1 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst1, v_delta), 6)));
-        }
-
-        return x;
-    }
-};
-
-struct PyrUpVec_32f
-{
-    int operator()(float** src, float** dst, int, int width) const
-    {
-        int x = 0;
-        const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
-        float *dst0 = dst[0], *dst1 = dst[1];
-        float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_r0 = vld1q_f32(row0 + x);
-            float32x4_t v_r1 = vld1q_f32(row1 + x);
-            float32x4_t v_r2 = vld1q_f32(row2 + x);
-
-            vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
-            vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
-
-            v_r0 = vld1q_f32(row0 + x + 4);
-            v_r1 = vld1q_f32(row1 + x + 4);
-            v_r2 = vld1q_f32(row2 + x + 4);
-
-            vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
-            vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
+            v_float32 v_r0 = vx_load(row0 + x),
+                      v_r1 = vx_load(row1 + x),
+                      v_r2 = vx_load(row2 + x);
+            v_store(dst1 + x, v_scale4 * (v_r1 + v_r2));
+            v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2));
         }
 
         return x;
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 683e4dee5c..5436a78ab5 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -1181,583 +1181,34 @@ struct HResizeNoVec
         const uchar*, int, int, int, int, int) const { return 0; }
 };
 
-#if CV_SSE2
+#if CV_SIMD
 
 struct VResizeLinearVec_32s8u
 {
     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         const int** src = (const int**)_src;
         const short* beta = (const short*)_beta;
         const int *S0 = src[0], *S1 = src[1];
         int x = 0;
-        __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
-        __m128i delta = _mm_set1_epi16(2);
+        v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128i x0, x1, x2, y0, y1, y2;
-                x0 = _mm_load_si128((const __m128i*)(S0 + x));
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S1 + x));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
-                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
-                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
-
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
-                x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
-                y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
-                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
-                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
-
-                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
-                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
-
-                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
-                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
-            }
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
+            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x                      ) >> 4, vx_load_aligned(S0 + x +     v_int32::nlanes) >> 4), b0) +
+                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x                      ) >> 4, vx_load_aligned(S1 + x +     v_int32::nlanes) >> 4), b1),
+                                                  v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
+                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
         else
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128i x0, x1, x2, y0, y1, y2;
-                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
-                x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
-                y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
-
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
-                x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
-                y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
-                x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
-                y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
-
-                x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
-                x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
-
-                x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-                x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
-                _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
-            }
-
-        for( ; x < width - 4; x += 4 )
-        {
-            __m128i x0, y0;
-            x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
-            y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
-            x0 = _mm_packs_epi32(x0, x0);
-            y0 = _mm_packs_epi32(y0, y0);
-            x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
-            x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
-            x0 = _mm_packus_epi16(x0, x0);
-            *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
-        }
-
-        return x;
-    }
-};
-
-
-template<int shiftval> struct VResizeLinearVec_32f16
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
-        __m128i preshift = _mm_set1_epi32(shiftval);
-        __m128i postshift = _mm_set1_epi16((short)shiftval);
-
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128 x0, x1, y0, y1;
-                __m128i t0, t1, t2;
-                x0 = _mm_load_ps(S0 + x);
-                x1 = _mm_load_ps(S0 + x + 4);
-                y0 = _mm_load_ps(S1 + x);
-                y1 = _mm_load_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
-
-                x0 = _mm_load_ps(S0 + x + 8);
-                x1 = _mm_load_ps(S0 + x + 12);
-                y0 = _mm_load_ps(S1 + x + 8);
-                y1 = _mm_load_ps(S1 + x + 12);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
-
-                _mm_storeu_si128( (__m128i*)(dst + x), t0);
-                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
-            }
-        else
-            for( ; x <= width - 16; x += 16 )
-            {
-                __m128 x0, x1, y0, y1;
-                __m128i t0, t1, t2;
-                x0 = _mm_loadu_ps(S0 + x);
-                x1 = _mm_loadu_ps(S0 + x + 4);
-                y0 = _mm_loadu_ps(S1 + x);
-                y1 = _mm_loadu_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
-
-                x0 = _mm_loadu_ps(S0 + x + 8);
-                x1 = _mm_loadu_ps(S0 + x + 12);
-                y0 = _mm_loadu_ps(S1 + x + 8);
-                y1 = _mm_loadu_ps(S1 + x + 12);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-                t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-                t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
-                t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
-
-                _mm_storeu_si128( (__m128i*)(dst + x), t0);
-                _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
-            }
-
-        for( ; x < width - 4; x += 4 )
-        {
-            __m128 x0, y0;
-            __m128i t0;
-            x0 = _mm_loadu_ps(S0 + x);
-            y0 = _mm_loadu_ps(S1 + x);
-
-            x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-            t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
-            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
-            _mm_storel_epi64( (__m128i*)(dst + x), t0);
-        }
-
-        return x;
-    }
-};
-
-typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
-typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
-
-struct VResizeLinearVec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1];
-        float* dst = (float*)_dst;
-        int x = 0;
-
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
-
-        if( (((size_t)S0|(size_t)S1)&15) == 0 )
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128 x0, x1, y0, y1;
-                x0 = _mm_load_ps(S0 + x);
-                x1 = _mm_load_ps(S0 + x + 4);
-                y0 = _mm_load_ps(S1 + x);
-                y1 = _mm_load_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-
-                _mm_storeu_ps( dst + x, x0);
-                _mm_storeu_ps( dst + x + 4, x1);
-            }
-        else
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128 x0, x1, y0, y1;
-                x0 = _mm_loadu_ps(S0 + x);
-                x1 = _mm_loadu_ps(S0 + x + 4);
-                y0 = _mm_loadu_ps(S1 + x);
-                y1 = _mm_loadu_ps(S1 + x + 4);
-
-                x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
-                x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
-
-                _mm_storeu_ps( dst + x, x0);
-                _mm_storeu_ps( dst + x + 4, x1);
-            }
-
-        return x;
-    }
-};
-
-
-struct VResizeCubicVec_32s8u
-{
-    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        const int** src = (const int**)_src;
-        const short* beta = (const short*)_beta;
-        const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        int x = 0;
-        float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
-        __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
-            b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
-
-        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128i x0, x1, y0, y1;
-                __m128 s0, s1, f0, f1;
-                x0 = _mm_load_si128((const __m128i*)(S0 + x));
-                x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S1 + x));
-                y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
-
-                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
-                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_load_si128((const __m128i*)(S2 + x));
-                x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
-                y0 = _mm_load_si128((const __m128i*)(S3 + x));
-                y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
-
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_cvtps_epi32(s0);
-                x1 = _mm_cvtps_epi32(s1);
-
-                x0 = _mm_packs_epi32(x0, x1);
-                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
-            }
-        else
-            for( ; x <= width - 8; x += 8 )
-            {
-                __m128i x0, x1, y0, y1;
-                __m128 s0, s1, f0, f1;
-                x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
-
-                s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
-                s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
-                x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
-                y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
-                y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
-
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-                f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
-                f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
-                s0 = _mm_add_ps(s0, f0);
-                s1 = _mm_add_ps(s1, f1);
-
-                x0 = _mm_cvtps_epi32(s0);
-                x1 = _mm_cvtps_epi32(s1);
-
-                x0 = _mm_packs_epi32(x0, x1);
-                _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
-            }
-
-        return x;
-    }
-};
-
-
-template<int shiftval> struct VResizeCubicVec_32f16
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        ushort* dst = (ushort*)_dst;
-        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
-            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
-        __m128i preshift = _mm_set1_epi32(shiftval);
-        __m128i postshift = _mm_set1_epi16((short)shiftval);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 x0, x1, y0, y1, s0, s1;
-            __m128i t0, t1;
-            x0 = _mm_loadu_ps(S0 + x);
-            x1 = _mm_loadu_ps(S0 + x + 4);
-            y0 = _mm_loadu_ps(S1 + x);
-            y1 = _mm_loadu_ps(S1 + x + 4);
-
-            s0 = _mm_mul_ps(x0, b0);
-            s1 = _mm_mul_ps(x1, b0);
-            y0 = _mm_mul_ps(y0, b1);
-            y1 = _mm_mul_ps(y1, b1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            x0 = _mm_loadu_ps(S2 + x);
-            x1 = _mm_loadu_ps(S2 + x + 4);
-            y0 = _mm_loadu_ps(S3 + x);
-            y1 = _mm_loadu_ps(S3 + x + 4);
-
-            x0 = _mm_mul_ps(x0, b2);
-            x1 = _mm_mul_ps(x1, b2);
-            y0 = _mm_mul_ps(y0, b3);
-            y1 = _mm_mul_ps(y1, b3);
-            s0 = _mm_add_ps(s0, x0);
-            s1 = _mm_add_ps(s1, x1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
-            t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
-
-            t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
-            _mm_storeu_si128( (__m128i*)(dst + x), t0);
-        }
-
-        return x;
-    }
-};
-
-typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
-typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
-
-struct VResizeCubicVec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
-        float* dst = (float*)_dst;
-        int x = 0;
-        __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
-            b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 x0, x1, y0, y1, s0, s1;
-            x0 = _mm_loadu_ps(S0 + x);
-            x1 = _mm_loadu_ps(S0 + x + 4);
-            y0 = _mm_loadu_ps(S1 + x);
-            y1 = _mm_loadu_ps(S1 + x + 4);
-
-            s0 = _mm_mul_ps(x0, b0);
-            s1 = _mm_mul_ps(x1, b0);
-            y0 = _mm_mul_ps(y0, b1);
-            y1 = _mm_mul_ps(y1, b1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            x0 = _mm_loadu_ps(S2 + x);
-            x1 = _mm_loadu_ps(S2 + x + 4);
-            y0 = _mm_loadu_ps(S3 + x);
-            y1 = _mm_loadu_ps(S3 + x + 4);
-
-            x0 = _mm_mul_ps(x0, b2);
-            x1 = _mm_mul_ps(x1, b2);
-            y0 = _mm_mul_ps(y0, b3);
-            y1 = _mm_mul_ps(y1, b3);
-            s0 = _mm_add_ps(s0, x0);
-            s1 = _mm_add_ps(s1, x1);
-            s0 = _mm_add_ps(s0, y0);
-            s1 = _mm_add_ps(s1, y1);
-
-            _mm_storeu_ps( dst + x, s0);
-            _mm_storeu_ps( dst + x + 4, s1);
-        }
-
-        return x;
-    }
-};
-
-#if CV_TRY_SSE4_1
-
-struct VResizeLanczos4Vec_32f16u
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
-        else return 0;
-    }
-};
-
-#else
-
-typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
-
-#endif
-
-struct VResizeLanczos4Vec_32f16s
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        short * dst = (short*)_dst;
-        int x = 0;
-        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
-               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
-               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
-               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
-            v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
-
-            __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
-            v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
-
-            __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
-            __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
-        }
-
-        return x;
-    }
-};
-
-
-struct VResizeLanczos4Vec_32f
-{
-    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
-    {
-        const float** src = (const float**)_src;
-        const float* beta = (const float*)_beta;
-        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
-                    *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
-        float* dst = (float*)_dst;
-        int x = 0;
-
-        __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
-               v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
-               v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
-               v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
-            v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
-
-            _mm_storeu_ps(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
-
-
-#elif CV_NEON
-
-struct VResizeLinearVec_32s8u
-{
-    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
-    {
-        const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
-        const short* beta = (const short*)_beta;
-        int x = 0;
-        int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
-
-        for( ; x <= width - 16; x += 16)
-        {
-            int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
-            int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
-
-            int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
-            int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
-
-            int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
-                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
-            v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
-
-            v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
-            v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
-            v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
-            v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
-
-            v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
-            v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
-
-            int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
-                                         vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
-            v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
-
-            vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
-        }
+            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x                      ) >> 4, vx_load(S0 + x +     v_int32::nlanes) >> 4), b0) +
+                                                  v_mul_hi(v_pack(vx_load(S1 + x                      ) >> 4, vx_load(S1 + x +     v_int32::nlanes) >> 4), b1),
+                                                  v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
+                                                  v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
+
+        for( ; x < width - v_int16::nlanes; x += v_int16::nlanes)
+            v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
+                                            v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1));
 
         return x;
     }
@@ -1773,18 +1224,20 @@ struct VResizeLinearVec_32f16u
         ushort* dst = (ushort*)_dst;
         int x = 0;
 
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        for( ; x <= width - 8; x += 8 )
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
+            for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
+                                          v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
+        else
+            for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
+                                          v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
+        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
         {
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
-
-            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
-            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
-
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            v_store_low(dst + x, v_pack_u(t0, t0));
         }
 
         return x;
@@ -1801,18 +1254,20 @@ struct VResizeLinearVec_32f16s
         short* dst = (short*)_dst;
         int x = 0;
 
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        for( ; x <= width - 8; x += 8 )
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
+            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
+                                        v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
+        else
+            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
+                                        v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
+        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
         {
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
-
-            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
-            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            v_store_low(dst + x, v_pack(t0, t0));
         }
 
         return x;
@@ -1829,22 +1284,56 @@ struct VResizeLinearVec_32f
         float* dst = (float*)_dst;
         int x = 0;
 
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
-            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
-
-            vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
-            vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
-        }
+        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
+            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+                v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1));
+        else
+            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+                v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
 
         return x;
     }
 };
 
-typedef VResizeNoVec VResizeCubicVec_32s8u;
+
+struct VResizeCubicVec_32s8u
+{
+    int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
+    {
+        const int** src = (const int**)_src;
+        const short* beta = (const short*)_beta;
+        const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        int x = 0;
+        float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
+
+        v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale),
+                  b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale);
+
+        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 )
+            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+                v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x                    )),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x                    )),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x                    )),  b2,
+                                                                v_cvt_f32(vx_load_aligned(S3 + x                    )) * b3)))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)),  b2,
+                                                                v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3))))));
+        else
+            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+                v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x                    )),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load(S1 + x                    )),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load(S2 + x                    )),  b2,
+                                                                v_cvt_f32(vx_load(S3 + x                    )) * b3)))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)),  b2,
+                                                                v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3))))));
+        return x;
+    }
+};
 
 struct VResizeCubicVec_32f16u
 {
@@ -1855,23 +1344,18 @@ struct VResizeCubicVec_32f16u
         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
         ushort* dst = (ushort*)_dst;
         int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                         v_b1, vld1q_f32(S1 + x + 4)),
-                                                                         v_b2, vld1q_f32(S2 + x + 4)),
-                                                                         v_b3, vld1q_f32(S3 + x + 4));
-
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
+        for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+            v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
+                                              v_muladd(vx_load(S1 + x                    ),  b1,
+                                              v_muladd(vx_load(S2 + x                    ),  b2,
+                                                       vx_load(S3 + x                    ) * b3)))),
+                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
+                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
+                                                       vx_load(S3 + x + v_float32::nlanes) * b3))))));
 
         return x;
     }
@@ -1886,23 +1370,18 @@ struct VResizeCubicVec_32f16s
         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
         short* dst = (short*)_dst;
         int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                         v_b1, vld1q_f32(S1 + x + 4)),
-                                                                         v_b2, vld1q_f32(S2 + x + 4)),
-                                                                         v_b3, vld1q_f32(S3 + x + 4));
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
+        for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
+                                            v_muladd(vx_load(S1 + x                    ),  b1,
+                                            v_muladd(vx_load(S2 + x                    ),  b2,
+                                                     vx_load(S3 + x                    ) * b3)))),
+                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
+                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
+                                                     vx_load(S3 + x + v_float32::nlanes) * b3))))));
 
         return x;
     }
@@ -1917,25 +1396,33 @@ struct VResizeCubicVec_32f
         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
         float* dst = (float*)_dst;
         int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for( ; x <= width - 8; x += 8 )
-        {
-            vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                       v_b1, vld1q_f32(S1 + x)),
-                                                                       v_b2, vld1q_f32(S2 + x)),
-                                                                       v_b3, vld1q_f32(S3 + x)));
-            vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                                          v_b1, vld1q_f32(S1 + x + 4)),
-                                                                          v_b2, vld1q_f32(S2 + x + 4)),
-                                                                          v_b3, vld1q_f32(S3 + x + 4)));
-        }
+        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+            v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
+                             v_muladd(vx_load(S1 + x),  b1,
+                             v_muladd(vx_load(S2 + x),  b2,
+                                      vx_load(S3 + x) * b3))));
 
         return x;
     }
 };
 
+
+#if CV_TRY_SSE4_1
+
+struct VResizeLanczos4Vec_32f16u
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
+        else return 0;
+    }
+};
+
+#else
+
 struct VResizeLanczos4Vec_32f16u
 {
     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
@@ -1946,41 +1433,35 @@ struct VResizeLanczos4Vec_32f16u
                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
         ushort * dst = (ushort*)_dst;
         int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
+                  b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
+                  b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
-                                                                         v_b5, vld1q_f32(S5 + x)),
-                                                                         v_b6, vld1q_f32(S6 + x)),
-                                                                         v_b7, vld1q_f32(S7 + x));
-            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
-
-            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                             v_b1, vld1q_f32(S1 + x + 4)),
-                                                             v_b2, vld1q_f32(S2 + x + 4)),
-                                                             v_b3, vld1q_f32(S3 + x + 4));
-            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
-                                                             v_b5, vld1q_f32(S5 + x + 4)),
-                                                             v_b6, vld1q_f32(S6 + x + 4)),
-                                                             v_b7, vld1q_f32(S7 + x + 4));
-            v_dst1 = vaddq_f32(v_dst0, v_dst1);
-
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
-        }
+        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+            v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
+                                              v_muladd(vx_load(S1 + x                    ),  b1,
+                                              v_muladd(vx_load(S2 + x                    ),  b2,
+                                              v_muladd(vx_load(S3 + x                    ),  b3,
+                                              v_muladd(vx_load(S4 + x                    ),  b4,
+                                              v_muladd(vx_load(S5 + x                    ),  b5,
+                                              v_muladd(vx_load(S6 + x                    ),  b6,
+                                                       vx_load(S7 + x                    ) * b7)))))))),
+                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
+                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
+                                              v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
+                                              v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
+                                              v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
+                                              v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
+                                                       vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
 
         return x;
     }
 };
 
+#endif
+
 struct VResizeLanczos4Vec_32f16s
 {
     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
@@ -1991,36 +1472,28 @@ struct VResizeLanczos4Vec_32f16s
                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
         short * dst = (short*)_dst;
         int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
+                  b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
+                  b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - 8; x += 8 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
-                                                                         v_b5, vld1q_f32(S5 + x)),
-                                                                         v_b6, vld1q_f32(S6 + x)),
-                                                                         v_b7, vld1q_f32(S7 + x));
-            float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
-
-            v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
-                                                             v_b1, vld1q_f32(S1 + x + 4)),
-                                                             v_b2, vld1q_f32(S2 + x + 4)),
-                                                             v_b3, vld1q_f32(S3 + x + 4));
-            v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
-                                                             v_b5, vld1q_f32(S5 + x + 4)),
-                                                             v_b6, vld1q_f32(S6 + x + 4)),
-                                                             v_b7, vld1q_f32(S7 + x + 4));
-            v_dst1 = vaddq_f32(v_dst0, v_dst1);
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
-        }
+        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
+                                            v_muladd(vx_load(S1 + x                    ),  b1,
+                                            v_muladd(vx_load(S2 + x                    ),  b2,
+                                            v_muladd(vx_load(S3 + x                    ),  b3,
+                                            v_muladd(vx_load(S4 + x                    ),  b4,
+                                            v_muladd(vx_load(S5 + x                    ),  b5,
+                                            v_muladd(vx_load(S6 + x                    ),  b6,
+                                                     vx_load(S7 + x                    ) * b7)))))))),
+                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
+                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
+                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
+                                            v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
+                                            v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
+                                            v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
+                                            v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
+                                                     vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
 
         return x;
     }
@@ -2036,23 +1509,21 @@ struct VResizeLanczos4Vec_32f
                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
         float* dst = (float*)_dst;
         int x = 0;
-        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
-                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
-                    v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
-                    v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
 
-        for( ; x <= width - 4; x += 4 )
-        {
-            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
-                                                                         v_b1, vld1q_f32(S1 + x)),
-                                                                         v_b2, vld1q_f32(S2 + x)),
-                                                                         v_b3, vld1q_f32(S3 + x));
-            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
-                                                                         v_b5, vld1q_f32(S5 + x)),
-                                                                         v_b6, vld1q_f32(S6 + x)),
-                                                                         v_b7, vld1q_f32(S7 + x));
-            vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
-        }
+        v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
+                  b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]),
+                  b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
+                  b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
+
+        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+            v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
+                             v_muladd(vx_load(S1 + x),  b1,
+                             v_muladd(vx_load(S2 + x),  b2,
+                             v_muladd(vx_load(S3 + x),  b3,
+                             v_muladd(vx_load(S4 + x),  b4,
+                             v_muladd(vx_load(S5 + x),  b5,
+                             v_muladd(vx_load(S6 + x),  b6,
+                                      vx_load(S7 + x) * b7))))))));
 
         return x;
     }
@@ -2695,95 +2166,94 @@ private:
     int step;
 };
 
-#elif CV_SSE2
+#elif CV_SIMD
 
 class ResizeAreaFastVec_SIMD_8u
 {
 public:
     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
+        cn(_cn), step(_step) {}
 
     int operator() (const uchar* S, uchar* D, int w) const
     {
-        if (!use_simd)
-            return 0;
-
         int dx = 0;
         const uchar* S0 = S;
         const uchar* S1 = S0 + step;
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi16(2);
 
         if (cn == 1)
         {
-            __m128i masklow = _mm_set1_epi16(0x00ff);
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            v_uint16 masklow = vx_setall_u16(0x00ff);
+            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
-                __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
-                s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
-
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0));
+                v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1));
+                v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow));
             }
         }
         else if (cn == 3)
-            for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
+        {
+            if (CV_SIMD_WIDTH > 64)
+                return 0;
+            for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
-                __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
-                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
-                __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
-
-                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
-                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
-
-                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
-                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)(D+3), s0);
+                v_uint16 t0, t1, t2, t3, t4, t5;
+                v_uint16 s0, s1, s2, s3, s4, s5;
+                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
+                s1 = vx_load_expand(S0 +   v_uint16::nlanes) + vx_load_expand(S1 +   v_uint16::nlanes);
+                s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes);
+                s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes);
+                s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes);
+                s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_uint16 bl, gl, rl;
+#if CV_SIMD_WIDTH == 16
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+#elif CV_SIMD_WIDTH == 32
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
+#elif CV_SIMD_WIDTH == 64
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+#endif
+                s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes);
+                s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes);
+                s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes);
+                s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes);
+                s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes);
+                s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_uint16 bh, gh, rh;
+#if CV_SIMD_WIDTH == 16
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#elif CV_SIMD_WIDTH == 32
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+#elif CV_SIMD_WIDTH == 64
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#endif
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
             }
+        }
         else
         {
             CV_Assert(cn == 4);
-            int v[] = { 0, 0, -1, -1 };
-            __m128i mask = _mm_loadu_si128((const __m128i*)v);
-
-            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_uint32 r00, r01, r10, r11;
+                v_load_deinterleave((uint32_t*)S0, r00, r01);
+                v_load_deinterleave((uint32_t*)S1, r10, r11);
 
-                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
-                __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
-                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
-                __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
-
-                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
-                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                __m128i res0 = _mm_srli_epi16(s0, 2);
-
-                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
-                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
-                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
-                __m128i res1 = _mm_srli_epi16(s0, 2);
-                s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
-                                                   _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
-                _mm_storel_epi64((__m128i*)(D), s0);
+                v_uint16 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
+                v_expand(v_reinterpret_as_u8(r00), r00l, r00h);
+                v_expand(v_reinterpret_as_u8(r01), r01l, r01h);
+                v_expand(v_reinterpret_as_u8(r10), r10l, r10h);
+                v_expand(v_reinterpret_as_u8(r11), r11l, r11h);
+                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
             }
         }
 
@@ -2792,7 +2262,6 @@ public:
 
 private:
     int cn;
-    bool use_simd;
     int step;
 };
 
@@ -2800,164 +2269,258 @@ class ResizeAreaFastVec_SIMD_16u
 {
 public:
     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
+        cn(_cn), step(_step) {}
 
     int operator() (const ushort* S, ushort* D, int w) const
     {
-        if (!use_simd)
-            return 0;
-
         int dx = 0;
         const ushort* S0 = (const ushort*)S;
         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
-        __m128i masklow = _mm_set1_epi32(0x0000ffff);
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi32(2);
-
-#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
 
         if (cn == 1)
         {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            v_uint32 masklow = vx_setall_u32(0x0000ffff);
+            for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
-                __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
-                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
-                s0 = _mm_srli_epi32(s0, 2);
-                s0 = _mm_packus_epi32(s0, zero);
-
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0));
+                v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1));
+                v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow));
             }
         }
         else if (cn == 3)
+        {
+#if CV_SIMD_WIDTH == 16
             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+#if CV_SSE4_1
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
-                __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
-                __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
-                __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
-
-                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
-                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
-                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
-                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_uint32 r0, r1, r2, r3;
+                v_expand(vx_load(S0), r0, r1);
+                v_expand(vx_load(S1), r2, r3);
+                r0 += r2; r1 += r3;
+                v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
             }
+#else
+                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+#endif
+#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
+            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            {
+                v_uint32 t0, t1, t2, t3, t4, t5;
+                v_uint32 s0, s1, s2, s3, s4, s5;
+                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
+                s1 = vx_load_expand(S0 +   v_uint32::nlanes) + vx_load_expand(S1 +   v_uint32::nlanes);
+                s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes);
+                s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes);
+                s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes);
+                s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_uint32 bl, gl, rl;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
+#endif
+                s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes);
+                s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes);
+                s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes);
+                s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes);
+                s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes);
+                s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_uint32 bh, gh, rh;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+#endif
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
+            }
+#elif CV_SIMD_WIDTH >= 64
+            v_uint32 masklow = vx_setall_u32(0x0000ffff);
+            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            {
+                v_uint16 b0, g0, r0, b1, g1, r1;
+                v_load_deinterleave(S0, b0, g0, r0);
+                v_load_deinterleave(S1, b1, g1, r1);
+                v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
+                v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
+                v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
+                v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
+                v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
+                v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
+                v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
+                v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
+            }
+#endif
+        }
         else
         {
             CV_Assert(cn == 4);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+#if CV_SIMD_WIDTH >= 64
+            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+                v_uint64 r00, r01, r10, r11;
+                v_load_deinterleave((uint64_t*)S0, r00, r01);
+                v_load_deinterleave((uint64_t*)S1, r10, r11);
 
-                __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
-                __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
-                __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
-                __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
-
-                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
-                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
-                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
-                s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_uint32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
+                v_expand(v_reinterpret_as_u16(r00), r00l, r00h);
+                v_expand(v_reinterpret_as_u16(r01), r01l, r01h);
+                v_expand(v_reinterpret_as_u16(r10), r10l, r10h);
+                v_expand(v_reinterpret_as_u16(r11), r11l, r11h);
+                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
             }
+#else
+            for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
+            {
+                v_uint32 r0, r1, r2, r3;
+                v_expand(vx_load(S0), r0, r1);
+                v_expand(vx_load(S1), r2, r3);
+                r0 += r2; r1 += r3;
+                v_uint32 v_d;
+#if CV_SIMD_WIDTH == 16
+                v_d = r0 + r1;
+#elif CV_SIMD_WIDTH == 32
+                v_uint32 t0, t1;
+                v_recombine(r0, r1, t0, t1);
+                v_d = t0 + t1;
+#endif
+                v_rshr_pack_store<2>(D, v_d);
+            }
+#endif
         }
 
-#undef _mm_packus_epi32
-
         return dx;
     }
 
 private:
     int cn;
     int step;
-    bool use_simd;
 };
 
 class ResizeAreaFastVec_SIMD_16s
 {
 public:
     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
-        cn(_cn), step(_step)
-    {
-        use_simd = checkHardwareSupport(CV_CPU_SSE2);
-    }
+        cn(_cn), step(_step) {}
 
     int operator() (const short* S, short* D, int w) const
     {
-        if (!use_simd)
-            return 0;
-
         int dx = 0;
         const short* S0 = (const short*)S;
         const short* S1 = (const short*)((const uchar*)(S) + step);
-        __m128i masklow = _mm_set1_epi32(0x0000ffff);
-        __m128i zero = _mm_setzero_si128();
-        __m128i delta2 = _mm_set1_epi32(2);
 
         if (cn == 1)
         {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            v_int32 masklow = vx_setall_s32(0x0000ffff);
+            for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
-                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
-                __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
-                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
-                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
-                s0 = _mm_srai_epi32(s0, 2);
-                s0 = _mm_packs_epi32(s0, zero);
-
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_int32 r0 = v_reinterpret_as_s32(vx_load(S0));
+                v_int32 r1 = v_reinterpret_as_s32(vx_load(S1));
+                v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16));
             }
         }
         else if (cn == 3)
+        {
+#if CV_SIMD_WIDTH == 16
             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
+            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
-                __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
-                __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
-                __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
-
-                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
-                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
-                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
-                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_int32 t0, t1, t2, t3, t4, t5;
+                v_int32 s0, s1, s2, s3, s4, s5;
+                s0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
+                s1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
+                s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
+                s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
+                s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes);
+                s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_int32 bl, gl, rl;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
+#endif
+                s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes);
+                s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes);
+                s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes);
+                s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes);
+                s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes);
+                s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes);
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                v_int32 bh, gh, rh;
+                v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
+#if CV_SIMD_WIDTH == 32
+                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+#else //CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
+                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+#endif
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
             }
+#elif CV_SIMD_WIDTH >= 64
+            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
+            {
+                v_int16 b0, g0, r0, b1, g1, r1;
+                v_load_deinterleave(S0, b0, g0, r0);
+                v_load_deinterleave(S1, b1, g1, r1);
+                v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
+                v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
+                v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
+                v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
+                v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
+                v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
+                v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
+                v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
+                v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
+            }
+#endif
+        }
         else
         {
             CV_Assert(cn == 4);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+#if CV_SIMD_WIDTH >= 64
+                v_int64 r00, r01, r10, r11;
+                v_load_deinterleave((int64_t*)S0, r00, r01);
+                v_load_deinterleave((int64_t*)S1, r10, r11);
 
-                __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
-                __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
-                __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
-                __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
-
-                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
-                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
-                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
-                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
-                _mm_storel_epi64((__m128i*)D, s0);
+                v_int32 r00l, r01l, r10l, r11l, r00h, r01h, r10h, r11h;
+                v_expand(v_reinterpret_as_s16(r00), r00l, r00h);
+                v_expand(v_reinterpret_as_s16(r01), r01l, r01h);
+                v_expand(v_reinterpret_as_s16(r10), r10l, r10h);
+                v_expand(v_reinterpret_as_s16(r11), r11l, r11h);
+                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
+#else
+                v_int32 r0, r1, r2, r3;
+                r0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
+                r1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
+                r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
+                r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
+                v_int32 dl, dh;
+#if CV_SIMD_WIDTH == 16
+                dl = r0 + r1; dh = r2 + r3;
+#elif CV_SIMD_WIDTH == 32
+                v_int32 t0, t1, t2, t3;
+                v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
+                dl = t0 + t1; dh = t2 + t3;
+#endif
+                v_store(D, v_rshr_pack<2>(dl, dh));
+#endif
             }
         }
 
@@ -2967,7 +2530,6 @@ public:
 private:
     int cn;
     int step;
-    bool use_simd;
 };
 
 struct ResizeAreaFastVec_SIMD_32f
@@ -2976,7 +2538,6 @@ struct ResizeAreaFastVec_SIMD_32f
         cn(_cn), step(_step)
     {
         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
-        fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
     }
 
     int operator() (const float * S, float * D, int w) const
@@ -2987,33 +2548,32 @@ struct ResizeAreaFastVec_SIMD_32f
         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
         int dx = 0;
 
-        __m128 v_025 = _mm_set1_ps(0.25f);
-
         if (cn == 1)
         {
-            const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            v_float32 v_025 = vx_setall_f32(0.25f);
+            for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
             {
-                __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
-                       v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
-
-                __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
-                                           _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
-                __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
-                                           _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
-
-                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
+                v_float32 v_row00, v_row01, v_row10, v_row11;
+                v_load_deinterleave(S0, v_row00, v_row01);
+                v_load_deinterleave(S1, v_row10, v_row11);
+                v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025);
             }
         }
         else if (cn == 4)
         {
-            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+#if CV_SIMD_WIDTH == 16
+            v_float32 v_025 = vx_setall_f32(0.25f);
+            for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
+                v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
+#elif CV_SIMD256
+            v_float32x8 v_025 = v256_setall_f32(0.25f);
+            for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
             {
-                __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
-                __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
-
-                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
+                v_float32x8 dst0, dst1;
+                v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1);
+                v_store(D, (dst0 + dst1) * v_025);
             }
+#endif
         }
 
         return dx;
diff --git a/modules/objdetect/CMakeLists.txt b/modules/objdetect/CMakeLists.txt
index 4e330af6c0..a51740c280 100644
--- a/modules/objdetect/CMakeLists.txt
+++ b/modules/objdetect/CMakeLists.txt
@@ -1,2 +1,8 @@
 set(the_description "Object Detection")
-ocv_define_module(objdetect opencv_core opencv_imgproc WRAP java python js)
+ocv_define_module(objdetect opencv_core opencv_imgproc opencv_calib3d WRAP java python js)
+
+if(HAVE_QUIRC)
+    get_property(QUIRC_INCLUDE GLOBAL PROPERTY QUIRC_INCLUDE_DIR)
+    ocv_include_directories(${QUIRC_INCLUDE})
+    ocv_target_link_libraries(${PROJECT_NAME} quirc)
+endif()
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index 3ccdfe60a3..34f58cdf79 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -690,6 +690,13 @@ protected:
     */
 CV_EXPORTS bool detectQRCode(InputArray in, std::vector<Point> &points, double eps_x = 0.2, double eps_y = 0.1);
 
+/** @brief Decode QR code in image and return text that is encrypted in QR code.
+    @param in  Matrix of the type CV_8UC1 containing an image where QR code are detected.
+    @param points Input vector of vertices of a quadrangle of minimal area that describes QR code.
+    @param decoded_info String information that is encrypted in QR code.
+    @param straight_qrcode Matrix of the type CV_8UC1 containing an binary straight QR code.
+    */
+CV_EXPORTS bool decodeQRCode(InputArray in, InputArray points, std::string &decoded_info, OutputArray straight_qrcode = noArray());
 //! @} objdetect
 
 }
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 5633c31037..aa269d9e4a 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -7,10 +7,16 @@
 
 #include "precomp.hpp"
 #include "opencv2/objdetect.hpp"
+#include "opencv2/calib3d.hpp"
+
+#ifdef HAVE_QUIRC
+#include "quirc.h"
+#endif
 
 #include <limits>
 #include <cmath>
 #include <iostream>
+#include <queue>
 
 namespace cv
 {
@@ -25,11 +31,11 @@ public:
     Mat getBinBarcode() { return bin_barcode; }
     Mat getStraightBarcode() { return straight_barcode; }
     vector<Point2f> getTransformationPoints() { return transformation_points; }
+    static Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2);
 protected:
     vector<Vec3d> searchHorizontalLines();
     vector<Point2f> separateVerticalLines(const vector<Vec3d> &list_lines);
     void fixationPoints(vector<Point2f> &local_point);
-    Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2);
     vector<Point2f> getQuadrilateral(vector<Point2f> angle_list);
     bool testBypassRoute(vector<Point2f> hull, int start, int finish);
     inline double getCosVectors(Point2f a, Point2f b, Point2f c);
@@ -61,6 +67,7 @@ void QRDetect::init(const Mat& src, double eps_vertical_, double eps_horizontal_
     eps_vertical   = eps_vertical_;
     eps_horizontal = eps_horizontal_;
     adaptiveThreshold(barcode, bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2);
+
 }
 
 vector<Vec3d> QRDetect::searchHorizontalLines()
@@ -538,7 +545,7 @@ vector<Point2f> QRDetect::getQuadrilateral(vector<Point2f> angle_list)
     vector<Point> locations;
     Mat mask_roi = mask(Range(1, bin_barcode.rows - 1), Range(1, bin_barcode.cols - 1));
 
-    cv::findNonZero(mask_roi, locations);
+    findNonZero(mask_roi, locations);
 
     for (size_t i = 0; i < angle_list.size(); i++)
     {
@@ -783,7 +790,7 @@ bool QRCodeDetector::detect(InputArray in, OutputArray points) const
     return true;
 }
 
-CV_EXPORTS bool detectQRCode(InputArray in, std::vector<Point> &points, double eps_x, double eps_y)
+CV_EXPORTS bool detectQRCode(InputArray in, vector<Point> &points, double eps_x, double eps_y)
 {
     QRCodeDetector qrdetector;
     qrdetector.setEpsX(eps_x);
@@ -792,4 +799,276 @@ CV_EXPORTS bool detectQRCode(InputArray in, std::vector<Point> &points, double e
     return qrdetector.detect(in, points);
 }
 
+class QRDecode
+{
+public:
+    void init(const Mat &src, const vector<Point2f> &points);
+    Mat getIntermediateBarcode() { return intermediate; }
+    Mat getStraightBarcode() { return straight; }
+    size_t getVersion() { return version; }
+    std::string getDecodeInformation() { return result_info; }
+    bool fullDecodingProcess();
+protected:
+    bool updatePerspective();
+    bool versionDefinition();
+    bool samplingForVersion();
+    bool decodingProcess();
+    Mat original, no_border_intermediate, intermediate, straight;
+    vector<Point2f> original_points;
+    std::string result_info;
+    uint8_t version, version_size;
+    float test_perspective_size;
+};
+
+void QRDecode::init(const Mat &src, const vector<Point2f> &points)
+{
+    original = src.clone();
+    intermediate = Mat::zeros(src.size(), CV_8UC1);
+    original_points = points;
+    version = 0;
+    version_size = 0;
+    test_perspective_size = 251;
+    result_info = "";
+}
+
+bool QRDecode::updatePerspective()
+{
+    const Size temporary_size(cvRound(test_perspective_size), cvRound(test_perspective_size));
+
+    vector<Point2f> perspective_points;
+    perspective_points.push_back(Point2f(0.f, 0.f));
+    perspective_points.push_back(Point2f(test_perspective_size, 0.f));
+
+    perspective_points.push_back(Point2f(static_cast<float>(test_perspective_size * 0.5),
+                                         static_cast<float>(test_perspective_size * 0.5)));
+    original_points.insert(original_points.begin() + 2,
+                           QRDetect::intersectionLines(
+                                original_points[0], original_points[2],
+                                original_points[1], original_points[3]));
+
+    perspective_points.push_back(Point2f(test_perspective_size, test_perspective_size));
+    perspective_points.push_back(Point2f(0.f, test_perspective_size));
+
+    Mat H = findHomography(original_points, perspective_points);
+    Mat bin_original = Mat::zeros(original.size(), CV_8UC1);
+    adaptiveThreshold(original, bin_original, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2);
+    Mat temp_intermediate = Mat::zeros(temporary_size, CV_8UC1);
+    warpPerspective(bin_original, temp_intermediate, H, temporary_size, INTER_NEAREST);
+    no_border_intermediate = temp_intermediate(Range(1, temp_intermediate.rows), Range(1, temp_intermediate.cols));
+
+    const int border = cvRound(0.1 * test_perspective_size);
+    const int borderType = BORDER_CONSTANT;
+    copyMakeBorder(no_border_intermediate, intermediate, border, border, border, border, borderType, Scalar(255));
+    return true;
+}
+
+bool QRDecode::versionDefinition()
+{
+    LineIterator line_iter(intermediate, Point2f(0, 0), Point2f(test_perspective_size, test_perspective_size));
+    Point black_point = Point(0, 0);
+    for(int j = 0; j < line_iter.count; j++, ++line_iter)
+    {
+        const uint8_t value = intermediate.at<uint8_t>(line_iter.pos());
+        if (value == 0) { black_point = line_iter.pos(); break; }
+    }
+
+    Mat mask = Mat::zeros(intermediate.rows + 2, intermediate.cols + 2, CV_8UC1);
+    floodFill(intermediate, mask, black_point, 255, 0, Scalar(), Scalar(), FLOODFILL_MASK_ONLY);
+
+    vector<Point> locations, non_zero_elem;
+    Mat mask_roi = mask(Range(1, intermediate.rows - 1), Range(1, intermediate.cols - 1));
+    findNonZero(mask_roi, non_zero_elem);
+    convexHull(Mat(non_zero_elem), locations);
+
+    Point temp_remote = locations[0], remote_point;
+    const Point delta_diff = Point(4, 4);
+    for (size_t i = 0; i < locations.size(); i++)
+    {
+        if (norm(black_point - temp_remote) <  norm(black_point - locations[i]))
+        {
+            const uint8_t value = intermediate.at<uint8_t>(temp_remote - delta_diff);
+            if (value == 0) { remote_point = temp_remote - delta_diff; }
+            else { remote_point = temp_remote; }
+            temp_remote = locations[i];
+        }
+    }
+
+    size_t transition_x = 0 , transition_y = 0;
+
+    uint8_t future_pixel = 255;
+    const uint8_t *intermediate_row = intermediate.ptr<uint8_t>(remote_point.y);
+    for(int i = remote_point.x; i < intermediate.cols; i++)
+    {
+        if (intermediate_row[i] == future_pixel)
+        {
+            future_pixel = 255 - future_pixel;
+            transition_x++;
+        }
+    }
+
+    future_pixel = 255;
+    for(int j = remote_point.y; j < intermediate.rows; j++)
+    {
+        const uint8_t value = intermediate.at<uint8_t>(Point(j, remote_point.x));
+        if (value == future_pixel)
+        {
+            future_pixel = 255 - future_pixel;
+            transition_y++;
+        }
+    }
+
+    version = saturate_cast<uint8_t>((std::min(transition_x, transition_y) - 1) * 0.25 - 1);
+    if ( !(  0 < version && version <= 40 ) ) { return false; }
+    version_size = 21 + (version - 1) * 4;
+    return true;
+}
+
+bool QRDecode::samplingForVersion()
+{
+    const double multiplyingFactor = (version < 3)  ? 1 :
+                                     (version == 3) ? 1.5 :
+                                     version * (5 + version - 4);
+    const Size newFactorSize(
+                  cvRound(no_border_intermediate.size().width  * multiplyingFactor),
+                  cvRound(no_border_intermediate.size().height * multiplyingFactor));
+    Mat postIntermediate(newFactorSize, CV_8UC1);
+    resize(no_border_intermediate, postIntermediate, newFactorSize, 0, 0, INTER_AREA);
+
+    const int no_inter_rows = postIntermediate.rows;
+    const int no_inter_cols = postIntermediate.cols;
+    const int delta_rows = cvRound((no_inter_rows * 1.0) / version_size);
+    const int delta_cols = cvRound((no_inter_cols * 1.0) / version_size);
+
+    vector<double> listFrequencyElem;
+    for (int r = 0; r < no_inter_rows; r += delta_rows)
+    {
+        for (int c = 0; c < no_inter_cols; c += delta_cols)
+        {
+            Mat tile = postIntermediate(
+                           Range(r, min(r + delta_rows, no_inter_rows)),
+                           Range(c, min(c + delta_cols, no_inter_cols)));
+            const double frequencyElem = (countNonZero(tile) * 1.0) / tile.total();
+            listFrequencyElem.push_back(frequencyElem);
+        }
+    }
+
+    double dispersionEFE = std::numeric_limits<double>::max();
+    double experimentalFrequencyElem = 0;
+    for (double expVal = 0; expVal < 1; expVal+=0.001)
+    {
+        double testDispersionEFE = 0.0;
+        for (size_t i = 0; i < listFrequencyElem.size(); i++)
+        {
+            testDispersionEFE += (listFrequencyElem[i] - expVal) *
+                                 (listFrequencyElem[i] - expVal);
+        }
+        testDispersionEFE /= (listFrequencyElem.size() - 1);
+        if (dispersionEFE > testDispersionEFE)
+        {
+            dispersionEFE = testDispersionEFE;
+            experimentalFrequencyElem = expVal;
+        }
+    }
+
+    straight = Mat(Size(version_size, version_size), CV_8UC1, Scalar(0));
+    size_t k = 0;
+    for (int r = 0; r < no_inter_rows &&
+                    k < listFrequencyElem.size() &&
+                    floor((r * 1.0) / delta_rows) < version_size; r += delta_rows)
+    {
+        for (int c = 0; c < no_inter_cols &&
+                        k < listFrequencyElem.size() &&
+                        floor((c * 1.0) / delta_cols) < version_size; c += delta_cols, k++)
+        {
+            Mat tile = postIntermediate(
+                           Range(r, min(r + delta_rows, no_inter_rows)),
+                           Range(c, min(c + delta_cols, no_inter_cols)));
+
+            if (listFrequencyElem[k] < experimentalFrequencyElem) { tile.setTo(0); }
+            else
+            {
+                tile.setTo(255);
+                straight.at<uint8_t>(cvRound(floor((r * 1.0) / delta_rows)),
+                                     cvRound(floor((c * 1.0) / delta_cols))) = 255;
+            }
+        }
+    }
+    return true;
+}
+
+bool QRDecode::decodingProcess()
+{
+#ifdef HAVE_QUIRC
+    if (straight.empty()) { return false; }
+
+    quirc_code qr_code;
+    memset(&qr_code, 0, sizeof(qr_code));
+
+    qr_code.size = straight.size().width;
+    for (int x = 0; x < qr_code.size; x++)
+    {
+        for (int y = 0; y < qr_code.size; y++)
+        {
+            int position = y * qr_code.size + x;
+            qr_code.cell_bitmap[position >> 3]
+                |= straight.at<uint8_t>(y, x) ? 0 : (1 << (position & 7));
+        }
+    }
+
+    quirc_data qr_code_data;
+    quirc_decode_error_t errorCode = quirc_decode(&qr_code, &qr_code_data);
+    if (errorCode != 0) { return false; }
+
+    for (int i = 0; i < qr_code_data.payload_len; i++)
+    {
+        result_info += qr_code_data.payload[i];
+    }
+    return true;
+#else
+    return false;
+#endif
+
+}
+
+bool QRDecode::fullDecodingProcess()
+{
+#ifdef HAVE_QUIRC
+    if (!updatePerspective())  { return false; }
+    if (!versionDefinition())  { return false; }
+    if (!samplingForVersion()) { return false; }
+    if (!decodingProcess())    { return false; }
+    return true;
+#else
+    std::cout << "Library QUIRC is not linked. No decoding is performed. Take it to the OpenCV repository." << std::endl;
+    return false;
+#endif
+}
+
+CV_EXPORTS bool decodeQRCode(InputArray in, InputArray points, std::string &decoded_info, OutputArray straight_qrcode)
+{
+    Mat inarr = in.getMat();
+    CV_Assert(!inarr.empty());
+    inarr.convertTo(inarr, CV_8UC1);
+
+    CV_Assert(points.isVector());
+    vector<Point2f> src_points;
+    points.copyTo(src_points);
+    CV_Assert(src_points.size() == 4);
+
+    QRDecode qrdec;
+    qrdec.init(inarr, src_points);
+    bool exit_flag = qrdec.fullDecodingProcess();
+
+    decoded_info = qrdec.getDecodeInformation();
+
+    if (straight_qrcode.needed())
+    {
+        qrdec.getStraightBarcode().convertTo(straight_qrcode,
+                                             straight_qrcode.fixedType() ?
+                                             straight_qrcode.type() : CV_32FC2);
+    }
+
+    return exit_flag;
+}
+
 }
diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp
index c0cea50428..0f4b4852c5 100644
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@@ -4,19 +4,15 @@
 
 #include "test_precomp.hpp"
 
-
 namespace opencv_test { namespace {
 
 std::string qrcode_images_name[] = {
-    // "20110817_030.jpg",
-    "20110817_048.jpg",
-    "img_20120226_161648.jpg",
-    "img_2714.jpg",
-    "img_2716.jpg",
-    "img_3011.jpg",
-    "img_3029.jpg",
-    "img_3070.jpg",
-    "qr_test_030.jpg"
+  "version_1_down.jpg", "version_1_left.jpg", "version_1_right.jpg", "version_1_up.jpg", "version_1_top.jpg",
+  "version_2_down.jpg", "version_2_left.jpg", "version_2_right.jpg", "version_2_up.jpg", "version_2_top.jpg",
+  "version_3_down.jpg", "version_3_left.jpg", "version_3_right.jpg", "version_3_up.jpg", "version_3_top.jpg",
+  "version_4_down.jpg", "version_4_left.jpg", "version_4_right.jpg", "version_4_up.jpg", "version_4_top.jpg",
+  "version_5_down.jpg", "version_5_left.jpg", "version_5_right.jpg", "version_5_up.jpg", "version_5_top.jpg",
+  "russian.jpg", "kanji.jpg", "link_github_ocv.jpg", "link_ocv.jpg", "link_wiki_cv.jpg"
 };
 
 // #define UPDATE_QRCODE_TEST_DATA
@@ -35,15 +31,21 @@ TEST(Objdetect_QRCode, generate_test_data)
         file_config << "{:" << "image_name" << qrcode_images_name[i];
         std::string image_path = findDataFile(root + qrcode_images_name[i]);
         std::vector<Point> corners;
-        Mat src = imread(image_path, IMREAD_GRAYSCALE);
+        Mat src = imread(image_path, IMREAD_GRAYSCALE), straight_barcode;
+        std::string decoded_info;
         ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
         EXPECT_TRUE(detectQRCode(src, corners));
+#ifdef HAVE_QUIRC
+        EXPECT_TRUE(decodeQRCode(src, corners, decoded_info, straight_barcode));
+#endif
         file_config << "x" << "[:";
         for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; }
         file_config << "]";
         file_config << "y" << "[:";
         for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].y; }
-        file_config << "]" << "}";
+        file_config << "]";
+        file_config << "info" << decoded_info;
+        file_config << "}";
     }
     file_config << "]";
     file_config.release();
@@ -59,11 +61,15 @@ TEST_P(Objdetect_QRCode, regression)
     const int pixels_error = 3;
 
     std::string image_path = findDataFile(root + name_current_image);
-    Mat src = imread(image_path, IMREAD_GRAYSCALE);
+    Mat src = imread(image_path, IMREAD_GRAYSCALE), straight_barcode;
     ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
 
     std::vector<Point> corners;
+    std::string decoded_info;
     ASSERT_TRUE(detectQRCode(src, corners));
+#ifdef HAVE_QUIRC
+    ASSERT_TRUE(decodeQRCode(src, corners, decoded_info, straight_barcode));
+#endif
 
     const std::string dataset_config = findDataFile(root + "dataset_config.json", false);
     FileStorage file_config(dataset_config, FileStorage::READ);
@@ -86,6 +92,12 @@ TEST_P(Objdetect_QRCode, regression)
                     EXPECT_NEAR(x, corners[i].x, pixels_error);
                     EXPECT_NEAR(y, corners[i].y, pixels_error);
                 }
+
+#ifdef HAVE_QUIRC
+                std::string original_info = config["info"];
+                EXPECT_EQ(decoded_info, original_info);
+#endif
+
                 return; // done
             }
         }
@@ -103,9 +115,14 @@ INSTANTIATE_TEST_CASE_P(/**/, Objdetect_QRCode, testing::ValuesIn(qrcode_images_
 
 TEST(Objdetect_QRCode_basic, not_found_qrcode)
 {
-    std::vector<Point> corners;
+    std::vector<Point> corners, straight_barcode;
+    std::string decoded_info;
     Mat zero_image = Mat::zeros(256, 256, CV_8UC1);
     EXPECT_FALSE(detectQRCode(zero_image, corners));
+#ifdef HAVE_QUIRC
+    corners = std::vector<Point>(4);
+    EXPECT_FALSE(decodeQRCode(zero_image, corners, decoded_info, straight_barcode));
+#endif
 }
 
 
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 6d583c8a6a..795d51adc8 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -52,6 +52,10 @@ using namespace cv::cuda;
 #include "opencv2/xfeatures2d.hpp"
 using xfeatures2d::SURF;
 using xfeatures2d::SIFT;
+#else
+#  if defined(_MSC_VER)
+#    pragma warning(disable:4702)  // unreachable code
+#  endif
 #endif
 
 #ifdef HAVE_OPENCV_CUDAIMGPROC
diff --git a/platforms/js/build_js.py b/platforms/js/build_js.py
index 2e55b7b308..9b7776cd45 100644
--- a/platforms/js/build_js.py
+++ b/platforms/js/build_js.py
@@ -113,10 +113,10 @@ class Builder:
                "-DWITH_ITT=OFF",
                "-DBUILD_ZLIB=ON",
                "-DBUILD_opencv_apps=OFF",
-               "-DBUILD_opencv_calib3d=ON",
+               "-DBUILD_opencv_calib3d=ON",  # No bindings provided. This module is used as a dependency for other modules.
                "-DBUILD_opencv_dnn=ON",
                "-DBUILD_opencv_features2d=ON",
-               "-DBUILD_opencv_flann=OFF",
+               "-DBUILD_opencv_flann=ON",  # No bindings provided. This module is used as a dependency for other modules.
                "-DBUILD_opencv_gapi=OFF",
                "-DBUILD_opencv_ml=OFF",
                "-DBUILD_opencv_photo=OFF",
diff --git a/samples/_winpack_build_sample.cmd b/samples/_winpack_build_sample.cmd
index c671d140fa..84a3b56428 100644
--- a/samples/_winpack_build_sample.cmd
+++ b/samples/_winpack_build_sample.cmd
@@ -38,6 +38,35 @@ echo ===========================================================================
 :: Path to FFMPEG binary files
 set "PATH=!PATH!;!SCRIPTDIR!\..\..\build\bin\"
 
+:: Detect compiler
+cl /? >NUL 2>NUL <NUL
+if !ERRORLEVEL! NEQ 0 (
+  PUSHD !CD!
+  if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Auxiliary\Build\vcvars64.bat" (
+    CALL "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Auxiliary\Build\vcvars64.bat"
+    goto check_msvc
+  )
+  if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars64.bat" (
+    CALL "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+    goto check_msvc
+  )
+  if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Auxiliary\Build\vcvars64.bat" (
+    CALL "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+    goto check_msvc
+  )
+  if exist "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" (
+    CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
+    goto check_msvc
+  )
+:check_msvc
+  POPD
+  cl /? >NUL 2>NUL <NUL
+  if !ERRORLEVEL! NEQ 0 (
+    set "MSG=Can't detect Microsoft Visial Studio C++ compiler (cl.exe). MSVS 2015/2017 are supported only from standard locations"
+    goto die
+  )
+)
+
 :: Detect CMake
 cmake --version >NUL 2>NUL
 if !ERRORLEVEL! EQU 0 (
@@ -55,32 +84,10 @@ if NOT DEFINED CMAKE_FOUND (
   set "MSG=CMake is required to build OpenCV samples. Download it from here: https://cmake.org/download/ and install into 'C:\Program Files\CMake'"
   goto die
 ) else (
+  call :execute cmake --version
   echo CMake is detected
 )
 
-:: Detect compiler
-cl /? >NUL 2>NUL <NUL
-if !ERRORLEVEL! NEQ 0 (
-  PUSHD !CD!
-  if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Auxiliary\Build\vcvars64.bat" (
-    CALL "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Auxiliary\Build\vcvars64.bat"
-  ) else (
-    if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars64.bat" (
-      CALL "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-    ) else (
-      if exist "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" (
-        CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
-      )
-    )
-  )
-  POPD
-  cl /? >NUL 2>NUL <NUL
-  if !ERRORLEVEL! NEQ 0 (
-    set "MSG=ERROR: Can't detect Microsoft Visial Studio C++ compiler (cl.exe). MSVS 2015/2017 are supported only from standard locations"
-    goto die
-  )
-)
-
 :: Detect available MSVS version
 if NOT DEFINED VisualStudioVersion (
   set "MSG=Can't determine MSVS version. 'VisualStudioVersion' is not defined"
diff --git a/samples/cpp/live_detect_qrcode.cpp b/samples/cpp/live_detect_qrcode.cpp
index 2ba70d893d..6851e724a0 100644
--- a/samples/cpp/live_detect_qrcode.cpp
+++ b/samples/cpp/live_detect_qrcode.cpp
@@ -89,7 +89,8 @@ int liveQRCodeDetect()
     TickMeter total;
     for(;;)
     {
-        Mat frame, src;
+        Mat frame, src, straight_barcode;
+        string decode_info;
         vector<Point> transform;
         cap >> frame;
         if(frame.empty()) { break; }
@@ -97,6 +98,11 @@ int liveQRCodeDetect()
 
         total.start();
         bool result_detection = detectQRCode(src, transform);
+        if (result_detection)
+        {
+            bool result_decode = decodeQRCode(src, transform, decode_info, straight_barcode);
+            if (result_decode) { cout << decode_info << '\n'; }
+        }
         total.stop();
         double fps = 1 / total.getTimeSec();
         total.reset();
@@ -112,11 +118,12 @@ int liveQRCodeDetect()
 
 int showImageQRCodeDetect(string in, string out)
 {
-    Mat src = imread(in, IMREAD_GRAYSCALE);
+    Mat src = imread(in, IMREAD_GRAYSCALE), straight_barcode;
+    string decode_info;
     vector<Point> transform;
     const int count_experiments = 10;
     double transform_time = 0.0;
-    bool result_detection = false;
+    bool result_detection = false, result_decode = false;
     TickMeter total;
     for (size_t i = 0; i < count_experiments; i++)
     {
@@ -125,12 +132,20 @@ int showImageQRCodeDetect(string in, string out)
         result_detection = detectQRCode(src, transform);
         total.stop();
         transform_time += total.getTimeSec();
-        if (!result_detection) { break; }
         total.reset();
+        if (!result_detection) { break; }
+
+        total.start();
+        result_decode = decodeQRCode(src, transform, decode_info, straight_barcode);
+        total.stop();
+        transform_time += total.getTimeSec();
+        total.reset();
+        if (!result_decode) { break; }
 
     }
     double fps = count_experiments / transform_time;
     if (!result_detection) { cout << "Not find QR-code." << '\n'; return -2; }
+    if (!result_decode) { cout << "Not decode QR-code." << '\n'; return -3; }
 
     Mat color_src = imread(in);
     getMatWithQRCodeContour(color_src, transform);
@@ -151,6 +166,7 @@ int showImageQRCodeDetect(string in, string out)
         cout << "Output image file path: " << out << '\n';
         cout << "Size: " << color_src.size() << '\n';
         cout << "FPS: " << fps << '\n';
+        cout << "Decode info: " << decode_info << '\n';
 
         vector<int> compression_params;
         compression_params.push_back(IMWRITE_PNG_COMPRESSION);