diff --git a/3rdparty/libjpeg-turbo/CMakeLists.txt b/3rdparty/libjpeg-turbo/CMakeLists.txt
index cf1f77aaa3..531a00e017 100644
--- a/3rdparty/libjpeg-turbo/CMakeLists.txt
+++ b/3rdparty/libjpeg-turbo/CMakeLists.txt
@@ -78,10 +78,11 @@ configure_file(jconfigint.h.in jconfigint.h)
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
+set(JPEG_SOURCES
+  jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c jcicc.c
   jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c
   jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c
-  jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c
+  jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c jdmainct.c jdmarker.c
   jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c
   jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c
   jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
diff --git a/3rdparty/libjpeg-turbo/src/jcicc.c b/3rdparty/libjpeg-turbo/src/jcicc.c
new file mode 100644
index 0000000000..11037ff694
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jcicc.c
@@ -0,0 +1,105 @@
+/*
+ * jcicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to write International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to embed the profile data in a JPEG file while writing it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/*
+ * Since an ICC profile can be larger than the maximum size of a JPEG marker
+ * (64K), we need provisions to split it into multiple markers.  The format
+ * defined by the ICC specifies one or more APP2 markers containing the
+ * following data:
+ *      Identifying string      ASCII "ICC_PROFILE\0"  (12 bytes)
+ *      Marker sequence number  1 for first APP2, 2 for next, etc (1 byte)
+ *      Number of markers       Total number of APP2's used (1 byte)
+ *      Profile data            (remainder of APP2 data)
+ * Decoders should use the marker sequence numbers to reassemble the profile,
+ * rather than assuming that the APP2 markers appear in the correct sequence.
+ */
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+#define MAX_BYTES_IN_MARKER  65533      /* maximum data len of a JPEG marker */
+#define MAX_DATA_BYTES_IN_MARKER  (MAX_BYTES_IN_MARKER - ICC_OVERHEAD_LEN)
+
+
+/*
+ * This routine writes the given ICC profile data into a JPEG file.  It *must*
+ * be called AFTER calling jpeg_start_compress() and BEFORE the first call to
+ * jpeg_write_scanlines().  (This ordering ensures that the APP2 marker(s) will
+ * appear after the SOI and JFIF or Adobe markers, but before all else.)
+ */
+
+GLOBAL(void)
+jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+                       unsigned int icc_data_len)
+{
+  unsigned int num_markers;     /* total number of markers we'll write */
+  int cur_marker = 1;           /* per spec, counting starts at 1 */
+  unsigned int length;          /* number of bytes to write in this marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == 0)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < CSTATE_SCANNING)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  /* Calculate the number of markers we'll need, rounding up of course */
+  num_markers = icc_data_len / MAX_DATA_BYTES_IN_MARKER;
+  if (num_markers * MAX_DATA_BYTES_IN_MARKER != icc_data_len)
+    num_markers++;
+
+  while (icc_data_len > 0) {
+    /* length of profile to put in this marker */
+    length = icc_data_len;
+    if (length > MAX_DATA_BYTES_IN_MARKER)
+      length = MAX_DATA_BYTES_IN_MARKER;
+    icc_data_len -= length;
+
+    /* Write the JPEG marker header (APP2 code and marker length) */
+    jpeg_write_m_header(cinfo, ICC_MARKER,
+                        (unsigned int)(length + ICC_OVERHEAD_LEN));
+
+    /* Write the marker identifying string "ICC_PROFILE" (null-terminated).  We
+     * code it in this less-than-transparent way so that the code works even if
+     * the local character set is not ASCII.
+     */
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x5F);
+    jpeg_write_m_byte(cinfo, 0x50);
+    jpeg_write_m_byte(cinfo, 0x52);
+    jpeg_write_m_byte(cinfo, 0x4F);
+    jpeg_write_m_byte(cinfo, 0x46);
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x4C);
+    jpeg_write_m_byte(cinfo, 0x45);
+    jpeg_write_m_byte(cinfo, 0x0);
+
+    /* Add the sequencing info */
+    jpeg_write_m_byte(cinfo, cur_marker);
+    jpeg_write_m_byte(cinfo, (int)num_markers);
+
+    /* Add the profile data */
+    while (length--) {
+      jpeg_write_m_byte(cinfo, *icc_data_ptr);
+      icc_data_ptr++;
+    }
+    cur_marker++;
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/jdicc.c b/3rdparty/libjpeg-turbo/src/jdicc.c
new file mode 100644
index 0000000000..7224695816
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jdicc.c
@@ -0,0 +1,171 @@
+/*
+ * jdicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to read International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to get the profile data from a JPEG file while reading it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc() */
+extern void *malloc(size_t size);
+#endif
+
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+
+
+/*
+ * Handy subroutine to test whether a saved marker is an ICC profile marker.
+ */
+
+LOCAL(boolean)
+marker_is_icc(jpeg_saved_marker_ptr marker)
+{
+  return
+    marker->marker == ICC_MARKER &&
+    marker->data_length >= ICC_OVERHEAD_LEN &&
+    /* verify the identifying string */
+    GETJOCTET(marker->data[0]) == 0x49 &&
+    GETJOCTET(marker->data[1]) == 0x43 &&
+    GETJOCTET(marker->data[2]) == 0x43 &&
+    GETJOCTET(marker->data[3]) == 0x5F &&
+    GETJOCTET(marker->data[4]) == 0x50 &&
+    GETJOCTET(marker->data[5]) == 0x52 &&
+    GETJOCTET(marker->data[6]) == 0x4F &&
+    GETJOCTET(marker->data[7]) == 0x46 &&
+    GETJOCTET(marker->data[8]) == 0x49 &&
+    GETJOCTET(marker->data[9]) == 0x4C &&
+    GETJOCTET(marker->data[10]) == 0x45 &&
+    GETJOCTET(marker->data[11]) == 0x0;
+}
+
+
+/*
+ * See if there was an ICC profile in the JPEG file being read; if so,
+ * reassemble and return the profile data.
+ *
+ * TRUE is returned if an ICC profile was found, FALSE if not.  If TRUE is
+ * returned, *icc_data_ptr is set to point to the returned data, and
+ * *icc_data_len is set to its length.
+ *
+ * IMPORTANT: the data at *icc_data_ptr is allocated with malloc() and must be
+ * freed by the caller with free() when the caller no longer needs it.
+ * (Alternatively, we could write this routine to use the IJG library's memory
+ * allocator, so that the data would be freed implicitly when
+ * jpeg_finish_decompress() is called.  But it seems likely that many
+ * applications will prefer to have the data stick around after decompression
+ * finishes.)
+ */
+
+GLOBAL(boolean)
+jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                      unsigned int *icc_data_len)
+{
+  jpeg_saved_marker_ptr marker;
+  int num_markers = 0;
+  int seq_no;
+  JOCTET *icc_data;
+  unsigned int total_length;
+#define MAX_SEQ_NO  255         /* sufficient since marker numbers are bytes */
+  char marker_present[MAX_SEQ_NO + 1];      /* 1 if marker found */
+  unsigned int data_length[MAX_SEQ_NO + 1]; /* size of profile data in marker */
+  unsigned int data_offset[MAX_SEQ_NO + 1]; /* offset for data in marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == NULL)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < DSTATE_READY)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  *icc_data_ptr = NULL;         /* avoid confusion if FALSE return */
+  *icc_data_len = 0;
+
+  /* This first pass over the saved markers discovers whether there are
+   * any ICC markers and verifies the consistency of the marker numbering.
+   */
+
+  for (seq_no = 1; seq_no <= MAX_SEQ_NO; seq_no++)
+    marker_present[seq_no] = 0;
+
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      if (num_markers == 0)
+        num_markers = GETJOCTET(marker->data[13]);
+      else if (num_markers != GETJOCTET(marker->data[13])) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
+        return FALSE;
+      }
+      seq_no = GETJOCTET(marker->data[12]);
+      if (seq_no <= 0 || seq_no > num_markers) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
+        return FALSE;
+      }
+      if (marker_present[seq_no]) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* duplicate sequence numbers */
+        return FALSE;
+      }
+      marker_present[seq_no] = 1;
+      data_length[seq_no] = marker->data_length - ICC_OVERHEAD_LEN;
+    }
+  }
+
+  if (num_markers == 0)
+    return FALSE;
+
+  /* Check for missing markers, count total space needed,
+   * compute offset of each marker's part of the data.
+   */
+
+  total_length = 0;
+  for (seq_no = 1; seq_no <= num_markers; seq_no++) {
+    if (marker_present[seq_no] == 0) {
+      WARNMS(cinfo, JWRN_BOGUS_ICC);  /* missing sequence number */
+      return FALSE;
+    }
+    data_offset[seq_no] = total_length;
+    total_length += data_length[seq_no];
+  }
+
+  if (total_length == 0) {
+    WARNMS(cinfo, JWRN_BOGUS_ICC);  /* found only empty markers? */
+    return FALSE;
+  }
+
+  /* Allocate space for assembled data */
+  icc_data = (JOCTET *)malloc(total_length * sizeof(JOCTET));
+  if (icc_data == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 11);  /* oops, out of memory */
+
+  /* and fill it in */
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      JOCTET FAR *src_ptr;
+      JOCTET *dst_ptr;
+      unsigned int length;
+      seq_no = GETJOCTET(marker->data[12]);
+      dst_ptr = icc_data + data_offset[seq_no];
+      src_ptr = marker->data + ICC_OVERHEAD_LEN;
+      length = data_length[seq_no];
+      while (length--) {
+        *dst_ptr++ = *src_ptr++;
+      }
+    }
+  }
+
+  *icc_data_ptr = icc_data;
+  *icc_data_len = total_length;
+
+  return TRUE;
+}
diff --git a/3rdparty/libjpeg/README b/3rdparty/libjpeg/README
index 4c8e82e9ba..56cdb60038 100644
--- a/3rdparty/libjpeg/README
+++ b/3rdparty/libjpeg/README
@@ -1,7 +1,7 @@
 The Independent JPEG Group's JPEG software
 ==========================================
 
-README for release 9b of 17-Jan-2016
+README for release 9c of 14-Jan-2018
 ====================================
 
 This distribution contains the ninth public release of the Independent JPEG
@@ -115,7 +115,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
+This software is copyright (C) 1991-2018, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -246,8 +246,8 @@ ARCHIVE LOCATIONS
 The "official" archive site for this software is www.ijg.org.
 The most recent released version can always be found there in
 directory "files".  This particular version will be archived as
-http://www.ijg.org/files/jpegsrc.v9b.tar.gz, and in Windows-compatible
-"zip" archive format as http://www.ijg.org/files/jpegsr9b.zip.
+http://www.ijg.org/files/jpegsrc.v9c.tar.gz, and in Windows-compatible
+"zip" archive format as http://www.ijg.org/files/jpegsr9c.zip.
 
 The JPEG FAQ (Frequently Asked Questions) article is a source of some
 general information about JPEG.
@@ -293,8 +293,11 @@ communication about JPEG configuration in Sigma Photo Pro software.
 
 Thank to Andrew Finkenstadt for hosting the ijg.org site.
 
-Last but not least special thank to Thomas G. Lane for the original
-design and development of this singular software package.
+Thank to Thomas G. Lane for the original design and development of
+this singular software package.
+
+Thank to Lars Goehler, Andreas Heinecke, Sebastian Fuss, Yvonne Roebert,
+Andrej Werner, and Ulf-Dietrich Braumann for support and public relations.
 
 
 FILE FORMAT WARS
diff --git a/3rdparty/libjpeg/change.log b/3rdparty/libjpeg/change.log
index b8439f4143..674a558a72 100644
--- a/3rdparty/libjpeg/change.log
+++ b/3rdparty/libjpeg/change.log
@@ -1,6 +1,27 @@
 CHANGE LOG for Independent JPEG Group's JPEG software
 
 
+Version 9c  14-Jan-2018
+-----------------------
+
+jpegtran: add an option to the -wipe switch to fill the region
+with the average of adjacent blocks, instead of gray out.
+Thank to Caitlyn Feddock and Maddie Ziegler for inspiration.
+
+Make range extension bits adjustable (in jpegint.h).
+Thank to Robin Watts for suggestion.
+
+Provide macros for fflush() and ferror() in jinclude.h in order
+to facilitate adaption by applications using an own FILE class.
+Thank to Gerhard Huber for suggestion.
+
+Add libjpeg pkg-config file.  Thank to Mark Lavi, Vincent Torri,
+Patrick McMunn, and Huw Davies for suggestion.
+
+Add sanity checks in cjpeg image reader modules.
+Thank to Bingchang, Liu for reports.
+
+
 Version 9b  17-Jan-2016
 -----------------------
 
diff --git a/3rdparty/libjpeg/jcinit.c b/3rdparty/libjpeg/jcinit.c
index 1e13e3462d..2aea7ca2a1 100644
--- a/3rdparty/libjpeg/jcinit.c
+++ b/3rdparty/libjpeg/jcinit.c
@@ -2,7 +2,7 @@
  * jcinit.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2003-2013 by Guido Vollbeding.
+ * Modified 2003-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -21,6 +21,168 @@
 #include "jpeglib.h"
 
 
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+  /* Sanity check on input image dimensions to prevent overflow in
+   * following calculations.
+   * We do check jpeg_width and jpeg_height in initial_setup in jcmaster.c,
+   * but image_width and image_height can come from arbitrary data,
+   * and we need some space for multiplication by block_size.
+   */
+  if (((long) cinfo->image_width >> 24) || ((long) cinfo->image_height >> 24))
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+
+#ifdef DCT_SCALING_SUPPORTED
+
+  /* Compute actual JPEG image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width * cinfo->block_size;
+    cinfo->jpeg_height = cinfo->image_height * cinfo->block_size;
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * 2 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/2 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 2L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 2L);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/3 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 3L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 3L);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num * 4 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/4 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 4L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 4L);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/5 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 5L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 5L);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * 6 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/6 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 6L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 6L);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/7 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 7L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 7L);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num * 8 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/8 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 8L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 8L);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/9 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 9L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 9L);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * 10 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/10 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 10L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 10L);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/11 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 11L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 11L);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * 12 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/12 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 12L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 12L);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/13 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 13L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 13L);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * 14 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/14 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 14L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 14L);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/15 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 15L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 15L);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide block_size/16 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 16L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 16L);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+#else /* !DCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+
+#endif /* DCT_SCALING_SUPPORTED */
+}
+
+
 /*
  * Master selection of compression modules.
  * This is done once at the start of processing an image.  We determine
@@ -37,7 +199,7 @@ jinit_compress_master (j_compress_ptr cinfo)
   if (cinfo->data_precision != BITS_IN_JSAMPLE)
     ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
 
-  /* Sanity check on image dimensions */
+  /* Sanity check on input image dimensions */
   if (cinfo->image_height <= 0 || cinfo->image_width <= 0 ||
       cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
@@ -48,6 +210,9 @@ jinit_compress_master (j_compress_ptr cinfo)
   if ((long) jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
+  /* Compute JPEG image dimensions and related values. */
+  jpeg_calc_jpeg_dimensions(cinfo);
+
   /* Initialize master control (includes parameter checking/processing) */
   jinit_c_master_control(cinfo, FALSE /* full compression */);
 
diff --git a/3rdparty/libjpeg/jcmaster.c b/3rdparty/libjpeg/jcmaster.c
index 2a8ae63303..43d49aec45 100644
--- a/3rdparty/libjpeg/jcmaster.c
+++ b/3rdparty/libjpeg/jcmaster.c
@@ -2,7 +2,7 @@
  * jcmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2003-2013 by Guido Vollbeding.
+ * Modified 2003-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -43,191 +43,13 @@ typedef my_comp_master * my_master_ptr;
  * Support routines that do various essential calculations.
  */
 
-/*
- * Compute JPEG image dimensions and related values.
- * NOTE: this is exported for possible use by application.
- * Hence it mustn't do anything that can't be done twice.
- */
-
-GLOBAL(void)
-jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
-/* Do computations that are needed before master selection phase */
-{
-#ifdef DCT_SCALING_SUPPORTED
-
-  /* Sanity check on input image dimensions to prevent overflow in
-   * following calculation.
-   * We do check jpeg_width and jpeg_height in initial_setup below,
-   * but image_width and image_height can come from arbitrary data,
-   * and we need some space for multiplication by block_size.
-   */
-  if (((long) cinfo->image_width >> 24) || ((long) cinfo->image_height >> 24))
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
-
-  /* Compute actual JPEG image dimensions and DCT scaling choices. */
-  if (cinfo->scale_num >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/1 scaling */
-    cinfo->jpeg_width = cinfo->image_width * cinfo->block_size;
-    cinfo->jpeg_height = cinfo->image_height * cinfo->block_size;
-    cinfo->min_DCT_h_scaled_size = 1;
-    cinfo->min_DCT_v_scaled_size = 1;
-  } else if (cinfo->scale_num * 2 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/2 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 2L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 2L);
-    cinfo->min_DCT_h_scaled_size = 2;
-    cinfo->min_DCT_v_scaled_size = 2;
-  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/3 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 3L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 3L);
-    cinfo->min_DCT_h_scaled_size = 3;
-    cinfo->min_DCT_v_scaled_size = 3;
-  } else if (cinfo->scale_num * 4 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/4 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 4L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 4L);
-    cinfo->min_DCT_h_scaled_size = 4;
-    cinfo->min_DCT_v_scaled_size = 4;
-  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/5 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 5L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 5L);
-    cinfo->min_DCT_h_scaled_size = 5;
-    cinfo->min_DCT_v_scaled_size = 5;
-  } else if (cinfo->scale_num * 6 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/6 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 6L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 6L);
-    cinfo->min_DCT_h_scaled_size = 6;
-    cinfo->min_DCT_v_scaled_size = 6;
-  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/7 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 7L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 7L);
-    cinfo->min_DCT_h_scaled_size = 7;
-    cinfo->min_DCT_v_scaled_size = 7;
-  } else if (cinfo->scale_num * 8 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/8 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 8L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 8L);
-    cinfo->min_DCT_h_scaled_size = 8;
-    cinfo->min_DCT_v_scaled_size = 8;
-  } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/9 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 9L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 9L);
-    cinfo->min_DCT_h_scaled_size = 9;
-    cinfo->min_DCT_v_scaled_size = 9;
-  } else if (cinfo->scale_num * 10 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/10 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 10L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 10L);
-    cinfo->min_DCT_h_scaled_size = 10;
-    cinfo->min_DCT_v_scaled_size = 10;
-  } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/11 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 11L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 11L);
-    cinfo->min_DCT_h_scaled_size = 11;
-    cinfo->min_DCT_v_scaled_size = 11;
-  } else if (cinfo->scale_num * 12 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/12 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 12L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 12L);
-    cinfo->min_DCT_h_scaled_size = 12;
-    cinfo->min_DCT_v_scaled_size = 12;
-  } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/13 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 13L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 13L);
-    cinfo->min_DCT_h_scaled_size = 13;
-    cinfo->min_DCT_v_scaled_size = 13;
-  } else if (cinfo->scale_num * 14 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/14 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 14L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 14L);
-    cinfo->min_DCT_h_scaled_size = 14;
-    cinfo->min_DCT_v_scaled_size = 14;
-  } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * cinfo->block_size) {
-    /* Provide block_size/15 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 15L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 15L);
-    cinfo->min_DCT_h_scaled_size = 15;
-    cinfo->min_DCT_v_scaled_size = 15;
-  } else {
-    /* Provide block_size/16 scaling */
-    cinfo->jpeg_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 16L);
-    cinfo->jpeg_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 16L);
-    cinfo->min_DCT_h_scaled_size = 16;
-    cinfo->min_DCT_v_scaled_size = 16;
-  }
-
-#else /* !DCT_SCALING_SUPPORTED */
-
-  /* Hardwire it to "no scaling" */
-  cinfo->jpeg_width = cinfo->image_width;
-  cinfo->jpeg_height = cinfo->image_height;
-  cinfo->min_DCT_h_scaled_size = DCTSIZE;
-  cinfo->min_DCT_v_scaled_size = DCTSIZE;
-
-#endif /* DCT_SCALING_SUPPORTED */
-}
-
-
 LOCAL(void)
-jpeg_calc_trans_dimensions (j_compress_ptr cinfo)
-{
-  if (cinfo->min_DCT_h_scaled_size != cinfo->min_DCT_v_scaled_size)
-    ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
-	     cinfo->min_DCT_h_scaled_size, cinfo->min_DCT_v_scaled_size);
-
-  cinfo->block_size = cinfo->min_DCT_h_scaled_size;
-}
-
-
-LOCAL(void)
-initial_setup (j_compress_ptr cinfo, boolean transcode_only)
+initial_setup (j_compress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
   int ci, ssize;
   jpeg_component_info *compptr;
 
-  if (transcode_only)
-    jpeg_calc_trans_dimensions(cinfo);
-  else
-    jpeg_calc_jpeg_dimensions(cinfo);
-
   /* Sanity check on block_size */
   if (cinfo->block_size < 1 || cinfo->block_size > 16)
     ERREXIT2(cinfo, JERR_BAD_DCTSIZE, cinfo->block_size, cinfo->block_size);
@@ -414,13 +236,9 @@ validate_script (j_compress_ptr cinfo)
        * out-of-range reconstructed DC values during the first DC scan,
        * which might cause problems for some decoders.
        */
-#if BITS_IN_JSAMPLE == 8
-#define MAX_AH_AL 10
-#else
-#define MAX_AH_AL 13
-#endif
       if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
-	  Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
+	  Ah < 0 || Ah > (cinfo->data_precision > 8 ? 13 : 10) ||
+	  Al < 0 || Al > (cinfo->data_precision > 8 ? 13 : 10))
 	ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       if (Ss == 0) {
 	if (Se != 0)		/* DC and AC together not OK */
@@ -812,7 +630,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
   master->pub.is_last_pass = FALSE;
 
   /* Validate parameters, determine derived values */
-  initial_setup(cinfo, transcode_only);
+  initial_setup(cinfo);
 
   if (cinfo->scan_info != NULL) {
 #ifdef C_MULTISCAN_FILES_SUPPORTED
diff --git a/3rdparty/libjpeg/jctrans.c b/3rdparty/libjpeg/jctrans.c
index 7cd077e4f6..5780de42e2 100644
--- a/3rdparty/libjpeg/jctrans.c
+++ b/3rdparty/libjpeg/jctrans.c
@@ -2,7 +2,7 @@
  * jctrans.c
  *
  * Copyright (C) 1995-1998, Thomas G. Lane.
- * Modified 2000-2013 by Guido Vollbeding.
+ * Modified 2000-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -85,12 +85,15 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
   jpeg_set_defaults(dstinfo);
   /* jpeg_set_defaults may choose wrong colorspace, eg YCbCr if input is RGB.
    * Fix it to get the right header markers for the image colorspace.
-   * Note: Entropy table assignment in jpeg_set_colorspace depends
-   * on color_transform.
+   * Note: Entropy table assignment in jpeg_set_colorspace
+   * depends on color_transform.
+   * Adaption is also required for setting the appropriate
+   * entropy coding mode dependent on image data precision.
    */
   dstinfo->color_transform = srcinfo->color_transform;
   jpeg_set_colorspace(dstinfo, srcinfo->jpeg_color_space);
   dstinfo->data_precision = srcinfo->data_precision;
+  dstinfo->arith_code = srcinfo->data_precision > 8 ? TRUE : FALSE;
   dstinfo->CCIR601_sampling = srcinfo->CCIR601_sampling;
   /* Copy the source's quantization tables. */
   for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
@@ -157,6 +160,18 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
 }
 
 
+LOCAL(void)
+jpeg_calc_trans_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+  if (cinfo->min_DCT_h_scaled_size != cinfo->min_DCT_v_scaled_size)
+    ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	     cinfo->min_DCT_h_scaled_size, cinfo->min_DCT_v_scaled_size);
+
+  cinfo->block_size = cinfo->min_DCT_h_scaled_size;
+}
+
+
 /*
  * Master selection of compression modules for transcoding.
  * This substitutes for jcinit.c's initialization of the full compressor.
@@ -166,6 +181,9 @@ LOCAL(void)
 transencode_master_selection (j_compress_ptr cinfo,
 			      jvirt_barray_ptr * coef_arrays)
 {
+  /* Do computations that are needed before master selection phase */
+  jpeg_calc_trans_dimensions(cinfo);
+
   /* Initialize master control (includes parameter checking/processing) */
   jinit_c_master_control(cinfo, TRUE /* transcode only */);
 
diff --git a/3rdparty/libjpeg/jdatadst.c b/3rdparty/libjpeg/jdatadst.c
index 5c8681c9e4..dcac2293ae 100644
--- a/3rdparty/libjpeg/jdatadst.c
+++ b/3rdparty/libjpeg/jdatadst.c
@@ -2,7 +2,7 @@
  * jdatadst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2009-2012 by Guido Vollbeding.
+ * Modified 2009-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -170,9 +170,9 @@ term_destination (j_compress_ptr cinfo)
     if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
       ERREXIT(cinfo, JERR_FILE_WRITE);
   }
-  fflush(dest->outfile);
+  JFFLUSH(dest->outfile);
   /* Make sure we wrote the output file OK */
-  if (ferror(dest->outfile))
+  if (JFERROR(dest->outfile))
     ERREXIT(cinfo, JERR_FILE_WRITE);
 }
 
diff --git a/3rdparty/libjpeg/jdcolor.c b/3rdparty/libjpeg/jdcolor.c
index 29c30fae51..0316354dac 100644
--- a/3rdparty/libjpeg/jdcolor.c
+++ b/3rdparty/libjpeg/jdcolor.c
@@ -2,7 +2,7 @@
  * jdcolor.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2011-2015 by Guido Vollbeding.
+ * Modified 2011-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -14,6 +14,12 @@
 #include "jpeglib.h"
 
 
+#if RANGE_BITS < 2
+  /* Deliberate syntax err */
+  Sorry, this code requires 2 or more range extension bits.
+#endif
+
+
 /* Private subobject */
 
 typedef struct {
diff --git a/3rdparty/libjpeg/jdct.h b/3rdparty/libjpeg/jdct.h
index 5d0fe83fb2..bcfedfcfd2 100644
--- a/3rdparty/libjpeg/jdct.h
+++ b/3rdparty/libjpeg/jdct.h
@@ -2,7 +2,7 @@
  * jdct.h
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2002-2015 by Guido Vollbeding.
+ * Modified 2002-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -79,13 +79,12 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
  * converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
  * be quite far out of range if the input data is corrupt, so a bulletproof
  * range-limiting step is required.  We use a mask-and-table-lookup method
- * to do the combined operations quickly, assuming that MAXJSAMPLE+1
- * is a power of 2.  See the comments with prepare_range_limit_table
- * (in jdmaster.c) for more info.
+ * to do the combined operations quickly, assuming that RANGE_CENTER
+ * (defined in jpegint.h) is a power of 2.  See the comments with
+ * prepare_range_limit_table (in jdmaster.c) for more info.
  */
 
-#define RANGE_MASK  (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */
-#define RANGE_CENTER  (MAXJSAMPLE * 2 + 2)
+#define RANGE_MASK  (RANGE_CENTER * 2 - 1)
 #define RANGE_SUBSET  (RANGE_CENTER - CENTERJSAMPLE)
 
 #define IDCT_range_limit(cinfo)  ((cinfo)->sample_range_limit - RANGE_SUBSET)
diff --git a/3rdparty/libjpeg/jdhuff.c b/3rdparty/libjpeg/jdhuff.c
index 6920e207c8..835d06ecb6 100644
--- a/3rdparty/libjpeg/jdhuff.c
+++ b/3rdparty/libjpeg/jdhuff.c
@@ -2,7 +2,7 @@
  * jdhuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2006-2013 by Guido Vollbeding.
+ * Modified 2006-2016 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -799,10 +799,6 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
    */
   if (! entropy->insufficient_data) {
 
-    Se = cinfo->Se;
-    Al = cinfo->Al;
-    natural_order = cinfo->natural_order;
-
     /* Load up working state.
      * We can avoid loading/saving bitread state if in an EOB run.
      */
@@ -814,6 +810,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       EOBRUN--;			/* ...process it now (we do nothing) */
     else {
       BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+      Se = cinfo->Se;
+      Al = cinfo->Al;
+      natural_order = cinfo->natural_order;
       block = MCU_data[0];
       tbl = entropy->ac_derived_tbl;
 
diff --git a/3rdparty/libjpeg/jdmainct.c b/3rdparty/libjpeg/jdmainct.c
index 52091fb2be..4d738fbaed 100644
--- a/3rdparty/libjpeg/jdmainct.c
+++ b/3rdparty/libjpeg/jdmainct.c
@@ -2,7 +2,7 @@
  * jdmainct.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2002-2012 by Guido Vollbeding.
+ * Modified 2002-2016 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -26,8 +26,8 @@
  * trivial.  Its responsibility is to provide context rows for upsampling/
  * rescaling, and doing this in an efficient fashion is a bit tricky.
  *
- * Postprocessor input data is counted in "row groups".  A row group
- * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * Postprocessor input data is counted in "row groups".  A row group is
+ * defined to be (v_samp_factor * DCT_v_scaled_size / min_DCT_v_scaled_size)
  * sample rows of each component.  (We require DCT_scaled_size values to be
  * chosen such that these numbers are integers.  In practice DCT_scaled_size
  * values will likely be powers of two, so we actually have the stronger
@@ -37,8 +37,8 @@
  * applying).
  *
  * The coefficient controller will deliver data to us one iMCU row at a time;
- * each iMCU row contains v_samp_factor * DCT_scaled_size sample rows, or
- * exactly min_DCT_scaled_size row groups.  (This amount of data corresponds
+ * each iMCU row contains v_samp_factor * DCT_v_scaled_size sample rows, or
+ * exactly min_DCT_v_scaled_size row groups.  (This amount of data corresponds
  * to one row of MCUs when the image is fully interleaved.)  Note that the
  * number of sample rows varies across components, but the number of row
  * groups does not.  Some garbage sample rows may be included in the last iMCU
@@ -75,7 +75,7 @@
  * We could do this most simply by copying data around in our buffer, but
  * that'd be very slow.  We can avoid copying any data by creating a rather
  * strange pointer structure.  Here's how it works.  We allocate a workspace
- * consisting of M+2 row groups (where M = min_DCT_scaled_size is the number
+ * consisting of M+2 row groups (where M = min_DCT_v_scaled_size is the number
  * of row groups per iMCU row).  We create two sets of redundant pointers to
  * the workspace.  Labeling the physical row groups 0 to M+1, the synthesized
  * pointer lists look like this:
@@ -100,11 +100,11 @@
  * the first or last sample row as necessary (this is cheaper than copying
  * sample rows around).
  *
- * This scheme breaks down if M < 2, ie, min_DCT_scaled_size is 1.  In that
+ * This scheme breaks down if M < 2, ie, min_DCT_v_scaled_size is 1.  In that
  * situation each iMCU row provides only one row group so the buffering logic
  * must be different (eg, we must read two iMCU rows before we can emit the
  * first row group).  For now, we simply do not support providing context
- * rows when min_DCT_scaled_size is 1.  That combination seems unlikely to
+ * rows when min_DCT_v_scaled_size is 1.  That combination seems unlikely to
  * be worth providing --- if someone wants a 1/8th-size preview, they probably
  * want it quick and dirty, so a context-free upsampler is sufficient.
  */
@@ -118,17 +118,18 @@ typedef struct {
   /* Pointer to allocated workspace (M or M+2 row groups). */
   JSAMPARRAY buffer[MAX_COMPONENTS];
 
-  boolean buffer_full;		/* Have we gotten an iMCU row from decoder? */
   JDIMENSION rowgroup_ctr;	/* counts row groups output to postprocessor */
+  JDIMENSION rowgroups_avail;	/* row groups available to postprocessor */
 
   /* Remaining fields are only used in the context case. */
 
+  boolean buffer_full;		/* Have we gotten an iMCU row from decoder? */
+
   /* These are the master pointers to the funny-order pointer lists. */
   JSAMPIMAGE xbuffer[2];	/* pointers to weird pointer lists */
 
   int whichptr;			/* indicates which pointer set is now in use */
   int context_state;		/* process_data state machine status */
-  JDIMENSION rowgroups_avail;	/* row groups available to postprocessor */
   JDIMENSION iMCU_row_ctr;	/* counts iMCU rows to detect image top/bot */
 } my_main_controller;
 
@@ -195,7 +196,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 LOCAL(void)
 make_funny_pointers (j_decompress_ptr cinfo)
 /* Create the funny pointer lists discussed in the comments above.
- * The actual workspace is already allocated (in main->buffer),
+ * The actual workspace is already allocated (in mainp->buffer),
  * and the space for the pointer lists is allocated too.
  * This routine just fills in the curiously ordered lists.
  * This will be repeated at the beginning of each pass.
@@ -317,12 +318,12 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
       mainp->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
       mainp->context_state = CTX_PREPARE_FOR_IMCU;
       mainp->iMCU_row_ctr = 0;
+      mainp->buffer_full = FALSE; /* Mark buffer empty */
     } else {
       /* Simple case with no context needed */
       mainp->pub.process_data = process_data_simple_main;
+      mainp->rowgroup_ctr = mainp->rowgroups_avail; /* Mark buffer empty */
     }
-    mainp->buffer_full = FALSE;	/* Mark buffer empty */
-    mainp->rowgroup_ctr = 0;
     break;
 #ifdef QUANT_2PASS_SUPPORTED
   case JBUF_CRANK_DEST:
@@ -348,17 +349,14 @@ process_data_simple_main (j_decompress_ptr cinfo,
 			  JDIMENSION out_rows_avail)
 {
   my_main_ptr mainp = (my_main_ptr) cinfo->main;
-  JDIMENSION rowgroups_avail;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! mainp->buffer_full) {
+  if (mainp->rowgroup_ctr >= mainp->rowgroups_avail) {
     if (! (*cinfo->coef->decompress_data) (cinfo, mainp->buffer))
       return;			/* suspension forced, can do nothing more */
-    mainp->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+    mainp->rowgroup_ctr = 0;	/* OK, we have an iMCU row to work with */
   }
 
-  /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_v_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
@@ -366,14 +364,8 @@ process_data_simple_main (j_decompress_ptr cinfo,
 
   /* Feed the postprocessor */
   (*cinfo->post->post_process_data) (cinfo, mainp->buffer,
-				     &mainp->rowgroup_ctr, rowgroups_avail,
-				     output_buf, out_row_ctr, out_rows_avail);
-
-  /* Has postprocessor consumed all the data yet? If so, mark buffer empty */
-  if (mainp->rowgroup_ctr >= rowgroups_avail) {
-    mainp->buffer_full = FALSE;
-    mainp->rowgroup_ctr = 0;
-  }
+			&mainp->rowgroup_ctr, mainp->rowgroups_avail,
+			output_buf, out_row_ctr, out_rows_avail);
 }
 
 
@@ -498,7 +490,9 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
     ngroups = cinfo->min_DCT_v_scaled_size + 2;
   } else {
+    /* There are always min_DCT_v_scaled_size row groups in an iMCU row. */
     ngroups = cinfo->min_DCT_v_scaled_size;
+    mainp->rowgroups_avail = (JDIMENSION) ngroups;
   }
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
diff --git a/3rdparty/libjpeg/jdmaster.c b/3rdparty/libjpeg/jdmaster.c
index ab95090f4d..62c07671f7 100644
--- a/3rdparty/libjpeg/jdmaster.c
+++ b/3rdparty/libjpeg/jdmaster.c
@@ -2,7 +2,7 @@
  * jdmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2002-2015 by Guido Vollbeding.
+ * Modified 2002-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -237,18 +237,17 @@ prepare_range_limit_table (j_decompress_ptr cinfo)
   JSAMPLE * table;
   int i;
 
-  table = (JSAMPLE *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				5 * (MAXJSAMPLE+1) * SIZEOF(JSAMPLE));
+  table = (JSAMPLE *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo,
+    JPOOL_IMAGE, (RANGE_CENTER * 2 + MAXJSAMPLE + 1) * SIZEOF(JSAMPLE));
   /* First segment of range limit table: limit[x] = 0 for x < 0 */
-  MEMZERO(table, 2 * (MAXJSAMPLE+1) * SIZEOF(JSAMPLE));
-  table += 2 * (MAXJSAMPLE+1);	/* allow negative subscripts of table */
+  MEMZERO(table, RANGE_CENTER * SIZEOF(JSAMPLE));
+  table += RANGE_CENTER;	/* allow negative subscripts of table */
   cinfo->sample_range_limit = table;
   /* Main part of range limit table: limit[x] = x */
   for (i = 0; i <= MAXJSAMPLE; i++)
     table[i] = (JSAMPLE) i;
   /* End of range limit table: limit[x] = MAXJSAMPLE for x > MAXJSAMPLE */
-  for (; i < 3 * (MAXJSAMPLE+1); i++)
+  for (; i <=  MAXJSAMPLE + RANGE_CENTER; i++)
     table[i] = MAXJSAMPLE;
 }
 
diff --git a/3rdparty/libjpeg/jdmerge.c b/3rdparty/libjpeg/jdmerge.c
index 192da5829d..866693f529 100644
--- a/3rdparty/libjpeg/jdmerge.c
+++ b/3rdparty/libjpeg/jdmerge.c
@@ -2,7 +2,7 @@
  * jdmerge.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2013-2015 by Guido Vollbeding.
+ * Modified 2013-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -40,6 +40,12 @@
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
 
+#if RANGE_BITS < 2
+  /* Deliberate syntax err */
+  Sorry, this code requires 2 or more range extension bits.
+#endif
+
+
 /* Private subobject */
 
 typedef struct {
diff --git a/3rdparty/libjpeg/jfdctflt.c b/3rdparty/libjpeg/jfdctflt.c
index 0ebc186d22..013f29e0ca 100644
--- a/3rdparty/libjpeg/jfdctflt.c
+++ b/3rdparty/libjpeg/jfdctflt.c
@@ -2,7 +2,7 @@
  * jfdctflt.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2003-2015 by Guido Vollbeding.
+ * Modified 2003-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -48,7 +48,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
diff --git a/3rdparty/libjpeg/jfdctfst.c b/3rdparty/libjpeg/jfdctfst.c
index d779f78bf4..5e4e017225 100644
--- a/3rdparty/libjpeg/jfdctfst.c
+++ b/3rdparty/libjpeg/jfdctfst.c
@@ -2,7 +2,7 @@
  * jfdctfst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2003-2015 by Guido Vollbeding.
+ * Modified 2003-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -44,7 +44,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
diff --git a/3rdparty/libjpeg/jidctflt.c b/3rdparty/libjpeg/jidctflt.c
index c7e832a3bb..e33a2b5e42 100644
--- a/3rdparty/libjpeg/jidctflt.c
+++ b/3rdparty/libjpeg/jidctflt.c
@@ -2,7 +2,7 @@
  * jidctflt.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * Modified 2010-2015 by Guido Vollbeding.
+ * Modified 2010-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -50,7 +50,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
diff --git a/3rdparty/libjpeg/jidctfst.c b/3rdparty/libjpeg/jidctfst.c
index 474cc45fd7..1ac3e39cb7 100644
--- a/3rdparty/libjpeg/jidctfst.c
+++ b/3rdparty/libjpeg/jidctfst.c
@@ -2,7 +2,7 @@
  * jidctfst.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * Modified 2015 by Guido Vollbeding.
+ * Modified 2015-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -46,7 +46,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
diff --git a/3rdparty/libjpeg/jidctint.c b/3rdparty/libjpeg/jidctint.c
index de233ec996..6437079a31 100644
--- a/3rdparty/libjpeg/jidctint.c
+++ b/3rdparty/libjpeg/jidctint.c
@@ -2,7 +2,7 @@
  * jidctint.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modification developed 2002-2015 by Guido Vollbeding.
+ * Modification developed 2002-2016 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -166,6 +166,7 @@
 /*
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/16).
  */
 
@@ -428,7 +429,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 7x7 output block.
+ * producing a reduced-size 7x7 output block.
  *
  * Optimized algorithm with 12 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/14).
@@ -2623,7 +2624,7 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
     tmp0 <<= CONST_BITS;
     /* Add fudge factor here for final descale. */
-    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
 
     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
@@ -2920,13 +2921,6 @@ jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * The rotator is c(-6).
      */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
-    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
-    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
-    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
-
     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
     z2 <<= CONST_BITS;
@@ -2937,6 +2931,13 @@ jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     tmp0 = z2 + z3;
     tmp1 = z2 - z3;
 
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
+
     tmp10 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
     tmp11 = tmp1 + tmp3;
@@ -4883,13 +4884,6 @@ jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * The rotator is c(-6).
      */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
-    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
-    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
-    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
-
     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
     z2 <<= CONST_BITS;
@@ -4900,6 +4894,13 @@ jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     tmp0 = z2 + z3;
     tmp1 = z2 - z3;
 
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
+
     tmp10 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
     tmp11 = tmp1 + tmp3;
diff --git a/3rdparty/libjpeg/jinclude.h b/3rdparty/libjpeg/jinclude.h
index 0a4f15146a..20ed4ef11f 100644
--- a/3rdparty/libjpeg/jinclude.h
+++ b/3rdparty/libjpeg/jinclude.h
@@ -2,6 +2,7 @@
  * jinclude.h
  *
  * Copyright (C) 1991-1994, Thomas G. Lane.
+ * Modified 2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -83,9 +84,14 @@
  * The modules that use fread() and fwrite() always invoke them through
  * these macros.  On some systems you may need to twiddle the argument casts.
  * CAUTION: argument order is different from underlying functions!
+ *
+ * Furthermore, macros are provided for fflush() and ferror() in order
+ * to facilitate adaption by applications using an own FILE class.
  */
 
 #define JFREAD(file,buf,sizeofbuf)  \
   ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
 #define JFWRITE(file,buf,sizeofbuf)  \
   ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+#define JFFLUSH(file)	fflush(file)
+#define JFERROR(file)	ferror(file)
diff --git a/3rdparty/libjpeg/jpegint.h b/3rdparty/libjpeg/jpegint.h
index 18bb8879aa..e312e1af97 100644
--- a/3rdparty/libjpeg/jpegint.h
+++ b/3rdparty/libjpeg/jpegint.h
@@ -2,7 +2,7 @@
  * jpegint.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 1997-2013 by Guido Vollbeding.
+ * Modified 1997-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -260,6 +260,19 @@ struct jpeg_color_quantizer {
 };
 
 
+/* Definition of range extension bits for decompression processes.
+ * See the comments with prepare_range_limit_table (in jdmaster.c)
+ * for more info.
+ * The recommended default value for normal applications is 2.
+ * Applications with special requirements may use a different value.
+ * For example, Ghostscript wants to use 3 for proper handling of
+ * wacky images with oversize coefficient values.
+ */
+
+#define RANGE_BITS	2
+#define RANGE_CENTER	(CENTERJSAMPLE << RANGE_BITS)
+
+
 /* Miscellaneous useful macros */
 
 #undef MAX
diff --git a/3rdparty/libjpeg/jpeglib.h b/3rdparty/libjpeg/jpeglib.h
index 939b50be58..4bd985316e 100644
--- a/3rdparty/libjpeg/jpeglib.h
+++ b/3rdparty/libjpeg/jpeglib.h
@@ -2,7 +2,7 @@
  * jpeglib.h
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modified 2002-2015 by Guido Vollbeding.
+ * Modified 2002-2017 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -39,7 +39,7 @@ extern "C" {
 
 #define JPEG_LIB_VERSION        90	/* Compatibility version 9.0 */
 #define JPEG_LIB_VERSION_MAJOR  9
-#define JPEG_LIB_VERSION_MINOR  2
+#define JPEG_LIB_VERSION_MINOR  3
 
 
 /* Various constants determining the sizes of things.
@@ -137,9 +137,9 @@ typedef struct {
   /* The decompressor output side may not use these variables. */
   int dc_tbl_no;		/* DC entropy table selector (0..3) */
   int ac_tbl_no;		/* AC entropy table selector (0..3) */
-  
+
   /* Remaining fields should be treated as private by applications. */
-  
+
   /* These values are computed during compression or decompression startup: */
   /* Component's size in DCT blocks.
    * Any dummy blocks added to complete an MCU are not counted; therefore
@@ -411,10 +411,10 @@ struct jpeg_compress_struct {
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
   /* The coefficient controller receives data in units of MCU rows as defined
    * for fully interleaved scans (whether the JPEG file is interleaved or not).
-   * There are v_samp_factor * DCTSIZE sample rows of each component in an
-   * "iMCU" (interleaved MCU) row.
+   * There are v_samp_factor * DCT_v_scaled_size sample rows of each component
+   * in an "iMCU" (interleaved MCU) row.
    */
-  
+
   /*
    * These fields are valid during any one scan.
    * They describe the components and MCUs actually appearing in the scan.
@@ -422,10 +422,10 @@ struct jpeg_compress_struct {
   int comps_in_scan;		/* # of JPEG components in this scan */
   jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
-  
+
   JDIMENSION MCUs_per_row;	/* # of MCUs across the image */
   JDIMENSION MCU_rows_in_scan;	/* # of MCU rows in the image */
-  
+
   int blocks_in_MCU;		/* # of DCT blocks per MCU */
   int MCU_membership[C_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
@@ -636,7 +636,7 @@ struct jpeg_decompress_struct {
    * in fully interleaved JPEG scans, but are used whether the scan is
    * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
    * rows of each component.  Therefore, the IDCT output contains
-   * v_samp_factor*DCT_v_scaled_size sample rows of a component per iMCU row.
+   * v_samp_factor * DCT_v_scaled_size sample rows of a component per iMCU row.
    */
 
   JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@@ -711,7 +711,7 @@ struct jpeg_error_mgr {
 #define JMSG_LENGTH_MAX  200	/* recommended size of format_message buffer */
   /* Reset error state variables at start of a new image */
   JMETHOD(void, reset_error_mgr, (j_common_ptr cinfo));
-  
+
   /* The message ID code and any parameters are saved here.
    * A message can have one string parameter or up to 8 int parameters.
    */
@@ -721,11 +721,11 @@ struct jpeg_error_mgr {
     int i[8];
     char s[JMSG_STR_PARM_MAX];
   } msg_parm;
-  
+
   /* Standard state variables for error facility */
-  
+
   int trace_level;		/* max msg_level that will be displayed */
-  
+
   /* For recoverable corrupt-data errors, we emit a warning message,
    * but keep going unless emit_message chooses to abort.  emit_message
    * should count warnings in num_warnings.  The surrounding application
diff --git a/3rdparty/libjpeg/jversion.h b/3rdparty/libjpeg/jversion.h
index 0740b317d7..d096384f7e 100644
--- a/3rdparty/libjpeg/jversion.h
+++ b/3rdparty/libjpeg/jversion.h
@@ -1,7 +1,7 @@
 /*
  * jversion.h
  *
- * Copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2018, Thomas G. Lane, Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -9,6 +9,6 @@
  */
 
 
-#define JVERSION	"9b  17-Jan-2016"
+#define JVERSION	"9c  14-Jan-2018"
 
-#define JCOPYRIGHT	"Copyright (C) 2016, Thomas G. Lane, Guido Vollbeding"
+#define JCOPYRIGHT	"Copyright (C) 2018, Thomas G. Lane, Guido Vollbeding"
diff --git a/3rdparty/libpng/CHANGES b/3rdparty/libpng/CHANGES
index bdd4480654..f0b0a9342c 100644
--- a/3rdparty/libpng/CHANGES
+++ b/3rdparty/libpng/CHANGES
@@ -6066,31 +6066,44 @@ Version 1.6.35 [July 15, 2018]
 Version 1.6.36 [December 1, 2018]
   Optimized png_do_expand_palette for ARM processors.
   Improved performance by around 10-22% on a recent ARM Chromebook.
-  (Contributed by Richard Townsend, ARM Holdings)
+    (Contributed by Richard Townsend, ARM Holdings)
   Fixed manipulation of machine-specific optimization options.
-  (Contributed by Vicki Pfau)
+    (Contributed by Vicki Pfau)
   Used memcpy instead of manual pointer arithmetic on Intel SSE2.
-  (Contributed by Samuel Williams)
+    (Contributed by Samuel Williams)
   Fixed build errors with MSVC on ARM64.
-  (Contributed by Zhijie Liang)
+    (Contributed by Zhijie Liang)
   Fixed detection of libm in CMakeLists.
-  (Contributed by Cameron Cawley)
+    (Contributed by Cameron Cawley)
   Fixed incorrect creation of pkg-config file in CMakeLists.
-  (Contributed by Kyle Bentley)
+    (Contributed by Kyle Bentley)
   Fixed the CMake build on Windows MSYS by avoiding symlinks.
   Fixed a build warning on OpenBSD.
-  (Contributed by Theo Buehler)
+    (Contributed by Theo Buehler)
   Fixed various typos in comments.
-  (Contributed by "luz.paz")
+    (Contributed by "luz.paz")
   Raised the minimum required CMake version from 3.0.2 to 3.1.
   Removed yet more of the vestigial support for pre-ANSI C compilers.
   Removed ancient makefiles for ancient systems that have been broken
-  across all previous libpng-1.6.x versions.
+    across all previous libpng-1.6.x versions.
   Removed the Y2K compliance statement and the export control
-  information.
+    information.
   Applied various code style and documentation fixes.
 
-Send comments/corrections/commendations to png-mng-implement at lists.sf.net
-(subscription required; visit
+Version 1.6.37 [April 14, 2019]
+  Fixed a use-after-free vulnerability (CVE-2019-7317) in png_image_free.
+  Fixed a memory leak in the ARM NEON implementation of png_do_expand_palette.
+  Fixed a memory leak in pngtest.c.
+  Fixed two vulnerabilities (CVE-2018-14048, CVE-2018-14550) in
+    contrib/pngminus; refactor.
+  Changed the license of contrib/pngminus to MIT; refresh makefile and docs.
+    (Contributed by Willem van Schaik)
+  Fixed a typo in the libpng license v2.
+    (Contributed by Miguel Ojeda)
+  Added makefiles for AddressSanitizer-enabled builds.
+  Cleaned up various makefiles.
+
+Send comments/corrections/commendations to png-mng-implement at lists.sf.net.
+Subscription is required; visit
 https://lists.sourceforge.net/lists/listinfo/png-mng-implement
-to subscribe).
+to subscribe.
diff --git a/3rdparty/libpng/LICENSE b/3rdparty/libpng/LICENSE
index 62ab8e48dc..e0c5b531cf 100644
--- a/3rdparty/libpng/LICENSE
+++ b/3rdparty/libpng/LICENSE
@@ -4,8 +4,8 @@ COPYRIGHT NOTICE, DISCLAIMER, and LICENSE
 PNG Reference Library License version 2
 ---------------------------------------
 
- * Copyright (c) 1995-2018 The PNG Reference Library Authors.
- * Copyright (c) 2018 Cosmin Truta.
+ * Copyright (c) 1995-2019 The PNG Reference Library Authors.
+ * Copyright (c) 2018-2019 Cosmin Truta.
  * Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson.
  * Copyright (c) 1996-1997 Andreas Dilger.
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -13,7 +13,7 @@ PNG Reference Library License version 2
 The software is supplied "as is", without warranty of any kind,
 express or implied, including, without limitation, the warranties
 of merchantability, fitness for a particular purpose, title, and
-non-infringement.  In no even shall the Copyright owners, or
+non-infringement.  In no event shall the Copyright owners, or
 anyone distributing the software, be liable for any damages or
 other liability, whether in contract, tort or otherwise, arising
 from, out of, or in connection with the software, or the use or
@@ -39,7 +39,7 @@ subject to the following restrictions:
 PNG Reference Library License version 1 (for libpng 0.5 through 1.6.35)
 -----------------------------------------------------------------------
 
-libpng versions 1.0.7, July 1, 2000 through 1.6.35, July 15, 2018 are
+libpng versions 1.0.7, July 1, 2000, through 1.6.35, July 15, 2018 are
 Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson, are
 derived from libpng-1.0.6, and are distributed according to the same
 disclaimer and license as libpng-1.0.6 with the following individuals
diff --git a/3rdparty/libpng/README b/3rdparty/libpng/README
index e41e0f549b..cfc1f0e3dc 100644
--- a/3rdparty/libpng/README
+++ b/3rdparty/libpng/README
@@ -1,5 +1,5 @@
-README for libpng version 1.6.36 - December 1, 2018
-===================================================
+README for libpng version 1.6.37 - April 14, 2019
+=================================================
 
 See the note about version numbers near the top of png.h.
 See INSTALL for instructions on how to install libpng.
diff --git a/3rdparty/libpng/arm/palette_neon_intrinsics.c b/3rdparty/libpng/arm/palette_neon_intrinsics.c
index fa02d6a8b3..b4d1fd2abf 100644
--- a/3rdparty/libpng/arm/palette_neon_intrinsics.c
+++ b/3rdparty/libpng/arm/palette_neon_intrinsics.c
@@ -1,7 +1,7 @@
 
 /* palette_neon_intrinsics.c - NEON optimised palette expansion functions
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 2017-2018 Arm Holdings. All rights reserved.
  * Written by Richard Townsend <Richard.Townsend@arm.com>, February 2017.
  *
@@ -20,9 +20,9 @@
 #  include <arm_neon.h>
 #endif
 
-/* Build an RGBA palette from the RGB and separate alpha palettes. */
+/* Build an RGBA8 palette from the separate RGB and alpha palettes. */
 void
-png_riffle_palette_rgba(png_structrp png_ptr, png_row_infop row_info)
+png_riffle_palette_neon(png_structrp png_ptr)
 {
    png_const_colorp palette = png_ptr->palette;
    png_bytep riffled_palette = png_ptr->riffled_palette;
@@ -30,6 +30,8 @@ png_riffle_palette_rgba(png_structrp png_ptr, png_row_infop row_info)
    int num_trans = png_ptr->num_trans;
    int i;
 
+   png_debug(1, "in png_riffle_palette_neon");
+
    /* Initially black, opaque. */
    uint8x16x4_t w = {{
       vdupq_n_u8(0x00),
@@ -38,16 +40,10 @@ png_riffle_palette_rgba(png_structrp png_ptr, png_row_infop row_info)
       vdupq_n_u8(0xff),
    }};
 
-   if (row_info->bit_depth != 8)
-   {
-      png_error(png_ptr, "bit_depth must be 8 for png_riffle_palette_rgba");
-      return;
-   }
-
-   /* First, riffle the RGB colours into a RGBA palette, the A value is
-    * set to opaque for now.
+   /* First, riffle the RGB colours into an RGBA8 palette.
+    * The alpha component is set to opaque for now.
     */
-   for (i = 0; i < (1 << row_info->bit_depth); i += 16)
+   for (i = 0; i < 256; i += 16)
    {
       uint8x16x3_t v = vld3q_u8((png_const_bytep)(palette + i));
       w.val[0] = v.val[0];
@@ -61,9 +57,9 @@ png_riffle_palette_rgba(png_structrp png_ptr, png_row_infop row_info)
       riffled_palette[(i << 2) + 3] = trans_alpha[i];
 }
 
-/* Expands a palettized row into RGBA. */
+/* Expands a palettized row into RGBA8. */
 int
-png_do_expand_palette_neon_rgba(png_structrp png_ptr, png_row_infop row_info,
+png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
     png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
 {
    png_uint_32 row_width = row_info->width;
@@ -72,6 +68,8 @@ png_do_expand_palette_neon_rgba(png_structrp png_ptr, png_row_infop row_info,
    const png_int_32 pixels_per_chunk = 4;
    int i;
 
+   png_debug(1, "in png_do_expand_palette_rgba8_neon");
+
    if (row_width < pixels_per_chunk)
       return 0;
 
@@ -103,9 +101,9 @@ png_do_expand_palette_neon_rgba(png_structrp png_ptr, png_row_infop row_info,
    return i;
 }
 
-/* Expands a palettized row into RGB format. */
+/* Expands a palettized row into RGB8. */
 int
-png_do_expand_palette_neon_rgb(png_structrp png_ptr, png_row_infop row_info,
+png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
     png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
 {
    png_uint_32 row_width = row_info->width;
@@ -113,6 +111,8 @@ png_do_expand_palette_neon_rgb(png_structrp png_ptr, png_row_infop row_info,
    const png_uint_32 pixels_per_chunk = 8;
    int i;
 
+   png_debug(1, "in png_do_expand_palette_rgb8_neon");
+
    if (row_width <= pixels_per_chunk)
       return 0;
 
diff --git a/3rdparty/libpng/png.c b/3rdparty/libpng/png.c
index 3dce191d17..757c755f97 100644
--- a/3rdparty/libpng/png.c
+++ b/3rdparty/libpng/png.c
@@ -1,7 +1,7 @@
 
 /* png.c - location for general purpose libpng functions
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -14,7 +14,7 @@
 #include "pngpriv.h"
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef png_libpng_version_1_6_36 Your_png_h_is_not_version_1_6_36;
+typedef png_libpng_version_1_6_37 Your_png_h_is_not_version_1_6_37;
 
 #ifdef __GNUC__
 /* The version tests may need to be added to, but the problem warning has
@@ -815,8 +815,8 @@ png_get_copyright(png_const_structrp png_ptr)
    return PNG_STRING_COPYRIGHT
 #else
    return PNG_STRING_NEWLINE \
-      "libpng version 1.6.36" PNG_STRING_NEWLINE \
-      "Copyright (c) 2018 Cosmin Truta" PNG_STRING_NEWLINE \
+      "libpng version 1.6.37" PNG_STRING_NEWLINE \
+      "Copyright (c) 2018-2019 Cosmin Truta" PNG_STRING_NEWLINE \
       "Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson" \
       PNG_STRING_NEWLINE \
       "Copyright (c) 1996-1997 Andreas Dilger" PNG_STRING_NEWLINE \
@@ -4588,8 +4588,7 @@ png_image_free(png_imagep image)
    if (image != NULL && image->opaque != NULL &&
       image->opaque->error_buf == NULL)
    {
-      /* Ignore errors here: */
-      (void)png_safe_execute(image, png_image_free_function, image);
+      png_image_free_function(image);
       image->opaque = NULL;
    }
 }
diff --git a/3rdparty/libpng/png.h b/3rdparty/libpng/png.h
index 8e272a0553..139eb0dc0f 100644
--- a/3rdparty/libpng/png.h
+++ b/3rdparty/libpng/png.h
@@ -1,9 +1,9 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.6.36 - December 1, 2018
+ * libpng version 1.6.37 - April 14, 2019
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -14,8 +14,9 @@
  *   libpng versions 0.71, May 1995, through 0.88, January 1996: Guy Schalnat
  *   libpng versions 0.89, June 1996, through 0.96, May 1997: Andreas Dilger
  *   libpng versions 0.97, January 1998, through 1.6.35, July 2018:
- *     Glenn Randers-Pehrson.
- *   libpng version 1.6.36, December 1, 2018: Cosmin Truta
+ *     Glenn Randers-Pehrson
+ *   libpng versions 1.6.36, December 2018, through 1.6.37, April 2019:
+ *     Cosmin Truta
  *   See also "Contributing Authors", below.
  */
 
@@ -26,8 +27,8 @@
  * PNG Reference Library License version 2
  * ---------------------------------------
  *
- *  * Copyright (c) 1995-2018 The PNG Reference Library Authors.
- *  * Copyright (c) 2018 Cosmin Truta.
+ *  * Copyright (c) 1995-2019 The PNG Reference Library Authors.
+ *  * Copyright (c) 2018-2019 Cosmin Truta.
  *  * Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson.
  *  * Copyright (c) 1996-1997 Andreas Dilger.
  *  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -35,7 +36,7 @@
  * The software is supplied "as is", without warranty of any kind,
  * express or implied, including, without limitation, the warranties
  * of merchantability, fitness for a particular purpose, title, and
- * non-infringement.  In no even shall the Copyright owners, or
+ * non-infringement.  In no event shall the Copyright owners, or
  * anyone distributing the software, be liable for any damages or
  * other liability, whether in contract, tort or otherwise, arising
  * from, out of, or in connection with the software, or the use or
@@ -61,7 +62,7 @@
  * PNG Reference Library License version 1 (for libpng 0.5 through 1.6.35)
  * -----------------------------------------------------------------------
  *
- * libpng versions 1.0.7, July 1, 2000 through 1.6.35, July 15, 2018 are
+ * libpng versions 1.0.7, July 1, 2000, through 1.6.35, July 15, 2018 are
  * Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson, are
  * derived from libpng-1.0.6, and are distributed according to the same
  * disclaimer and license as libpng-1.0.6 with the following individuals
@@ -238,7 +239,7 @@
  *    ...
  *    1.5.30                  15    10530  15.so.15.30[.0]
  *    ...
- *    1.6.36                  16    10636  16.so.16.36[.0]
+ *    1.6.37                  16    10637  16.so.16.37[.0]
  *
  *    Henceforth the source version will match the shared-library major and
  *    minor numbers; the shared-library major version number will be used for
@@ -277,8 +278,8 @@
  */
 
 /* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.6.36"
-#define PNG_HEADER_VERSION_STRING " libpng version 1.6.36 - December 1, 2018\n"
+#define PNG_LIBPNG_VER_STRING "1.6.37"
+#define PNG_HEADER_VERSION_STRING " libpng version 1.6.37 - April 14, 2019\n"
 
 #define PNG_LIBPNG_VER_SONUM   16
 #define PNG_LIBPNG_VER_DLLNUM  16
@@ -286,12 +287,11 @@
 /* These should match the first 3 components of PNG_LIBPNG_VER_STRING: */
 #define PNG_LIBPNG_VER_MAJOR   1
 #define PNG_LIBPNG_VER_MINOR   6
-#define PNG_LIBPNG_VER_RELEASE 36
+#define PNG_LIBPNG_VER_RELEASE 37
 
-/* This should match the numeric part of the final component of
- * PNG_LIBPNG_VER_STRING, omitting any leading zero:
+/* This should be zero for a public release, or non-zero for a
+ * development version.  [Deprecated]
  */
-
 #define PNG_LIBPNG_VER_BUILD  0
 
 /* Release Status */
@@ -318,7 +318,7 @@
  * From version 1.0.1 it is:
  * XXYYZZ, where XX=major, YY=minor, ZZ=release
  */
-#define PNG_LIBPNG_VER 10636 /* 1.6.36 */
+#define PNG_LIBPNG_VER 10637 /* 1.6.37 */
 
 /* Library configuration: these options cannot be changed after
  * the library has been built.
@@ -428,7 +428,7 @@ extern "C" {
 /* This triggers a compiler error in png.c, if png.c and png.h
  * do not agree upon the version number.
  */
-typedef char* png_libpng_version_1_6_36;
+typedef char* png_libpng_version_1_6_37;
 
 /* Basic control structions.  Read libpng-manual.txt or libpng.3 for more info.
  *
diff --git a/3rdparty/libpng/pngconf.h b/3rdparty/libpng/pngconf.h
index 5e641b2509..927a769dbe 100644
--- a/3rdparty/libpng/pngconf.h
+++ b/3rdparty/libpng/pngconf.h
@@ -1,9 +1,9 @@
 
-/* pngconf.h - machine configurable file for libpng
+/* pngconf.h - machine-configurable file for libpng
  *
- * libpng version 1.6.36
+ * libpng version 1.6.37
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2016,2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
diff --git a/3rdparty/libpng/pnglibconf.h b/3rdparty/libpng/pnglibconf.h
index 00340c678b..e1e27e957e 100644
--- a/3rdparty/libpng/pnglibconf.h
+++ b/3rdparty/libpng/pnglibconf.h
@@ -1,8 +1,8 @@
 /* pnglibconf.h - library build configuration */
 
-/* libpng version 1.6.36 */
+/* libpng version 1.6.37 */
 
-/* Copyright (c) 2018 Cosmin Truta */
+/* Copyright (c) 2018-2019 Cosmin Truta */
 /* Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson */
 
 /* This code is released under the libpng license. */
diff --git a/3rdparty/libpng/pngpriv.h b/3rdparty/libpng/pngpriv.h
index 973c3eac1f..583c26f9bd 100644
--- a/3rdparty/libpng/pngpriv.h
+++ b/3rdparty/libpng/pngpriv.h
@@ -1,7 +1,7 @@
 
 /* pngpriv.h - private declarations for use inside libpng
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -2119,11 +2119,11 @@ PNG_INTERNAL_FUNCTION(png_uint_32, png_check_keyword, (png_structrp png_ptr,
 
 #if PNG_ARM_NEON_IMPLEMENTATION == 1
 PNG_INTERNAL_FUNCTION(void,
-                      png_riffle_palette_rgba,
-                      (png_structrp, png_row_infop),
+                      png_riffle_palette_neon,
+                      (png_structrp),
                       PNG_EMPTY);
 PNG_INTERNAL_FUNCTION(int,
-                      png_do_expand_palette_neon_rgba,
+                      png_do_expand_palette_rgba8_neon,
                       (png_structrp,
                        png_row_infop,
                        png_const_bytep,
@@ -2131,7 +2131,7 @@ PNG_INTERNAL_FUNCTION(int,
                        const png_bytepp),
                       PNG_EMPTY);
 PNG_INTERNAL_FUNCTION(int,
-                      png_do_expand_palette_neon_rgb,
+                      png_do_expand_palette_rgb8_neon,
                       (png_structrp,
                        png_row_infop,
                        png_const_bytep,
diff --git a/3rdparty/libpng/pngread.c b/3rdparty/libpng/pngread.c
index f8e762196e..8fa7d9f162 100644
--- a/3rdparty/libpng/pngread.c
+++ b/3rdparty/libpng/pngread.c
@@ -1,7 +1,7 @@
 
 /* pngread.c - read a PNG file
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -994,6 +994,12 @@ png_read_destroy(png_structrp png_ptr)
    png_ptr->chunk_list = NULL;
 #endif
 
+#if defined(PNG_READ_EXPAND_SUPPORTED) && \
+    defined(PNG_ARM_NEON_IMPLEMENTATION)
+   png_free(png_ptr, png_ptr->riffled_palette);
+   png_ptr->riffled_palette = NULL;
+#endif
+
    /* NOTE: the 'setjmp' buffer may still be allocated and the memory and error
     * callbacks are still set at this point.  They are required to complete the
     * destruction of the png_struct itself.
diff --git a/3rdparty/libpng/pngrtran.c b/3rdparty/libpng/pngrtran.c
index ccc58ce6f1..9a8fad9f4a 100644
--- a/3rdparty/libpng/pngrtran.c
+++ b/3rdparty/libpng/pngrtran.c
@@ -1,7 +1,7 @@
 
 /* pngrtran.c - transforms the data in a row for PNG readers
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -1182,20 +1182,20 @@ png_init_palette_transformations(png_structrp png_ptr)
              png_ptr->palette[png_ptr->background.index].blue;
 
 #ifdef PNG_READ_INVERT_ALPHA_SUPPORTED
-        if ((png_ptr->transformations & PNG_INVERT_ALPHA) != 0)
-        {
-           if ((png_ptr->transformations & PNG_EXPAND_tRNS) == 0)
-           {
-              /* Invert the alpha channel (in tRNS) unless the pixels are
-               * going to be expanded, in which case leave it for later
-               */
-              int i, istop = png_ptr->num_trans;
+         if ((png_ptr->transformations & PNG_INVERT_ALPHA) != 0)
+         {
+            if ((png_ptr->transformations & PNG_EXPAND_tRNS) == 0)
+            {
+               /* Invert the alpha channel (in tRNS) unless the pixels are
+                * going to be expanded, in which case leave it for later
+                */
+               int i, istop = png_ptr->num_trans;
 
-              for (i=0; i<istop; i++)
-                 png_ptr->trans_alpha[i] = (png_byte)(255 -
-                    png_ptr->trans_alpha[i]);
-           }
-        }
+               for (i = 0; i < istop; i++)
+                  png_ptr->trans_alpha[i] =
+                      (png_byte)(255 - png_ptr->trans_alpha[i]);
+            }
+         }
 #endif /* READ_INVERT_ALPHA */
       }
    } /* background expand and (therefore) no alpha association. */
@@ -4320,9 +4320,11 @@ png_do_expand_palette(png_structrp png_ptr, png_row_infop row_info,
                    * but sometimes row_info->bit_depth has been changed to 8.
                    * In these cases, the palette hasn't been riffled.
                    */
-                  i = png_do_expand_palette_neon_rgba(png_ptr, row_info, row,
+                  i = png_do_expand_palette_rgba8_neon(png_ptr, row_info, row,
                       &sp, &dp);
                }
+#else
+               PNG_UNUSED(png_ptr)
 #endif
 
                for (; i < row_width; i++)
@@ -4349,8 +4351,10 @@ png_do_expand_palette(png_structrp png_ptr, png_row_infop row_info,
                dp = row + (size_t)(row_width * 3) - 1;
                i = 0;
 #ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
-               i = png_do_expand_palette_neon_rgb(png_ptr, row_info, row,
+               i = png_do_expand_palette_rgb8_neon(png_ptr, row_info, row,
                    &sp, &dp);
+#else
+               PNG_UNUSED(png_ptr)
 #endif
 
                for (; i < row_width; i++)
@@ -4770,19 +4774,17 @@ png_do_read_transformations(png_structrp png_ptr, png_row_infop row_info)
 #ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
          if ((png_ptr->num_trans > 0) && (png_ptr->bit_depth == 8))
          {
-            /* Allocate space for the decompressed full palette. */
             if (png_ptr->riffled_palette == NULL)
             {
-               png_ptr->riffled_palette = png_malloc(png_ptr, 256*4);
-               if (png_ptr->riffled_palette == NULL)
-                  png_error(png_ptr, "NULL row buffer");
-               /* Build the RGBA palette. */
-               png_riffle_palette_rgba(png_ptr, row_info);
+               /* Initialize the accelerated palette expansion. */
+               png_ptr->riffled_palette =
+                   (png_bytep)png_malloc(png_ptr, 256 * 4);
+               png_riffle_palette_neon(png_ptr);
             }
          }
 #endif
          png_do_expand_palette(png_ptr, row_info, png_ptr->row_buf + 1,
-            png_ptr->palette, png_ptr->trans_alpha, png_ptr->num_trans);
+             png_ptr->palette, png_ptr->trans_alpha, png_ptr->num_trans);
       }
 
       else
diff --git a/3rdparty/libpng/pngstruct.h b/3rdparty/libpng/pngstruct.h
index 94a6d041ff..8bdc7ce46d 100644
--- a/3rdparty/libpng/pngstruct.h
+++ b/3rdparty/libpng/pngstruct.h
@@ -1,7 +1,7 @@
 
 /* pngstruct.h - header file for PNG reference library
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -228,10 +228,6 @@ struct png_struct_def
                                * big_row_buf; while writing it is separately
                                * allocated.
                                */
-#ifdef PNG_READ_EXPAND_SUPPORTED
-   /* Buffer to accelerate palette transformations. */
-   png_bytep riffled_palette;
-#endif
 #ifdef PNG_WRITE_FILTER_SUPPORTED
    png_bytep try_row;    /* buffer to save trial row when filtering */
    png_bytep tst_row;    /* buffer to save best trial row when filtering */
@@ -396,6 +392,12 @@ struct png_struct_def
    /* deleted in 1.5.5: rgb_to_gray_blue_coeff; */
 #endif
 
+/* New member added in libpng-1.6.36 */
+#if defined(PNG_READ_EXPAND_SUPPORTED) && \
+    defined(PNG_ARM_NEON_IMPLEMENTATION)
+   png_bytep riffled_palette; /* buffer for accelerated palette expansion */
+#endif
+
 /* New member added in libpng-1.0.4 (renamed in 1.0.9) */
 #if defined(PNG_MNG_FEATURES_SUPPORTED)
 /* Changed from png_byte to png_uint_32 at version 1.2.0 */
diff --git a/3rdparty/libpng/pngwrite.c b/3rdparty/libpng/pngwrite.c
index 160c877d38..59377a4dde 100644
--- a/3rdparty/libpng/pngwrite.c
+++ b/3rdparty/libpng/pngwrite.c
@@ -1,7 +1,7 @@
 
 /* pngwrite.c - general routines to write a PNG file
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2019 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -948,10 +948,6 @@ png_write_destroy(png_structrp png_ptr)
    png_free_buffer_list(png_ptr, &png_ptr->zbuffer_list);
    png_free(png_ptr, png_ptr->row_buf);
    png_ptr->row_buf = NULL;
-#ifdef PNG_READ_EXPANDED_SUPPORTED
-   png_free(png_ptr, png_ptr->riffled_palette);
-   png_ptr->riffled_palette = NULL;
-#endif
 #ifdef PNG_WRITE_FILTER_SUPPORTED
    png_free(png_ptr, png_ptr->prev_row);
    png_free(png_ptr, png_ptr->try_row);
diff --git a/3rdparty/libwebp/CMakeLists.txt b/3rdparty/libwebp/CMakeLists.txt
index cd4d61f687..83884c9d4d 100644
--- a/3rdparty/libwebp/CMakeLists.txt
+++ b/3rdparty/libwebp/CMakeLists.txt
@@ -21,6 +21,11 @@ if(ANDROID AND ARMEABI_V7A AND NOT NEON)
   endforeach()
 endif()
 
+# FIX for quant.h - requires C99 for() loops
+ocv_check_flag_support(C "-std=c99" _varname "${CMAKE_C_FLAGS}")
+if(${_varname})
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
+endif()
 
 
 # ----------------------------------------------------------------------------------
diff --git a/3rdparty/libwebp/COPYING b/3rdparty/libwebp/COPYING
new file mode 100644
index 0000000000..7a6f99547d
--- /dev/null
+++ b/3rdparty/libwebp/COPYING
@@ -0,0 +1,30 @@
+Copyright (c) 2010, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/3rdparty/libwebp/src/dec/alphai_dec.h b/3rdparty/libwebp/src/dec/alphai_dec.h
index e0fa281a55..a64104abeb 100644
--- a/3rdparty/libwebp/src/dec/alphai_dec.h
+++ b/3rdparty/libwebp/src/dec/alphai_dec.h
@@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_ALPHAI_DEC_H_ */
+#endif  // WEBP_DEC_ALPHAI_DEC_H_
diff --git a/3rdparty/libwebp/src/dec/buffer_dec.c b/3rdparty/libwebp/src/dec/buffer_dec.c
index 75eb3c40b4..3cd94eb4d9 100644
--- a/3rdparty/libwebp/src/dec/buffer_dec.c
+++ b/3rdparty/libwebp/src/dec/buffer_dec.c
@@ -74,7 +74,8 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
   } else {    // RGB checks
     const WebPRGBABuffer* const buf = &buffer->u.RGBA;
     const int stride = abs(buf->stride);
-    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
+    const uint64_t size =
+        MIN_BUFFER_SIZE(width * kModeBpp[mode], height, stride);
     ok &= (size <= buf->size);
     ok &= (stride >= width * kModeBpp[mode]);
     ok &= (buf->rgba != NULL);
diff --git a/3rdparty/libwebp/src/dec/common_dec.h b/3rdparty/libwebp/src/dec/common_dec.h
index 9995f1a51a..b158550a80 100644
--- a/3rdparty/libwebp/src/dec/common_dec.h
+++ b/3rdparty/libwebp/src/dec/common_dec.h
@@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
        NUM_PROBAS = 11
      };
 
-#endif    // WEBP_DEC_COMMON_DEC_H_
+#endif  // WEBP_DEC_COMMON_DEC_H_
diff --git a/3rdparty/libwebp/src/dec/frame_dec.c b/3rdparty/libwebp/src/dec/frame_dec.c
index a9d5430d00..bda9e1a6f6 100644
--- a/3rdparty/libwebp/src/dec/frame_dec.c
+++ b/3rdparty/libwebp/src/dec/frame_dec.c
@@ -338,7 +338,6 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
         VP8QuantMatrix* const dqm = &dec->dqm_[s];
         if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
-          // TODO(skal): should we specially dither more for uv_quant_ < 0?
           const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
           dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
         }
@@ -669,15 +668,9 @@ int VP8GetThreadMethod(const WebPDecoderOptions* const options,
   (void)height;
   assert(headers == NULL || !headers->is_lossless);
 #if defined(WEBP_USE_THREAD)
-  if (width < MIN_WIDTH_FOR_THREADS) return 0;
-  // TODO(skal): tune the heuristic further
-#if 0
-  if (height < 2 * width) return 2;
+  if (width >= MIN_WIDTH_FOR_THREADS) return 2;
 #endif
-  return 2;
-#else   // !WEBP_USE_THREAD
   return 0;
-#endif
 }
 
 #undef MT_CACHE_LINES
diff --git a/3rdparty/libwebp/src/dec/idec_dec.c b/3rdparty/libwebp/src/dec/idec_dec.c
index a371ed7500..9bc9166808 100644
--- a/3rdparty/libwebp/src/dec/idec_dec.c
+++ b/3rdparty/libwebp/src/dec/idec_dec.c
@@ -140,10 +140,9 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
       if (NeedCompressedAlpha(idec)) {
         ALPHDecoder* const alph_dec = dec->alph_dec_;
         dec->alpha_data_ += offset;
-        if (alph_dec != NULL) {
+        if (alph_dec != NULL && alph_dec->vp8l_dec_ != NULL) {
           if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
             VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
-            assert(alph_vp8l_dec != NULL);
             assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
             VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
                                    dec->alpha_data_ + ALPHA_HEADER_LEN,
@@ -283,10 +282,8 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
   if (idec->state_ == STATE_VP8_DATA) {
-    VP8Io* const io = &idec->io_;
-    if (io->teardown != NULL) {
-      io->teardown(io);
-    }
+    // Synchronize the thread, clean-up and check for errors.
+    VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
   }
   idec->state_ = STATE_ERROR;
   return error;
@@ -451,7 +448,10 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
   VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   VP8Io* const io = &idec->io_;
 
-  assert(dec->ready_);
+  // Make sure partition #0 has been read before, to set dec to ready_.
+  if (!dec->ready_) {
+    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
+  }
   for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
     if (idec->last_mb_y_ != dec->mb_y_) {
       if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
@@ -473,6 +473,12 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
             MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
           return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
         }
+        // Synchronize the threads.
+        if (dec->mt_method_ > 0) {
+          if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) {
+            return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
+          }
+        }
         RestoreContext(&context, dec, token_br);
         return VP8_STATUS_SUSPENDED;
       }
@@ -491,6 +497,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
   }
   // Synchronize the thread and check for errors.
   if (!VP8ExitCritical(dec, io)) {
+    idec->state_ = STATE_ERROR;  // prevent re-entry in IDecError
     return IDecError(idec, VP8_STATUS_USER_ABORT);
   }
   dec->ready_ = 0;
@@ -571,6 +578,10 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
     status = DecodePartition0(idec);
   }
   if (idec->state_ == STATE_VP8_DATA) {
+    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+    if (dec == NULL) {
+      return VP8_STATUS_SUSPENDED;  // can't continue if we have no decoder.
+    }
     status = DecodeRemaining(idec);
   }
   if (idec->state_ == STATE_VP8L_HEADER) {
diff --git a/3rdparty/libwebp/src/dec/vp8_dec.h b/3rdparty/libwebp/src/dec/vp8_dec.h
index ca85b340cf..a05405df72 100644
--- a/3rdparty/libwebp/src/dec/vp8_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8_dec.h
@@ -182,4 +182,4 @@ WEBP_EXTERN int VP8LGetInfo(
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_VP8_DEC_H_ */
+#endif  // WEBP_DEC_VP8_DEC_H_
diff --git a/3rdparty/libwebp/src/dec/vp8i_dec.h b/3rdparty/libwebp/src/dec/vp8i_dec.h
index c929933e1c..2d7900aae1 100644
--- a/3rdparty/libwebp/src/dec/vp8i_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 1
 #define DEC_MIN_VERSION 0
-#define DEC_REV_VERSION 0
+#define DEC_REV_VERSION 2
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
@@ -316,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_VP8I_DEC_H_ */
+#endif  // WEBP_DEC_VP8I_DEC_H_
diff --git a/3rdparty/libwebp/src/dec/vp8l_dec.c b/3rdparty/libwebp/src/dec/vp8l_dec.c
index 0570f53a77..333bb3e80d 100644
--- a/3rdparty/libwebp/src/dec/vp8l_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -362,12 +362,19 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* huffman_image = NULL;
   HTreeGroup* htree_groups = NULL;
+  // When reading htrees, some might be unused, as the format allows it.
+  // We will still read them but put them in this htree_group_bogus.
+  HTreeGroup htree_group_bogus;
   HuffmanCode* huffman_tables = NULL;
+  HuffmanCode* huffman_tables_bogus = NULL;
   HuffmanCode* next = NULL;
   int num_htree_groups = 1;
+  int num_htree_groups_max = 1;
   int max_alphabet_size = 0;
   int* code_lengths = NULL;
   const int table_size = kTableSize[color_cache_bits];
+  int* mapping = NULL;
+  int ok = 0;
 
   if (allow_recursion && VP8LReadBits(br, 1)) {
     // use meta Huffman codes.
@@ -384,10 +391,42 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
       // The huffman data is stored in red and green bytes.
       const int group = (huffman_image[i] >> 8) & 0xffff;
       huffman_image[i] = group;
-      if (group >= num_htree_groups) {
-        num_htree_groups = group + 1;
+      if (group >= num_htree_groups_max) {
+        num_htree_groups_max = group + 1;
       }
     }
+    // Check the validity of num_htree_groups_max. If it seems too big, use a
+    // smaller value for later. This will prevent big memory allocations to end
+    // up with a bad bitstream anyway.
+    // The value of 1000 is totally arbitrary. We know that num_htree_groups_max
+    // is smaller than (1 << 16) and should be smaller than the number of pixels
+    // (though the format allows it to be bigger).
+    if (num_htree_groups_max > 1000 || num_htree_groups_max > xsize * ysize) {
+      // Create a mapping from the used indices to the minimal set of used
+      // values [0, num_htree_groups)
+      mapping = (int*)WebPSafeMalloc(num_htree_groups_max, sizeof(*mapping));
+      if (mapping == NULL) {
+        dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+        goto Error;
+      }
+      // -1 means a value is unmapped, and therefore unused in the Huffman
+      // image.
+      memset(mapping, 0xff, num_htree_groups_max * sizeof(*mapping));
+      for (num_htree_groups = 0, i = 0; i < huffman_pixs; ++i) {
+        // Get the current mapping for the group and remap the Huffman image.
+        int* const mapped_group = &mapping[huffman_image[i]];
+        if (*mapped_group == -1) *mapped_group = num_htree_groups++;
+        huffman_image[i] = *mapped_group;
+      }
+      huffman_tables_bogus = (HuffmanCode*)WebPSafeMalloc(
+          table_size, sizeof(*huffman_tables_bogus));
+      if (huffman_tables_bogus == NULL) {
+        dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+        goto Error;
+      }
+    } else {
+      num_htree_groups = num_htree_groups_max;
+    }
   }
 
   if (br->eos_) goto Error;
@@ -403,11 +442,11 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
     }
   }
 
+  code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size,
+                                      sizeof(*code_lengths));
   huffman_tables = (HuffmanCode*)WebPSafeMalloc(num_htree_groups * table_size,
                                                 sizeof(*huffman_tables));
   htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
-  code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size,
-                                      sizeof(*code_lengths));
 
   if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
@@ -415,28 +454,35 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   }
 
   next = huffman_tables;
-  for (i = 0; i < num_htree_groups; ++i) {
-    HTreeGroup* const htree_group = &htree_groups[i];
+  for (i = 0; i < num_htree_groups_max; ++i) {
+    // If the index "i" is unused in the Huffman image, read the coefficients
+    // but store them to a bogus htree_group.
+    const int is_bogus = (mapping != NULL && mapping[i] == -1);
+    HTreeGroup* const htree_group =
+        is_bogus ? &htree_group_bogus :
+        &htree_groups[(mapping == NULL) ? i : mapping[i]];
     HuffmanCode** const htrees = htree_group->htrees;
+    HuffmanCode* huffman_tables_i = is_bogus ? huffman_tables_bogus : next;
     int size;
     int total_size = 0;
     int is_trivial_literal = 1;
     int max_bits = 0;
     for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
       int alphabet_size = kAlphabetSize[j];
-      htrees[j] = next;
+      htrees[j] = huffman_tables_i;
       if (j == 0 && color_cache_bits > 0) {
         alphabet_size += 1 << color_cache_bits;
       }
-      size = ReadHuffmanCode(alphabet_size, dec, code_lengths, next);
+      size =
+          ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables_i);
       if (size == 0) {
         goto Error;
       }
       if (is_trivial_literal && kLiteralMap[j] == 1) {
-        is_trivial_literal = (next->bits == 0);
+        is_trivial_literal = (huffman_tables_i->bits == 0);
       }
-      total_size += next->bits;
-      next += size;
+      total_size += huffman_tables_i->bits;
+      huffman_tables_i += size;
       if (j <= ALPHA) {
         int local_max_bits = code_lengths[0];
         int k;
@@ -448,38 +494,41 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
         max_bits += local_max_bits;
       }
     }
+    if (!is_bogus) next = huffman_tables_i;
     htree_group->is_trivial_literal = is_trivial_literal;
     htree_group->is_trivial_code = 0;
     if (is_trivial_literal) {
       const int red = htrees[RED][0].value;
       const int blue = htrees[BLUE][0].value;
       const int alpha = htrees[ALPHA][0].value;
-      htree_group->literal_arb =
-          ((uint32_t)alpha << 24) | (red << 16) | blue;
+      htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
       if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
         htree_group->is_trivial_code = 1;
         htree_group->literal_arb |= htrees[GREEN][0].value << 8;
       }
     }
-    htree_group->use_packed_table = !htree_group->is_trivial_code &&
-                                    (max_bits < HUFFMAN_PACKED_BITS);
+    htree_group->use_packed_table =
+        !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
     if (htree_group->use_packed_table) BuildPackedTable(htree_group);
   }
-  WebPSafeFree(code_lengths);
+  ok = 1;
 
-  // All OK. Finalize pointers and return.
+  // All OK. Finalize pointers.
   hdr->huffman_image_ = huffman_image;
   hdr->num_htree_groups_ = num_htree_groups;
   hdr->htree_groups_ = htree_groups;
   hdr->huffman_tables_ = huffman_tables;
-  return 1;
 
  Error:
   WebPSafeFree(code_lengths);
-  WebPSafeFree(huffman_image);
-  WebPSafeFree(huffman_tables);
-  VP8LHtreeGroupsFree(htree_groups);
-  return 0;
+  WebPSafeFree(huffman_tables_bogus);
+  WebPSafeFree(mapping);
+  if (!ok) {
+    WebPSafeFree(huffman_image);
+    WebPSafeFree(huffman_tables);
+    VP8LHtreeGroupsFree(htree_groups);
+  }
+  return ok;
 }
 
 //------------------------------------------------------------------------------
@@ -884,7 +933,11 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
 #endif
         break;
       case 2:
+#if !defined(WORDS_BIGENDIAN)
         memcpy(&pattern, src, sizeof(uint16_t));
+#else
+        pattern = ((uint32_t)src[0] << 8) | src[1];
+#endif
 #if defined(__arm__) || defined(_M_ARM)
         pattern |= pattern << 16;
 #elif defined(WEBP_USE_MIPS_DSP_R2)
@@ -1523,7 +1576,6 @@ int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
   if (dec == NULL) return 0;
 
   assert(alph_dec != NULL);
-  alph_dec->vp8l_dec_ = dec;
 
   dec->width_ = alph_dec->width_;
   dec->height_ = alph_dec->height_;
@@ -1555,11 +1607,12 @@ int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
 
   if (!ok) goto Err;
 
+  // Only set here, once we are sure it is valid (to avoid thread races).
+  alph_dec->vp8l_dec_ = dec;
   return 1;
 
  Err:
-  VP8LDelete(alph_dec->vp8l_dec_);
-  alph_dec->vp8l_dec_ = NULL;
+  VP8LDelete(dec);
   return 0;
 }
 
diff --git a/3rdparty/libwebp/src/dec/vp8li_dec.h b/3rdparty/libwebp/src/dec/vp8li_dec.h
index 8e500cf9ff..0a4d613f99 100644
--- a/3rdparty/libwebp/src/dec/vp8li_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8li_dec.h
@@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_VP8LI_DEC_H_ */
+#endif  // WEBP_DEC_VP8LI_DEC_H_
diff --git a/3rdparty/libwebp/src/dec/webpi_dec.h b/3rdparty/libwebp/src/dec/webpi_dec.h
index c378ba6fc3..24baff5d27 100644
--- a/3rdparty/libwebp/src/dec/webpi_dec.h
+++ b/3rdparty/libwebp/src/dec/webpi_dec.h
@@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_WEBPI_DEC_H_ */
+#endif  // WEBP_DEC_WEBPI_DEC_H_
diff --git a/3rdparty/libwebp/src/demux/demux.c b/3rdparty/libwebp/src/demux/demux.c
index 684215e3de..d8f7a40a56 100644
--- a/3rdparty/libwebp/src/demux/demux.c
+++ b/3rdparty/libwebp/src/demux/demux.c
@@ -25,7 +25,7 @@
 
 #define DMUX_MAJ_VERSION 1
 #define DMUX_MIN_VERSION 0
-#define DMUX_REV_VERSION 0
+#define DMUX_REV_VERSION 2
 
 typedef struct {
   size_t start_;        // start location of the data
diff --git a/3rdparty/libwebp/src/dsp/cost.c b/3rdparty/libwebp/src/dsp/cost.c
index 634ccc2085..cc681cdd4b 100644
--- a/3rdparty/libwebp/src/dsp/cost.c
+++ b/3rdparty/libwebp/src/dsp/cost.c
@@ -377,6 +377,7 @@ VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
 extern void VP8EncDspCostInitMIPS32(void);
 extern void VP8EncDspCostInitMIPSdspR2(void);
 extern void VP8EncDspCostInitSSE2(void);
+extern void VP8EncDspCostInitNEON(void);
 
 WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
   VP8GetResidualCost = GetResidualCost_C;
@@ -398,6 +399,11 @@ WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspCostInitSSE2();
     }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspCostInitNEON();
+    }
 #endif
   }
 }
diff --git a/3rdparty/libwebp/src/dsp/cost_neon.c b/3rdparty/libwebp/src/dsp/cost_neon.c
new file mode 100644
index 0000000000..8cc8ce58aa
--- /dev/null
+++ b/3rdparty/libwebp/src/dsp/cost_neon.c
@@ -0,0 +1,122 @@
+// Copyright 2018 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of cost functions
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "src/dsp/neon.h"
+#include "src/enc/cost_enc.h"
+
+static const uint8_t position[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
+                                      9, 10, 11, 12, 13, 14, 15, 16 };
+
+static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
+  const int16x8_t minus_one = vdupq_n_s16(-1);
+  const int16x8_t coeffs_0 = vld1q_s16(coeffs);
+  const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
+  const uint16x8_t eob_0 = vtstq_s16(coeffs_0, minus_one);
+  const uint16x8_t eob_1 = vtstq_s16(coeffs_1, minus_one);
+  const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
+  const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
+
+#ifdef __aarch64__
+  res->last = vmaxvq_u8(masked) - 1;
+#else
+  const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
+  const uint16x8_t eob_16x8 = vmovl_u8(eob_8x8);
+  const uint16x4_t eob_16x4 =
+      vmax_u16(vget_low_u16(eob_16x8), vget_high_u16(eob_16x8));
+  const uint32x4_t eob_32x4 = vmovl_u16(eob_16x4);
+  uint32x2_t eob_32x2 =
+      vmax_u32(vget_low_u32(eob_32x4), vget_high_u32(eob_32x4));
+  eob_32x2 = vpmax_u32(eob_32x2, eob_32x2);
+
+  vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
+  --res->last;
+#endif  // __aarch64__
+
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const uint8x16_t kCst2 = vdupq_n_u8(2);
+    const uint8x16_t kCst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL);
+    const int16x8_t c0 = vld1q_s16(res->coeffs);
+    const int16x8_t c1 = vld1q_s16(res->coeffs + 8);
+    const uint16x8_t E0 = vreinterpretq_u16_s16(vabsq_s16(c0));
+    const uint16x8_t E1 = vreinterpretq_u16_s16(vabsq_s16(c1));
+    const uint8x16_t F = vcombine_u8(vqmovn_u16(E0), vqmovn_u16(E1));
+    const uint8x16_t G = vminq_u8(F, kCst2);   // context = 0,1,2
+    const uint8x16_t H = vminq_u8(F, kCst67);  // clamp_level in [0..67]
+
+    vst1q_u8(ctxs, G);
+    vst1q_u8(levels, H);
+
+    vst1q_u16(abs_levels, E0);
+    vst1q_u16(abs_levels + 8, E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs_NEON;
+  VP8GetResidualCost = GetResidualCost_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/3rdparty/libwebp/src/dsp/dsp.h b/3rdparty/libwebp/src/dsp/dsp.h
index 4ab77a5130..fafc2d05d3 100644
--- a/3rdparty/libwebp/src/dsp/dsp.h
+++ b/3rdparty/libwebp/src/dsp/dsp.h
@@ -76,10 +76,6 @@ extern "C" {
 #define WEBP_USE_SSE41
 #endif
 
-#if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
-#define WEBP_USE_AVX2
-#endif
-
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
 #if (defined(__ARM_NEON__) || \
@@ -679,4 +675,4 @@ void VP8FiltersInit(void);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DSP_DSP_H_ */
+#endif  // WEBP_DSP_DSP_H_
diff --git a/3rdparty/libwebp/src/dsp/enc.c b/3rdparty/libwebp/src/dsp/enc.c
index fa23b40a30..2fddbc4c52 100644
--- a/3rdparty/libwebp/src/dsp/enc.c
+++ b/3rdparty/libwebp/src/dsp/enc.c
@@ -734,7 +734,6 @@ VP8BlockCopy VP8Copy16x8;
 
 extern void VP8EncDspInitSSE2(void);
 extern void VP8EncDspInitSSE41(void);
-extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
 extern void VP8EncDspInitMIPSdspR2(void);
@@ -784,11 +783,6 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
 #endif
     }
 #endif
-#if defined(WEBP_USE_AVX2)
-    if (VP8GetCPUInfo(kAVX2)) {
-      VP8EncDspInitAVX2();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8EncDspInitMIPS32();
diff --git a/3rdparty/libwebp/src/dsp/enc_avx2.c b/3rdparty/libwebp/src/dsp/enc_avx2.c
deleted file mode 100644
index 8bc5798fee..0000000000
--- a/3rdparty/libwebp/src/dsp/enc_avx2.c
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// AVX2 version of speed-critical encoding functions.
-
-#include "src/dsp/dsp.h"
-
-#if defined(WEBP_USE_AVX2)
-
-#endif  // WEBP_USE_AVX2
-
-//------------------------------------------------------------------------------
-// Entry point
-
-WEBP_DSP_INIT_STUB(VP8EncDspInitAVX2)
diff --git a/3rdparty/libwebp/src/dsp/lossless.c b/3rdparty/libwebp/src/dsp/lossless.c
index f9b3c182d3..d21aa6a0a0 100644
--- a/3rdparty/libwebp/src/dsp/lossless.c
+++ b/3rdparty/libwebp/src/dsp/lossless.c
@@ -23,8 +23,6 @@
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
 
-#define MAX_DIFF_COST (1e30f)
-
 //------------------------------------------------------------------------------
 // Image transforms.
 
diff --git a/3rdparty/libwebp/src/dsp/lossless.h b/3rdparty/libwebp/src/dsp/lossless.h
index b2bbdfc93c..f709cc86b2 100644
--- a/3rdparty/libwebp/src/dsp/lossless.h
+++ b/3rdparty/libwebp/src/dsp/lossless.h
@@ -163,7 +163,7 @@ extern VP8LCostCombinedFunc VP8LExtraCostCombined;
 extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
 
 typedef struct {        // small struct to hold counters
-  int counts[2];        // index: 0=zero steak, 1=non-zero streak
+  int counts[2];        // index: 0=zero streak, 1=non-zero streak
   int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
 } VP8LStreaks;
 
@@ -194,10 +194,14 @@ extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
                               VP8LBitEntropy* const entropy);
 
-typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
-                                     const VP8LHistogram* const b,
-                                     VP8LHistogram* const out);
-extern VP8LHistogramAddFunc VP8LHistogramAdd;
+typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b,
+                                  uint32_t* out, int size);
+extern VP8LAddVectorFunc VP8LAddVector;
+typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size);
+extern VP8LAddVectorEqFunc VP8LAddVectorEq;
+void VP8LHistogramAdd(const VP8LHistogram* const a,
+                      const VP8LHistogram* const b,
+                      VP8LHistogram* const out);
 
 // -----------------------------------------------------------------------------
 // PrefixEncode()
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc.c b/3rdparty/libwebp/src/dsp/lossless_enc.c
index d608326fef..1408fbf580 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -632,38 +632,67 @@ static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
 
 //------------------------------------------------------------------------------
 
-static void HistogramAdd_C(const VP8LHistogram* const a,
-                           const VP8LHistogram* const b,
-                           VP8LHistogram* const out) {
+static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                        int size) {
+  int i;
+  for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
+}
+
+static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  for (i = 0; i < size; ++i) out[i] += a[i];
+}
+
+#define ADD(X, ARG, LEN) do {                                                  \
+  if (a->is_used_[X]) {                                                        \
+    if (b->is_used_[X]) {                                                      \
+      VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN));                          \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  } else if (b->is_used_[X]) {                                                 \
+    memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0]));             \
+  } else {                                                                     \
+    memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0]));                      \
+  }                                                                            \
+} while (0)
+
+#define ADD_EQ(X, ARG, LEN) do {                                               \
+  if (a->is_used_[X]) {                                                        \
+    if (out->is_used_[X]) {                                                    \
+      VP8LAddVectorEq(a->ARG, out->ARG, (LEN));                                \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  }                                                                            \
+} while (0)
+
+void VP8LHistogramAdd(const VP8LHistogram* const a,
+                      const VP8LHistogram* const b, VP8LHistogram* const out) {
   int i;
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
+
   if (b != out) {
-    for (i = 0; i < literal_size; ++i) {
-      out->literal_[i] = a->literal_[i] + b->literal_[i];
-    }
-    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-      out->distance_[i] = a->distance_[i] + b->distance_[i];
-    }
-    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
-      out->red_[i] = a->red_[i] + b->red_[i];
-      out->blue_[i] = a->blue_[i] + b->blue_[i];
-      out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
+    ADD(0, literal_, literal_size);
+    ADD(1, red_, NUM_LITERAL_CODES);
+    ADD(2, blue_, NUM_LITERAL_CODES);
+    ADD(3, alpha_, NUM_LITERAL_CODES);
+    ADD(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) {
+      out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]);
     }
   } else {
-    for (i = 0; i < literal_size; ++i) {
-      out->literal_[i] += a->literal_[i];
-    }
-    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-      out->distance_[i] += a->distance_[i];
-    }
-    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
-      out->red_[i] += a->red_[i];
-      out->blue_[i] += a->blue_[i];
-      out->alpha_[i] += a->alpha_[i];
-    }
+    ADD_EQ(0, literal_, literal_size);
+    ADD_EQ(1, red_, NUM_LITERAL_CODES);
+    ADD_EQ(2, blue_, NUM_LITERAL_CODES);
+    ADD_EQ(3, alpha_, NUM_LITERAL_CODES);
+    ADD_EQ(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i];
   }
 }
+#undef ADD
+#undef ADD_EQ
 
 //------------------------------------------------------------------------------
 // Image transforms.
@@ -848,7 +877,8 @@ VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
 VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
 
-VP8LHistogramAddFunc VP8LHistogramAdd;
+VP8LAddVectorFunc VP8LAddVector;
+VP8LAddVectorEqFunc VP8LAddVectorEq;
 
 VP8LVectorMismatchFunc VP8LVectorMismatch;
 VP8LBundleColorMapFunc VP8LBundleColorMap;
@@ -885,7 +915,8 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
   VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
   VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
 
-  VP8LHistogramAdd = HistogramAdd_C;
+  VP8LAddVector = AddVector_C;
+  VP8LAddVectorEq = AddVectorEq_C;
 
   VP8LVectorMismatch = VectorMismatch_C;
   VP8LBundleColorMap = VP8LBundleColorMap_C;
@@ -971,7 +1002,8 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
   assert(VP8LCombinedShannonEntropy != NULL);
   assert(VP8LGetEntropyUnrefined != NULL);
   assert(VP8LGetCombinedEntropyUnrefined != NULL);
-  assert(VP8LHistogramAdd != NULL);
+  assert(VP8LAddVector != NULL);
+  assert(VP8LAddVectorEq != NULL);
   assert(VP8LVectorMismatch != NULL);
   assert(VP8LBundleColorMap != NULL);
   assert(VP8LPredictorsSub[0] != NULL);
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c b/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
index e7b58f4e8c..0412a093cf 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -344,65 +344,29 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
     ASM_END_COMMON_0                                    \
     ASM_END_COMMON_1
 
-#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE)  do {   \
-  const uint32_t* pa = (const uint32_t*)(A);            \
-  const uint32_t* pb = (const uint32_t*)(B);            \
-  uint32_t* pout = (uint32_t*)(OUT);                    \
-  const uint32_t* const LoopEnd = pa + (SIZE);          \
-  assert((SIZE) % 4 == 0);                              \
-  ASM_START                                             \
-  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)              \
-  ASM_END_0                                             \
-  if ((EXTRA_SIZE) > 0) {                               \
-    const int last = (EXTRA_SIZE);                      \
-    int i;                                              \
-    for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
-  }                                                     \
-} while (0)
-
-#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE)  do {   \
-  const uint32_t* pa = (const uint32_t*)(A);            \
-  uint32_t* pout = (uint32_t*)(OUT);                    \
-  const uint32_t* const LoopEnd = pa + (SIZE);          \
-  assert((SIZE) % 4 == 0);                              \
-  ASM_START                                             \
-  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)            \
-  ASM_END_1                                             \
-  if ((EXTRA_SIZE) > 0) {                               \
-    const int last = (EXTRA_SIZE);                      \
-    int i;                                              \
-    for (i = 0; i < last; ++i) pout[i] += pa[i];        \
-  }                                                     \
-} while (0)
-
-static void HistogramAdd_MIPS32(const VP8LHistogram* const a,
-                                const VP8LHistogram* const b,
-                                VP8LHistogram* const out) {
+static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
+                             uint32_t* pout, int size) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
-                             - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
-  assert(a->palette_code_bits_ == b->palette_code_bits_);
-
-  if (b != out) {
-    ADD_VECTOR(a->literal_, b->literal_, out->literal_,
-               NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
-    ADD_VECTOR(a->distance_, b->distance_, out->distance_,
-               NUM_DISTANCE_CODES, 0);
-    ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
-    ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
-    ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
-  } else {
-    ADD_VECTOR_EQ(a->literal_, out->literal_,
-                  NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
-    ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
-    ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
-    ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
-    ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
-  }
+  const uint32_t end = ((size) / 4) * 4;
+  const uint32_t* const LoopEnd = pa + end;
+  int i;
+  ASM_START
+  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
+  ASM_END_0
+  for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
+}
+
+static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const uint32_t end = ((size) / 4) * 4;
+  const uint32_t* const LoopEnd = pa + end;
+  int i;
+  ASM_START
+  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
+  ASM_END_1
+  for (i = end; i < size; ++i) pout[i] += pa[i];
 }
 
-#undef ADD_VECTOR_EQ
-#undef ADD_VECTOR
 #undef ASM_END_1
 #undef ASM_END_0
 #undef ASM_END_COMMON_1
@@ -422,7 +386,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
   VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
   VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
   VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
-  VP8LHistogramAdd = HistogramAdd_MIPS32;
+  VP8LAddVector = AddVector_MIPS32;
+  VP8LAddVectorEq = AddVectorEq_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index f84a9909e1..36478c4912 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -170,12 +170,13 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
 
 //------------------------------------------------------------------------------
 
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
 #define LINE_SIZE 16    // 8 or 16
 static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
                            int size) {
   int i;
-  assert(size % LINE_SIZE == 0);
-  for (i = 0; i < size; i += LINE_SIZE) {
+  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
 #if (LINE_SIZE == 16)
@@ -195,12 +196,14 @@ static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
 #endif
   }
+  for (; i < size; ++i) {
+    out[i] = a[i] + b[i];
+  }
 }
 
 static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
   int i;
-  assert(size % LINE_SIZE == 0);
-  for (i = 0; i < size; i += LINE_SIZE) {
+  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
 #if (LINE_SIZE == 16)
@@ -220,36 +223,12 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
 #endif
   }
+  for (; i < size; ++i) {
+    out[i] += a[i];
+  }
 }
 #undef LINE_SIZE
 
-// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
-// that's ok since the histogram values are less than 1<<28 (max picture size).
-static void HistogramAdd_SSE2(const VP8LHistogram* const a,
-                              const VP8LHistogram* const b,
-                              VP8LHistogram* const out) {
-  int i;
-  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
-  assert(a->palette_code_bits_ == b->palette_code_bits_);
-  if (b != out) {
-    AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
-    AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
-  } else {
-    AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES);
-    AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
-  }
-  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
-    out->literal_[i] = a->literal_[i] + b->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] = a->distance_[i] + b->distance_[i];
-  }
-}
-
 //------------------------------------------------------------------------------
 // Entropy
 
@@ -675,7 +654,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
   VP8LTransformColor = TransformColor_SSE2;
   VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
   VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
-  VP8LHistogramAdd = HistogramAdd_SSE2;
+  VP8LAddVector = AddVector_SSE2;
+  VP8LAddVectorEq = AddVectorEq_SSE2;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
   VP8LVectorMismatch = VectorMismatch_SSE2;
   VP8LBundleColorMap = BundleColorMap_SSE2;
diff --git a/3rdparty/libwebp/src/dsp/msa_macro.h b/3rdparty/libwebp/src/dsp/msa_macro.h
index dfacda6ccd..de026a1d9e 100644
--- a/3rdparty/libwebp/src/dsp/msa_macro.h
+++ b/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -1389,4 +1389,4 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
-#endif  /* WEBP_DSP_MSA_MACRO_H_ */
+#endif  // WEBP_DSP_MSA_MACRO_H_
diff --git a/3rdparty/libwebp/src/dsp/quant.h b/3rdparty/libwebp/src/dsp/quant.h
new file mode 100644
index 0000000000..5ba6f9c377
--- /dev/null
+++ b/3rdparty/libwebp/src/dsp/quant.h
@@ -0,0 +1,70 @@
+// Copyright 2018 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+
+#ifndef WEBP_DSP_QUANT_H_
+#define WEBP_DSP_QUANT_H_
+
+#include "src/dsp/dsp.h"
+#include "src/webp/types.h"
+
+#if defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) && \
+    !defined(WEBP_HAVE_NEON_RTCD)
+#include <arm_neon.h>
+
+#define IsFlat IsFlat_NEON
+
+static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                  vreinterpret_u32_u64(vget_high_u64(b)));
+}
+
+static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
+                              int thresh) {
+  const int16x8_t tst_ones = vdupq_n_s16(-1);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  for (int i = 0; i < num_blocks; ++i) {
+    // Set DC to zero.
+    const int16x8_t a_0 = vsetq_lane_s16(0, vld1q_s16(levels), 0);
+    const int16x8_t a_1 = vld1q_s16(levels + 8);
+
+    const uint16x8_t b_0 = vshrq_n_u16(vtstq_s16(a_0, tst_ones), 15);
+    const uint16x8_t b_1 = vshrq_n_u16(vtstq_s16(a_1, tst_ones), 15);
+
+    sum = vpadalq_u16(sum, b_0);
+    sum = vpadalq_u16(sum, b_1);
+
+    levels += 16;
+  }
+  return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0);
+}
+
+#else
+
+#define IsFlat IsFlat_C
+
+static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
+                              int thresh) {
+  int score = 0;
+  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
+    int i;
+    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
+      score += (levels[i] != 0);
+      if (score > thresh) return 0;
+    }
+    levels += 16;
+  }
+  return 1;
+}
+
+#endif  // defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) &&
+        // !defined(WEBP_HAVE_NEON_RTCD)
+
+#endif  // WEBP_DSP_QUANT_H_
diff --git a/3rdparty/libwebp/src/dsp/rescaler.c b/3rdparty/libwebp/src/dsp/rescaler.c
index f307d35056..753f84fcf4 100644
--- a/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/3rdparty/libwebp/src/dsp/rescaler.c
@@ -21,6 +21,7 @@
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
 
 //------------------------------------------------------------------------------
 // Row import
@@ -138,7 +139,7 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
   if (yscale) {
     for (x_out = 0; x_out < x_out_max; ++x_out) {
       const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
       assert(v >= 0 && v <= 255);
       dst[x_out] = v;
       irow[x_out] = frac;   // new fractional start
@@ -153,6 +154,7 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
   }
 }
 
+#undef MULT_FIX_FLOOR
 #undef MULT_FIX
 #undef ROUNDER
 
diff --git a/3rdparty/libwebp/src/dsp/rescaler_mips32.c b/3rdparty/libwebp/src/dsp/rescaler_mips32.c
index 542f7e5970..61f63c616c 100644
--- a/3rdparty/libwebp/src/dsp/rescaler_mips32.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_mips32.c
@@ -209,6 +209,7 @@ static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
   }
 }
 
+#if 0  // disabled for now. TODO(skal): make match the C-code
 static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   uint8_t* dst = wrk->dst;
@@ -273,6 +274,7 @@ static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
     );
   }
 }
+#endif  // 0
 
 //------------------------------------------------------------------------------
 // Entry point
@@ -283,7 +285,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
   WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
   WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
   WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
-  WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
+//  WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c b/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
index b78aac15e6..ce9e64862e 100644
--- a/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
@@ -20,10 +20,12 @@
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
 
 //------------------------------------------------------------------------------
 // Row export
 
+#if 0  // disabled for now. TODO(skal): make match the C-code
 static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
   int i;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -106,7 +108,7 @@ static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
     }
     for (i = 0; i < (x_out_max & 0x3); ++i) {
       const uint32_t frac = (uint32_t)MULT_FIX(*frow++, yscale);
-      const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
+      const int v = (int)MULT_FIX_FLOOR(*irow - frac, wrk->fxy_scale);
       assert(v >= 0 && v <= 255);
       *dst++ = v;
       *irow++ = frac;   // new fractional start
@@ -154,13 +156,14 @@ static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
       );
     }
     for (i = 0; i < (x_out_max & 0x3); ++i) {
-      const int v = (int)MULT_FIX(*irow, wrk->fxy_scale);
+      const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale);
       assert(v >= 0 && v <= 255);
       *dst++ = v;
       *irow++ = 0;
     }
   }
 }
+#endif  // 0
 
 static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   int i;
@@ -294,6 +297,7 @@ static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   }
 }
 
+#undef MULT_FIX_FLOOR
 #undef MULT_FIX
 #undef ROUNDER
 
@@ -304,7 +308,7 @@ extern void WebPRescalerDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
   WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
-  WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
+//  WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/3rdparty/libwebp/src/dsp/rescaler_msa.c b/3rdparty/libwebp/src/dsp/rescaler_msa.c
index f3bc99f1cd..c559254836 100644
--- a/3rdparty/libwebp/src/dsp/rescaler_msa.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_msa.c
@@ -22,6 +22,7 @@
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
 
 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
   v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
@@ -262,6 +263,7 @@ static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   }
 }
 
+#if 0  // disabled for now. TODO(skal): make match the C-code
 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
                                           uint8_t* dst, int length,
                                           const uint32_t yscale,
@@ -341,7 +343,7 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
     }
     for (x_out = 0; x_out < length; ++x_out) {
       const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
       assert(v >= 0 && v <= 255);
       dst[x_out] = v;
       irow[x_out] = frac;
@@ -426,6 +428,7 @@ static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
     ExportRowShrink_1(irow, dst, x_out_max, wrk);
   }
 }
+#endif  // 0
 
 //------------------------------------------------------------------------------
 // Entry point
@@ -434,7 +437,7 @@ extern void WebPRescalerDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
   WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
-  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
+//  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
 }
 
 #else     // !WEBP_USE_MSA
diff --git a/3rdparty/libwebp/src/dsp/rescaler_neon.c b/3rdparty/libwebp/src/dsp/rescaler_neon.c
index 3eff9fbaf4..a553f06f79 100644
--- a/3rdparty/libwebp/src/dsp/rescaler_neon.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_neon.c
@@ -22,6 +22,7 @@
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
 
 #define LOAD_32x4(SRC, DST) const uint32x4_t DST = vld1q_u32((SRC))
 #define LOAD_32x8(SRC, DST0, DST1)                                    \
@@ -35,8 +36,11 @@
 
 #if (WEBP_RESCALER_RFIX == 32)
 #define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1))
-#define MULT_FIX(A, B) /* note: B is actualy scale>>1. See MAKE_HALF_CST */ \
+// note: B is actualy scale>>1. See MAKE_HALF_CST
+#define MULT_FIX(A, B) \
     vreinterpretq_u32_s32(vqrdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
+#define MULT_FIX_FLOOR(A, B) \
+    vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
 #else
 #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
 #endif
@@ -135,8 +139,8 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
       const uint32x4_t A1 = MULT_FIX(in1, yscale_half);
       const uint32x4_t B0 = vqsubq_u32(in2, A0);
       const uint32x4_t B1 = vqsubq_u32(in3, A1);
-      const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
-      const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
+      const uint32x4_t C0 = MULT_FIX_FLOOR(B0, fxy_scale_half);
+      const uint32x4_t C1 = MULT_FIX_FLOOR(B1, fxy_scale_half);
       const uint16x4_t D0 = vmovn_u32(C0);
       const uint16x4_t D1 = vmovn_u32(C1);
       const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1));
@@ -145,7 +149,7 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
     }
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale);
-      const int v = (int)MULT_FIX_C(irow[x_out] - frac, wrk->fxy_scale);
+      const int v = (int)MULT_FIX_FLOOR_C(irow[x_out] - frac, fxy_scale);
       assert(v >= 0 && v <= 255);
       dst[x_out] = v;
       irow[x_out] = frac;   // new fractional start
@@ -170,6 +174,12 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
   }
 }
 
+#undef MULT_FIX_FLOOR_C
+#undef MULT_FIX_C
+#undef MULT_FIX_FLOOR
+#undef MULT_FIX
+#undef ROUNDER
+
 //------------------------------------------------------------------------------
 
 extern void WebPRescalerDspInitNEON(void);
diff --git a/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index 64c50deab5..f7461a452c 100644
--- a/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -25,6 +25,7 @@
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
 
 // input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
 static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
@@ -224,6 +225,35 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
   _mm_storel_epi64((__m128i*)dst, G);
 }
 
+static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0,
+                                              const __m128i* const A1,
+                                              const __m128i* const A2,
+                                              const __m128i* const A3,
+                                              const __m128i* const mult,
+                                              uint8_t* const dst) {
+  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
+  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
+  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
+  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
+  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
+  const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);
+  const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
+#if (WEBP_RESCALER_RFIX < 32)
+  const __m128i D2 =
+      _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask);
+  const __m128i D3 =
+      _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask);
+#else
+  const __m128i D2 = _mm_and_si128(B2, mask);
+  const __m128i D3 = _mm_and_si128(B3, mask);
+#endif
+  const __m128i E0 = _mm_or_si128(D0, D2);
+  const __m128i E1 = _mm_or_si128(D1, D3);
+  const __m128i F = _mm_packs_epi32(E0, E1);
+  const __m128i G = _mm_packus_epi16(F, F);
+  _mm_storel_epi64((__m128i*)dst, G);
+}
+
 static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
@@ -322,12 +352,12 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
         const __m128i G1 = _mm_or_si128(D1, F3);
         _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
         _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
-        ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+        ProcessRow_Floor_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
       }
     }
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
       assert(v >= 0 && v <= 255);
       dst[x_out] = v;
       irow[x_out] = frac;   // new fractional start
@@ -352,6 +382,7 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
   }
 }
 
+#undef MULT_FIX_FLOOR
 #undef MULT_FIX
 #undef ROUNDER
 
diff --git a/3rdparty/libwebp/src/dsp/yuv.h b/3rdparty/libwebp/src/dsp/yuv.h
index eb787270d2..c12be1d094 100644
--- a/3rdparty/libwebp/src/dsp/yuv.h
+++ b/3rdparty/libwebp/src/dsp/yuv.h
@@ -207,4 +207,4 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DSP_YUV_H_ */
+#endif  // WEBP_DSP_YUV_H_
diff --git a/3rdparty/libwebp/src/enc/analysis_enc.c b/3rdparty/libwebp/src/enc/analysis_enc.c
index a47ff7d4e8..687757ae03 100644
--- a/3rdparty/libwebp/src/enc/analysis_enc.c
+++ b/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -458,7 +458,7 @@ static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
   dst->uv_alpha += src->uv_alpha;
 }
 
-// initialize the job struct with some TODOs
+// initialize the job struct with some tasks to perform
 static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
                            int start_row, int end_row) {
   WebPGetWorkerInterface()->Init(&job->worker);
diff --git a/3rdparty/libwebp/src/enc/backward_references_cost_enc.c b/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
index 7175496c7f..516abd73eb 100644
--- a/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
+++ b/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
@@ -67,7 +67,7 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
 
   // The following code is similar to VP8LHistogramCreate but converts the
   // distance to plane code.
-  VP8LHistogramInit(histo, cache_bits);
+  VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 1);
   while (VP8LRefsCursorOk(&c)) {
     VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
                                     xsize);
diff --git a/3rdparty/libwebp/src/enc/backward_references_enc.c b/3rdparty/libwebp/src/enc/backward_references_enc.c
index 39230188b9..3ab7b0ac7d 100644
--- a/3rdparty/libwebp/src/enc/backward_references_enc.c
+++ b/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -715,6 +715,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
   for (i = 0; i <= cache_bits_max; ++i) {
     histos[i] = VP8LAllocateHistogram(i);
     if (histos[i] == NULL) goto Error;
+    VP8LHistogramInit(histos[i], i, /*init_arrays=*/ 1);
     if (i == 0) continue;
     cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
     if (!cc_init[i]) goto Error;
diff --git a/3rdparty/libwebp/src/enc/cost_enc.h b/3rdparty/libwebp/src/enc/cost_enc.h
index bdce1e6a3b..a4b177b342 100644
--- a/3rdparty/libwebp/src/enc/cost_enc.h
+++ b/3rdparty/libwebp/src/enc/cost_enc.h
@@ -79,4 +79,4 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_COST_ENC_H_ */
+#endif  // WEBP_ENC_COST_ENC_H_
diff --git a/3rdparty/libwebp/src/enc/histogram_enc.c b/3rdparty/libwebp/src/enc/histogram_enc.c
index 9fdbc627a1..8ac6fa8e02 100644
--- a/3rdparty/libwebp/src/enc/histogram_enc.c
+++ b/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -51,10 +51,12 @@ static void HistogramCopy(const VP8LHistogram* const src,
                           VP8LHistogram* const dst) {
   uint32_t* const dst_literal = dst->literal_;
   const int dst_cache_bits = dst->palette_code_bits_;
+  const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
   const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
   assert(src->palette_code_bits_ == dst_cache_bits);
   memcpy(dst, src, histo_size);
   dst->literal_ = dst_literal;
+  memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
 }
 
 int VP8LGetHistogramSize(int cache_bits) {
@@ -91,9 +93,19 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
   VP8LHistogramStoreRefs(refs, p);
 }
 
-void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits) {
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
+                       int init_arrays) {
   p->palette_code_bits_ = palette_code_bits;
-  HistogramClear(p);
+  if (init_arrays) {
+    HistogramClear(p);
+  } else {
+    p->trivial_symbol_ = 0;
+    p->bit_cost_ = 0.;
+    p->literal_cost_ = 0.;
+    p->red_cost_ = 0.;
+    p->blue_cost_ = 0.;
+    memset(p->is_used_, 0, sizeof(p->is_used_));
+  }
 }
 
 VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
@@ -104,37 +116,84 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
   histo = (VP8LHistogram*)memory;
   // literal_ won't necessary be aligned.
   histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
-  VP8LHistogramInit(histo, cache_bits);
+  VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 0);
   return histo;
 }
 
+// Resets the pointers of the histograms to point to the bit buffer in the set.
+static void HistogramSetResetPointers(VP8LHistogramSet* const set,
+                                      int cache_bits) {
+  int i;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  uint8_t* memory = (uint8_t*) (set->histograms);
+  memory += set->max_size * sizeof(*set->histograms);
+  for (i = 0; i < set->max_size; ++i) {
+    memory = (uint8_t*) WEBP_ALIGN(memory);
+    set->histograms[i] = (VP8LHistogram*) memory;
+    // literal_ won't necessary be aligned.
+    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+    memory += histo_size;
+  }
+}
+
+// Returns the total size of the VP8LHistogramSet.
+static size_t HistogramSetTotalSize(int size, int cache_bits) {
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
+          histo_size + WEBP_ALIGN_CST));
+}
+
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   int i;
   VP8LHistogramSet* set;
-  const int histo_size = VP8LGetHistogramSize(cache_bits);
-  const size_t total_size =
-      sizeof(*set) + size * (sizeof(*set->histograms) +
-      histo_size + WEBP_ALIGN_CST);
+  const size_t total_size = HistogramSetTotalSize(size, cache_bits);
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
   set = (VP8LHistogramSet*)memory;
   memory += sizeof(*set);
   set->histograms = (VP8LHistogram**)memory;
-  memory += size * sizeof(*set->histograms);
   set->max_size = size;
   set->size = size;
+  HistogramSetResetPointers(set, cache_bits);
   for (i = 0; i < size; ++i) {
-    memory = (uint8_t*)WEBP_ALIGN(memory);
-    set->histograms[i] = (VP8LHistogram*)memory;
-    // literal_ won't necessary be aligned.
-    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
-    VP8LHistogramInit(set->histograms[i], cache_bits);
-    memory += histo_size;
+    VP8LHistogramInit(set->histograms[i], cache_bits, /*init_arrays=*/ 0);
   }
   return set;
 }
 
+void VP8LHistogramSetClear(VP8LHistogramSet* const set) {
+  int i;
+  const int cache_bits = set->histograms[0]->palette_code_bits_;
+  const int size = set->max_size;
+  const size_t total_size = HistogramSetTotalSize(size, cache_bits);
+  uint8_t* memory = (uint8_t*)set;
+
+  memset(memory, 0, total_size);
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  set->max_size = size;
+  set->size = size;
+  HistogramSetResetPointers(set, cache_bits);
+  for (i = 0; i < size; ++i) {
+    set->histograms[i]->palette_code_bits_ = cache_bits;
+  }
+}
+
+// Removes the histogram 'i' from 'set' by setting it to NULL.
+static void HistogramSetRemoveHistogram(VP8LHistogramSet* const set, int i,
+                                        int* const num_used) {
+  assert(set->histograms[i] != NULL);
+  set->histograms[i] = NULL;
+  --*num_used;
+  // If we remove the last valid one, shrink until the next valid one.
+  if (i == set->size - 1) {
+    while (set->size >= 1 && set->histograms[set->size - 1] == NULL) {
+      --set->size;
+    }
+  }
+}
+
 // -----------------------------------------------------------------------------
 
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
@@ -237,7 +296,8 @@ static double FinalHuffmanCost(const VP8LStreaks* const stats) {
 // Get the symbol entropy for the distribution 'population'.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
 static double PopulationCost(const uint32_t* const population, int length,
-                             uint32_t* const trivial_sym) {
+                             uint32_t* const trivial_sym,
+                             uint8_t* const is_used) {
   VP8LBitEntropy bit_entropy;
   VP8LStreaks stats;
   VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
@@ -245,6 +305,8 @@ static double PopulationCost(const uint32_t* const population, int length,
     *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
                                                : VP8L_NON_TRIVIAL_SYM;
   }
+  // The histogram is used if there is at least one non-zero streak.
+  *is_used = (stats.streaks[1][0] != 0 || stats.streaks[1][1] != 0);
 
   return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
@@ -253,7 +315,9 @@ static double PopulationCost(const uint32_t* const population, int length,
 // non-zero: both the zero-th one, or both the last one.
 static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
                                              const uint32_t* const Y,
-                                             int length, int trivial_at_end) {
+                                             int length, int is_X_used,
+                                             int is_Y_used,
+                                             int trivial_at_end) {
   VP8LStreaks stats;
   if (trivial_at_end) {
     // This configuration is due to palettization that transforms an indexed
@@ -262,28 +326,43 @@ static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
     // Only FinalHuffmanCost needs to be evaluated.
     memset(&stats, 0, sizeof(stats));
     // Deal with the non-zero value at index 0 or length-1.
-    stats.streaks[1][0] += 1;
+    stats.streaks[1][0] = 1;
     // Deal with the following/previous zero streak.
-    stats.counts[0] += 1;
-    stats.streaks[0][1] += length - 1;
+    stats.counts[0] = 1;
+    stats.streaks[0][1] = length - 1;
     return FinalHuffmanCost(&stats);
   } else {
     VP8LBitEntropy bit_entropy;
-    VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+    if (is_X_used) {
+      if (is_Y_used) {
+        VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+      } else {
+        VP8LGetEntropyUnrefined(X, length, &bit_entropy, &stats);
+      }
+    } else {
+      if (is_Y_used) {
+        VP8LGetEntropyUnrefined(Y, length, &bit_entropy, &stats);
+      } else {
+        memset(&stats, 0, sizeof(stats));
+        stats.counts[0] = 1;
+        stats.streaks[0][length > 3] = length;
+        VP8LBitEntropyInit(&bit_entropy);
+      }
+    }
 
     return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
   }
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
-double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
+double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
   return
-      PopulationCost(
-          p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), NULL)
-      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL)
-      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL)
-      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL)
-      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL)
+      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
+                     NULL, &p->is_used_[0])
+      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
       + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
       + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
@@ -299,7 +378,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   int trivial_at_end = 0;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   *cost += GetCombinedEntropy(a->literal_, b->literal_,
-                              VP8LHistogramNumCodes(palette_code_bits), 0);
+                              VP8LHistogramNumCodes(palette_code_bits),
+                              a->is_used_[0], b->is_used_[0], 0);
   *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
                                  b->literal_ + NUM_LITERAL_CODES,
                                  NUM_LENGTH_CODES);
@@ -319,19 +399,23 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   }
 
   *cost +=
-      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, trivial_at_end);
+      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1],
+                         b->is_used_[1], trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
   *cost +=
-      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, trivial_at_end);
-  if (*cost > cost_threshold) return 0;
-
-  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
-                              trivial_at_end);
+      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2],
+                         b->is_used_[2], trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
   *cost +=
-      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, 0);
+      GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                         a->is_used_[3], b->is_used_[3], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
+                         a->is_used_[4], b->is_used_[4], 0);
   *cost +=
       VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
@@ -377,7 +461,9 @@ static double HistogramAddEval(const VP8LHistogram* const a,
 static double HistogramAddThresh(const VP8LHistogram* const a,
                                  const VP8LHistogram* const b,
                                  double cost_threshold) {
-  double cost = -a->bit_cost_;
+  double cost;
+  assert(a != NULL && b != NULL);
+  cost = -a->bit_cost_;
   GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
   return cost;
 }
@@ -419,16 +505,19 @@ static void UpdateDominantCostRange(
 static void UpdateHistogramCost(VP8LHistogram* const h) {
   uint32_t alpha_sym, red_sym, blue_sym;
   const double alpha_cost =
-      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym);
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
+                     &h->is_used_[3]);
   const double distance_cost =
-      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL) +
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
       VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
-  h->literal_cost_ = PopulationCost(h->literal_, num_codes, NULL) +
-                     VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
-                                   NUM_LENGTH_CODES);
-  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym);
-  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym);
+  h->literal_cost_ =
+      PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
+          VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
+  h->red_cost_ =
+      PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
+  h->blue_cost_ =
+      PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym, &h->is_used_[2]);
   h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
                  alpha_cost + distance_cost;
   if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
@@ -473,6 +562,7 @@ static void HistogramBuild(
   VP8LHistogram** const histograms = image_histo->histograms;
   VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
   assert(histo_bits > 0);
+  VP8LHistogramSetClear(image_histo);
   while (VP8LRefsCursorOk(&c)) {
     const PixOrCopy* const v = c.cur_pos;
     const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
@@ -487,17 +577,37 @@ static void HistogramBuild(
 }
 
 // Copies the histograms and computes its bit_cost.
-static void HistogramCopyAndAnalyze(
-    VP8LHistogramSet* const orig_histo, VP8LHistogramSet* const image_histo) {
-  int i;
-  const int histo_size = orig_histo->size;
+static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1);
+static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
+                                    VP8LHistogramSet* const image_histo,
+                                    int* const num_used,
+                                    uint16_t* const histogram_symbols) {
+  int i, cluster_id;
+  int num_used_orig = *num_used;
   VP8LHistogram** const orig_histograms = orig_histo->histograms;
   VP8LHistogram** const histograms = image_histo->histograms;
-  for (i = 0; i < histo_size; ++i) {
+  assert(image_histo->max_size == orig_histo->max_size);
+  for (cluster_id = 0, i = 0; i < orig_histo->max_size; ++i) {
     VP8LHistogram* const histo = orig_histograms[i];
     UpdateHistogramCost(histo);
-    // Copy histograms from orig_histo[] to image_histo[].
-    HistogramCopy(histo, histograms[i]);
+
+    // Skip the histogram if it is completely empty, which can happen for tiles
+    // with no information (when they are skipped because of LZ77).
+    if (!histo->is_used_[0] && !histo->is_used_[1] && !histo->is_used_[2]
+        && !histo->is_used_[3] && !histo->is_used_[4]) {
+      // The first histogram is always used. If an histogram is empty, we set
+      // its id to be the same as the previous one: this will improve
+      // compressibility for later LZ77.
+      assert(i > 0);
+      HistogramSetRemoveHistogram(image_histo, i, num_used);
+      HistogramSetRemoveHistogram(orig_histo, i, &num_used_orig);
+      histogram_symbols[i] = kInvalidHistogramSymbol;
+    } else {
+      // Copy histograms from orig_histo[] to image_histo[].
+      HistogramCopy(histo, histograms[i]);
+      histogram_symbols[i] = cluster_id++;
+      assert(cluster_id <= image_histo->max_size);
+    }
   }
 }
 
@@ -514,29 +624,33 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
 
   // Analyze the dominant (literal, red and blue) entropy costs.
   for (i = 0; i < histo_size; ++i) {
+    if (histograms[i] == NULL) continue;
     UpdateDominantCostRange(histograms[i], &cost_range);
   }
 
   // bin-hash histograms on three of the dominant (literal, red and blue)
   // symbol costs and store the resulting bin_id for each histogram.
   for (i = 0; i < histo_size; ++i) {
+    // bin_map[i] is not set to a special value as its use will later be guarded
+    // by another (histograms[i] == NULL).
+    if (histograms[i] == NULL) continue;
     bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort);
   }
 }
 
-// Compact image_histo[] by merging some histograms with same bin_id together if
-// it's advantageous.
+// Merges some histograms with same bin_id together if it's advantageous.
+// Sets the remaining histograms to NULL.
 static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       int *num_used,
+                                       const uint16_t* const clusters,
+                                       uint16_t* const cluster_mappings,
                                        VP8LHistogram* cur_combo,
                                        const uint16_t* const bin_map,
-                                       int bin_map_size, int num_bins,
+                                       int num_bins,
                                        double combine_cost_factor,
                                        int low_effort) {
   VP8LHistogram** const histograms = image_histo->histograms;
   int idx;
-  // Work in-place: processed histograms are put at the beginning of
-  // image_histo[]. At the end, we just have to truncate the array.
-  int size = 0;
   struct {
     int16_t first;    // position of the histogram that accumulates all
                       // histograms with the same bin_id
@@ -549,16 +663,19 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
     bin_info[idx].num_combine_failures = 0;
   }
 
-  for (idx = 0; idx < bin_map_size; ++idx) {
-    const int bin_id = bin_map[idx];
-    const int first = bin_info[bin_id].first;
-    assert(size <= idx);
+  // By default, a cluster matches itself.
+  for (idx = 0; idx < *num_used; ++idx) cluster_mappings[idx] = idx;
+  for (idx = 0; idx < image_histo->size; ++idx) {
+    int bin_id, first;
+    if (histograms[idx] == NULL) continue;
+    bin_id = bin_map[idx];
+    first = bin_info[bin_id].first;
     if (first == -1) {
-      // just move histogram #idx to its final position
-      histograms[size] = histograms[idx];
-      bin_info[bin_id].first = size++;
+      bin_info[bin_id].first = idx;
     } else if (low_effort) {
       HistogramAdd(histograms[idx], histograms[first], histograms[first]);
+      HistogramSetRemoveHistogram(image_histo, idx, num_used);
+      cluster_mappings[clusters[idx]] = clusters[first];
     } else {
       // try to merge #idx into #first (both share the same bin_id)
       const double bit_cost = histograms[idx]->bit_cost_;
@@ -581,19 +698,18 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
             bin_info[bin_id].num_combine_failures >= max_combine_failures) {
           // move the (better) merged histogram to its final slot
           HistogramSwap(&cur_combo, &histograms[first]);
+          HistogramSetRemoveHistogram(image_histo, idx, num_used);
+          cluster_mappings[clusters[idx]] = clusters[first];
         } else {
-          histograms[size++] = histograms[idx];
           ++bin_info[bin_id].num_combine_failures;
         }
-      } else {
-        histograms[size++] = histograms[idx];
       }
     }
   }
-  image_histo->size = size;
   if (low_effort) {
     // for low_effort case, update the final cost when everything is merged
-    for (idx = 0; idx < size; ++idx) {
+    for (idx = 0; idx < image_histo->size; ++idx) {
+      if (histograms[idx] == NULL) continue;
       UpdateHistogramCost(histograms[idx]);
     }
   }
@@ -624,16 +740,9 @@ typedef struct {
   int max_size;
 } HistoQueue;
 
-static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
+static int HistoQueueInit(HistoQueue* const histo_queue, const int max_size) {
   histo_queue->size = 0;
-  // max_index^2 for the queue size is safe. If you look at
-  // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
-  // data to the queue, you insert at most:
-  // - max_index*(max_index-1)/2 (the first two for loops)
-  // - max_index - 1 in the last for loop at the first iteration of the while
-  //   loop, max_index - 2 at the second iteration ... therefore
-  //   max_index*(max_index-1)/2 overall too
-  histo_queue->max_size = max_index * max_index;
+  histo_queue->max_size = max_size;
   // We allocate max_size + 1 because the last element at index "size" is
   // used as temporary data (and it could be up to max_size).
   histo_queue->queue = (HistogramPair*)WebPSafeMalloc(
@@ -674,6 +783,18 @@ static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
   }
 }
 
+// Update the cost diff and combo of a pair of histograms. This needs to be
+// called when the the histograms have been merged with a third one.
+static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
+                                 const VP8LHistogram* const h2,
+                                 double threshold,
+                                 HistogramPair* const pair) {
+  const double sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  pair->cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
+  pair->cost_diff = pair->cost_combo - sum_cost;
+}
+
 // Create a pair from indices "idx1" and "idx2" provided its cost
 // is inferior to "threshold", a negative entropy.
 // It returns the cost of the pair, or 0. if it superior to threshold.
@@ -683,8 +804,9 @@ static double HistoQueuePush(HistoQueue* const histo_queue,
   const VP8LHistogram* h1;
   const VP8LHistogram* h2;
   HistogramPair pair;
-  double sum_cost;
 
+  // Stop here if the queue is full.
+  if (histo_queue->size == histo_queue->max_size) return 0.;
   assert(threshold <= 0.);
   if (idx1 > idx2) {
     const int tmp = idx2;
@@ -695,16 +817,12 @@ static double HistoQueuePush(HistoQueue* const histo_queue,
   pair.idx2 = idx2;
   h1 = histograms[idx1];
   h2 = histograms[idx2];
-  sum_cost = h1->bit_cost_ + h2->bit_cost_;
-  pair.cost_combo = 0.;
-  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair.cost_combo);
-  pair.cost_diff = pair.cost_combo - sum_cost;
+
+  HistoQueueUpdatePair(h1, h2, threshold, &pair);
 
   // Do not even consider the pair if it does not improve the entropy.
   if (pair.cost_diff >= threshold) return 0.;
 
-  // We cannot add more elements than the capacity.
-  assert(histo_queue->size < histo_queue->max_size);
   histo_queue->queue[histo_queue->size++] = pair;
   HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
 
@@ -715,42 +833,43 @@ static double HistoQueuePush(HistoQueue* const histo_queue,
 
 // Combines histograms by continuously choosing the one with the highest cost
 // reduction.
-static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
+static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
+                                  int* const num_used) {
   int ok = 0;
-  int image_histo_size = image_histo->size;
+  const int image_histo_size = image_histo->size;
   int i, j;
   VP8LHistogram** const histograms = image_histo->histograms;
-  // Indexes of remaining histograms.
-  int* const clusters =
-      (int*)WebPSafeMalloc(image_histo_size, sizeof(*clusters));
   // Priority queue of histogram pairs.
   HistoQueue histo_queue;
 
-  if (!HistoQueueInit(&histo_queue, image_histo_size) || clusters == NULL) {
+  // image_histo_size^2 for the queue size is safe. If you look at
+  // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
+  // data to the queue, you insert at most:
+  // - image_histo_size*(image_histo_size-1)/2 (the first two for loops)
+  // - image_histo_size - 1 in the last for loop at the first iteration of
+  //   the while loop, image_histo_size - 2 at the second iteration ...
+  //   therefore image_histo_size*(image_histo_size-1)/2 overall too
+  if (!HistoQueueInit(&histo_queue, image_histo_size * image_histo_size)) {
     goto End;
   }
 
   for (i = 0; i < image_histo_size; ++i) {
-    // Initialize clusters indexes.
-    clusters[i] = i;
+    if (image_histo->histograms[i] == NULL) continue;
     for (j = i + 1; j < image_histo_size; ++j) {
-      // Initialize positions array.
+      // Initialize queue.
+      if (image_histo->histograms[j] == NULL) continue;
       HistoQueuePush(&histo_queue, histograms, i, j, 0.);
     }
   }
 
-  while (image_histo_size > 1 && histo_queue.size > 0) {
+  while (histo_queue.size > 0) {
     const int idx1 = histo_queue.queue[0].idx1;
     const int idx2 = histo_queue.queue[0].idx2;
     HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
     histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+
     // Remove merged histogram.
-    for (i = 0; i + 1 < image_histo_size; ++i) {
-      if (clusters[i] >= idx2) {
-        clusters[i] = clusters[i + 1];
-      }
-    }
-    --image_histo_size;
+    HistogramSetRemoveHistogram(image_histo, idx2, num_used);
 
     // Remove pairs intersecting the just combined best pair.
     for (i = 0; i < histo_queue.size;) {
@@ -765,24 +884,15 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     }
 
     // Push new pairs formed with combined histogram to the queue.
-    for (i = 0; i < image_histo_size; ++i) {
-      if (clusters[i] != idx1) {
-        HistoQueuePush(&histo_queue, histograms, idx1, clusters[i], 0.);
-      }
-    }
-  }
-  // Move remaining histograms to the beginning of the array.
-  for (i = 0; i < image_histo_size; ++i) {
-    if (i != clusters[i]) {  // swap the two histograms
-      HistogramSwap(&histograms[i], &histograms[clusters[i]]);
+    for (i = 0; i < image_histo->size; ++i) {
+      if (i == idx1 || image_histo->histograms[i] == NULL) continue;
+      HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.);
     }
   }
 
-  image_histo->size = image_histo_size;
   ok = 1;
 
  End:
-  WebPSafeFree(clusters);
   HistoQueueClear(&histo_queue);
   return ok;
 }
@@ -790,47 +900,69 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
 // Perform histogram aggregation using a stochastic approach.
 // 'do_greedy' is set to 1 if a greedy approach needs to be performed
 // afterwards, 0 otherwise.
+static int PairComparison(const void* idx1, const void* idx2) {
+  // To be used with bsearch: <0 when *idx1<*idx2, >0 if >, 0 when ==.
+  return (*(int*) idx1 - *(int*) idx2);
+}
 static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
-                                      int min_cluster_size,
+                                      int* const num_used, int min_cluster_size,
                                       int* const do_greedy) {
-  int iter;
+  int j, iter;
   uint32_t seed = 1;
   int tries_with_no_success = 0;
-  int image_histo_size = image_histo->size;
-  const int outer_iters = image_histo_size;
+  const int outer_iters = *num_used;
   const int num_tries_no_success = outer_iters / 2;
   VP8LHistogram** const histograms = image_histo->histograms;
-  // Priority queue of histogram pairs. Its size of "kCostHeapSizeSqrt"^2
+  // Priority queue of histogram pairs. Its size of 'kHistoQueueSize'
   // impacts the quality of the compression and the speed: the smaller the
   // faster but the worse for the compression.
   HistoQueue histo_queue;
-  const int kHistoQueueSizeSqrt = 3;
+  const int kHistoQueueSize = 9;
   int ok = 0;
+  // mapping from an index in image_histo with no NULL histogram to the full
+  // blown image_histo.
+  int* mappings;
 
-  if (!HistoQueueInit(&histo_queue, kHistoQueueSizeSqrt)) {
+  if (*num_used < min_cluster_size) {
+    *do_greedy = 1;
+    return 1;
+  }
+
+  mappings = (int*) WebPSafeMalloc(*num_used, sizeof(*mappings));
+  if (mappings == NULL || !HistoQueueInit(&histo_queue, kHistoQueueSize)) {
     goto End;
   }
+  // Fill the initial mapping.
+  for (j = 0, iter = 0; iter < image_histo->size; ++iter) {
+    if (histograms[iter] == NULL) continue;
+    mappings[j++] = iter;
+  }
+  assert(j == *num_used);
+
   // Collapse similar histograms in 'image_histo'.
-  ++min_cluster_size;
-  for (iter = 0; iter < outer_iters && image_histo_size >= min_cluster_size &&
-                 ++tries_with_no_success < num_tries_no_success;
+  for (iter = 0;
+       iter < outer_iters && *num_used >= min_cluster_size &&
+           ++tries_with_no_success < num_tries_no_success;
        ++iter) {
+    int* mapping_index;
     double best_cost =
         (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
     int best_idx1 = -1, best_idx2 = 1;
-    int j;
-    const uint32_t rand_range = (image_histo_size - 1) * image_histo_size;
-    // image_histo_size / 2 was chosen empirically. Less means faster but worse
+    const uint32_t rand_range = (*num_used - 1) * (*num_used);
+    // (*num_used) / 2 was chosen empirically. Less means faster but worse
     // compression.
-    const int num_tries = image_histo_size / 2;
+    const int num_tries = (*num_used) / 2;
 
-    for (j = 0; j < num_tries; ++j) {
+    // Pick random samples.
+    for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
       double curr_cost;
       // Choose two different histograms at random and try to combine them.
       const uint32_t tmp = MyRand(&seed) % rand_range;
-      const uint32_t idx1 = tmp / (image_histo_size - 1);
-      uint32_t idx2 = tmp % (image_histo_size - 1);
+      uint32_t idx1 = tmp / (*num_used - 1);
+      uint32_t idx2 = tmp % (*num_used - 1);
       if (idx2 >= idx1) ++idx2;
+      idx1 = mappings[idx1];
+      idx2 = mappings[idx2];
 
       // Calculate cost reduction on combination.
       curr_cost =
@@ -843,18 +975,21 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
     }
     if (histo_queue.size == 0) continue;
 
-    // Merge the two best histograms.
+    // Get the best histograms.
     best_idx1 = histo_queue.queue[0].idx1;
     best_idx2 = histo_queue.queue[0].idx2;
     assert(best_idx1 < best_idx2);
-    HistogramAddEval(histograms[best_idx1], histograms[best_idx2],
-                     histograms[best_idx1], 0);
-    // Swap the best_idx2 histogram with the last one (which is now unused).
-    --image_histo_size;
-    if (best_idx2 != image_histo_size) {
-      HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
-    }
-    histograms[image_histo_size] = NULL;
+    // Pop best_idx2 from mappings.
+    mapping_index = (int*) bsearch(&best_idx2, mappings, *num_used,
+                                   sizeof(best_idx2), &PairComparison);
+    assert(mapping_index != NULL);
+    memmove(mapping_index, mapping_index + 1, sizeof(*mapping_index) *
+        ((*num_used) - (mapping_index - mappings) - 1));
+    // Merge the histograms and remove best_idx2 from the queue.
+    HistogramAdd(histograms[best_idx2], histograms[best_idx1],
+                 histograms[best_idx1]);
+    histograms[best_idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+    HistogramSetRemoveHistogram(image_histo, best_idx2, num_used);
     // Parse the queue and update each pair that deals with best_idx1,
     // best_idx2 or image_histo_size.
     for (j = 0; j < histo_queue.size;) {
@@ -877,12 +1012,6 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
         p->idx2 = best_idx1;
         do_eval = 1;
       }
-      if (p->idx2 == image_histo_size) {
-        // No need to re-evaluate here as it does not involve a pair
-        // containing best_idx1 or best_idx2.
-        p->idx2 = best_idx2;
-      }
-      assert(p->idx2 < image_histo_size);
       // Make sure the index order is respected.
       if (p->idx1 > p->idx2) {
         const int tmp = p->idx2;
@@ -891,8 +1020,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
       }
       if (do_eval) {
         // Re-evaluate the cost of an updated pair.
-        GetCombinedHistogramEntropy(histograms[p->idx1], histograms[p->idx2], 0,
-                                    &p->cost_diff);
+        HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p);
         if (p->cost_diff >= 0.) {
           HistoQueuePopPair(&histo_queue, p);
           continue;
@@ -901,15 +1029,14 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
       HistoQueueUpdateHead(&histo_queue, p);
       ++j;
     }
-
     tries_with_no_success = 0;
   }
-  image_histo->size = image_histo_size;
-  *do_greedy = (image_histo->size <= min_cluster_size);
+  *do_greedy = (*num_used <= min_cluster_size);
   ok = 1;
 
 End:
   HistoQueueClear(&histo_queue);
+  WebPSafeFree(mappings);
   return ok;
 }
 
@@ -917,23 +1044,29 @@ End:
 // Histogram refinement
 
 // Find the best 'out' histogram for each of the 'in' histograms.
+// At call-time, 'out' contains the histograms of the clusters.
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
 static void HistogramRemap(const VP8LHistogramSet* const in,
-                           const VP8LHistogramSet* const out,
+                           VP8LHistogramSet* const out,
                            uint16_t* const symbols) {
   int i;
   VP8LHistogram** const in_histo = in->histograms;
   VP8LHistogram** const out_histo = out->histograms;
-  const int in_size = in->size;
+  const int in_size = out->max_size;
   const int out_size = out->size;
   if (out_size > 1) {
     for (i = 0; i < in_size; ++i) {
       int best_out = 0;
       double best_bits = MAX_COST;
       int k;
+      if (in_histo[i] == NULL) {
+        // Arbitrarily set to the previous value if unused to help future LZ77.
+        symbols[i] = symbols[i - 1];
+        continue;
+      }
       for (k = 0; k < out_size; ++k) {
-        const double cur_bits =
-            HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
+        double cur_bits;
+        cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
         if (k == 0 || cur_bits < best_bits) {
           best_bits = cur_bits;
           best_out = k;
@@ -949,12 +1082,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   }
 
   // Recompute each out based on raw and symbols.
-  for (i = 0; i < out_size; ++i) {
-    HistogramClear(out_histo[i]);
-  }
+  VP8LHistogramSetClear(out);
+  out->size = out_size;
 
   for (i = 0; i < in_size; ++i) {
-    const int idx = symbols[i];
+    int idx;
+    if (in_histo[i] == NULL) continue;
+    idx = symbols[i];
     HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
   }
 }
@@ -970,6 +1104,70 @@ static double GetCombineCostFactor(int histo_size, int quality) {
   return combine_cost_factor;
 }
 
+// Given a HistogramSet 'set', the mapping of clusters 'cluster_mapping' and the
+// current assignment of the cells in 'symbols', merge the clusters and
+// assign the smallest possible clusters values.
+static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
+                                     uint16_t* const cluster_mappings,
+                                     int num_clusters,
+                                     uint16_t* const cluster_mappings_tmp,
+                                     uint16_t* const symbols) {
+  int i, cluster_max;
+  int do_continue = 1;
+  // First, assign the lowest cluster to each pixel.
+  while (do_continue) {
+    do_continue = 0;
+    for (i = 0; i < num_clusters; ++i) {
+      int k;
+      k = cluster_mappings[i];
+      while (k != cluster_mappings[k]) {
+        cluster_mappings[k] = cluster_mappings[cluster_mappings[k]];
+        k = cluster_mappings[k];
+      }
+      if (k != cluster_mappings[i]) {
+        do_continue = 1;
+        cluster_mappings[i] = k;
+      }
+    }
+  }
+  // Create a mapping from a cluster id to its minimal version.
+  cluster_max = 0;
+  memset(cluster_mappings_tmp, 0,
+         set->max_size * sizeof(*cluster_mappings_tmp));
+  assert(cluster_mappings[0] == 0);
+  // Re-map the ids.
+  for (i = 0; i < set->max_size; ++i) {
+    int cluster;
+    if (symbols[i] == kInvalidHistogramSymbol) continue;
+    cluster = cluster_mappings[symbols[i]];
+    assert(symbols[i] < num_clusters);
+    if (cluster > 0 && cluster_mappings_tmp[cluster] == 0) {
+      ++cluster_max;
+      cluster_mappings_tmp[cluster] = cluster_max;
+    }
+    symbols[i] = cluster_mappings_tmp[cluster];
+  }
+
+  // Make sure all cluster values are used.
+  cluster_max = 0;
+  for (i = 0; i < set->max_size; ++i) {
+    if (symbols[i] == kInvalidHistogramSymbol) continue;
+    if (symbols[i] <= cluster_max) continue;
+    ++cluster_max;
+    assert(symbols[i] == cluster_max);
+  }
+}
+
+static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) {
+  uint32_t size;
+  int i;
+  for (i = 0, size = 0; i < image_histo->size; ++i) {
+    if (image_histo->histograms[i] == NULL) continue;
+    image_histo->histograms[size++] = image_histo->histograms[i];
+  }
+  image_histo->size = size;
+}
+
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
                              int quality, int low_effort,
@@ -987,28 +1185,37 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // histograms of small sizes (as bin_map will be very sparse) and
   // maximum quality q==100 (to preserve the compression gains at that level).
   const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
-  const int entropy_combine =
-      (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
-
-  if (orig_histo == NULL) goto Error;
+  int entropy_combine;
+  uint16_t* const map_tmp =
+      WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp));
+  uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
+  int num_used = image_histo_raw_size;
+  if (orig_histo == NULL || map_tmp == NULL) goto Error;
 
   // Construct the histograms from backward references.
   HistogramBuild(xsize, histo_bits, refs, orig_histo);
   // Copies the histograms and computes its bit_cost.
-  HistogramCopyAndAnalyze(orig_histo, image_histo);
+  // histogram_symbols is optimized
+  HistogramCopyAndAnalyze(orig_histo, image_histo, &num_used,
+                          histogram_symbols);
+
+  entropy_combine =
+      (num_used > entropy_combine_num_bins * 2) && (quality < 100);
 
   if (entropy_combine) {
-    const int bin_map_size = orig_histo->size;
-    // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
-    uint16_t* const bin_map = histogram_symbols;
+    uint16_t* const bin_map = map_tmp;
     const double combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
+    const uint32_t num_clusters = num_used;
 
-    HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
+    HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    HistogramCombineEntropyBin(image_histo, tmp_histo, bin_map, bin_map_size,
+    HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols,
+                               cluster_mappings, tmp_histo, bin_map,
                                entropy_combine_num_bins, combine_cost_factor,
                                low_effort);
+    OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters,
+                             map_tmp, histogram_symbols);
   }
 
   // Don't combine the histograms using stochastic and greedy heuristics for
@@ -1018,21 +1225,26 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     // cubic ramp between 1 and MAX_HISTO_GREEDY:
     const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
     int do_greedy;
-    if (!HistogramCombineStochastic(image_histo, threshold_size, &do_greedy)) {
+    if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
+                                    &do_greedy)) {
       goto Error;
     }
-    if (do_greedy && !HistogramCombineGreedy(image_histo)) {
-      goto Error;
+    if (do_greedy) {
+      RemoveEmptyHistograms(image_histo);
+      if (!HistogramCombineGreedy(image_histo, &num_used)) {
+        goto Error;
+      }
     }
   }
 
-  // TODO(vrabaud): Optimize HistogramRemap for low-effort compression mode.
   // Find the optimal map from original histograms to the final ones.
+  RemoveEmptyHistograms(image_histo);
   HistogramRemap(orig_histo, image_histo, histogram_symbols);
 
   ok = 1;
 
  Error:
   VP8LFreeHistogramSet(orig_histo);
+  WebPSafeFree(map_tmp);
   return ok;
 }
diff --git a/3rdparty/libwebp/src/enc/histogram_enc.h b/3rdparty/libwebp/src/enc/histogram_enc.h
index e8c4c83f6f..54c2d21783 100644
--- a/3rdparty/libwebp/src/enc/histogram_enc.h
+++ b/3rdparty/libwebp/src/enc/histogram_enc.h
@@ -44,6 +44,7 @@ typedef struct {
   double literal_cost_;      // Cached values of dominant entropy costs:
   double red_cost_;          // literal, red & blue.
   double blue_cost_;
+  uint8_t is_used_[5];       // 5 for literal, red, blue, alpha, distance
 } VP8LHistogram;
 
 // Collection of histograms with fixed capacity, allocated as one
@@ -67,7 +68,9 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
 int VP8LGetHistogramSize(int palette_code_bits);
 
 // Set the palette_code_bits and reset the stats.
-void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
+// If init_arrays is true, the arrays are also filled with 0's.
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
+                       int init_arrays);
 
 // Collect all the references into a histogram (without reset)
 void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
@@ -83,6 +86,9 @@ void VP8LFreeHistogramSet(VP8LHistogramSet* const histo);
 // using 'cache_bits'. Return NULL in case of memory error.
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits);
 
+// Set the histograms in set to 0.
+void VP8LHistogramSetClear(VP8LHistogramSet* const set);
+
 // Allocate and initialize histogram object with specified 'cache_bits'.
 // Returns NULL in case of memory error.
 // Special case of VP8LAllocateHistogramSet, with size equals 1.
@@ -113,7 +119,7 @@ double VP8LBitsEntropy(const uint32_t* const array, int n);
 
 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
-double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+double VP8LHistogramEstimateBits(VP8LHistogram* const p);
 
 #ifdef __cplusplus
 }
diff --git a/3rdparty/libwebp/src/enc/iterator_enc.c b/3rdparty/libwebp/src/enc/iterator_enc.c
index 7c47d51272..29f91d8315 100644
--- a/3rdparty/libwebp/src/enc/iterator_enc.c
+++ b/3rdparty/libwebp/src/enc/iterator_enc.c
@@ -128,7 +128,7 @@ static void ImportLine(const uint8_t* src, int src_stride,
   for (; i < total_len; ++i) dst[i] = dst[len - 1];
 }
 
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32) {
   const VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
   const WebPPicture* const pic = enc->pic_;
diff --git a/3rdparty/libwebp/src/enc/picture_tools_enc.c b/3rdparty/libwebp/src/enc/picture_tools_enc.c
index be292d4391..d0e8a495da 100644
--- a/3rdparty/libwebp/src/enc/picture_tools_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -16,10 +16,6 @@
 #include "src/enc/vp8i_enc.h"
 #include "src/dsp/yuv.h"
 
-static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
-  return (0xff000000u | (r << 16) | (g << 8) | b);
-}
-
 //------------------------------------------------------------------------------
 // Helper: clean up fully transparent area to help compressibility.
 
@@ -195,6 +191,10 @@ void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
 #define BLEND_10BIT(V0, V1, ALPHA) \
     ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18)
 
+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+  return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
 void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
   const int red = (background_rgb >> 16) & 0xff;
   const int green = (background_rgb >> 8) & 0xff;
@@ -208,39 +208,44 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
     const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
     const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
     const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    if (!has_alpha || pic->a == NULL) return;    // nothing to do
+    uint8_t* y_ptr = pic->y;
+    uint8_t* u_ptr = pic->u;
+    uint8_t* v_ptr = pic->v;
+    uint8_t* a_ptr = pic->a;
+    if (!has_alpha || a_ptr == NULL) return;    // nothing to do
     for (y = 0; y < pic->height; ++y) {
       // Luma blending
-      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
-      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
       for (x = 0; x < pic->width; ++x) {
-        const int alpha = a_ptr[x];
+        const uint8_t alpha = a_ptr[x];
         if (alpha < 0xff) {
-          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
+          y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
         }
       }
       // Chroma blending every even line
       if ((y & 1) == 0) {
-        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
-        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
         uint8_t* const a_ptr2 =
             (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
         for (x = 0; x < uv_width; ++x) {
           // Average four alpha values into a single blending weight.
           // TODO(skal): might lead to visible contouring. Can we do better?
-          const int alpha =
+          const uint32_t alpha =
               a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
               a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
+          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
         }
         if (pic->width & 1) {   // rightmost pixel
-          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
+          const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
         }
+      } else {
+        u_ptr += pic->uv_stride;
+        v_ptr += pic->uv_stride;
       }
-      memset(a_ptr, 0xff, pic->width);
+      memset(a_ptr, 0xff, pic->width);  // reset alpha value to opaque
+      a_ptr += pic->a_stride;
+      y_ptr += pic->y_stride;
     }
   } else {
     uint32_t* argb = pic->argb;
diff --git a/3rdparty/libwebp/src/enc/predictor_enc.c b/3rdparty/libwebp/src/enc/predictor_enc.c
index f3715f515e..802e89693e 100644
--- a/3rdparty/libwebp/src/enc/predictor_enc.c
+++ b/3rdparty/libwebp/src/enc/predictor_enc.c
@@ -177,12 +177,15 @@ static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
   }
 }
 
+static WEBP_INLINE uint8_t NearLosslessDiff(uint8_t a, uint8_t b) {
+  return (uint8_t)((((int)(a) - (int)(b))) & 0xff);
+}
+
 // Quantize every component of the difference between the actual pixel value and
 // its prediction to a multiple of a quantization (a power of 2, not larger than
 // max_quantization which is a power of 2, smaller than max_diff). Take care if
 // value and predict have undergone subtract green, which means that red and
 // blue are represented as offsets from green.
-#define NEAR_LOSSLESS_DIFF(a, b) (uint8_t)((((int)(a) - (int)(b))) & 0xff)
 static uint32_t NearLossless(uint32_t value, uint32_t predict,
                              int max_quantization, int max_diff,
                              int used_subtract_green) {
@@ -199,7 +202,7 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
   }
   if ((value >> 24) == 0 || (value >> 24) == 0xff) {
     // Preserve transparency of fully transparent or fully opaque pixels.
-    a = NEAR_LOSSLESS_DIFF(value >> 24, predict >> 24);
+    a = NearLosslessDiff(value >> 24, predict >> 24);
   } else {
     a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
   }
@@ -212,16 +215,15 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
     // The amount by which green has been adjusted during quantization. It is
     // subtracted from red and blue for compensation, to avoid accumulating two
     // quantization errors in them.
-    green_diff = NEAR_LOSSLESS_DIFF(new_green, value >> 8);
+    green_diff = NearLosslessDiff(new_green, value >> 8);
   }
-  r = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value >> 16, green_diff),
+  r = NearLosslessComponent(NearLosslessDiff(value >> 16, green_diff),
                             (predict >> 16) & 0xff, 0xff - new_green,
                             quantization);
-  b = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value, green_diff),
+  b = NearLosslessComponent(NearLosslessDiff(value, green_diff),
                             predict & 0xff, 0xff - new_green, quantization);
   return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
 }
-#undef NEAR_LOSSLESS_DIFF
 #endif  // (WEBP_NEAR_LOSSLESS == 1)
 
 // Stores the difference between the pixel and its prediction in "out".
diff --git a/3rdparty/libwebp/src/enc/quant_enc.c b/3rdparty/libwebp/src/enc/quant_enc.c
index 35bfaf21ef..03c682e3ae 100644
--- a/3rdparty/libwebp/src/enc/quant_enc.c
+++ b/3rdparty/libwebp/src/enc/quant_enc.c
@@ -15,6 +15,7 @@
 #include <math.h>
 #include <stdlib.h>  // for abs()
 
+#include "src/dsp/quant.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/cost_enc.h"
 
@@ -977,19 +978,6 @@ static void SwapOut(VP8EncIterator* const it) {
   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }
 
-static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
-  score_t score = 0;
-  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
-    int i;
-    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
-      score += (levels[i] != 0);
-      if (score > thresh) return 0;
-    }
-    levels += 16;
-  }
-  return 1;
-}
-
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   const int kNumBlocks = 16;
   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
diff --git a/3rdparty/libwebp/src/enc/vp8i_enc.h b/3rdparty/libwebp/src/enc/vp8i_enc.h
index 624e8f8e66..3a1967da88 100644
--- a/3rdparty/libwebp/src/enc/vp8i_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 1
 #define ENC_MIN_VERSION 0
-#define ENC_REV_VERSION 0
+#define ENC_REV_VERSION 2
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -278,7 +278,7 @@ int VP8IteratorIsDone(const VP8EncIterator* const it);
 // Import uncompressed samples from source.
 // If tmp_32 is not NULL, import boundary samples too.
 // tmp_32 is a 32-bytes scratch buffer that must be aligned in memory.
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32);
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32);
 // export decimated samples
 void VP8IteratorExport(const VP8EncIterator* const it);
 // go to next macroblock. Returns false if not finished.
@@ -515,4 +515,4 @@ void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_VP8I_ENC_H_ */
+#endif  // WEBP_ENC_VP8I_ENC_H_
diff --git a/3rdparty/libwebp/src/enc/vp8l_enc.c b/3rdparty/libwebp/src/enc/vp8l_enc.c
index a89184eb08..2efd403f77 100644
--- a/3rdparty/libwebp/src/enc/vp8l_enc.c
+++ b/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -462,6 +462,7 @@ static int GetHuffBitLengthsAndCodes(
   for (i = 0; i < histogram_image_size; ++i) {
     const VP8LHistogram* const histo = histogram_image->histograms[i];
     HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+    assert(histo != NULL);
     for (k = 0; k < 5; ++k) {
       const int num_symbols =
           (k == 0) ? VP8LHistogramNumCodes(histo->palette_code_bits_) :
@@ -809,6 +810,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
+  VP8LHistogramSetClear(histogram_image);
 
   // Build histogram image and symbols from backward references.
   VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
@@ -1248,14 +1250,20 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
   const WebPPicture* const picture = enc->pic_;
   const int width = picture->width;
   const int height = picture->height;
-  int y;
+
   err = AllocateTransformBuffer(enc, width, height);
   if (err != VP8_ENC_OK) return err;
   if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
-  for (y = 0; y < height; ++y) {
-    memcpy(enc->argb_ + y * width,
-           picture->argb + y * picture->argb_stride,
-           width * sizeof(*enc->argb_));
+
+  {
+    uint32_t* dst = enc->argb_;
+    const uint32_t* src = picture->argb;
+    int y;
+    for (y = 0; y < height; ++y) {
+      memcpy(dst, src, width * sizeof(*dst));
+      dst += width;
+      src += picture->argb_stride;
+    }
   }
   enc->argb_content_ = kEncoderARGB;
   assert(enc->current_width_ == width);
diff --git a/3rdparty/libwebp/src/enc/vp8li_enc.h b/3rdparty/libwebp/src/enc/vp8li_enc.h
index 298a4a0014..d2d0fc509c 100644
--- a/3rdparty/libwebp/src/enc/vp8li_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -115,4 +115,4 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_VP8LI_ENC_H_ */
+#endif  // WEBP_ENC_VP8LI_ENC_H_
diff --git a/3rdparty/libwebp/src/mux/animi.h b/3rdparty/libwebp/src/mux/animi.h
index 88899532aa..34c45ba4da 100644
--- a/3rdparty/libwebp/src/mux/animi.h
+++ b/3rdparty/libwebp/src/mux/animi.h
@@ -40,4 +40,4 @@ int WebPAnimEncoderRefineRect(
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_MUX_ANIMI_H_ */
+#endif  // WEBP_MUX_ANIMI_H_
diff --git a/3rdparty/libwebp/src/mux/muxedit.c b/3rdparty/libwebp/src/mux/muxedit.c
index 7a027b3cb4..ccf14b2a0c 100644
--- a/3rdparty/libwebp/src/mux/muxedit.c
+++ b/3rdparty/libwebp/src/mux/muxedit.c
@@ -69,12 +69,12 @@ void WebPMuxDelete(WebPMux* mux) {
   if (idx == (INDEX)) {                                                        \
     err = ChunkAssignData(&chunk, data, copy_data, tag);                       \
     if (err == WEBP_MUX_OK) {                                                  \
-      err = ChunkSetNth(&chunk, (LIST), nth);                                  \
+      err = ChunkSetHead(&chunk, (LIST));                                      \
     }                                                                          \
     return err;                                                                \
   }
 
-static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag, uint32_t nth,
+static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag,
                            const WebPData* const data, int copy_data) {
   WebPChunk chunk;
   WebPMuxError err = WEBP_MUX_NOT_FOUND;
@@ -190,7 +190,7 @@ WebPMuxError WebPMuxSetChunk(WebPMux* mux, const char fourcc[4],
   if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
 
   // Add the given chunk.
-  return MuxSet(mux, tag, 1, chunk_data, copy_data);
+  return MuxSet(mux, tag, chunk_data, copy_data);
 }
 
 // Creates a chunk from given 'data' and sets it as 1st chunk in 'chunk_list'.
@@ -202,7 +202,7 @@ static WebPMuxError AddDataToChunkList(
   ChunkInit(&chunk);
   err = ChunkAssignData(&chunk, data, copy_data, tag);
   if (err != WEBP_MUX_OK) goto Err;
-  err = ChunkSetNth(&chunk, chunk_list, 1);
+  err = ChunkSetHead(&chunk, chunk_list);
   if (err != WEBP_MUX_OK) goto Err;
   return WEBP_MUX_OK;
  Err:
@@ -266,14 +266,14 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info,
                               int copy_data) {
   WebPMuxImage wpi;
   WebPMuxError err;
-  const WebPData* const bitstream = &info->bitstream;
 
   // Sanity checks.
   if (mux == NULL || info == NULL) return WEBP_MUX_INVALID_ARGUMENT;
 
   if (info->id != WEBP_CHUNK_ANMF) return WEBP_MUX_INVALID_ARGUMENT;
 
-  if (bitstream->bytes == NULL || bitstream->size > MAX_CHUNK_PAYLOAD) {
+  if (info->bitstream.bytes == NULL ||
+      info->bitstream.size > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
@@ -287,7 +287,7 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info,
   }
 
   MuxImageInit(&wpi);
-  err = SetAlphaAndImageChunks(bitstream, copy_data, &wpi);
+  err = SetAlphaAndImageChunks(&info->bitstream, copy_data, &wpi);
   if (err != WEBP_MUX_OK) goto Err;
   assert(wpi.img_ != NULL);  // As SetAlphaAndImageChunks() was successful.
 
@@ -342,7 +342,7 @@ WebPMuxError WebPMuxSetAnimationParams(WebPMux* mux,
   // Set the animation parameters.
   PutLE32(data, params->bgcolor);
   PutLE16(data + 4, params->loop_count);
-  return MuxSet(mux, kChunks[IDX_ANIM].tag, 1, &anim, 1);
+  return MuxSet(mux, kChunks[IDX_ANIM].tag, &anim, 1);
 }
 
 WebPMuxError WebPMuxSetCanvasSize(WebPMux* mux,
@@ -540,7 +540,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
   PutLE24(data + 4, width - 1);   // canvas width.
   PutLE24(data + 7, height - 1);  // canvas height.
 
-  return MuxSet(mux, kChunks[IDX_VP8X].tag, 1, &vp8x, 1);
+  return MuxSet(mux, kChunks[IDX_VP8X].tag, &vp8x, 1);
 }
 
 // Cleans up 'mux' by removing any unnecessary chunks.
diff --git a/3rdparty/libwebp/src/mux/muxi.h b/3rdparty/libwebp/src/mux/muxi.h
index 6b57eea30f..3e9d8c48d8 100644
--- a/3rdparty/libwebp/src/mux/muxi.h
+++ b/3rdparty/libwebp/src/mux/muxi.h
@@ -14,6 +14,7 @@
 #ifndef WEBP_MUX_MUXI_H_
 #define WEBP_MUX_MUXI_H_
 
+#include <assert.h>
 #include <stdlib.h>
 #include "src/dec/vp8i_dec.h"
 #include "src/dec/vp8li_dec.h"
@@ -28,7 +29,7 @@ extern "C" {
 
 #define MUX_MAJ_VERSION 1
 #define MUX_MIN_VERSION 0
-#define MUX_REV_VERSION 0
+#define MUX_REV_VERSION 2
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
@@ -126,11 +127,14 @@ WebPChunk* ChunkSearchList(WebPChunk* first, uint32_t nth, uint32_t tag);
 WebPMuxError ChunkAssignData(WebPChunk* chunk, const WebPData* const data,
                              int copy_data, uint32_t tag);
 
-// Sets 'chunk' at nth position in the 'chunk_list'.
-// nth = 0 has the special meaning "last of the list".
+// Sets 'chunk' as the only element in 'chunk_list' if it is empty.
 // On success ownership is transferred from 'chunk' to the 'chunk_list'.
-WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
-                         uint32_t nth);
+WebPMuxError ChunkSetHead(WebPChunk* const chunk, WebPChunk** const chunk_list);
+// Sets 'chunk' at last position in the 'chunk_list'.
+// On success ownership is transferred from 'chunk' to the 'chunk_list'.
+// *chunk_list also points towards the last valid element of the initial
+// *chunk_list.
+WebPMuxError ChunkAppend(WebPChunk* const chunk, WebPChunk*** const chunk_list);
 
 // Releases chunk and returns chunk->next_.
 WebPChunk* ChunkRelease(WebPChunk* const chunk);
@@ -143,13 +147,13 @@ void ChunkListDelete(WebPChunk** const chunk_list);
 
 // Returns size of the chunk including chunk header and padding byte (if any).
 static WEBP_INLINE size_t SizeWithPadding(size_t chunk_size) {
+  assert(chunk_size <= MAX_CHUNK_PAYLOAD);
   return CHUNK_HEADER_SIZE + ((chunk_size + 1) & ~1U);
 }
 
 // Size of a chunk including header and padding.
 static WEBP_INLINE size_t ChunkDiskSize(const WebPChunk* chunk) {
   const size_t data_size = chunk->data_.size;
-  assert(data_size < MAX_CHUNK_PAYLOAD);
   return SizeWithPadding(data_size);
 }
 
@@ -227,4 +231,4 @@ WebPMuxError MuxValidate(const WebPMux* const mux);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_MUX_MUXI_H_ */
+#endif  // WEBP_MUX_MUXI_H_
diff --git a/3rdparty/libwebp/src/mux/muxinternal.c b/3rdparty/libwebp/src/mux/muxinternal.c
index 1473f100e5..b9ee6717d3 100644
--- a/3rdparty/libwebp/src/mux/muxinternal.c
+++ b/3rdparty/libwebp/src/mux/muxinternal.c
@@ -111,27 +111,6 @@ WebPChunk* ChunkSearchList(WebPChunk* first, uint32_t nth, uint32_t tag) {
   return ((nth > 0) && (iter > 0)) ? NULL : first;
 }
 
-// Outputs a pointer to 'prev_chunk->next_',
-//   where 'prev_chunk' is the pointer to the chunk at position (nth - 1).
-// Returns true if nth chunk was found.
-static int ChunkSearchListToSet(WebPChunk** chunk_list, uint32_t nth,
-                                WebPChunk*** const location) {
-  uint32_t count = 0;
-  assert(chunk_list != NULL);
-  *location = chunk_list;
-
-  while (*chunk_list != NULL) {
-    WebPChunk* const cur_chunk = *chunk_list;
-    ++count;
-    if (count == nth) return 1;  // Found.
-    chunk_list = &cur_chunk->next_;
-    *location = chunk_list;
-  }
-
-  // *chunk_list is ok to be NULL if adding at last location.
-  return (nth == 0 || (count == nth - 1)) ? 1 : 0;
-}
-
 //------------------------------------------------------------------------------
 // Chunk writer methods.
 
@@ -156,11 +135,12 @@ WebPMuxError ChunkAssignData(WebPChunk* chunk, const WebPData* const data,
   return WEBP_MUX_OK;
 }
 
-WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
-                         uint32_t nth) {
+WebPMuxError ChunkSetHead(WebPChunk* const chunk,
+                          WebPChunk** const chunk_list) {
   WebPChunk* new_chunk;
 
-  if (!ChunkSearchListToSet(chunk_list, nth, &chunk_list)) {
+  assert(chunk_list != NULL);
+  if (*chunk_list != NULL) {
     return WEBP_MUX_NOT_FOUND;
   }
 
@@ -168,11 +148,26 @@ WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
   if (new_chunk == NULL) return WEBP_MUX_MEMORY_ERROR;
   *new_chunk = *chunk;
   chunk->owner_ = 0;
-  new_chunk->next_ = *chunk_list;
+  new_chunk->next_ = NULL;
   *chunk_list = new_chunk;
   return WEBP_MUX_OK;
 }
 
+WebPMuxError ChunkAppend(WebPChunk* const chunk,
+                         WebPChunk*** const chunk_list) {
+  assert(chunk_list != NULL && *chunk_list != NULL);
+
+  if (**chunk_list == NULL) {
+    ChunkSetHead(chunk, *chunk_list);
+  } else {
+    WebPChunk* last_chunk = **chunk_list;
+    while (last_chunk->next_ != NULL) last_chunk = last_chunk->next_;
+    ChunkSetHead(chunk, &last_chunk->next_);
+    *chunk_list = &last_chunk->next_;
+  }
+  return WEBP_MUX_OK;
+}
+
 //------------------------------------------------------------------------------
 // Chunk deletion method(s).
 
@@ -232,9 +227,11 @@ void MuxImageInit(WebPMuxImage* const wpi) {
 WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi) {
   WebPMuxImage* next;
   if (wpi == NULL) return NULL;
-  ChunkDelete(wpi->header_);
-  ChunkDelete(wpi->alpha_);
-  ChunkDelete(wpi->img_);
+  // There should be at most one chunk of header_, alpha_, img_ but we call
+  // ChunkListDelete to be safe
+  ChunkListDelete(&wpi->header_);
+  ChunkListDelete(&wpi->alpha_);
+  ChunkListDelete(&wpi->img_);
   ChunkListDelete(&wpi->unknown_);
 
   next = wpi->next_;
diff --git a/3rdparty/libwebp/src/mux/muxread.c b/3rdparty/libwebp/src/mux/muxread.c
index 0b55286862..268f6acb53 100644
--- a/3rdparty/libwebp/src/mux/muxread.c
+++ b/3rdparty/libwebp/src/mux/muxread.c
@@ -59,6 +59,7 @@ static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk,
   // Sanity checks.
   if (data_size < CHUNK_HEADER_SIZE) return WEBP_MUX_NOT_ENOUGH_DATA;
   chunk_size = GetLE32(data + TAG_SIZE);
+  if (chunk_size > MAX_CHUNK_PAYLOAD) return WEBP_MUX_BAD_DATA;
 
   {
     const size_t chunk_disk_size = SizeWithPadding(chunk_size);
@@ -102,6 +103,7 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
   const uint8_t* const last = bytes + size;
   WebPChunk subchunk;
   size_t subchunk_size;
+  WebPChunk** unknown_chunk_list = &wpi->unknown_;
   ChunkInit(&subchunk);
 
   assert(chunk->tag_ == kChunks[IDX_ANMF].tag);
@@ -116,7 +118,7 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
     if (size < hdr_size) goto Fail;
     ChunkAssignData(&subchunk, &temp, copy_data, chunk->tag_);
   }
-  ChunkSetNth(&subchunk, &wpi->header_, 1);
+  ChunkSetHead(&subchunk, &wpi->header_);
   wpi->is_partial_ = 1;  // Waiting for ALPH and/or VP8/VP8L chunks.
 
   // Rest of the chunks.
@@ -133,18 +135,23 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
     switch (ChunkGetIdFromTag(subchunk.tag_)) {
       case WEBP_CHUNK_ALPHA:
         if (wpi->alpha_ != NULL) goto Fail;  // Consecutive ALPH chunks.
-        if (ChunkSetNth(&subchunk, &wpi->alpha_, 1) != WEBP_MUX_OK) goto Fail;
+        if (ChunkSetHead(&subchunk, &wpi->alpha_) != WEBP_MUX_OK) goto Fail;
         wpi->is_partial_ = 1;  // Waiting for a VP8 chunk.
         break;
       case WEBP_CHUNK_IMAGE:
-        if (ChunkSetNth(&subchunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Fail;
+        if (wpi->img_ != NULL) goto Fail;  // Only 1 image chunk allowed.
+        if (ChunkSetHead(&subchunk, &wpi->img_) != WEBP_MUX_OK) goto Fail;
         if (!MuxImageFinalize(wpi)) goto Fail;
         wpi->is_partial_ = 0;  // wpi is completely filled.
         break;
       case WEBP_CHUNK_UNKNOWN:
-        if (wpi->is_partial_) goto Fail;  // Encountered an unknown chunk
-                                          // before some image chunks.
-        if (ChunkSetNth(&subchunk, &wpi->unknown_, 0) != WEBP_MUX_OK) goto Fail;
+        if (wpi->is_partial_) {
+          goto Fail;  // Encountered an unknown chunk
+                      // before some image chunks.
+        }
+        if (ChunkAppend(&subchunk, &unknown_chunk_list) != WEBP_MUX_OK) {
+          goto Fail;
+        }
         break;
       default:
         goto Fail;
@@ -175,6 +182,9 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   const uint8_t* data;
   size_t size;
   WebPChunk chunk;
+  // Stores the end of the chunk lists so that it is faster to append data to
+  // their ends.
+  WebPChunk** chunk_list_ends[WEBP_CHUNK_NIL + 1] = { NULL };
   ChunkInit(&chunk);
 
   // Sanity checks.
@@ -187,7 +197,7 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   size = bitstream->size;
 
   if (data == NULL) return NULL;
-  if (size < RIFF_HEADER_SIZE) return NULL;
+  if (size < RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE) return NULL;
   if (GetLE32(data + 0) != MKFOURCC('R', 'I', 'F', 'F') ||
       GetLE32(data + CHUNK_HEADER_SIZE) != MKFOURCC('W', 'E', 'B', 'P')) {
     return NULL;
@@ -196,8 +206,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   mux = WebPMuxNew();
   if (mux == NULL) return NULL;
 
-  if (size < RIFF_HEADER_SIZE + TAG_SIZE) goto Err;
-
   tag = GetLE32(data + RIFF_HEADER_SIZE);
   if (tag != kChunks[IDX_VP8].tag &&
       tag != kChunks[IDX_VP8L].tag &&
@@ -205,13 +213,17 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
     goto Err;  // First chunk should be VP8, VP8L or VP8X.
   }
 
-  riff_size = SizeWithPadding(GetLE32(data + TAG_SIZE));
-  if (riff_size > MAX_CHUNK_PAYLOAD || riff_size > size) {
-    goto Err;
-  } else {
-    if (riff_size < size) {  // Redundant data after last chunk.
-      size = riff_size;  // To make sure we don't read any data beyond mux_size.
-    }
+  riff_size = GetLE32(data + TAG_SIZE);
+  if (riff_size > MAX_CHUNK_PAYLOAD) goto Err;
+
+  // Note this padding is historical and differs from demux.c which does not
+  // pad the file size.
+  riff_size = SizeWithPadding(riff_size);
+  if (riff_size < CHUNK_HEADER_SIZE) goto Err;
+  if (riff_size > size) goto Err;
+  // There's no point in reading past the end of the RIFF chunk.
+  if (size > riff_size + CHUNK_HEADER_SIZE) {
+    size = riff_size + CHUNK_HEADER_SIZE;
   }
 
   end = data + size;
@@ -226,7 +238,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   while (data != end) {
     size_t data_size;
     WebPChunkId id;
-    WebPChunk** chunk_list;
     if (ChunkVerifyAndAssign(&chunk, data, size, riff_size,
                              copy_data) != WEBP_MUX_OK) {
       goto Err;
@@ -236,11 +247,11 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
     switch (id) {
       case WEBP_CHUNK_ALPHA:
         if (wpi->alpha_ != NULL) goto Err;  // Consecutive ALPH chunks.
-        if (ChunkSetNth(&chunk, &wpi->alpha_, 1) != WEBP_MUX_OK) goto Err;
+        if (ChunkSetHead(&chunk, &wpi->alpha_) != WEBP_MUX_OK) goto Err;
         wpi->is_partial_ = 1;  // Waiting for a VP8 chunk.
         break;
       case WEBP_CHUNK_IMAGE:
-        if (ChunkSetNth(&chunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Err;
+        if (ChunkSetHead(&chunk, &wpi->img_) != WEBP_MUX_OK) goto Err;
         if (!MuxImageFinalize(wpi)) goto Err;
         wpi->is_partial_ = 0;  // wpi is completely filled.
  PushImage:
@@ -257,9 +268,13 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
       default:  // A non-image chunk.
         if (wpi->is_partial_) goto Err;  // Encountered a non-image chunk before
                                          // getting all chunks of an image.
-        chunk_list = MuxGetChunkListFromId(mux, id);  // List to add this chunk.
-        if (ChunkSetNth(&chunk, chunk_list, 0) != WEBP_MUX_OK) goto Err;
+        if (chunk_list_ends[id] == NULL) {
+          chunk_list_ends[id] =
+              MuxGetChunkListFromId(mux, id);  // List to add this chunk.
+        }
+        if (ChunkAppend(&chunk, &chunk_list_ends[id]) != WEBP_MUX_OK) goto Err;
         if (id == WEBP_CHUNK_VP8X) {  // grab global specs
+          if (data_size < CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE) goto Err;
           mux->canvas_width_ = GetLE24(data + 12) + 1;
           mux->canvas_height_ = GetLE24(data + 15) + 1;
         }
@@ -385,6 +400,10 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
   uint8_t* const data = (uint8_t*)WebPSafeMalloc(1ULL, size);
   if (data == NULL) return WEBP_MUX_MEMORY_ERROR;
 
+  // There should be at most one alpha_ chunk and exactly one img_ chunk.
+  assert(wpi->alpha_ == NULL || wpi->alpha_->next_ == NULL);
+  assert(wpi->img_ != NULL && wpi->img_->next_ == NULL);
+
   // Main RIFF header.
   dst = MuxEmitRiffHeader(data, size);
 
diff --git a/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h b/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
index 2ccc6ed326..7e607f370a 100644
--- a/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
+++ b/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -187,4 +187,4 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
 }    // extern "C"
 #endif
 
-#endif   // WEBP_UTILS_BIT_READER_INL_UTILS_H_
+#endif  // WEBP_UTILS_BIT_READER_INL_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/bit_reader_utils.h b/3rdparty/libwebp/src/utils/bit_reader_utils.h
index 317d311c17..0f8db2caf2 100644
--- a/3rdparty/libwebp/src/utils/bit_reader_utils.h
+++ b/3rdparty/libwebp/src/utils/bit_reader_utils.h
@@ -172,4 +172,4 @@ static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_BIT_READER_UTILS_H_ */
+#endif  // WEBP_UTILS_BIT_READER_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/bit_writer_utils.c b/3rdparty/libwebp/src/utils/bit_writer_utils.c
index f4f476ce3f..7f83b4c8a2 100644
--- a/3rdparty/libwebp/src/utils/bit_writer_utils.c
+++ b/3rdparty/libwebp/src/utils/bit_writer_utils.c
@@ -248,6 +248,7 @@ int VP8LBitWriterClone(const VP8LBitWriter* const src,
   dst->bits_ = src->bits_;
   dst->used_ = src->used_;
   dst->error_ = src->error_;
+  dst->cur_ = dst->buf_ + current_size;
   return 1;
 }
 
diff --git a/3rdparty/libwebp/src/utils/bit_writer_utils.h b/3rdparty/libwebp/src/utils/bit_writer_utils.h
index 2cf5976fe3..b9d5102a5a 100644
--- a/3rdparty/libwebp/src/utils/bit_writer_utils.h
+++ b/3rdparty/libwebp/src/utils/bit_writer_utils.h
@@ -151,4 +151,4 @@ static WEBP_INLINE void VP8LPutBits(VP8LBitWriter* const bw,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_BIT_WRITER_UTILS_H_ */
+#endif  // WEBP_UTILS_BIT_WRITER_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/filters_utils.h b/3rdparty/libwebp/src/utils/filters_utils.h
index 410f2fcdf2..61da66e212 100644
--- a/3rdparty/libwebp/src/utils/filters_utils.h
+++ b/3rdparty/libwebp/src/utils/filters_utils.h
@@ -29,4 +29,4 @@ WEBP_FILTER_TYPE WebPEstimateBestFilter(const uint8_t* data,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_FILTERS_UTILS_H_ */
+#endif  // WEBP_UTILS_FILTERS_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c b/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
index 3818a78b93..f65b6cdbb6 100644
--- a/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
+++ b/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -261,9 +261,15 @@ static void CleanupParams(SmoothParams* const p) {
 
 int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride,
                          int strength) {
-  const int radius = 4 * strength / 100;
+  int radius = 4 * strength / 100;
+
   if (strength < 0 || strength > 100) return 0;
   if (data == NULL || width <= 0 || height <= 0) return 0;  // bad params
+
+  // limit the filter size to not exceed the image dimensions
+  if (2 * radius + 1 > width) radius = (width - 1) >> 1;
+  if (2 * radius + 1 > height) radius = (height - 1) >> 1;
+
   if (radius > 0) {
     SmoothParams p;
     memset(&p, 0, sizeof(p));
diff --git a/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h b/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
index f822107a72..327f19f336 100644
--- a/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
+++ b/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
@@ -32,4 +32,4 @@ int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_ */
+#endif  // WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/quant_levels_utils.h b/3rdparty/libwebp/src/utils/quant_levels_utils.h
index 75df2ba6a4..9ee3ea0075 100644
--- a/3rdparty/libwebp/src/utils/quant_levels_utils.h
+++ b/3rdparty/libwebp/src/utils/quant_levels_utils.h
@@ -33,4 +33,4 @@ int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_QUANT_LEVELS_UTILS_H_ */
+#endif  // WEBP_UTILS_QUANT_LEVELS_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/random_utils.h b/3rdparty/libwebp/src/utils/random_utils.h
index 6d36c667e7..a5006f84f7 100644
--- a/3rdparty/libwebp/src/utils/random_utils.h
+++ b/3rdparty/libwebp/src/utils/random_utils.h
@@ -60,4 +60,4 @@ static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_RANDOM_UTILS_H_ */
+#endif  // WEBP_UTILS_RANDOM_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/rescaler_utils.h b/3rdparty/libwebp/src/utils/rescaler_utils.h
index 8890e6fa13..ca41e42c4a 100644
--- a/3rdparty/libwebp/src/utils/rescaler_utils.h
+++ b/3rdparty/libwebp/src/utils/rescaler_utils.h
@@ -98,4 +98,4 @@ int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_RESCALER_UTILS_H_ */
+#endif  // WEBP_UTILS_RESCALER_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/thread_utils.h b/3rdparty/libwebp/src/utils/thread_utils.h
index c8ae6c9033..29ad49f74b 100644
--- a/3rdparty/libwebp/src/utils/thread_utils.h
+++ b/3rdparty/libwebp/src/utils/thread_utils.h
@@ -87,4 +87,4 @@ WEBP_EXTERN const WebPWorkerInterface* WebPGetWorkerInterface(void);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_THREAD_UTILS_H_ */
+#endif  // WEBP_UTILS_THREAD_UTILS_H_
diff --git a/3rdparty/libwebp/src/utils/utils.h b/3rdparty/libwebp/src/utils/utils.h
index 52921bf24e..c7620f91ec 100644
--- a/3rdparty/libwebp/src/utils/utils.h
+++ b/3rdparty/libwebp/src/utils/utils.h
@@ -107,19 +107,6 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
-// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
-// based on table or not. Can be used as fallback if clz() is not available.
-#define WEBP_NEED_LOG_TABLE_8BIT
-extern const uint8_t WebPLogTable8bit[256];
-static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
-  int log_value = 0;
-  while (n >= 256) {
-    log_value += 8;
-    n >>= 8;
-  }
-  return log_value + WebPLogTable8bit[n];
-}
-
 // Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
@@ -138,6 +125,19 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   return first_set_bit;
 }
 #else   // default: use the C-version.
+// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
+// based on table or not. Can be used as fallback if clz() is not available.
+#define WEBP_NEED_LOG_TABLE_8BIT
+extern const uint8_t WebPLogTable8bit[256];
+static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
+  int log_value = 0;
+  while (n >= 256) {
+    log_value += 8;
+    n >>= 8;
+  }
+  return log_value + WebPLogTable8bit[n];
+}
+
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
 #endif
 
@@ -175,4 +175,4 @@ WEBP_EXTERN int WebPGetColorPalette(const struct WebPPicture* const pic,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_UTILS_H_ */
+#endif  // WEBP_UTILS_UTILS_H_
diff --git a/3rdparty/libwebp/src/webp/decode.h b/3rdparty/libwebp/src/webp/decode.h
index 2165e96c95..ae8bfe840e 100644
--- a/3rdparty/libwebp/src/webp/decode.h
+++ b/3rdparty/libwebp/src/webp/decode.h
@@ -42,6 +42,12 @@ WEBP_EXTERN int WebPGetDecoderVersion(void);
 // This function will also validate the header, returning true on success,
 // false otherwise. '*width' and '*height' are only valid on successful return.
 // Pointers 'width' and 'height' can be passed NULL if deemed irrelevant.
+// Note: The following chunk sequences (before the raw VP8/VP8L data) are
+// considered valid by this function:
+// RIFF + VP8(L)
+// RIFF + VP8X + (optional chunks) + VP8(L)
+// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
+// VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
 WEBP_EXTERN int WebPGetInfo(const uint8_t* data, size_t data_size,
                             int* width, int* height);
 
@@ -425,6 +431,12 @@ WEBP_EXTERN VP8StatusCode WebPGetFeaturesInternal(
 // Returns VP8_STATUS_OK when the features are successfully retrieved. Returns
 // VP8_STATUS_NOT_ENOUGH_DATA when more data is needed to retrieve the
 // features from headers. Returns error in other cases.
+// Note: The following chunk sequences (before the raw VP8/VP8L data) are
+// considered valid by this function:
+// RIFF + VP8(L)
+// RIFF + VP8X + (optional chunks) + VP8(L)
+// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
+// VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
 static WEBP_INLINE VP8StatusCode WebPGetFeatures(
     const uint8_t* data, size_t data_size,
     WebPBitstreamFeatures* features) {
@@ -491,4 +503,4 @@ WEBP_EXTERN VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_WEBP_DECODE_H_ */
+#endif  // WEBP_WEBP_DECODE_H_
diff --git a/3rdparty/libwebp/src/webp/demux.h b/3rdparty/libwebp/src/webp/demux.h
index 555d641338..846eeb15a9 100644
--- a/3rdparty/libwebp/src/webp/demux.h
+++ b/3rdparty/libwebp/src/webp/demux.h
@@ -360,4 +360,4 @@ WEBP_EXTERN void WebPAnimDecoderDelete(WebPAnimDecoder* dec);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_WEBP_DEMUX_H_ */
+#endif  // WEBP_WEBP_DEMUX_H_
diff --git a/3rdparty/libwebp/src/webp/encode.h b/3rdparty/libwebp/src/webp/encode.h
index 7ec3543dc2..549cf07730 100644
--- a/3rdparty/libwebp/src/webp/encode.h
+++ b/3rdparty/libwebp/src/webp/encode.h
@@ -542,4 +542,4 @@ WEBP_EXTERN int WebPEncode(const WebPConfig* config, WebPPicture* picture);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_WEBP_ENCODE_H_ */
+#endif  // WEBP_WEBP_ENCODE_H_
diff --git a/3rdparty/libwebp/src/webp/format_constants.h b/3rdparty/libwebp/src/webp/format_constants.h
index 329fc8a3b0..eca6981a47 100644
--- a/3rdparty/libwebp/src/webp/format_constants.h
+++ b/3rdparty/libwebp/src/webp/format_constants.h
@@ -84,4 +84,4 @@ typedef enum {
 // overflow a uint32_t.
 #define MAX_CHUNK_PAYLOAD (~0U - CHUNK_HEADER_SIZE - 1)
 
-#endif  /* WEBP_WEBP_FORMAT_CONSTANTS_H_ */
+#endif  // WEBP_WEBP_FORMAT_CONSTANTS_H_
diff --git a/3rdparty/libwebp/src/webp/mux.h b/3rdparty/libwebp/src/webp/mux.h
index 28bb4a41c9..66096a92e0 100644
--- a/3rdparty/libwebp/src/webp/mux.h
+++ b/3rdparty/libwebp/src/webp/mux.h
@@ -527,4 +527,4 @@ WEBP_EXTERN void WebPAnimEncoderDelete(WebPAnimEncoder* enc);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_WEBP_MUX_H_ */
+#endif  // WEBP_WEBP_MUX_H_
diff --git a/3rdparty/libwebp/src/webp/mux_types.h b/3rdparty/libwebp/src/webp/mux_types.h
index b37e2c67aa..ceea77dfc6 100644
--- a/3rdparty/libwebp/src/webp/mux_types.h
+++ b/3rdparty/libwebp/src/webp/mux_types.h
@@ -95,4 +95,4 @@ static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_WEBP_MUX_TYPES_H_ */
+#endif  // WEBP_WEBP_MUX_TYPES_H_
diff --git a/3rdparty/libwebp/src/webp/types.h b/3rdparty/libwebp/src/webp/types.h
index 989a763f0d..0ce2622e41 100644
--- a/3rdparty/libwebp/src/webp/types.h
+++ b/3rdparty/libwebp/src/webp/types.h
@@ -49,4 +49,4 @@ typedef long long int int64_t;
 // Macro to check ABI compatibility (same major revision number)
 #define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
 
-#endif  /* WEBP_WEBP_TYPES_H_ */
+#endif  // WEBP_WEBP_TYPES_H_
diff --git a/3rdparty/protobuf/README.md b/3rdparty/protobuf/README.md
index e0edb75f58..30404036a8 100644
--- a/3rdparty/protobuf/README.md
+++ b/3rdparty/protobuf/README.md
@@ -1,3 +1,3 @@
 Project: Protocol Buffers - Google's data interchange format
-Source code: https://github.com/google/protobuf
-Version: 3.5.1
+Source code: https://github.com/protocolbuffers/protobuf
+Version: 3.5.2
diff --git a/3rdparty/protobuf/src/google/protobuf/stubs/io_win32.cc b/3rdparty/protobuf/src/google/protobuf/stubs/io_win32.cc
index ad2d2d265e..b59b8e487b 100644
--- a/3rdparty/protobuf/src/google/protobuf/stubs/io_win32.cc
+++ b/3rdparty/protobuf/src/google/protobuf/stubs/io_win32.cc
@@ -91,7 +91,7 @@ struct CharTraits<wchar_t> {
 
 template <typename char_type>
 bool null_or_empty(const char_type* s) {
-  return s == nullptr || *s == 0;
+  return s == NULL || *s == 0;
 }
 
 // Returns true if the path starts with a drive letter, e.g. "c:".
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f09e3e7d6..523d234fdf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -463,7 +463,7 @@ OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"
 OCV_OPTION(OPENCV_ENABLE_MEMORY_SANITIZER "Better support for memory/address sanitizers"         OFF)
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CV_GCC )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
-OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CV_GCC AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_FAST_MATH           "Enable compiler options for fast math optimizations on FP computations (not recommended)" OFF)
 if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS))  # Use CPU_BASELINE instead
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index 8247a9a53c..fcf31135f0 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -635,10 +635,6 @@ macro(ocv_compiler_optimization_options_finalize)
   if(MSVC)
     # Generate Intrinsic Functions
     set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
-
-    if((X86 OR X86_64) AND CMAKE_SIZEOF_VOID_P EQUAL 4 AND ";${CPU_BASELINE_FINAL};" MATCHES ";SSE;")
-      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /fp:fast") # !! important - be on the same wave with x64 compilers
-    endif()
   endif(MSVC)
 endmacro()
 
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index c3d5ed593f..a5e6f5fd41 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -75,11 +75,25 @@ if(NOT MSVC)
   add_extra_compiler_option(-fsigned-char)
 endif()
 
-if(CV_ICC AND NOT ENABLE_FAST_MATH)
-  if(MSVC)
-    add_extra_compiler_option("/fp:precise")
-  else()
-    add_extra_compiler_option("-fp-model precise")
+if(MSVC)
+  if(NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " /fp:")
+    if(ENABLE_FAST_MATH)
+      add_extra_compiler_option("/fp:fast")
+    else()
+      add_extra_compiler_option("/fp:precise")
+    endif()
+  endif()
+elseif(CV_ICC)
+  if(NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " /fp:"
+      AND NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " -fp-model"
+  )
+    if(NOT ENABLE_FAST_MATH)
+      add_extra_compiler_option("-fp-model precise")
+    endif()
+  endif()
+elseif(CV_GCC OR CV_CLANG)
+  if(ENABLE_FAST_MATH)
+    add_extra_compiler_option(-ffast-math)
   endif()
 endif()
 
@@ -167,9 +181,6 @@ if(CV_GCC OR CV_CLANG)
   elseif(DEFINED ENABLE_OMIT_FRAME_POINTER)
     add_extra_compiler_option(-fno-omit-frame-pointer)
   endif()
-  if(ENABLE_FAST_MATH)
-    add_extra_compiler_option(-ffast-math)
-  endif()
 
   # Profiling?
   if(ENABLE_PROFILING)
diff --git a/doc/py_tutorials/py_objdetect/images/face_icon.jpg b/doc/py_tutorials/py_objdetect/images/face_icon.jpg
deleted file mode 100644
index a7def47137..0000000000
Binary files a/doc/py_tutorials/py_objdetect/images/face_icon.jpg and /dev/null differ
diff --git a/doc/py_tutorials/py_objdetect/py_face_detection/images/face.jpg b/doc/py_tutorials/py_objdetect/py_face_detection/images/face.jpg
deleted file mode 100644
index 913a7f18dd..0000000000
Binary files a/doc/py_tutorials/py_objdetect/py_face_detection/images/face.jpg and /dev/null differ
diff --git a/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.markdown b/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.markdown
index 3b4308a958..81973190aa 100644
--- a/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.markdown
+++ b/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.markdown
@@ -1,134 +1,4 @@
 Face Detection using Haar Cascades {#tutorial_py_face_detection}
 ==================================
 
-Goal
-----
-
-In this session,
-
--   We will see the basics of face detection using Haar Feature-based Cascade Classifiers
--   We will extend the same for eye detection etc.
-
-Basics
-------
-
-Object Detection using Haar feature-based cascade classifiers is an effective object detection
-method proposed by Paul Viola and Michael Jones in their paper, "Rapid Object Detection using a
-Boosted Cascade of Simple Features" in 2001. It is a machine learning based approach where a cascade
-function is trained from a lot of positive and negative images. It is then used to detect objects in
-other images.
-
-Here we will work with face detection. Initially, the algorithm needs a lot of positive images
-(images of faces) and negative images (images without faces) to train the classifier. Then we need
-to extract features from it. For this, Haar features shown in the below image are used. They are just
-like our convolutional kernel. Each feature is a single value obtained by subtracting sum of pixels
-under the white rectangle from sum of pixels under the black rectangle.
-
-![image](images/haar_features.jpg)
-
-Now, all possible sizes and locations of each kernel are used to calculate lots of features. (Just
-imagine how much computation it needs? Even a 24x24 window results over 160000 features). For each
-feature calculation, we need to find the sum of the pixels under white and black rectangles. To solve
-this, they introduced the integral image. However large your image, it reduces the calculations for a
-given pixel to an operation involving just four pixels. Nice, isn't it? It makes things super-fast.
-
-But among all these features we calculated, most of them are irrelevant. For example, consider the
-image below. The top row shows two good features. The first feature selected seems to focus on the
-property that the region of the eyes is often darker than the region of the nose and cheeks. The
-second feature selected relies on the property that the eyes are darker than the bridge of the nose.
-But the same windows applied to cheeks or any other place is irrelevant. So how do we select the
-best features out of 160000+ features? It is achieved by **Adaboost**.
-
-![image](images/haar.png)
-
-For this, we apply each and every feature on all the training images. For each feature, it finds the
-best threshold which will classify the faces to positive and negative. Obviously, there will be
-errors or misclassifications. We select the features with minimum error rate, which means they are
-the features that most accurately classify the face and non-face images. (The process is not as simple as
-this. Each image is given an equal weight in the beginning. After each classification, weights of
-misclassified images are increased. Then the same process is done. New error rates are calculated.
-Also new weights. The process is continued until the required accuracy or error rate is achieved or
-the required number of features are found).
-
-The final classifier is a weighted sum of these weak classifiers. It is called weak because it alone
-can't classify the image, but together with others forms a strong classifier. The paper says even
-200 features provide detection with 95% accuracy. Their final setup had around 6000 features.
-(Imagine a reduction from 160000+ features to 6000 features. That is a big gain).
-
-So now you take an image. Take each 24x24 window. Apply 6000 features to it. Check if it is face or
-not. Wow.. Isn't it a little inefficient and time consuming? Yes, it is. The authors have a good
-solution for that.
-
-In an image, most of the image is non-face region. So it is a better idea to have a simple
-method to check if a window is not a face region. If it is not, discard it in a single shot, and don't
-process it again. Instead, focus on regions where there can be a face. This way, we spend more time
-checking possible face regions.
-
-For this they introduced the concept of **Cascade of Classifiers**. Instead of applying all 6000
-features on a window, the features are grouped into different stages of classifiers and applied one-by-one.
-(Normally the first few stages will contain very many fewer features). If a window fails the first
-stage, discard it. We don't consider the remaining features on it. If it passes, apply the second stage
-of features and continue the process. The window which passes all stages is a face region. How is
-that plan!
-
-The authors' detector had 6000+ features with 38 stages with 1, 10, 25, 25 and 50 features in the first five
-stages. (The two features in the above image are actually obtained as the best two features from
-Adaboost). According to the authors, on average 10 features out of 6000+ are evaluated per
-sub-window.
-
-So this is a simple intuitive explanation of how Viola-Jones face detection works. Read the paper for
-more details or check out the references in the Additional Resources section.
-
-Haar-cascade Detection in OpenCV
---------------------------------
-
-OpenCV comes with a trainer as well as detector. If you want to train your own classifier for any
-object like car, planes etc. you can use OpenCV to create one. Its full details are given here:
-[Cascade Classifier Training](@ref tutorial_traincascade).
-
-Here we will deal with detection. OpenCV already contains many pre-trained classifiers for face,
-eyes, smiles, etc. Those XML files are stored in the opencv/data/haarcascades/ folder. Let's create a
-face and eye detector with OpenCV.
-
-First we need to load the required XML classifiers. Then load our input image (or video) in
-grayscale mode.
-@code{.py}
-import numpy as np
-import cv2 as cv
-
-face_cascade = cv.CascadeClassifier('haarcascade_frontalface_default.xml')
-eye_cascade = cv.CascadeClassifier('haarcascade_eye.xml')
-
-img = cv.imread('sachin.jpg')
-gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
-@endcode
-Now we find the faces in the image. If faces are found, it returns the positions of detected faces
-as Rect(x,y,w,h). Once we get these locations, we can create a ROI for the face and apply eye
-detection on this ROI (since eyes are always on the face !!! ).
-@code{.py}
-faces = face_cascade.detectMultiScale(gray, 1.3, 5)
-for (x,y,w,h) in faces:
-    cv.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
-    roi_gray = gray[y:y+h, x:x+w]
-    roi_color = img[y:y+h, x:x+w]
-    eyes = eye_cascade.detectMultiScale(roi_gray)
-    for (ex,ey,ew,eh) in eyes:
-        cv.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)
-
-cv.imshow('img',img)
-cv.waitKey(0)
-cv.destroyAllWindows()
-@endcode
-Result looks like below:
-
-![image](images/face.jpg)
-
-Additional Resources
---------------------
-
--#  Video Lecture on [Face Detection and Tracking](https://www.youtube.com/watch?v=WfdYYNamHZ8)
--#  An interesting interview regarding Face Detection by [Adam
-    Harvey](https://web.archive.org/web/20171204220159/http://www.makematics.com/research/viola-jones/)
-
-Exercises
----------
+Tutorial content has been moved: @ref tutorial_cascade_classifier
diff --git a/doc/py_tutorials/py_objdetect/py_table_of_contents_objdetect.markdown b/doc/py_tutorials/py_objdetect/py_table_of_contents_objdetect.markdown
index 8ea0504ea4..9375ca1e4b 100644
--- a/doc/py_tutorials/py_objdetect/py_table_of_contents_objdetect.markdown
+++ b/doc/py_tutorials/py_objdetect/py_table_of_contents_objdetect.markdown
@@ -1,7 +1,4 @@
 Object Detection {#tutorial_py_table_of_contents_objdetect}
 ================
 
--   @subpage tutorial_py_face_detection
-
-    Face detection
-    using haar-cascades
+Content has been moved: @ref tutorial_table_of_content_objdetect
diff --git a/doc/py_tutorials/py_tutorials.markdown b/doc/py_tutorials/py_tutorials.markdown
index 7d9298d68e..6957cac53d 100644
--- a/doc/py_tutorials/py_tutorials.markdown
+++ b/doc/py_tutorials/py_tutorials.markdown
@@ -45,10 +45,10 @@ OpenCV-Python Tutorials {#tutorial_py_root}
     In this section you
     will learn different computational photography techniques like image denoising etc.
 
--   @subpage tutorial_py_table_of_contents_objdetect
+-   @ref tutorial_table_of_content_objdetect
 
     In this section you
-    will object detection techniques like face detection etc.
+    will learn object detection techniques like face detection etc.
 
 -   @subpage tutorial_py_table_of_contents_bindings
 
diff --git a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown b/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
index c2f7851a7b..122c1ca0f1 100644
--- a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
+++ b/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
@@ -4,9 +4,11 @@ Cascade Classifier {#tutorial_cascade_classifier}
 Goal
 ----
 
-In this tutorial you will learn how to:
+In this tutorial,
 
--   Use the @ref cv::CascadeClassifier class to detect objects in a video stream. Particularly, we
+-   We will learn how the Haar cascade object detection works.
+-   We will see the basics of face detection and eye detection using the Haar Feature-based Cascade Classifiers
+-   We will use the @ref cv::CascadeClassifier class to detect objects in a video stream. Particularly, we
     will use the functions:
     -   @ref cv::CascadeClassifier::load to load a .xml classifier file. It can be either a Haar or a LBP classifer
     -   @ref cv::CascadeClassifier::detectMultiScale to perform the detection.
@@ -14,8 +16,81 @@ In this tutorial you will learn how to:
 Theory
 ------
 
-Code
-----
+Object Detection using Haar feature-based cascade classifiers is an effective object detection
+method proposed by Paul Viola and Michael Jones in their paper, "Rapid Object Detection using a
+Boosted Cascade of Simple Features" in 2001. It is a machine learning based approach where a cascade
+function is trained from a lot of positive and negative images. It is then used to detect objects in
+other images.
+
+Here we will work with face detection. Initially, the algorithm needs a lot of positive images
+(images of faces) and negative images (images without faces) to train the classifier. Then we need
+to extract features from it. For this, Haar features shown in the below image are used. They are just
+like our convolutional kernel. Each feature is a single value obtained by subtracting sum of pixels
+under the white rectangle from sum of pixels under the black rectangle.
+
+![image](images/haar_features.jpg)
+
+Now, all possible sizes and locations of each kernel are used to calculate lots of features. (Just
+imagine how much computation it needs? Even a 24x24 window results over 160000 features). For each
+feature calculation, we need to find the sum of the pixels under white and black rectangles. To solve
+this, they introduced the integral image. However large your image, it reduces the calculations for a
+given pixel to an operation involving just four pixels. Nice, isn't it? It makes things super-fast.
+
+But among all these features we calculated, most of them are irrelevant. For example, consider the
+image below. The top row shows two good features. The first feature selected seems to focus on the
+property that the region of the eyes is often darker than the region of the nose and cheeks. The
+second feature selected relies on the property that the eyes are darker than the bridge of the nose.
+But the same windows applied to cheeks or any other place is irrelevant. So how do we select the
+best features out of 160000+ features? It is achieved by **Adaboost**.
+
+![image](images/haar.png)
+
+For this, we apply each and every feature on all the training images. For each feature, it finds the
+best threshold which will classify the faces to positive and negative. Obviously, there will be
+errors or misclassifications. We select the features with minimum error rate, which means they are
+the features that most accurately classify the face and non-face images. (The process is not as simple as
+this. Each image is given an equal weight in the beginning. After each classification, weights of
+misclassified images are increased. Then the same process is done. New error rates are calculated.
+Also new weights. The process is continued until the required accuracy or error rate is achieved or
+the required number of features are found).
+
+The final classifier is a weighted sum of these weak classifiers. It is called weak because it alone
+can't classify the image, but together with others forms a strong classifier. The paper says even
+200 features provide detection with 95% accuracy. Their final setup had around 6000 features.
+(Imagine a reduction from 160000+ features to 6000 features. That is a big gain).
+
+So now you take an image. Take each 24x24 window. Apply 6000 features to it. Check if it is face or
+not. Wow.. Isn't it a little inefficient and time consuming? Yes, it is. The authors have a good
+solution for that.
+
+In an image, most of the image is non-face region. So it is a better idea to have a simple
+method to check if a window is not a face region. If it is not, discard it in a single shot, and don't
+process it again. Instead, focus on regions where there can be a face. This way, we spend more time
+checking possible face regions.
+
+For this they introduced the concept of **Cascade of Classifiers**. Instead of applying all 6000
+features on a window, the features are grouped into different stages of classifiers and applied one-by-one.
+(Normally the first few stages will contain very many fewer features). If a window fails the first
+stage, discard it. We don't consider the remaining features on it. If it passes, apply the second stage
+of features and continue the process. The window which passes all stages is a face region. How is
+that plan!
+
+The authors' detector had 6000+ features with 38 stages with 1, 10, 25, 25 and 50 features in the first five
+stages. (The two features in the above image are actually obtained as the best two features from
+Adaboost). According to the authors, on average 10 features out of 6000+ are evaluated per
+sub-window.
+
+So this is a simple intuitive explanation of how Viola-Jones face detection works. Read the paper for
+more details or check out the references in the Additional Resources section.
+
+Haar-cascade Detection in OpenCV
+--------------------------------
+OpenCV provides a training method (see @ref tutorial_traincascade) or pretrained models, that can be read using the @ref cv::CascadeClassifier::load method.
+The pretrained models are located in the data folder in the OpenCV installation or can be found [here](https://github.com/opencv/opencv/tree/master/data).
+
+The following code example will use pretrained Haar cascade models to detect faces and eyes in an image.
+First, a @ref cv::CascadeClassifier is created and the necessary XML file is loaded using the @ref cv::CascadeClassifier::load method.
+Afterwards, the detection is done using the @ref cv::CascadeClassifier::detectMultiScale method, which returns boundary rectangles for the detected faces or eyes.
 
 @add_toggle_cpp
 This tutorial code's is shown lines below. You can also download it from
@@ -35,9 +110,6 @@ This tutorial code's is shown lines below. You can also download it from
 @include samples/python/tutorial_code/objectDetection/cascade_classifier/objectDetection.py
 @end_toggle
 
-Explanation
------------
-
 Result
 ------
 
diff --git a/doc/py_tutorials/py_objdetect/py_face_detection/images/haar.png b/doc/tutorials/objdetect/cascade_classifier/images/haar.png
similarity index 100%
rename from doc/py_tutorials/py_objdetect/py_face_detection/images/haar.png
rename to doc/tutorials/objdetect/cascade_classifier/images/haar.png
diff --git a/doc/py_tutorials/py_objdetect/py_face_detection/images/haar_features.jpg b/doc/tutorials/objdetect/cascade_classifier/images/haar_features.jpg
similarity index 100%
rename from doc/py_tutorials/py_objdetect/py_face_detection/images/haar_features.jpg
rename to doc/tutorials/objdetect/cascade_classifier/images/haar_features.jpg
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index 8f0a4a2449..d58571afcf 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -2501,7 +2501,7 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
                       CvRect* roi1, CvRect* roi2 )
 {
     double _om[3], _t[3] = {0}, _uu[3]={0,0,0}, _r_r[3][3], _pp[3][4];
-    double _ww[3], _wr[3][3], _z[3] = {0,0,0}, _ri[3][3], _w3[3];
+    double _ww[3], _wr[3][3], _z[3] = {0,0,0}, _ri[3][3];
     cv::Rect_<float> inner1, inner2, outer1, outer2;
 
     CvMat om  = cvMat(3, 1, CV_64F, _om);
@@ -2510,13 +2510,11 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
     CvMat r_r = cvMat(3, 3, CV_64F, _r_r);
     CvMat pp  = cvMat(3, 4, CV_64F, _pp);
     CvMat ww  = cvMat(3, 1, CV_64F, _ww); // temps
-    CvMat w3  = cvMat(3, 1, CV_64F, _w3); // temps
     CvMat wR  = cvMat(3, 3, CV_64F, _wr);
     CvMat Z   = cvMat(3, 1, CV_64F, _z);
     CvMat Ri  = cvMat(3, 3, CV_64F, _ri);
     double nx = imageSize.width, ny = imageSize.height;
     int i, k;
-    double nt, nw;
 
     if( matR->rows == 3 && matR->cols == 3 )
         cvRodrigues2(matR, &om);          // get vector rotation
@@ -2527,36 +2525,15 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
     cvMatMul(&r_r, matT, &t);
 
     int idx = fabs(_t[0]) > fabs(_t[1]) ? 0 : 1;
+    double c = _t[idx], nt = cvNorm(&t, 0, CV_L2);
+    _uu[idx] = c > 0 ? 1 : -1;
 
-    // if idx == 0
-    //   e1 = T / ||T||
-    //   e2 = e1 x [0,0,1]
-
-    // if idx == 1
-    //   e2 = T / ||T||
-    //   e1 = e2 x [0,0,1]
-
-    // e3 = e1 x e2
-
-    _uu[2] = 1;
-    cvCrossProduct(&uu, &t, &ww);
-    nt = cvNorm(&t, 0, CV_L2);
-    CV_Assert(fabs(nt) > 0);
-    nw = cvNorm(&ww, 0, CV_L2);
-    CV_Assert(fabs(nw) > 0);
-    cvConvertScale(&ww, &ww, 1 / nw);
-    cvCrossProduct(&t, &ww, &w3);
-    nw = cvNorm(&w3, 0, CV_L2);
-    CV_Assert(fabs(nw) > 0);
-    cvConvertScale(&w3, &w3, 1 / nw);
-    _uu[2] = 0;
-
-    for (i = 0; i < 3; ++i)
-    {
-        _wr[idx][i] = -_t[i] / nt;
-        _wr[idx ^ 1][i] = -_ww[i];
-        _wr[2][i] = _w3[i] * (1 - 2 * idx); // if idx == 1 -> opposite direction
-    }
+    // calculate global Z rotation
+    cvCrossProduct(&t,&uu,&ww);
+    double nw = cvNorm(&ww, 0, CV_L2);
+    if (nw > 0.0)
+        cvConvertScale(&ww, &ww, acos(fabs(c)/nt)/nw);
+    cvRodrigues2(&ww, &wR);
 
     // apply to both views
     cvGEMM(&wR, &r_r, 1, 0, 0, &Ri, CV_GEMM_B_T);
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index f0a2324a38..6274c1b2c3 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -2089,6 +2089,44 @@ TEST(Calib3d_StereoCalibrate, regression_10791)
     EXPECT_GE(roi2.area(), 400*300) << roi2;
 }
 
+TEST(Calib3d_StereoCalibrate, regression_11131)
+{
+    const Matx33d M1(
+        1457.572438721727, 0, 1212.945694211622,
+        0, 1457.522226502963, 1007.32058848921,
+        0, 0, 1
+    );
+    const Matx33d M2(
+        1460.868570835972, 0, 1215.024068023046,
+        0, 1460.791367088, 1011.107202932225,
+        0, 0, 1
+    );
+    const Matx<double, 5, 1> D1(0, 0, 0, 0, 0);
+    const Matx<double, 5, 1> D2(0, 0, 0, 0, 0);
+
+    const Matx33d R(
+        0.9985404059825475, 0.02963547172078553, -0.04515303352041626,
+        -0.03103795276460111, 0.9990471552537432, -0.03068268351343364,
+        0.04420071389006859, 0.03203935697372317, 0.9985087763742083
+    );
+    const Matx31d T(0.9995500167379527, 0.0116311595111068, 0.02764923448462666);
+
+    const Size imageSize(2456, 2058);
+
+    Mat R1, R2, P1, P2, Q;
+    Rect roi1, roi2;
+    stereoRectify(M1, D1, M2, D2, imageSize, R, T,
+                  R1, R2, P1, P2, Q,
+                  CALIB_ZERO_DISPARITY, 1, imageSize, &roi1, &roi2);
+
+    EXPECT_GT(P1.at<double>(0, 0), 0);
+    EXPECT_GT(P2.at<double>(0, 0), 0);
+    EXPECT_GT(R1.at<double>(0, 0), 0);
+    EXPECT_GT(R2.at<double>(0, 0), 0);
+    EXPECT_GE(roi1.area(), 400*300) << roi1;
+    EXPECT_GE(roi2.area(), 400*300) << roi2;
+}
+
 TEST(Calib3d_Triangulate, accuracy)
 {
     // the testcase from http://code.opencv.org/issues/4334
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index c12847e3e7..246f23b0b8 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -13,8 +13,9 @@ ocv_add_dispatched_file(split SSE2 AVX2)
 ocv_add_dispatched_file(sum SSE2 AVX2)
 
 # dispatching for accuracy tests
-ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
-ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2)
+ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)
 
 ocv_add_module(core
                OPTIONAL opencv_cudev
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 460c5c5900..adce1b3fb1 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -180,6 +180,18 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 
 #endif
 
+// AVX512 can be used together with SSE2 and AVX2, so
+// we define those sets of intrinsics at once.
+// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
+// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
+#if CV_AVX512_SKX
+
+#define CV__SIMD_FORWARD 512
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx512.hpp"
+
+#endif
+
 //! @cond IGNORED
 
 namespace cv {
@@ -321,13 +333,41 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
 #endif
 
+#if CV_SIMD512
+    CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
+    CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
+#endif
+
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
 #define CV__SIMD_NAMESPACE simd512
 namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD 1
     #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_FP16 CV_SIMD512_FP16
     #define CV_SIMD_WIDTH 64
-    // TODO typedef v_uint8 / v_int32 / etc types here
+    typedef v_uint8x64    v_uint8;
+    typedef v_int8x64     v_int8;
+    typedef v_uint16x32   v_uint16;
+    typedef v_int16x32    v_int16;
+    typedef v_uint32x16   v_uint32;
+    typedef v_int32x16    v_int32;
+    typedef v_uint64x8    v_uint64;
+    typedef v_int64x8     v_int64;
+    typedef v_float32x16  v_float32;
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
+#if CV_SIMD512_64F
+    typedef v_float64x8   v_float64;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
+#endif
+        inline void vx_cleanup() { v512_cleanup(); }
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
 #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index cd7490bb0d..f765c22a10 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1083,7 +1083,7 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
         __m128 v1 = _v256_extract_high(a.val);                        \
         v0 = intrin(v0, v1);                                          \
         v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
-        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
         return _mm_cvtss_f32(v0);                                     \
     }
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
new file mode 100644
index 0000000000..69d8d8398d
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -0,0 +1,2743 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
+#define OPENCV_HAL_INTRIN_AVX512_HPP
+
+#define CVT_ROUND_MODES_IMPLEMENTED 0
+
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#define CV_SIMD512_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
+
+#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
+#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
+                         ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
+#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                        a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
+#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
+                        ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
+                        ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
+                        ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
+                        ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
+#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
+                       (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
+                       (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
+                       (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
+                       (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
+                       (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
+                       (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
+                       (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
+
+#ifndef _mm512_cvtpd_pslo
+#ifdef _mm512_zextsi256_si512
+#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
+#else
+//if preferred way to extend with zeros is unavailable
+#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
+#endif
+#endif
+///////// Utils ////////////
+
+namespace
+{
+
+inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
+{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
+
+inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
+{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
+
+inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
+{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
+
+inline int _v_cvtsi512_si32(const __m512i& a)
+{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
+
+inline __m256i _v512_extract_high(const __m512i& v)
+{ return _mm512_extracti32x8_epi32(v, 1); }
+
+inline __m256  _v512_extract_high(const __m512& v)
+{ return _mm512_extractf32x8_ps(v, 1); }
+
+inline __m256d _v512_extract_high(const __m512d& v)
+{ return _mm512_extractf64x4_pd(v, 1); }
+
+inline __m256i _v512_extract_low(const __m512i& v)
+{ return _mm512_castsi512_si256(v); }
+
+inline __m256  _v512_extract_low(const __m512& v)
+{ return _mm512_castps512_ps256(v); }
+
+inline __m256d _v512_extract_low(const __m512d& v)
+{ return _mm512_castpd512_pd256(v); }
+
+inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
+{ return _mm512_inserti32x8(a, b, 0); }
+
+inline __m512 _v512_insert(const __m512& a, const __m256& b)
+{ return _mm512_insertf32x8(a, b, 0); }
+
+inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
+{ return _mm512_insertf64x4(a, b, 0); }
+
+}
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x64
+{
+    typedef uchar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_uint8x64(__m512i v) : val(v) {}
+    v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31,
+               uchar v32, uchar v33, uchar v34, uchar v35,
+               uchar v36, uchar v37, uchar v38, uchar v39,
+               uchar v40, uchar v41, uchar v42, uchar v43,
+               uchar v44, uchar v45, uchar v46, uchar v47,
+               uchar v48, uchar v49, uchar v50, uchar v51,
+               uchar v52, uchar v53, uchar v54, uchar v55,
+               uchar v56, uchar v57, uchar v58, uchar v59,
+               uchar v60, uchar v61, uchar v62, uchar v63)
+    {
+        val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint8x64() : val(_mm512_setzero_si512()) {}
+    uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
+};
+
+struct v_int8x64
+{
+    typedef schar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_int8x64(__m512i v) : val(v) {}
+    v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31,
+              schar v32, schar v33, schar v34, schar v35,
+              schar v36, schar v37, schar v38, schar v39,
+              schar v40, schar v41, schar v42, schar v43,
+              schar v44, schar v45, schar v46, schar v47,
+              schar v48, schar v49, schar v50, schar v51,
+              schar v52, schar v53, schar v54, schar v55,
+              schar v56, schar v57, schar v58, schar v59,
+              schar v60, schar v61, schar v62, schar v63)
+    {
+        val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_int8x64() : val(_mm512_setzero_si512()) {}
+    schar get0() const { return (schar)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint16x32
+{
+    typedef ushort lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_uint16x32(__m512i v) : val(v) {}
+    v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15,
+                ushort v16, ushort v17, ushort v18, ushort v19,
+                ushort v20, ushort v21, ushort v22, ushort v23,
+                ushort v24, ushort v25, ushort v26, ushort v27,
+                ushort v28, ushort v29, ushort v30, ushort v31)
+    {
+        val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                              v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint16x32() : val(_mm512_setzero_si512()) {}
+    ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
+};
+
+struct v_int16x32
+{
+    typedef short lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_int16x32(__m512i v) : val(v) {}
+    v_int16x32(short v0,  short v1,  short v2,  short v3,  short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11, short v12, short v13, short v14, short v15,
+               short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
+               short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
+    {
+        val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
+                              (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
+                              (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
+                              (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
+    }
+    v_int16x32() : val(_mm512_setzero_si512()) {}
+    short get0() const { return (short)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint32x16
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_uint32x16(__m512i v) : val(v) {}
+    v_uint32x16(unsigned v0,  unsigned v1,  unsigned v2,  unsigned v3,
+                unsigned v4,  unsigned v5,  unsigned v6,  unsigned v7,
+                unsigned v8,  unsigned v9,  unsigned v10, unsigned v11,
+                unsigned v12, unsigned v13, unsigned v14, unsigned v15)
+    {
+        val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,
+                                (int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
+    }
+    v_uint32x16() : val(_mm512_setzero_si512()) {}
+    unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
+};
+
+struct v_int32x16
+{
+    typedef int lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_int32x16(__m512i v) : val(v) {}
+    v_int32x16(int v0, int v1, int v2,  int v3,  int v4,  int v5,  int v6,  int v7,
+               int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
+    {
+        val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_int32x16() : val(_mm512_setzero_si512()) {}
+    int get0() const { return _v_cvtsi512_si32(val); }
+};
+
+struct v_float32x16
+{
+    typedef float lane_type;
+    enum { nlanes = 16 };
+    __m512 val;
+
+    explicit v_float32x16(__m512 v) : val(v) {}
+    v_float32x16(float v0, float v1, float v2,  float v3,  float v4,  float v5,  float v6,  float v7,
+                 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
+    {
+        val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_float32x16() : val(_mm512_setzero_ps()) {}
+    float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
+};
+
+struct v_uint64x8
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_uint64x8(__m512i v) : val(v) {}
+    v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
+    { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
+    v_uint64x8() : val(_mm512_setzero_si512()) {}
+    uint64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
+};
+
+struct v_int64x8
+{
+    typedef int64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_int64x8(__m512i v) : val(v) {}
+    v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
+    { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_int64x8() : val(_mm512_setzero_si512()) {}
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
+};
+
+struct v_float64x8
+{
+    typedef double lane_type;
+    enum { nlanes = 8 };
+    __m512d val;
+
+    explicit v_float64x8(__m512d v) : val(v) {}
+    v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
+    { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_float64x8() : val(_mm512_setzero_pd()) {}
+    double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v512_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \
+        return _Tpvec(_mm512_castsi256_si512(v256));                  \
+    }                                                                 \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \
+        __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \
+        return _Tpvec(_v512_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm512_store_si512((__m512i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm512_stream_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_si512((__m512i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_si512((__m512i*)ptr, a.val); \
+        else \
+            _mm512_store_si512((__m512i*)ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,  short)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,  unsigned)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,   int)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v512_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm512_load_##suffix(ptr)); }                         \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \
+                     (_mm256_loadu_##suffix(ptr)));                       \
+    }                                                                     \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \
+        halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \
+        return _Tpvec(_v512_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm512_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm512_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm512_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_##suffix(ptr, a.val); \
+        else \
+            _mm512_store_##suffix(ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float,  ps, __m256)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
+
+#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \
+    inline _Tpvec v512_setzero_##suffix()                                          \
+    { return _Tpvec(_mm512_setzero_si512()); }                                     \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
+    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)
+
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,  int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
+
+#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v512_setzero_##suffix()                                   \
+    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                               \
+    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float,  f32, ps, _mm512_castsi512_ps)
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,  double, f64, pd, _mm512_castsi512_pd)
+
+inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
+{ return a; }
+inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_castpd_ps(a.val)); }
+
+inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
+{ return a; }
+inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_castps_pd(a.val)); }
+
+// FP16
+inline v_float32x16 v512_load_expand(const float16_t* ptr)
+{
+    return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
+{
+    __m256i ah = _mm512_cvtps_ph(a.val, 0);
+    _mm256_storeu_si256((__m256i*)ptr, ah);
+}
+
+/* Recombine & ZIP */
+inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,
+                                    87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,
+                                    79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,
+                                    71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);
+    ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,
+                                   119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,
+                                   111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,
+                                   103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);
+    ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
+#else
+    __m512i low  = _mm512_unpacklo_epi8(a.val, b.val);
+    __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
+    ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));
+    ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
+#endif
+}
+inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
+{
+    __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,
+                                    39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);
+    ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
+                                    55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
+    ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
+{
+    __m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);
+    ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+    ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
+{
+    __m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);
+    ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
+    ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
+}
+
+inline void v_zip(const v_uint8x64&  a, const v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)
+{
+    v_int8x64 i0, i1;
+    v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
+    ab0 = v_reinterpret_as_u8(i0);
+    ab1 = v_reinterpret_as_u8(i1);
+}
+inline void v_zip(const v_uint16x32&  a, const v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)
+{
+    v_int16x32 i0, i1;
+    v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
+    ab0 = v_reinterpret_as_u16(i0);
+    ab1 = v_reinterpret_as_u16(i1);
+}
+inline void v_zip(const v_uint32x16&  a, const v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_u32(i0);
+    ab1 = v_reinterpret_as_u32(i1);
+}
+inline void v_zip(const v_uint64x8&  a, const v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_u64(i0);
+    ab1 = v_reinterpret_as_u64(i1);
+}
+inline void v_zip(const v_float32x16&  a, const v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_f32(i0);
+    ab1 = v_reinterpret_as_f32(i1);
+}
+inline void v_zip(const v_float64x8&  a, const v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_f64(i0);
+    ab1 = v_reinterpret_as_f64(i1);
+}
+
+#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \
+    { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \
+    { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \
+                                  _Tpvec& c, _Tpvec& d)                                   \
+    {                                                                                     \
+        c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \
+        d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \
+    }
+
+
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
+
+inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i ad = _mm512_srai_epi16(a.val, 8);
+    __m512i bd = _mm512_srai_epi16(b.val, 8);
+    __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
+    __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
+    return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
+}
+inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(intrin(a.val, b.val)); }                             \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
+
+/** Saturating arithmetics **/
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
+
+// saturating multiply
+inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
+{
+    v_uint16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
+{
+    v_int16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
+}
+inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+    return v_int16x32(_mm512_packs_epi32(p0, p1));
+}
+
+inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
+{ a = a * b; return a; }
+inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
+{ a = a * b; return a; }
+inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
+{ a = a * b; return a; }
+inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
+{ a = a * b; return a; }
+
+inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
+                         v_uint16x32& c, v_uint16x32& d)
+{
+    v_uint16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
+                         v_int16x32& c, v_int16x32& d)
+{
+    v_int16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
+                         v_int32x16& c, v_int32x16& d)
+{
+    v_int16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
+                         v_uint32x16& c, v_uint32x16& d)
+{
+    v_uint16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
+                         v_uint64x8& c, v_uint64x8& d)
+{
+    v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
+          v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
+    v_int64x8& c, v_int64x8& d)
+{
+    v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
+          v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                     \
+    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
+
+inline v_float32x16 v_not_nan(const v_float32x16& a)
+{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
+inline v_float64x8 v_not_nan(const v_float64x8& a)
+{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)
+
+/** Rotate **/
+template<int imm>
+inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
+{
+    if (imm == 0) return a;
+    if (imm == 64) return b;
+    if (imm >= 128) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_permutex2var_epi8(a.val,
+           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
+                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
+                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
+                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
+                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
+                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
+                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
+                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), b.val));
+#else
+    __m512i pre = _mm512_alignr_epi32(b.val, a.val, imm/4);
+    if (imm % 4)
+    {
+        __m512i post;
+        if (imm/4 < 15)
+            post = _mm512_alignr_epi32(b.val, a.val, imm/4 + 1);
+        else if (imm/4 == 15)
+            post = b.val;
+        else
+            post = _mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm/4 - 15);
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(pre, (imm % 4)*8), _mm512_slli_epi32(post, (4 - imm % 4)*8)));
+    }
+    else
+        return v_int8x64(pre);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
+{
+    if (imm == 0) return a;
+    if (imm == 64) return b;
+    if (imm >= 128) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_permutex2var_epi8(b.val,
+           _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
+                          0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
+                          0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
+                          0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
+                          0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
+                          0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
+                          0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
+                          0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
+#else
+    if (imm < 64) return v_rotate_right<64 - imm>(b, a);
+    else return v_rotate_right<128 - imm>(v512_setzero_s8(), b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_right(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
+           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
+                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
+                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
+                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
+                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
+                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
+                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
+                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
+#else
+    return v_rotate_right<imm>(a, v512_setzero_s8());
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
+           _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
+                          0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
+                          0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
+                          0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
+                          0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
+                          0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
+                          0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
+                          0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
+#else
+    return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
+#endif
+}
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                 \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                          \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }    \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                         \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }   \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                           \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                            \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                          \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                 \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                            \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                 \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec();                                                                                        \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << imm)&MASK, a.val)); \
+}                                                                                                                                        \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                           \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                 \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec();                                                                                        \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << imm)&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
+}                                                                                                                                        \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                             \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm >= _Tpvec::nlanes) return _Tpvec();                                                                                          \
+    return _Tpvec(_mm512_maskz_expand_##suffix((MASK << imm)&MASK, a.val));                                                              \
+}                                                                                                                                        \
+template<int imm>                                                                                                                        \
+inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                            \
+{                                                                                                                                        \
+    enum { SHIFT2 = _Tpvec::nlanes - imm };                                                                                              \
+    enum { MASK = (1 << _Tpvec::nlanes) - 1 };                                                                                           \
+    if (imm == 0) return a;                                                                                                              \
+    if (imm >= _Tpvec::nlanes) return _Tpvec();                                                                                          \
+    return _Tpvec(_mm512_maskz_compress_##suffix((MASK << imm)&MASK, a.val));                                                            \
+}
+
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)
+
+////////// Reduce /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \
+      sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \
+      _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  min, v_int64x8,  min_epi64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  max, v_int64x8,  max_epi64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \
+    inline double v_reduce_##func(const v_float64x8& a)                                             \
+    { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \
+      double CV_DECL_ALIGNED(64) idx[2];                                                            \
+      _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  min, v_int32x16,  min_epi32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  max, v_int32x16,  max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \
+    inline float v_reduce_##func(const v_float32x16& a)                                           \
+    { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \
+      __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \
+      return _mm_cvtss_f32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
+
+inline float v_reduce_sum(const v_float32x16& a)
+{
+    __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
+    quarter = _mm_hadd_ps(quarter, quarter);
+    return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
+}
+inline int v_reduce_sum(const v_int32x16& a)
+{
+    __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    quarter = _mm_hadd_epi32(quarter, quarter);
+    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
+}
+inline uint v_reduce_sum(const v_uint32x16& a)
+{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)
+
+inline int v_reduce_sum(const v_int16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline uint v_reduce_sum(const v_uint16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64,  min_epi8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64,  max_epi8)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \
+    inline sctype v_reduce_sum(const _Tpvec& a)                                                         \
+    {   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \
+                                       _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \
+        a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
+        __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \
+        __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \
+        a4 = _mm_hadd_epi32(a4, a4);                                                                    \
+        return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)
+
+inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
+                                  const v_float32x16& c, const v_float32x16& d)
+{
+    __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
+    __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
+    __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
+    __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
+    return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i val = _mm512_sad_epu8(a.val, b.val);
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i val = _mm512_set1_epi8(0x80);
+    val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
+inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
+{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
+inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
+inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
+{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
+inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
+{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
+inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
+{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
+
+/** Popcount **/
+inline v_uint8x64 v_popcount(const v_int8x64& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint8x64(_mm512_popcnt_epi8(a.val));
+#elif CV_AVX_512VBMI
+    __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
+    __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
+    return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
+#else
+    __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
+    __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
+
+    return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),
+                                      _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
+#endif
+}
+inline v_uint16x32 v_popcount(const v_int16x32& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint16x32(_mm512_popcnt_epi16(a.val));
+#elif CV_AVX_512VPOPCNTDQ
+    __m512i zero = _mm512_setzero_si512();
+    return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
+                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
+#endif
+}
+inline v_uint32x16 v_popcount(const v_int32x16& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint32x16(_mm512_popcnt_epi32(a.val));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
+#endif
+}
+inline v_uint64x8 v_popcount(const v_int64x8& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint64x8(_mm512_popcnt_epi64(a.val));
+#else
+    return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
+#endif
+}
+
+
+inline v_uint8x64  v_popcount(const v_uint8x64&  a) { return v_popcount(v_reinterpret_as_s8 (a)); }
+inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
+inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
+inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinterpret_as_s64(a)); }
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b * b)); }
+
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
+
+inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return a * b + c; }
+inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x16 v_invsqrt(const v_float32x16& x)
+{
+#if CV_AVX_512ER
+    return v_float32x16(_mm512_rsqrt28_ps(x.val));
+#else
+    v_float32x16 half = x * v512_setall_f32(0.5);
+    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
+    t *= v512_setall_f32(1.5) - ((t * t) * half);
+    return t;
+#endif
+}
+
+inline v_float64x8 v_invsqrt(const v_float64x8& x)
+{
+#if CV_AVX_512ER
+    return v_float64x8(_mm512_rsqrt28_pd(x.val));
+#else
+    return v512_setall_f64(1.) / v_sqrt(x);
+//    v_float64x8 half = x * v512_setall_f64(0.5);
+//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    return t;
+#endif
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
+    inline _Tpuvec v_abs(const _Tpvec& x)                   \
+    { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)
+
+inline v_float32x16 v_abs(const v_float32x16& x)
+{
+#ifdef _mm512_abs_pd
+    return v_float32x16(_mm512_abs_ps(x.val));
+#else
+    return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
+                        _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
+                                        0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
+#endif
+}
+
+inline v_float64x8 v_abs(const v_float64x8& x)
+{
+#ifdef _mm512_abs_pd
+    #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
+        return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
+    #else
+        return v_float64x8(_mm512_abs_pd(x.val));
+    #endif
+#else
+    return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
+                       _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
+                                       0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
+#endif
+}
+
+/** Absolute difference **/
+inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = v_sub_wrap(a, b);
+    v_int8x64 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 d = a - b;
+    v_int32x16 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
+{ return v_abs(a - b); }
+
+inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = a - b;
+    v_int8x64 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x16 v_round(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }
+
+inline v_int32x16 v_round(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
+
+inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
+{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
+
+inline v_int32x16 v_trunc(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }
+
+inline v_int32x16 v_trunc(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
+
+#if CVT_ROUND_MODES_IMPLEMENTED
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
+#else
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
+#endif
+
+/** To float **/
+inline v_float32x16 v_cvt_f32(const v_int32x16& a)
+{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
+{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x64 v512_lut(const schar* tab, const int* idx)
+{
+    __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
+    __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
+    return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
+}
+inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    return v_int8x64(_v512_combine(p0, p1));
+}
+inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
+}
+inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x32 v512_lut(const short* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 2));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
+    return v_int16x32(_v512_combine(p0, p1));
+}
+inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
+}
+inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
+#else
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
+#endif
+}
+inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x16 v512_lut(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
+#else
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
+#endif
+}
+inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                          _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x8 v512_lut(const int64* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
+#else
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
+#endif
+}
+inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                         _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
+inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x16 v512_lut(const float* tab, const int* idx)
+{
+    return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x8 v512_lut(const double* tab, const int* idx)
+{
+    return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
+}
+inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
+{
+        return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
+                               _mm_loadu_pd(tab + idx[0])),
+                               _mm_loadu_pd(tab + idx[1]), 1),
+                               _mm_loadu_pd(tab + idx[2]), 2),
+                               _mm_loadu_pd(tab + idx[3]), 3));
+}
+
+inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
+{
+    return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
+}
+
+inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
+{
+    return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
+}
+
+inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
+{
+    return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
+{
+    x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
+    y.val = _mm512_i32gather_ps(idxvec.val, tab + 1, 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
+{
+    x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
+    y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab + 1, 8);
+}
+
+inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
+}
+inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
+}
+inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
+}
+inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
+}
+inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
+}
+inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
+                                              _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
+}
+inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
+                                                               0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
+}
+inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                               0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
+{
+    return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+
+////////// Matrix operations /////////
+
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
+{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
+
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b) + c; }
+
+#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
+    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x16 v_matmul(const v_float32x16& v,
+                             const v_float32x16& m0, const v_float32x16& m1,
+                             const v_float32x16& m2, const v_float32x16& m3)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x16 v_matmuladd(const v_float32x16& v,
+                                const v_float32x16& m0, const v_float32x16& m1,
+                                const v_float32x16& m2, const v_float32x16& a)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \
+        __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \
+        __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \
+        __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v512_extract_low(a.val));                  \
+        b1.val = intrin(_v512_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \
+    inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,  short,    _mm512_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,  unsigned, _mm512_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,   int,      _mm512_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
+    inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);    \
+        return _Tpvec(intrin(a));                            \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
+{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    const __m512i t = _mm512_set1_epi16(255);
+    return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
+}
+
+inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    const __m512i m = _mm512_set1_epi16(255);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
+{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
+}
+
+inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
+{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
+{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
+
+inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
+{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
+
+inline void v_pack_store(int* ptr, const v_int64x8& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x8& a)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
+                           const v_uint32x16& c, const v_uint32x16& d)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+
+    return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
+}
+
+inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                           const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
+                           const v_uint64x8& g, const v_uint64x8& h)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+    __m512i ef = _mm512_packs_epi32(e.val, f.val);
+    __m512i gh = _mm512_packs_epi32(g.val, h.val);
+
+    __m512i abcd = _mm512_packs_epi32(ab, cd);
+    __m512i efgh = _mm512_packs_epi32(ef, gh);
+
+    return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
+                                                               27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \
+    template<int s>                                           \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+    a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
+#else
+    __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
+    __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
+    __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
+    __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
+    b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+    a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+
+#if CV_AVX_512VBMI2
+    __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,
+                                    78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,
+                                    30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,
+                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);
+    __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
+    __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
+    __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
+                    _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,
+                                    77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
+                                    94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,
+                                    46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);
+    b = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
+    g = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
+    r = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
+#elif CV_AVX_512VBMI
+    __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
+    __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
+    __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
+    b = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,
+                                                                    77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,
+                                                                    46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,
+                                                                    23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));
+    g = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,
+                                                                    39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,
+                                                                    15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,
+                                                                   111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));
+    r = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,
+                                                                    15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,
+                                                                    95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,
+                                                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));
+#else
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
+    __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                   14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
+    __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
+    b = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
+    r = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
+    g = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    b = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
+    g = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                    14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
+    r = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+
+    __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
+    __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
+    __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
+    __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
+
+    b = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
+    g = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
+    r = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+
+    __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
+    __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
+
+    b = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
+    r = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
+    g = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r, v_uint8x64& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
+
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
+
+    b = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
+    r = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
+    g = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
+    a = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
+#else
+    __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+    __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
+    __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
+    __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
+    __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
+    __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
+    __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
+    __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
+
+    b = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
+    r = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
+    g = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    a = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r, v_uint16x32& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
+
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
+
+    b = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
+    r = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
+    g = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
+    a = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r, v_uint32x16& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
+
+    b = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
+    r = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
+    g = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    a = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r, v_uint64x8& a )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
+
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
+
+    b = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
+    r = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
+    g = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
+    a = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g, const v_uint8x64& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,
+                                    79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,
+                                    10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,
+                                   111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);
+    __m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,
+                                    37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,
+                                    95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,
+                                     5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);
+    __m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,
+                                   122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,
+                                    53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,
+                                    90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);
+    __m512i r2g0r0 = _mm512_permutex2var_epi8(g.val, mask0, r.val);
+    __m512i b0r1b1 = _mm512_permutex2var_epi8(b.val, mask1, r.val);
+    __m512i g1b2g2 = _mm512_permutex2var_epi8(b.val, mask2, g.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
+    __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
+#else
+    __m512i g1g0 = _mm512_shuffle_epi8(g.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
+    __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, g1g0);
+    __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r.val, b.val);
+    __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, r.val);
+
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+#endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g, const v_uint16x32& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(b.val, mask0, g.val);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(b.val, mask1, r.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(g.val, mask2, r.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g, const v_uint32x16& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
+    __m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);
+    __m512i g1b2g2 = _mm512_permutex2var_epi32(b.val, mask0, g.val);
+    __m512i r2r1b1 = _mm512_permutex2var_epi32(b.val, mask1, r.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, b.val), 0x2492, g.val), 0x4924, r.val);
+    __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g, const v_uint64x8& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);
+    __m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);
+    __m512i r1b1b2 = _mm512_permutex2var_epi64(b.val, mask0, r.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi64(g.val, mask1, r.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, b.val), 0x92, g.val), 0x24, r.val);
+    __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
+    __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g,
+                                const v_uint8x64& r, const v_uint8x64& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint8x64 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g,
+                                const v_uint16x32& r, const v_uint16x32& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint16x32 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g,
+                                const v_uint32x16& r, const v_uint32x16& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint32x16 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g,
+                                const v_uint64x8& r, const v_uint64x8& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 br01, br23, ga01, ga23;
+    v_zip(b, r, br01, br23);
+    v_zip(g, a, ga01, ga23);
+    v_uint64x8 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+}
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
+
+////////// Mask and checks /////////
+
+/** Mask **/
+inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
+inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+
+/** Checks **/
+inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+
+inline void v512_cleanup() { _mm256_zeroall(); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
index 4618552907..6873633165 100644
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@@ -14,9 +14,32 @@ namespace cv
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 /** Types **/
-#if CV__SIMD_FORWARD == 512
-// [todo] 512
-#error "AVX512 Not implemented yet"
+#if CV__SIMD_FORWARD == 1024
+// [todo] 1024
+#error "1024-long ops not implemented yet"
+#elif CV__SIMD_FORWARD == 512
+// 512
+#define __CV_VX(fun)   v512_##fun
+#define __CV_V_UINT8   v_uint8x64
+#define __CV_V_INT8    v_int8x64
+#define __CV_V_UINT16  v_uint16x32
+#define __CV_V_INT16   v_int16x32
+#define __CV_V_UINT32  v_uint32x16
+#define __CV_V_INT32   v_int32x16
+#define __CV_V_UINT64  v_uint64x8
+#define __CV_V_INT64   v_int64x8
+#define __CV_V_FLOAT32 v_float32x16
+#define __CV_V_FLOAT64 v_float64x8
+struct v_uint8x64;
+struct v_int8x64;
+struct v_uint16x32;
+struct v_int16x32;
+struct v_uint32x16;
+struct v_int32x16;
+struct v_uint64x8;
+struct v_int64x8;
+struct v_float32x16;
+struct v_float64x8;
 #elif CV__SIMD_FORWARD == 256
 // 256
 #define __CV_VX(fun)   v256_##fun
diff --git a/modules/core/misc/java/src/java/core+KeyPoint.java b/modules/core/misc/java/src/java/core+KeyPoint.java
index de5b2151ff..e34d6f17ef 100644
--- a/modules/core/misc/java/src/java/core+KeyPoint.java
+++ b/modules/core/misc/java/src/java/core+KeyPoint.java
@@ -33,8 +33,7 @@ public class KeyPoint {
     public int class_id;
 
     // javadoc:KeyPoint::KeyPoint(x,y,_size,_angle,_response,_octave,_class_id)
-    public KeyPoint(float x, float y, float _size, float _angle, float _response, int _octave, int _class_id)
-    {
+    public KeyPoint(float x, float y, float _size, float _angle, float _response, int _octave, int _class_id) {
         pt = new Point(x, y);
         size = _size;
         angle = _angle;
@@ -44,32 +43,27 @@ public class KeyPoint {
     }
 
     // javadoc: KeyPoint::KeyPoint()
-    public KeyPoint()
-    {
+    public KeyPoint() {
         this(0, 0, 0, -1, 0, 0, -1);
     }
 
     // javadoc: KeyPoint::KeyPoint(x, y, _size, _angle, _response, _octave)
-    public KeyPoint(float x, float y, float _size, float _angle, float _response, int _octave)
-    {
+    public KeyPoint(float x, float y, float _size, float _angle, float _response, int _octave) {
         this(x, y, _size, _angle, _response, _octave, -1);
     }
 
     // javadoc: KeyPoint::KeyPoint(x, y, _size, _angle, _response)
-    public KeyPoint(float x, float y, float _size, float _angle, float _response)
-    {
+    public KeyPoint(float x, float y, float _size, float _angle, float _response) {
         this(x, y, _size, _angle, _response, 0, -1);
     }
 
     // javadoc: KeyPoint::KeyPoint(x, y, _size, _angle)
-    public KeyPoint(float x, float y, float _size, float _angle)
-    {
+    public KeyPoint(float x, float y, float _size, float _angle) {
         this(x, y, _size, _angle, 0, 0, -1);
     }
 
     // javadoc: KeyPoint::KeyPoint(x, y, _size)
-    public KeyPoint(float x, float y, float _size)
-    {
+    public KeyPoint(float x, float y, float _size) {
         this(x, y, _size, -1, 0, 0, -1);
     }
 
diff --git a/modules/core/misc/java/src/java/core+Mat.java b/modules/core/misc/java/src/java/core+Mat.java
index 3bcb1ee9f7..990fb6787b 100644
--- a/modules/core/misc/java/src/java/core+Mat.java
+++ b/modules/core/misc/java/src/java/core+Mat.java
@@ -8,8 +8,7 @@ public class Mat {
 
     public final long nativeObj;
 
-    public Mat(long addr)
-    {
+    public Mat(long addr) {
         if (addr == 0)
             throw new UnsupportedOperationException("Native object address is NULL");
         nativeObj = addr;
@@ -20,12 +19,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat()
-    public Mat()
-    {
-
+    public Mat() {
         nativeObj = n_Mat();
-
-        return;
     }
 
     //
@@ -33,12 +28,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(rows, cols, type)
-    public Mat(int rows, int cols, int type)
-    {
-
+    public Mat(int rows, int cols, int type) {
         nativeObj = n_Mat(rows, cols, type);
-
-        return;
     }
 
     //
@@ -46,12 +37,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(rows, cols, type, data)
-    public Mat(int rows, int cols, int type, ByteBuffer data)
-    {
-
+    public Mat(int rows, int cols, int type, ByteBuffer data) {
         nativeObj = n_Mat(rows, cols, type, data);
-
-        return;
     }
 
     //
@@ -59,12 +46,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(size, type)
-    public Mat(Size size, int type)
-    {
-
+    public Mat(Size size, int type) {
         nativeObj = n_Mat(size.width, size.height, type);
-
-        return;
     }
 
     //
@@ -72,12 +55,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(sizes, type)
-    public Mat(int[] sizes, int type)
-    {
-
+    public Mat(int[] sizes, int type) {
         nativeObj = n_Mat(sizes.length, sizes, type);
-
-        return;
     }
 
     //
@@ -85,12 +64,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(rows, cols, type, s)
-    public Mat(int rows, int cols, int type, Scalar s)
-    {
-
+    public Mat(int rows, int cols, int type, Scalar s) {
         nativeObj = n_Mat(rows, cols, type, s.val[0], s.val[1], s.val[2], s.val[3]);
-
-        return;
     }
 
     //
@@ -98,12 +73,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(size, type, s)
-    public Mat(Size size, int type, Scalar s)
-    {
-
+    public Mat(Size size, int type, Scalar s) {
         nativeObj = n_Mat(size.width, size.height, type, s.val[0], s.val[1], s.val[2], s.val[3]);
-
-        return;
     }
 
     //
@@ -111,12 +82,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(sizes, type, s)
-    public Mat(int[] sizes, int type, Scalar s)
-    {
-
+    public Mat(int[] sizes, int type, Scalar s) {
         nativeObj = n_Mat(sizes.length, sizes, type, s.val[0], s.val[1], s.val[2], s.val[3]);
-
-        return;
     }
 
     //
@@ -124,21 +91,13 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(m, rowRange, colRange)
-    public Mat(Mat m, Range rowRange, Range colRange)
-    {
-
+    public Mat(Mat m, Range rowRange, Range colRange) {
         nativeObj = n_Mat(m.nativeObj, rowRange.start, rowRange.end, colRange.start, colRange.end);
-
-        return;
     }
 
     // javadoc: Mat::Mat(m, rowRange)
-    public Mat(Mat m, Range rowRange)
-    {
-
+    public Mat(Mat m, Range rowRange) {
         nativeObj = n_Mat(m.nativeObj, rowRange.start, rowRange.end);
-
-        return;
     }
 
     //
@@ -146,12 +105,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(m, ranges)
-    public Mat(Mat m, Range[] ranges)
-    {
-
+    public Mat(Mat m, Range[] ranges) {
         nativeObj = n_Mat(m.nativeObj, ranges);
-
-        return;
     }
 
     //
@@ -159,12 +114,8 @@ public class Mat {
     //
 
     // javadoc: Mat::Mat(m, roi)
-    public Mat(Mat m, Rect roi)
-    {
-
+    public Mat(Mat m, Rect roi) {
         nativeObj = n_Mat(m.nativeObj, roi.y, roi.y + roi.height, roi.x, roi.x + roi.width);
-
-        return;
     }
 
     //
@@ -172,12 +123,8 @@ public class Mat {
     //
 
     // javadoc: Mat::adjustROI(dtop, dbottom, dleft, dright)
-    public Mat adjustROI(int dtop, int dbottom, int dleft, int dright)
-    {
-
-        Mat retVal = new Mat(n_adjustROI(nativeObj, dtop, dbottom, dleft, dright));
-
-        return retVal;
+    public Mat adjustROI(int dtop, int dbottom, int dleft, int dright) {
+        return new Mat(n_adjustROI(nativeObj, dtop, dbottom, dleft, dright));
     }
 
     //
@@ -185,21 +132,13 @@ public class Mat {
     //
 
     // javadoc: Mat::assignTo(m, type)
-    public void assignTo(Mat m, int type)
-    {
-
+    public void assignTo(Mat m, int type) {
         n_assignTo(nativeObj, m.nativeObj, type);
-
-        return;
     }
 
     // javadoc: Mat::assignTo(m)
-    public void assignTo(Mat m)
-    {
-
+    public void assignTo(Mat m) {
         n_assignTo(nativeObj, m.nativeObj);
-
-        return;
     }
 
     //
@@ -207,12 +146,8 @@ public class Mat {
     //
 
     // javadoc: Mat::channels()
-    public int channels()
-    {
-
-        int retVal = n_channels(nativeObj);
-
-        return retVal;
+    public int channels() {
+        return n_channels(nativeObj);
     }
 
     //
@@ -221,30 +156,18 @@ public class Mat {
     //
 
     // javadoc: Mat::checkVector(elemChannels, depth, requireContinuous)
-    public int checkVector(int elemChannels, int depth, boolean requireContinuous)
-    {
-
-        int retVal = n_checkVector(nativeObj, elemChannels, depth, requireContinuous);
-
-        return retVal;
+    public int checkVector(int elemChannels, int depth, boolean requireContinuous) {
+        return n_checkVector(nativeObj, elemChannels, depth, requireContinuous);
     }
 
     // javadoc: Mat::checkVector(elemChannels, depth)
-    public int checkVector(int elemChannels, int depth)
-    {
-
-        int retVal = n_checkVector(nativeObj, elemChannels, depth);
-
-        return retVal;
+    public int checkVector(int elemChannels, int depth) {
+        return n_checkVector(nativeObj, elemChannels, depth);
     }
 
     // javadoc: Mat::checkVector(elemChannels)
-    public int checkVector(int elemChannels)
-    {
-
-        int retVal = n_checkVector(nativeObj, elemChannels);
-
-        return retVal;
+    public int checkVector(int elemChannels) {
+        return n_checkVector(nativeObj, elemChannels);
     }
 
     //
@@ -252,12 +175,8 @@ public class Mat {
     //
 
     // javadoc: Mat::clone()
-    public Mat clone()
-    {
-
-        Mat retVal = new Mat(n_clone(nativeObj));
-
-        return retVal;
+    public Mat clone() {
+        return new Mat(n_clone(nativeObj));
     }
 
     //
@@ -265,12 +184,8 @@ public class Mat {
     //
 
     // javadoc: Mat::col(x)
-    public Mat col(int x)
-    {
-
-        Mat retVal = new Mat(n_col(nativeObj, x));
-
-        return retVal;
+    public Mat col(int x) {
+        return new Mat(n_col(nativeObj, x));
     }
 
     //
@@ -278,12 +193,8 @@ public class Mat {
     //
 
     // javadoc: Mat::colRange(startcol, endcol)
-    public Mat colRange(int startcol, int endcol)
-    {
-
-        Mat retVal = new Mat(n_colRange(nativeObj, startcol, endcol));
-
-        return retVal;
+    public Mat colRange(int startcol, int endcol) {
+        return new Mat(n_colRange(nativeObj, startcol, endcol));
     }
 
     //
@@ -291,12 +202,8 @@ public class Mat {
     //
 
     // javadoc: Mat::colRange(r)
-    public Mat colRange(Range r)
-    {
-
-        Mat retVal = new Mat(n_colRange(nativeObj, r.start, r.end));
-
-        return retVal;
+    public Mat colRange(Range r) {
+        return new Mat(n_colRange(nativeObj, r.start, r.end));
     }
 
     //
@@ -304,12 +211,8 @@ public class Mat {
     //
 
     // javadoc: Mat::dims()
-    public int dims()
-    {
-
-        int retVal = n_dims(nativeObj);
-
-        return retVal;
+    public int dims() {
+        return n_dims(nativeObj);
     }
 
     //
@@ -317,12 +220,8 @@ public class Mat {
     //
 
     // javadoc: Mat::cols()
-    public int cols()
-    {
-
-        int retVal = n_cols(nativeObj);
-
-        return retVal;
+    public int cols() {
+        return n_cols(nativeObj);
     }
 
     //
@@ -331,30 +230,18 @@ public class Mat {
     //
 
     // javadoc: Mat::convertTo(m, rtype, alpha, beta)
-    public void convertTo(Mat m, int rtype, double alpha, double beta)
-    {
-
+    public void convertTo(Mat m, int rtype, double alpha, double beta) {
         n_convertTo(nativeObj, m.nativeObj, rtype, alpha, beta);
-
-        return;
     }
 
     // javadoc: Mat::convertTo(m, rtype, alpha)
-    public void convertTo(Mat m, int rtype, double alpha)
-    {
-
+    public void convertTo(Mat m, int rtype, double alpha) {
         n_convertTo(nativeObj, m.nativeObj, rtype, alpha);
-
-        return;
     }
 
     // javadoc: Mat::convertTo(m, rtype)
-    public void convertTo(Mat m, int rtype)
-    {
-
+    public void convertTo(Mat m, int rtype) {
         n_convertTo(nativeObj, m.nativeObj, rtype);
-
-        return;
     }
 
     //
@@ -362,12 +249,8 @@ public class Mat {
     //
 
     // javadoc: Mat::copyTo(m)
-    public void copyTo(Mat m)
-    {
-
+    public void copyTo(Mat m) {
         n_copyTo(nativeObj, m.nativeObj);
-
-        return;
     }
 
     //
@@ -375,12 +258,8 @@ public class Mat {
     //
 
     // javadoc: Mat::copyTo(m, mask)
-    public void copyTo(Mat m, Mat mask)
-    {
-
+    public void copyTo(Mat m, Mat mask) {
         n_copyTo(nativeObj, m.nativeObj, mask.nativeObj);
-
-        return;
     }
 
     //
@@ -388,12 +267,8 @@ public class Mat {
     //
 
     // javadoc: Mat::create(rows, cols, type)
-    public void create(int rows, int cols, int type)
-    {
-
+    public void create(int rows, int cols, int type) {
         n_create(nativeObj, rows, cols, type);
-
-        return;
     }
 
     //
@@ -401,12 +276,8 @@ public class Mat {
     //
 
     // javadoc: Mat::create(size, type)
-    public void create(Size size, int type)
-    {
-
+    public void create(Size size, int type) {
         n_create(nativeObj, size.width, size.height, type);
-
-        return;
     }
 
     //
@@ -414,12 +285,8 @@ public class Mat {
     //
 
     // javadoc: Mat::create(sizes, type)
-    public void create(int[] sizes, int type)
-    {
-
+    public void create(int[] sizes, int type) {
         n_create(nativeObj, sizes.length, sizes, type);
-
-        return;
     }
 
     //
@@ -427,11 +294,8 @@ public class Mat {
     //
 
     // javadoc: Mat::copySize(m)
-    public void copySize(Mat m)
-    {
+    public void copySize(Mat m) {
         n_copySize(nativeObj, m.nativeObj);
-
-        return;
     }
 
     //
@@ -439,12 +303,8 @@ public class Mat {
     //
 
     // javadoc: Mat::cross(m)
-    public Mat cross(Mat m)
-    {
-
-        Mat retVal = new Mat(n_cross(nativeObj, m.nativeObj));
-
-        return retVal;
+    public Mat cross(Mat m) {
+        return new Mat(n_cross(nativeObj, m.nativeObj));
     }
 
     //
@@ -452,12 +312,8 @@ public class Mat {
     //
 
     // javadoc: Mat::dataAddr()
-    public long dataAddr()
-    {
-
-        long retVal = n_dataAddr(nativeObj);
-
-        return retVal;
+    public long dataAddr() {
+        return n_dataAddr(nativeObj);
     }
 
     //
@@ -465,12 +321,8 @@ public class Mat {
     //
 
     // javadoc: Mat::depth()
-    public int depth()
-    {
-
-        int retVal = n_depth(nativeObj);
-
-        return retVal;
+    public int depth() {
+        return n_depth(nativeObj);
     }
 
     //
@@ -478,21 +330,13 @@ public class Mat {
     //
 
     // javadoc: Mat::diag(d)
-    public Mat diag(int d)
-    {
-
-        Mat retVal = new Mat(n_diag(nativeObj, d));
-
-        return retVal;
+    public Mat diag(int d) {
+        return new Mat(n_diag(nativeObj, d));
     }
 
     // javadoc: Mat::diag()
-    public Mat diag()
-    {
-
-        Mat retVal = new Mat(n_diag(nativeObj, 0));
-
-        return retVal;
+    public Mat diag() {
+        return new Mat(n_diag(nativeObj, 0));
     }
 
     //
@@ -500,12 +344,8 @@ public class Mat {
     //
 
     // javadoc: Mat::diag(d)
-    public static Mat diag(Mat d)
-    {
-
-        Mat retVal = new Mat(n_diag(d.nativeObj));
-
-        return retVal;
+    public static Mat diag(Mat d) {
+        return new Mat(n_diag(d.nativeObj));
     }
 
     //
@@ -513,12 +353,8 @@ public class Mat {
     //
 
     // javadoc: Mat::dot(m)
-    public double dot(Mat m)
-    {
-
-        double retVal = n_dot(nativeObj, m.nativeObj);
-
-        return retVal;
+    public double dot(Mat m) {
+        return n_dot(nativeObj, m.nativeObj);
     }
 
     //
@@ -526,12 +362,8 @@ public class Mat {
     //
 
     // javadoc: Mat::elemSize()
-    public long elemSize()
-    {
-
-        long retVal = n_elemSize(nativeObj);
-
-        return retVal;
+    public long elemSize() {
+        return n_elemSize(nativeObj);
     }
 
     //
@@ -539,12 +371,8 @@ public class Mat {
     //
 
     // javadoc: Mat::elemSize1()
-    public long elemSize1()
-    {
-
-        long retVal = n_elemSize1(nativeObj);
-
-        return retVal;
+    public long elemSize1() {
+        return n_elemSize1(nativeObj);
     }
 
     //
@@ -552,12 +380,8 @@ public class Mat {
     //
 
     // javadoc: Mat::empty()
-    public boolean empty()
-    {
-
-        boolean retVal = n_empty(nativeObj);
-
-        return retVal;
+    public boolean empty() {
+        return n_empty(nativeObj);
     }
 
     //
@@ -565,12 +389,8 @@ public class Mat {
     //
 
     // javadoc: Mat::eye(rows, cols, type)
-    public static Mat eye(int rows, int cols, int type)
-    {
-
-        Mat retVal = new Mat(n_eye(rows, cols, type));
-
-        return retVal;
+    public static Mat eye(int rows, int cols, int type) {
+        return new Mat(n_eye(rows, cols, type));
     }
 
     //
@@ -578,12 +398,8 @@ public class Mat {
     //
 
     // javadoc: Mat::eye(size, type)
-    public static Mat eye(Size size, int type)
-    {
-
-        Mat retVal = new Mat(n_eye(size.width, size.height, type));
-
-        return retVal;
+    public static Mat eye(Size size, int type) {
+        return new Mat(n_eye(size.width, size.height, type));
     }
 
     //
@@ -591,21 +407,13 @@ public class Mat {
     //
 
     // javadoc: Mat::inv(method)
-    public Mat inv(int method)
-    {
-
-        Mat retVal = new Mat(n_inv(nativeObj, method));
-
-        return retVal;
+    public Mat inv(int method) {
+        return new Mat(n_inv(nativeObj, method));
     }
 
     // javadoc: Mat::inv()
-    public Mat inv()
-    {
-
-        Mat retVal = new Mat(n_inv(nativeObj));
-
-        return retVal;
+    public Mat inv() {
+        return new Mat(n_inv(nativeObj));
     }
 
     //
@@ -613,12 +421,8 @@ public class Mat {
     //
 
     // javadoc: Mat::isContinuous()
-    public boolean isContinuous()
-    {
-
-        boolean retVal = n_isContinuous(nativeObj);
-
-        return retVal;
+    public boolean isContinuous() {
+        return n_isContinuous(nativeObj);
     }
 
     //
@@ -626,12 +430,8 @@ public class Mat {
     //
 
     // javadoc: Mat::isSubmatrix()
-    public boolean isSubmatrix()
-    {
-
-        boolean retVal = n_isSubmatrix(nativeObj);
-
-        return retVal;
+    public boolean isSubmatrix() {
+        return n_isSubmatrix(nativeObj);
     }
 
     //
@@ -639,14 +439,18 @@ public class Mat {
     //
 
     // javadoc: Mat::locateROI(wholeSize, ofs)
-    public void locateROI(Size wholeSize, Point ofs)
-    {
+    public void locateROI(Size wholeSize, Point ofs) {
         double[] wholeSize_out = new double[2];
         double[] ofs_out = new double[2];
         locateROI_0(nativeObj, wholeSize_out, ofs_out);
-        if(wholeSize!=null){ wholeSize.width = wholeSize_out[0]; wholeSize.height = wholeSize_out[1]; }
-        if(ofs!=null){ ofs.x = ofs_out[0]; ofs.y = ofs_out[1]; }
-        return;
+        if (wholeSize != null) {
+            wholeSize.width = wholeSize_out[0];
+            wholeSize.height = wholeSize_out[1];
+        }
+        if (ofs != null) {
+            ofs.x = ofs_out[0];
+            ofs.y = ofs_out[1];
+        }
     }
 
     //
@@ -654,21 +458,13 @@ public class Mat {
     //
 
     // javadoc: Mat::mul(m, scale)
-    public Mat mul(Mat m, double scale)
-    {
-
-        Mat retVal = new Mat(n_mul(nativeObj, m.nativeObj, scale));
-
-        return retVal;
+    public Mat mul(Mat m, double scale) {
+        return new Mat(n_mul(nativeObj, m.nativeObj, scale));
     }
 
     // javadoc: Mat::mul(m)
-    public Mat mul(Mat m)
-    {
-
-        Mat retVal = new Mat(n_mul(nativeObj, m.nativeObj));
-
-        return retVal;
+    public Mat mul(Mat m) {
+        return new Mat(n_mul(nativeObj, m.nativeObj));
     }
 
     //
@@ -676,12 +472,8 @@ public class Mat {
     //
 
     // javadoc: Mat::ones(rows, cols, type)
-    public static Mat ones(int rows, int cols, int type)
-    {
-
-        Mat retVal = new Mat(n_ones(rows, cols, type));
-
-        return retVal;
+    public static Mat ones(int rows, int cols, int type) {
+        return new Mat(n_ones(rows, cols, type));
     }
 
     //
@@ -689,12 +481,8 @@ public class Mat {
     //
 
     // javadoc: Mat::ones(size, type)
-    public static Mat ones(Size size, int type)
-    {
-
-        Mat retVal = new Mat(n_ones(size.width, size.height, type));
-
-        return retVal;
+    public static Mat ones(Size size, int type) {
+        return new Mat(n_ones(size.width, size.height, type));
     }
 
     //
@@ -702,12 +490,8 @@ public class Mat {
     //
 
     // javadoc: Mat::ones(sizes, type)
-    public static Mat ones(int[] sizes, int type)
-    {
-
-        Mat retVal = new Mat(n_ones(sizes.length, sizes, type));
-
-        return retVal;
+    public static Mat ones(int[] sizes, int type) {
+        return new Mat(n_ones(sizes.length, sizes, type));
     }
 
     //
@@ -715,12 +499,8 @@ public class Mat {
     //
 
     // javadoc: Mat::push_back(m)
-    public void push_back(Mat m)
-    {
-
+    public void push_back(Mat m) {
         n_push_back(nativeObj, m.nativeObj);
-
-        return;
     }
 
     //
@@ -728,12 +508,8 @@ public class Mat {
     //
 
     // javadoc: Mat::release()
-    public void release()
-    {
-
+    public void release() {
         n_release(nativeObj);
-
-        return;
     }
 
     //
@@ -741,21 +517,13 @@ public class Mat {
     //
 
     // javadoc: Mat::reshape(cn, rows)
-    public Mat reshape(int cn, int rows)
-    {
-
-        Mat retVal = new Mat(n_reshape(nativeObj, cn, rows));
-
-        return retVal;
+    public Mat reshape(int cn, int rows) {
+        return new Mat(n_reshape(nativeObj, cn, rows));
     }
 
     // javadoc: Mat::reshape(cn)
-    public Mat reshape(int cn)
-    {
-
-        Mat retVal = new Mat(n_reshape(nativeObj, cn));
-
-        return retVal;
+    public Mat reshape(int cn) {
+        return new Mat(n_reshape(nativeObj, cn));
     }
 
     //
@@ -763,11 +531,8 @@ public class Mat {
     //
 
     // javadoc: Mat::reshape(cn, newshape)
-    public Mat reshape(int cn, int[] newshape)
-    {
-        Mat retVal = new Mat(n_reshape_1(nativeObj, cn, newshape.length, newshape));
-
-        return retVal;
+    public Mat reshape(int cn, int[] newshape) {
+        return new Mat(n_reshape_1(nativeObj, cn, newshape.length, newshape));
     }
 
     //
@@ -775,12 +540,8 @@ public class Mat {
     //
 
     // javadoc: Mat::row(y)
-    public Mat row(int y)
-    {
-
-        Mat retVal = new Mat(n_row(nativeObj, y));
-
-        return retVal;
+    public Mat row(int y) {
+        return new Mat(n_row(nativeObj, y));
     }
 
     //
@@ -788,12 +549,8 @@ public class Mat {
     //
 
     // javadoc: Mat::rowRange(startrow, endrow)
-    public Mat rowRange(int startrow, int endrow)
-    {
-
-        Mat retVal = new Mat(n_rowRange(nativeObj, startrow, endrow));
-
-        return retVal;
+    public Mat rowRange(int startrow, int endrow) {
+        return new Mat(n_rowRange(nativeObj, startrow, endrow));
     }
 
     //
@@ -801,12 +558,8 @@ public class Mat {
     //
 
     // javadoc: Mat::rowRange(r)
-    public Mat rowRange(Range r)
-    {
-
-        Mat retVal = new Mat(n_rowRange(nativeObj, r.start, r.end));
-
-        return retVal;
+    public Mat rowRange(Range r) {
+        return new Mat(n_rowRange(nativeObj, r.start, r.end));
     }
 
     //
@@ -814,12 +567,8 @@ public class Mat {
     //
 
     // javadoc: Mat::rows()
-    public int rows()
-    {
-
-        int retVal = n_rows(nativeObj);
-
-        return retVal;
+    public int rows() {
+        return n_rows(nativeObj);
     }
 
     //
@@ -827,12 +576,8 @@ public class Mat {
     //
 
     // javadoc: Mat::operator =(s)
-    public Mat setTo(Scalar s)
-    {
-
-        Mat retVal = new Mat(n_setTo(nativeObj, s.val[0], s.val[1], s.val[2], s.val[3]));
-
-        return retVal;
+    public Mat setTo(Scalar s) {
+        return new Mat(n_setTo(nativeObj, s.val[0], s.val[1], s.val[2], s.val[3]));
     }
 
     //
@@ -840,12 +585,8 @@ public class Mat {
     //
 
     // javadoc: Mat::setTo(value, mask)
-    public Mat setTo(Scalar value, Mat mask)
-    {
-
-        Mat retVal = new Mat(n_setTo(nativeObj, value.val[0], value.val[1], value.val[2], value.val[3], mask.nativeObj));
-
-        return retVal;
+    public Mat setTo(Scalar value, Mat mask) {
+        return new Mat(n_setTo(nativeObj, value.val[0], value.val[1], value.val[2], value.val[3], mask.nativeObj));
     }
 
     //
@@ -853,21 +594,13 @@ public class Mat {
     //
 
     // javadoc: Mat::setTo(value, mask)
-    public Mat setTo(Mat value, Mat mask)
-    {
-
-        Mat retVal = new Mat(n_setTo(nativeObj, value.nativeObj, mask.nativeObj));
-
-        return retVal;
+    public Mat setTo(Mat value, Mat mask) {
+        return new Mat(n_setTo(nativeObj, value.nativeObj, mask.nativeObj));
     }
 
     // javadoc: Mat::setTo(value)
-    public Mat setTo(Mat value)
-    {
-
-        Mat retVal = new Mat(n_setTo(nativeObj, value.nativeObj));
-
-        return retVal;
+    public Mat setTo(Mat value) {
+        return new Mat(n_setTo(nativeObj, value.nativeObj));
     }
 
     //
@@ -875,12 +608,8 @@ public class Mat {
     //
 
     // javadoc: Mat::size()
-    public Size size()
-    {
-
-        Size retVal = new Size(n_size(nativeObj));
-
-        return retVal;
+    public Size size() {
+        return new Size(n_size(nativeObj));
     }
 
     //
@@ -888,11 +617,8 @@ public class Mat {
     //
 
     // javadoc: Mat::size(int i)
-    public int size(int i)
-    {
-        int retVal = n_size_i(nativeObj, i);
-
-        return retVal;
+    public int size(int i) {
+        return n_size_i(nativeObj, i);
     }
 
     //
@@ -900,21 +626,13 @@ public class Mat {
     //
 
     // javadoc: Mat::step1(i)
-    public long step1(int i)
-    {
-
-        long retVal = n_step1(nativeObj, i);
-
-        return retVal;
+    public long step1(int i) {
+        return n_step1(nativeObj, i);
     }
 
     // javadoc: Mat::step1()
-    public long step1()
-    {
-
-        long retVal = n_step1(nativeObj);
-
-        return retVal;
+    public long step1() {
+        return n_step1(nativeObj);
     }
 
     //
@@ -923,12 +641,8 @@ public class Mat {
     //
 
     // javadoc: Mat::operator()(rowStart, rowEnd, colStart, colEnd)
-    public Mat submat(int rowStart, int rowEnd, int colStart, int colEnd)
-    {
-
-        Mat retVal = new Mat(n_submat_rr(nativeObj, rowStart, rowEnd, colStart, colEnd));
-
-        return retVal;
+    public Mat submat(int rowStart, int rowEnd, int colStart, int colEnd) {
+        return new Mat(n_submat_rr(nativeObj, rowStart, rowEnd, colStart, colEnd));
     }
 
     //
@@ -936,12 +650,8 @@ public class Mat {
     //
 
     // javadoc: Mat::operator()(rowRange, colRange)
-    public Mat submat(Range rowRange, Range colRange)
-    {
-
-        Mat retVal = new Mat(n_submat_rr(nativeObj, rowRange.start, rowRange.end, colRange.start, colRange.end));
-
-        return retVal;
+    public Mat submat(Range rowRange, Range colRange) {
+        return new Mat(n_submat_rr(nativeObj, rowRange.start, rowRange.end, colRange.start, colRange.end));
     }
 
     //
@@ -949,12 +659,8 @@ public class Mat {
     //
 
     // javadoc: Mat::operator()(ranges[])
-    public Mat submat(Range[] ranges)
-    {
-
-        Mat retVal = new Mat(n_submat_ranges(nativeObj, ranges));
-
-        return retVal;
+    public Mat submat(Range[] ranges) {
+        return new Mat(n_submat_ranges(nativeObj, ranges));
     }
 
     //
@@ -962,12 +668,8 @@ public class Mat {
     //
 
     // javadoc: Mat::operator()(roi)
-    public Mat submat(Rect roi)
-    {
-
-        Mat retVal = new Mat(n_submat(nativeObj, roi.x, roi.y, roi.width, roi.height));
-
-        return retVal;
+    public Mat submat(Rect roi) {
+        return new Mat(n_submat(nativeObj, roi.x, roi.y, roi.width, roi.height));
     }
 
     //
@@ -975,12 +677,8 @@ public class Mat {
     //
 
     // javadoc: Mat::t()
-    public Mat t()
-    {
-
-        Mat retVal = new Mat(n_t(nativeObj));
-
-        return retVal;
+    public Mat t() {
+        return new Mat(n_t(nativeObj));
     }
 
     //
@@ -988,12 +686,8 @@ public class Mat {
     //
 
     // javadoc: Mat::total()
-    public long total()
-    {
-
-        long retVal = n_total(nativeObj);
-
-        return retVal;
+    public long total() {
+        return n_total(nativeObj);
     }
 
     //
@@ -1001,12 +695,8 @@ public class Mat {
     //
 
     // javadoc: Mat::type()
-    public int type()
-    {
-
-        int retVal = n_type(nativeObj);
-
-        return retVal;
+    public int type() {
+        return n_type(nativeObj);
     }
 
     //
@@ -1014,12 +704,8 @@ public class Mat {
     //
 
     // javadoc: Mat::zeros(rows, cols, type)
-    public static Mat zeros(int rows, int cols, int type)
-    {
-
-        Mat retVal = new Mat(n_zeros(rows, cols, type));
-
-        return retVal;
+    public static Mat zeros(int rows, int cols, int type) {
+        return new Mat(n_zeros(rows, cols, type));
     }
 
     //
@@ -1027,12 +713,8 @@ public class Mat {
     //
 
     // javadoc: Mat::zeros(size, type)
-    public static Mat zeros(Size size, int type)
-    {
-
-        Mat retVal = new Mat(n_zeros(size.width, size.height, type));
-
-        return retVal;
+    public static Mat zeros(Size size, int type) {
+        return new Mat(n_zeros(size.width, size.height, type));
     }
 
     //
@@ -1040,12 +722,8 @@ public class Mat {
     //
 
     // javadoc: Mat::zeros(sizes, type)
-    public static Mat zeros(int[] sizes, int type)
-    {
-
-        Mat retVal = new Mat(n_zeros(sizes.length, sizes, type));
-
-        return retVal;
+    public static Mat zeros(int[] sizes, int type) {
+        return new Mat(n_zeros(sizes.length, sizes, type));
     }
 
     @Override
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index a10621366c..7c7ae154c7 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -755,8 +755,7 @@ int64 getTickCount(void)
     return (int64)mach_absolute_time();
 #else
     struct timeval tv;
-    struct timezone tz;
-    gettimeofday( &tv, &tz );
+    gettimeofday(&tv, NULL);
     return (int64)tv.tv_sec*1000000 + tv.tv_usec;
 #endif
 }
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index 15ace206c8..9bc4981cda 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -7,11 +7,15 @@
 #include "test_intrin128.simd_declarations.hpp"
 
 #undef CV_CPU_DISPATCH_MODES_ALL
-
 #include "opencv2/core/cv_cpu_dispatch.h"
 #include "test_intrin256.simd.hpp"
 #include "test_intrin256.simd_declarations.hpp"
 
+#undef CV_CPU_DISPATCH_MODES_ALL
+#include "opencv2/core/cv_cpu_dispatch.h"
+#include "test_intrin512.simd.hpp"
+#include "test_intrin512.simd_declarations.hpp"
+
 #ifdef _MSC_VER
 # pragma warning(disable:4702)  // unreachable code
 #endif
@@ -30,6 +34,11 @@ namespace opencv_test { namespace hal {
     throw SkipTestException("SIMD256 (" #cpu_opt ") is not available or disabled"); \
 } while(0)
 
+#define DISPATCH_SIMD512(fn, cpu_opt) do { \
+    CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
+    throw SkipTestException("SIMD512 (" #cpu_opt ") is not available or disabled"); \
+} while(0)
+
 #define DEFINE_SIMD_TESTS(simd_size, cpu_opt) \
 TEST(hal_intrin ## simd_size, uint8x16_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint8, cpu_opt); } \
 TEST(hal_intrin ## simd_size, int8x16_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int8, cpu_opt); } \
@@ -67,6 +76,9 @@ DEFINE_SIMD_TESTS(128, AVX)
 #if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
 DEFINE_SIMD_TESTS(128, AVX2)
 #endif
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+DEFINE_SIMD_TESTS(128, AVX512_SKX)
+#endif
 
 TEST(hal_intrin128, float16x8_FP16)
 {
@@ -91,6 +103,10 @@ namespace intrin256 {
 DEFINE_SIMD_TESTS(256, AVX2)
 #endif
 
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+DEFINE_SIMD_TESTS(256, AVX512_SKX)
+#endif
+
 TEST(hal_intrin256, float16x16_FP16)
 {
     //CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
@@ -101,4 +117,19 @@ TEST(hal_intrin256, float16x16_FP16)
 
 } // namespace intrin256
 
+namespace intrin512 {
+
+#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
+    DEFINE_SIMD_TESTS(512, AVX512_SKX)
+#endif
+
+TEST(hal_intrin512, float16x32_FP16)
+{
+    CV_CPU_CALL_AVX512_SKX_(test_hal_intrin_float16, ());
+    throw SkipTestException("Unsupported hardware: FP16 is not available");
+}
+
+
+} // namespace intrin512
+
 }} // namespace
\ No newline at end of file
diff --git a/modules/core/test/test_intrin512.simd.hpp b/modules/core/test/test_intrin512.simd.hpp
new file mode 100644
index 0000000000..0e941bc189
--- /dev/null
+++ b/modules/core/test/test_intrin512.simd.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#if !defined CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY && \
+    !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS // TODO? C++ fallback implementation for SIMD512
+
+#define CV__SIMD_FORCE_WIDTH 512
+#include "opencv2/core/hal/intrin.hpp"
+#undef CV__SIMD_FORCE_WIDTH
+
+#if CV_SIMD_WIDTH != 64
+#error "Invalid build configuration"
+#endif
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+namespace opencv_test { namespace hal { namespace intrin512 {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+#include "test_intrin_utils.hpp"
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}}} //namespace
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 3cd1145985..177d61f9c7 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -772,6 +772,9 @@ template<typename R> struct TheTest
         EXPECT_EQ((LaneType)1, v_reduce_min(a));
         EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a));
         EXPECT_EQ((LaneType)((1 + R::nlanes)*R::nlanes/2), v_reduce_sum(a));
+        dataA[0] += R::nlanes;
+        R an = dataA;
+        EXPECT_EQ((LaneType)2, v_reduce_min(an));
         return *this;
     }
 
@@ -808,7 +811,9 @@ template<typename R> struct TheTest
         R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
 
         EXPECT_EQ(2, v_signmask(a));
+#if CV_SIMD_WIDTH <= 32
         EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b));
+#endif
 
         EXPECT_EQ(false, v_check_all(a));
         EXPECT_EQ(false, v_check_all(b));
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index df92ed7b76..f5daa27fcd 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -213,8 +213,7 @@ PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
 
 PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+    if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
 }
diff --git a/modules/dnn/src/caffe/caffe_importer.cpp b/modules/dnn/src/caffe/caffe_importer.cpp
index 21392f7c8a..26374ce698 100644
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@@ -388,6 +388,27 @@ public:
                     layerParams.blobs[1].setTo(1);  // std
                 }
             }
+            else if (type == "Axpy")
+            {
+                CV_Assert_N(layer.bottom_size() == 3, layer.top_size() == 1);
+
+                std::string scaleName = name + "/scale";
+                int repetitions = layerCounter[scaleName]++;
+                if (repetitions) {
+                    scaleName += String("_") + toString(repetitions);
+                }
+
+                LayerParams scaleParams;
+                scaleParams.set("axis", 1);
+                scaleParams.set("has_bias", false);
+                int scaleId = dstNet.addLayer(scaleName, "Scale", scaleParams);
+                addInput(layer.bottom(2), scaleId, 0, dstNet);
+                addInput(layer.bottom(0), scaleId, 1, dstNet);
+                addOutput(layer, scaleId, 0);
+                net.mutable_layer(li)->set_bottom(0, layer.top(0));
+                net.mutable_layer(li)->mutable_bottom()->RemoveLast();
+                type = "Eltwise";
+            }
             else if ("ConvolutionDepthwise" == type)
             {
                 type = "Convolution";
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index c3a68a2a42..7907047067 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -122,7 +122,7 @@ public:
         else
         {
             ieLayer.setType("Split");
-            ieLayer.getParameters()["axis"] = input->dims.size() - 1;
+            ieLayer.getParameters()["axis"] = (size_t)0;
             ieLayer.getParameters()["out_sizes"] = input->dims[0];
         }
         std::vector<size_t> shape(input->dims);
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 95f5b57de6..8467251263 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -62,7 +62,7 @@ namespace dnn
 class BaseConvolutionLayerImpl : public ConvolutionLayer
 {
 public:
-    bool newWeightAndBias;
+    bool fusedWeights, fusedBias;
     std::vector<double> weightsMultipliers;
     BaseConvolutionLayerImpl(const LayerParams &params)
     {
@@ -91,7 +91,8 @@ public:
             CV_Assert(adjustPad.width < stride.width &&
                       adjustPad.height < stride.height);
         }
-        newWeightAndBias = false;
+        fusedWeights = false;
+        fusedBias = false;
     }
 
     virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
@@ -134,6 +135,8 @@ public:
             }
             pad = Size(pads_begin[1], pads_begin[0]);
         }
+        fusedWeights = false;
+        fusedBias = false;
     }
 
     bool hasBias() const
@@ -156,6 +159,8 @@ public:
         if (!w.empty() || !b.empty())
         {
             fuseWeights(w, b);
+            fusedWeights = fusedWeights || !w.empty();
+            fusedBias = fusedBias || (hasBias() && !w.empty()) || !b.empty();
             return true;
         }
         return false;
@@ -216,7 +221,6 @@ public:
     std::vector<float> biasvec;
     std::vector<float> reluslope;
     Ptr<ActivationLayer> activ;
-    bool fusedBias;
 
 #ifdef HAVE_OPENCL
     Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
@@ -227,7 +231,6 @@ public:
 #endif
     ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params)
     {
-        fusedBias = false;
 #ifdef HAVE_OPENCL
         newActiv = false;
         activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
@@ -413,9 +416,6 @@ public:
             for (int i = 0; i < outCn; ++i)
                 biasvec[i] += b.at<float>(i);
         }
-
-        newWeightAndBias = !w.empty() || !b.empty();
-        fusedBias = hasBias() || !b.empty();
         biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
     }
 
@@ -549,12 +549,12 @@ public:
                                                                      InferenceEngine::Layout::NCDHW;
 
         auto ieWeights = wrapToInfEngineBlob(blobs[0], layout);
-        if (newWeightAndBias)
+        if (fusedWeights)
         {
             if (weightsMat.isContinuous())
             {
-                Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
-                ieWeights = wrapToInfEngineBlob(fusedWeights, layout);
+                Mat cvWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
+                ieWeights = wrapToInfEngineBlob(cvWeights, layout);
             }
             else
             {
@@ -564,8 +564,8 @@ public:
                 ieWeights->allocate();
 
                 Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn);
-                Mat fusedWeights = weightsMat.colRange(0, newWeights.cols);
-                fusedWeights.copyTo(newWeights);
+                Mat cvWeights = weightsMat.colRange(0, newWeights.cols);
+                cvWeights.copyTo(newWeights);
             }
         }
         InferenceEngine::Blob::Ptr ieBiases;
@@ -1089,17 +1089,18 @@ public:
             }
         }
 
-        if ( newWeightAndBias )
+        if (fusedWeights)
         {
             weightsMat.copyTo(umat_blobs[0]);
-            if ( fusedBias )
-            {
-                if ( umat_blobs.size() < 2 )
-                    umat_blobs.resize(2);
-                umat_blobs[1] = UMat(biasvec, true);
-            }
-            convolutionOp->setBias(fusedBias || hasBias());
-            newWeightAndBias = false;
+            fusedWeights = false;
+        }
+        if (fusedBias)
+        {
+            if ( umat_blobs.size() < 2 )
+                umat_blobs.resize(2);
+            umat_blobs[1] = UMat(biasvec, true);
+            convolutionOp->setBias(true);
+            fusedBias = false;
         }
 
         if ( newActiv )
@@ -1144,7 +1145,7 @@ public:
         return convolutionOp->Forward(inpMat,
                                       inputs.size() == 2 ? inputs[1] : UMat(),
                                       umat_blobs[0],
-                                      (hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
+                                      umat_blobs.size() > 1 ? umat_blobs[1] : UMat(),
                                       outMat,
                                       batch_size);
     }
@@ -1249,16 +1250,34 @@ public:
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+        const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
+        const int group = numOutput / outGroupCn;
+
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         {
             if (kernel_size.size() == 3)
                 CV_Error(Error::StsNotImplemented, "Unsupported deconvolution3D layer");
 
             if (INF_ENGINE_RELEASE >= 2018050000 && (adjustPad.height || adjustPad.width))
-                return false;
+            {
+                if (padMode.empty())
+                {
+                    if (preferableTarget != DNN_TARGET_CPU && group != 1)
+                    {
+                        if ((adjustPad.height && pad.height) || (adjustPad.width && pad.width))
+                            return false;
+                    }
+                    return pad.width >= adjustPad.width && pad.height >= adjustPad.height;
+                }
+                else if (padMode == "SAME")
+                {
+                    return kernel.width >= pad.width + 1 + adjustPad.width &&
+                           kernel.height >= pad.height + 1 + adjustPad.height;
+                }
+                else if (padMode == "VALID")
+                    return false;
+            }
 
-            const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
-            const int group = numOutput / outGroupCn;
             if (group != 1)
             {
                 return preferableTarget == DNN_TARGET_CPU;
@@ -1376,8 +1395,6 @@ public:
         {
             cv::add(biasesMat, b.reshape(1, numOutput), biasesMat);
         }
-
-        newWeightAndBias = !w.empty() || !b.empty();
     }
 
     class MatMulInvoker : public ParallelLoopBody
@@ -1645,14 +1662,15 @@ public:
 
         if (umat_weights.empty())
         {
-            if (newWeightAndBias)
-            {
+            if (fusedWeights)
                 weightsMat.copyTo(umat_weights);
+            else
+                transpose(blobs[0].reshape(1, inpCn), umat_weights);
+
+            if (fusedBias)
                 biasesMat.copyTo(umat_biases);
-            }
             else
             {
-                transpose(blobs[0].reshape(1, inpCn), umat_weights);
                 if (hasBias())
                     blobs[1].reshape(1, outCn).copyTo(umat_biases);
                 else
@@ -1852,6 +1870,19 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+        auto ieWeights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
+        if (fusedWeights)
+        {
+            ieWeights = InferenceEngine::make_shared_blob<float>(
+                                InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW,
+                                ieWeights->dims());
+            ieWeights->allocate();
+
+            int inpCn = blobs[0].size[0];
+            Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, inpCn);
+            transpose(weightsMat, newWeights);
+        }
+
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
         const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
         const int group = numOutput / outGroupCn;
@@ -1862,14 +1893,23 @@ public:
         ieLayer.setStrides(strides);
         ieLayer.setDilation(dilations);
         ieLayer.setPaddingsBegin(pads_begin);
-        ieLayer.setPaddingsEnd(pads_end);
+
+        if (padMode.empty())
+        {
+            ieLayer.setPaddingsEnd({pads_end[0] - adjust_pads[0], pads_end[1] - adjust_pads[1]});
+        }
+        else if (padMode == "SAME")
+        {
+            ieLayer.setPaddingsEnd({kernel_size[0] - pads_begin[0] - 1 - adjust_pads[0],
+                                    kernel_size[1] - pads_begin[1] - 1 - adjust_pads[1]});
+        }
         ieLayer.setGroup((size_t)group);
         ieLayer.setOutDepth((size_t)numOutput);
 
         InferenceEngine::Builder::Layer l = ieLayer;
-        addConstantData("weights", wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW), l);
+        addConstantData("weights", ieWeights, l);
         if (hasBias())
-            addConstantData("biases", wrapToInfEngineBlob(blobs[1], {(size_t)numOutput}, InferenceEngine::Layout::C), l);
+            addConstantData("biases", wrapToInfEngineBlob(biasesMat, {(size_t)numOutput}, InferenceEngine::Layout::C), l);
         return Ptr<BackendNode>(new InfEngineBackendNode(l));
 #else
         const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index ed6da9e1a4..7ea89abdf0 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -143,7 +143,7 @@ public:
             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 4, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
             CV_Assert(coeffs.empty() || coeffs.size() == (size_t)nsrcs);
 
-            for( int i = 0; i > nsrcs; i++ )
+            for( int i = 0; i < nsrcs; i++ )
             {
                 CV_Assert(srcs[i].size == dst.size &&
                           srcs[i].type() == dst.type() &&
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index 8e57d557db..8270f90d7e 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -601,7 +601,7 @@ public:
 class UpsamplingKerasSubgraph : public Subgraph
 {
 public:
-    UpsamplingKerasSubgraph()
+    UpsamplingKerasSubgraph(const std::string& type)
     {
         int input = addNodeToMatch("");
         int shape = addNodeToMatch("Shape", input);
@@ -611,8 +611,8 @@ public:
         int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
         int factors = addNodeToMatch("Const");
         int mul = addNodeToMatch("Mul", strided_slice, factors);
-        addNodeToMatch("ResizeNearestNeighbor", input, mul);
-        setFusedNode("ResizeNearestNeighbor", input, factors);
+        addNodeToMatch(type, input, mul);
+        setFusedNode(type, input, factors);
     }
 
     virtual void finalize(tensorflow::GraphDef& net, tensorflow::NodeDef* fusedNode,
@@ -707,7 +707,8 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
     subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionValidKerasSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new DeconvolutionSameKerasSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new ResizeBilinearSubgraph()));
-    subgraphs.push_back(Ptr<Subgraph>(new UpsamplingKerasSubgraph()));
+    subgraphs.push_back(Ptr<Subgraph>(new UpsamplingKerasSubgraph("ResizeNearestNeighbor")));
+    subgraphs.push_back(Ptr<Subgraph>(new UpsamplingKerasSubgraph("ResizeBilinear")));
     subgraphs.push_back(Ptr<Subgraph>(new SoftMaxSlimSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new SoftMaxSlimV2Subgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new ReshapeAsShapeSubgraph()));
@@ -752,6 +753,8 @@ void RemoveIdentityOps(tensorflow::GraphDef& net)
         tensorflow::NodeDef* layer = net.mutable_node(li);
         for (int input_id = 0; input_id < layer->input_size(); input_id++) {
             String input_op_name = layer->input(input_id);
+            input_op_name = input_op_name.substr(input_op_name.find('^') + 1,
+                                                 input_op_name.rfind(':'));
             IdentityOpsMap::iterator it = identity_ops.find(input_op_name);
 
             if (it != identity_ops.end()) {
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 834741965c..aa60b48541 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -405,8 +405,9 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
     // Output image has values in range [-143.526, 148.539].
     float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.4 : 4e-5;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.28 : 2e-3;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7.45 : 2e-3;
     processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf);
+    expectNoFallbacksFromIE(net);
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false, true));
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 43236ff37f..b9f07e5c76 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -83,17 +83,17 @@ TEST(Test_Caffe, memory_read)
     const string proto = findDataFile("dnn/bvlc_googlenet.prototxt", false);
     const string model = findDataFile("dnn/bvlc_googlenet.caffemodel", false);
 
-    string dataProto;
-    ASSERT_TRUE(readFileInMemory(proto, dataProto));
-    string dataModel;
-    ASSERT_TRUE(readFileInMemory(model, dataModel));
+    std::vector<char> dataProto;
+    readFileContent(proto, dataProto);
+    std::vector<char> dataModel;
+    readFileContent(model, dataModel);
 
-    Net net = readNetFromCaffe(dataProto.c_str(), dataProto.size());
+    Net net = readNetFromCaffe(dataProto.data(), dataProto.size());
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
     ASSERT_FALSE(net.empty());
 
-    Net net2 = readNetFromCaffe(dataProto.c_str(), dataProto.size(),
-                                dataModel.c_str(), dataModel.size());
+    Net net2 = readNetFromCaffe(dataProto.data(), dataProto.size(),
+                                dataModel.data(), dataModel.size());
     ASSERT_FALSE(net2.empty());
 }
 
@@ -109,6 +109,49 @@ TEST(Test_Caffe, read_googlenet)
     ASSERT_FALSE(net.empty());
 }
 
+TEST_P(Test_Caffe_nets, Axpy)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
+        throw SkipTestException("");
+
+    String proto = _tf("axpy.prototxt");
+    Net net = readNetFromCaffe(proto);
+
+    checkBackend();
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    int size[] = {1, 2, 3, 4};
+    int scale_size[] = {1, 2, 1, 1};
+    Mat scale(4, &scale_size[0], CV_32F);
+    Mat shift(4, &size[0], CV_32F);
+    Mat inp(4, &size[0], CV_32F);
+    randu(scale, -1.0f, 1.0f);
+    randu(shift, -1.0f, 1.0f);
+    randu(inp, -1.0f, 1.0f);
+
+    net.setInput(scale, "scale");
+    net.setInput(shift, "shift");
+    net.setInput(inp, "data");
+
+    Mat out = net.forward();
+
+    Mat ref(4, &size[0], inp.type());
+    for (int i = 0; i < inp.size[1]; i++) {
+        for (int h = 0; h < inp.size[2]; h++) {
+            for (int w = 0; w < inp.size[3]; w++) {
+                int idx[] = {0, i, h, w};
+                int scale_idx[] = {0, i, 0, 0};
+                ref.at<float>(idx) = inp.at<float>(idx) * scale.at<float>(scale_idx) +
+                                     shift.at<float>(idx);
+            }
+        }
+    }
+    float l1 = (target == DNN_TARGET_OPENCL_FP16) ? 2e-4 : 1e-5;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16) ? 1e-3 : 1e-4;
+    normAssert(ref, out, "", l1, lInf);
+}
+
 typedef testing::TestWithParam<tuple<bool, Target> > Reproducibility_AlexNet;
 TEST_P(Reproducibility_AlexNet, Accuracy)
 {
@@ -124,13 +167,13 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
         const string model = findDataFile("dnn/bvlc_alexnet.caffemodel", false);
         if (readFromMemory)
         {
-            string dataProto;
-            ASSERT_TRUE(readFileInMemory(proto, dataProto));
-            string dataModel;
-            ASSERT_TRUE(readFileInMemory(model, dataModel));
+            std::vector<char> dataProto;
+            readFileContent(proto, dataProto);
+            std::vector<char> dataModel;
+            readFileContent(model, dataModel);
 
-            net = readNetFromCaffe(dataProto.c_str(), dataProto.size(),
-                                   dataModel.c_str(), dataModel.size());
+            net = readNetFromCaffe(dataProto.data(), dataProto.size(),
+                                   dataModel.data(), dataModel.size());
         }
         else
             net = readNetFromCaffe(proto, model);
diff --git a/modules/dnn/test/test_common.hpp b/modules/dnn/test/test_common.hpp
index 5897e4d92f..0e2509eb6a 100644
--- a/modules/dnn/test/test_common.hpp
+++ b/modules/dnn/test/test_common.hpp
@@ -59,7 +59,7 @@ void normAssertDetections(
         double confThreshold = 0.0, double scores_diff = 1e-5,
         double boxes_iou_diff = 1e-4);
 
-bool readFileInMemory(const std::string& filename, std::string& content);
+void readFileContent(const std::string& filename, CV_OUT std::vector<char>& content);
 
 #ifdef HAVE_INF_ENGINE
 bool validateVPUType();
diff --git a/modules/dnn/test/test_common.impl.hpp b/modules/dnn/test/test_common.impl.hpp
index 6914af2bcb..9098413e27 100644
--- a/modules/dnn/test/test_common.impl.hpp
+++ b/modules/dnn/test/test_common.impl.hpp
@@ -160,23 +160,21 @@ void normAssertDetections(
                          testBoxes, comment, confThreshold, scores_diff, boxes_iou_diff);
 }
 
-bool readFileInMemory(const std::string& filename, std::string& content)
+void readFileContent(const std::string& filename, CV_OUT std::vector<char>& content)
 {
-    std::ios::openmode mode = std::ios::in | std::ios::binary;
+    const std::ios::openmode mode = std::ios::in | std::ios::binary;
     std::ifstream ifs(filename.c_str(), mode);
-    if (!ifs.is_open())
-        return false;
+    ASSERT_TRUE(ifs.is_open());
 
     content.clear();
 
     ifs.seekg(0, std::ios::end);
-    content.reserve(ifs.tellg());
+    const size_t sz = ifs.tellg();
+    content.resize(sz);
     ifs.seekg(0, std::ios::beg);
 
-    content.assign((std::istreambuf_iterator<char>(ifs)),
-                   std::istreambuf_iterator<char>());
-
-    return true;
+    ifs.read((char*)content.data(), sz);
+    ASSERT_FALSE(ifs.fail());
 }
 
 
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index ffc71b66c4..683c22e691 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -93,11 +93,11 @@ TEST(Test_Darknet, read_yolo_voc_stream)
     }
     // Import from bytes array.
     {
-        std::string cfg, weights;
-        readFileInMemory(cfgFile, cfg);
-        readFileInMemory(weightsFile, weights);
+        std::vector<char> cfg, weights;
+        readFileContent(cfgFile, cfg);
+        readFileContent(weightsFile, weights);
 
-        Net net = readNetFromDarknet(&cfg[0], cfg.size(), &weights[0], weights.size());
+        Net net = readNetFromDarknet(cfg.data(), cfg.size(), weights.data(), weights.size());
         net.setInput(inp);
         net.setPreferableBackend(DNN_BACKEND_OPENCV);
         Mat out = net.forward();
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
index 62915b8f3f..6950ad0731 100644
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -159,10 +159,6 @@ TEST_P(Deconvolution, Accuracy)
     Backend backendId = get<0>(get<7>(GetParam()));
     Target targetId = get<1>(get<7>(GetParam()));
 
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE && (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_MYRIAD) &&
-        dilation.width == 2 && dilation.height == 2)
-        throw SkipTestException("");
-
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018040000)
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU
             && hasBias && group != 1)
@@ -216,7 +212,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Deconvolution, Combine(
 /*in size*/  Values(Size(5, 6)),
 /*kernel*/   Values(Size(3, 1), Size(1, 3)),
 /*pad*/      Values(Size(1, 0), Size(0, 1)),
-/*dilation*/ Values(Size(1, 1), Size(2, 2)),
+/*dilation*/ Values(Size(1, 1)),
 /*stride, adj. pad*/ Values(Vec4i(1,1, 0,0), Vec4i(2,2, 1,0), Vec4i(1,2, 0,1)),
 /*has bias*/ Bool(),
              dnnBackendsAndTargetsWithHalide()
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index f926a43f97..9de4603ce5 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -352,7 +352,7 @@ TEST_P(Test_ONNX_nets, ResNet18v1)
     applyTestTag(CV_TEST_TAG_MEMORY_512MB);
 
     // output range: [-16; 22], after Softmax [0, 0.51]
-    testONNXModels("resnet18v1", pb, default_l1, default_lInf, true);
+    testONNXModels("resnet18v1", pb, default_l1, default_lInf, true, target != DNN_TARGET_MYRIAD);
 }
 
 TEST_P(Test_ONNX_nets, ResNet50v1)
@@ -360,7 +360,7 @@ TEST_P(Test_ONNX_nets, ResNet50v1)
     applyTestTag(CV_TEST_TAG_MEMORY_512MB);
 
     // output range: [-67; 75], after Softmax [0, 0.98]
-    testONNXModels("resnet50v1", pb, default_l1, default_lInf, true);
+    testONNXModels("resnet50v1", pb, default_l1, default_lInf, true, target != DNN_TARGET_MYRIAD);
 }
 
 TEST_P(Test_ONNX_nets, ResNet101_DUC_HDC)
@@ -477,7 +477,7 @@ TEST_P(Test_ONNX_nets, DenseNet121)
     applyTestTag(CV_TEST_TAG_MEMORY_512MB);
 
     // output range: [-87; 138], after Softmax [0; 1]
-    testONNXModels("densenet121", pb, default_l1, default_lInf, true);
+    testONNXModels("densenet121", pb, default_l1, default_lInf, true, target != DNN_TARGET_MYRIAD);
 }
 
 TEST_P(Test_ONNX_nets, Inception_v1)
@@ -497,6 +497,61 @@ TEST_P(Test_ONNX_nets, Shufflenet)
     testONNXModels("shufflenet", pb);
 }
 
+TEST_P(Test_ONNX_nets, Resnet34_kinetics)
+{
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
+        throw SkipTestException("Only DLIE backend on CPU is supported");
+
+    String onnxmodel = findDataFile("dnn/resnet-34_kinetics.onnx");
+    Mat image0 = imread(findDataFile("dnn/dog416.png"));
+    Mat image1 = imread(findDataFile("dnn/street.png"));
+
+    Mat ref0 = blobFromNPY(_tf("data/output_kinetics0.npy"));
+    Mat ref1 = blobFromNPY(_tf("data/output_kinetics1.npy"));
+
+    std::vector<Mat> images_0(16, image0);
+    std::vector<Mat> images_1(16, image1);
+    Mat blob0 = blobFromImages(images_0, 1.0, Size(112, 112), Scalar(114.7748, 107.7354, 99.4750), true, true);
+    Mat blob1 = blobFromImages(images_1, 1.0, Size(112, 112), Scalar(114.7748, 107.7354, 99.4750), true, true);
+
+    Net permute;
+    LayerParams lp;
+    int order[] = {1, 0, 2, 3};
+    lp.set("order", DictValue::arrayInt<int*>(&order[0], 4));
+    permute.addLayerToPrev("perm", "Permute", lp);
+
+    permute.setInput(blob0);
+    Mat input0 = permute.forward().clone();
+
+    permute.setInput(blob1);
+    Mat input1 = permute.forward().clone();
+
+    int dims[] = {1, 3, 16, 112, 112};
+    input0 = input0.reshape(0, 5, &dims[0]);
+    input1 = input1.reshape(0, 5, &dims[0]);
+
+    Net net = readNetFromONNX(onnxmodel);
+    ASSERT_FALSE(net.empty());
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    // output range [-5, 11]
+    float l1 = 0.0013;
+    float lInf = 0.009;
+
+    checkBackend(&input0, &ref0);
+    net.setInput(input0);
+    Mat out = net.forward().clone();
+    normAssert(ref0, out, "", l1, lInf);
+
+    checkBackend(&input1, &ref1);
+    net.setInput(input1);
+    out = net.forward().clone();
+    normAssert(ref1, out, "", l1, lInf);
+
+    expectNoFallbacksFromIE(net);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index afe9287dd0..c495af8cfe 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -96,17 +96,17 @@ public:
         if (memoryLoad)
         {
             // Load files into a memory buffers
-            string dataModel;
-            ASSERT_TRUE(readFileInMemory(netPath, dataModel));
+            std::vector<char> dataModel;
+            readFileContent(netPath, dataModel);
 
-            string dataConfig;
+            std::vector<char> dataConfig;
             if (hasText)
             {
-                ASSERT_TRUE(readFileInMemory(netConfig, dataConfig));
+                readFileContent(netConfig, dataConfig);
             }
 
-            net = readNetFromTensorflow(dataModel.c_str(), dataModel.size(),
-                                        dataConfig.c_str(), dataConfig.size());
+            net = readNetFromTensorflow(dataModel.data(), dataModel.size(),
+                                        dataConfig.data(), dataConfig.size());
         }
         else
             net = readNetFromTensorflow(netPath, netConfig);
@@ -186,6 +186,7 @@ TEST_P(Test_TensorFlow_layers, batch_norm)
     runTensorFlowNet("unfused_batch_norm_no_gamma");
     runTensorFlowNet("mvn_batch_norm");
     runTensorFlowNet("mvn_batch_norm_1x1");
+    runTensorFlowNet("switch_identity");
 }
 
 TEST_P(Test_TensorFlow_layers, batch_norm3D)
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 981c9401b6..f417a45d7e 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -172,10 +172,6 @@ TEST_P(Test_Torch_layers, run_depth_concat)
 
 TEST_P(Test_Torch_layers, run_deconv)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("Test is disabled for OpenVINO 2018R4");
-#endif
     runTorchNet("net_deconv");
 }
 
@@ -398,10 +394,10 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
 //   -model models/instance_norm/feathers.t7
 TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
+#if defined INF_ENGINE_RELEASE
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
             && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
-        throw SkipTestException("Test is disabled for OpenVINO <= 2018R5 + MyriadX target");
+        throw SkipTestException("Test is disabled for MyriadX target");
 #endif
 
     checkBackend();
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index 4d48500016..8325b2414b 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -1061,10 +1061,16 @@ cvFindNextContour( CvContourScanner scanner )
                 }
                 else
                 {
-                    v_uint8 v_prev = vx_setall_u8((uchar)prev);
-                    for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+#if CV_SIMD_WIDTH > 16
+                    v_uint8 vx_prev = vx_setall_u8((uchar)prev);
+                    while (x <= width - v_uint8::nlanes &&
+                           v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
+                        x += v_uint8::nlanes;
+#endif
+                    v_uint8x16 v_prev = v_setall_u8((uchar)prev);
+                    for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
                     {
-                        unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(img + x)) != v_prev);
+                        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev);
                         if (mask)
                         {
                             p = img[(x += cv::trailingZeros32(mask))];
@@ -1328,10 +1334,16 @@ CvLinkedRunPoint;
 inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 {
 #if CV_SIMD
-    v_uint8 v_zero = vx_setzero_u8();
-    for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
+#if CV_SIMD_WIDTH > 16
+    v_uint8 vx_zero = vx_setzero_u8();
+    while (j <= img_size.width - v_uint8::nlanes &&
+           v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
+        j += v_uint8::nlanes;
+#endif
+    v_uint8x16 v_zero = v_setzero_u8();
+    for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
     {
-        unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) != v_zero);
+        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero);
         if (mask)
         {
             j += cv::trailingZeros32(mask);
@@ -1353,10 +1365,16 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
     }
     else
     {
-        v_uint8 v_zero = vx_setzero_u8();
+#if CV_SIMD_WIDTH > 16
+        v_uint8 vx_zero = vx_setzero_u8();
+        while (j <= img_size.width - v_uint8::nlanes &&
+               v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
+            j += v_uint8::nlanes;
+#endif
+        v_uint8x16 v_zero = v_setzero_u8();
         for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
         {
-            unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) == v_zero);
+            unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero);
             if (mask)
             {
                 j += cv::trailingZeros32(mask);
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index e7b1d50151..b81c7d5002 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -436,6 +436,9 @@ struct RemapNoVec
 
 #if CV_SIMD128
 
+typedef unsigned short CV_DECL_ALIGNED(1) unaligned_ushort;
+typedef int CV_DECL_ALIGNED(1) unaligned_int;
+
 struct RemapVec_8u
 {
     int operator()( const Mat& _src, void* _dst, const short* XY,
@@ -461,8 +464,8 @@ struct RemapVec_8u
             {                                      \
                 v_uint8x16 rrggbb, dummy;          \
                 v_uint16x8 rrggbb8, dummy8;        \
-                v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \
-                v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + 3), 0, 0, 0)); \
+                v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
+                v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + 3), 0, 0, 0)); \
                 v_zip(rgb0, rgb1, rrggbb, dummy);  \
                 v_expand(rrggbb, rrggbb8, dummy8); \
                 result = v_reinterpret_as_s16(rrggbb8); \
@@ -480,15 +483,15 @@ struct RemapVec_8u
             CV_DbgAssert(p <= src_limit_8bytes);   \
             v_uint8x16 rrggbbaa, dummy;            \
             v_uint16x8 rrggbbaa8, dummy8;          \
-            v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \
-            v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
+            v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
+            v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
             v_zip(rgba0, rgba1, rrggbbaa, dummy);  \
             v_expand(rrggbbaa, rrggbbaa8, dummy8); \
             result = v_reinterpret_as_s16(rrggbbaa8); \
         }
 #define CV_PICK_AND_PACK4(base,offset)             \
-            v_uint16x8(*(ushort*)(base + offset[0]), *(ushort*)(base + offset[1]), \
-                       *(ushort*)(base + offset[2]), *(ushort*)(base + offset[3]), \
+            v_uint16x8(*(unaligned_ushort*)(base + offset[0]), *(unaligned_ushort*)(base + offset[1]), \
+                       *(unaligned_ushort*)(base + offset[2]), *(unaligned_ushort*)(base + offset[3]), \
                        0, 0, 0, 0)
 
         if( cn == 1 )
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 2882f26341..ab0d3fe89f 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -2148,6 +2148,7 @@ public:
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
 #elif CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
 #endif
@@ -2167,6 +2168,7 @@ public:
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
 #elif CV_SIMD_WIDTH == 64
+                v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
 #endif
diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp
index 8531fc61f4..2052b02e41 100755
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -127,7 +127,7 @@ struct Integral_SIMD<uchar, int, double>
             {
                 v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                 v_int32 el4l, el4h;
-#if CV_AVX2
+#if CV_AVX2 && CV_SIMD_WIDTH == 32
                 __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@@ -138,7 +138,7 @@ struct Integral_SIMD<uchar, int, double>
 #else
                 el8 += v_rotate_left<1>(el8);
                 el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH == 32
+#if CV_SIMD_WIDTH >= 32
                 el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
                 el8 += v_rotate_left<8>(el8);
@@ -194,7 +194,7 @@ struct Integral_SIMD<uchar, float, double>
             {
                 v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                 v_float32 el4l, el4h;
-#if CV_AVX2
+#if CV_AVX2 && CV_SIMD_WIDTH == 32
                 __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
                 vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@@ -205,7 +205,7 @@ struct Integral_SIMD<uchar, float, double>
 #else
                 el8 += v_rotate_left<1>(el8);
                 el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH == 32
+#if CV_SIMD_WIDTH >= 32
                 el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
                 el8 += v_rotate_left<8>(el8);
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index 6cbce64007..57032dbe84 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -231,7 +231,8 @@ class ClassInfo(GeneralInfo):
         return Template("CLASS $namespace::$classpath.$name : $base").substitute(**self.__dict__)
 
     def getAllImports(self, module):
-        return ["import %s;" % c for c in sorted(self.imports) if not c.startswith('org.opencv.'+module)]
+        return ["import %s;" % c for c in sorted(self.imports) if not c.startswith('org.opencv.'+module)
+            and (not c.startswith('java.lang.') or c.count('.') != 2)]
 
     def addImports(self, ctype):
         if ctype in type_dict:
@@ -294,8 +295,8 @@ class ClassInfo(GeneralInfo):
         self.cpp_code.close()
 
     def generateJavaCode(self, m, M):
-        return Template(self.j_code.getvalue() + "\n\n" + \
-                         self.jn_code.getvalue() + "\n}\n").substitute(\
+        return Template(self.j_code.getvalue() + "\n\n" +
+                         self.jn_code.getvalue() + "\n}\n").substitute(
                             module = m,
                             name = self.name,
                             jname = self.jname,
@@ -694,7 +695,7 @@ class JavaWrapperGenerator(object):
                         jn_args.append ( ArgInfo([ "double[]", "%s_out" % a.name, "", [], "" ]) )
                         jni_args.append ( ArgInfo([ "double[]", "%s_out" % a.name, "", [], "" ]) )
                         j_prologue.append( "double[] %s_out = new double[%i];" % (a.name, len(fields)) )
-                        c_epilogue.append( \
+                        c_epilogue.append(
                             "jdouble tmp_%(n)s[%(cnt)i] = {%(args)s}; env->SetDoubleArrayRegion(%(n)s_out, 0, %(cnt)i, tmp_%(n)s);" %
                             { "n" : a.name, "cnt" : len(fields), "args" : ", ".join(["(jdouble)" + a.name + f[1] for f in fields]) } )
                         if type_dict[a.ctype]["j_type"] in ('bool', 'int', 'long', 'float', 'double'):
@@ -733,10 +734,10 @@ class JavaWrapperGenerator(object):
             # private java NATIVE method decl
             # e.g.
             # private static native void add_0(long src1, long src2, long dst, long mask, int dtype);
-            jn_code.write( Template(\
-                "    private static native $type $name($args);\n").substitute(\
-                type = type_dict[fi.ctype].get("jn_type", "double[]"), \
-                name = fi.jname + '_' + str(suffix_counter), \
+            jn_code.write( Template(
+                "    private static native $type $name($args);\n").substitute(
+                type = type_dict[fi.ctype].get("jn_type", "double[]"),
+                name = fi.jname + '_' + str(suffix_counter),
                 args = ", ".join(["%s %s" % (type_dict[a.ctype]["jn_type"], normalize_field_name(a.name)) for a in jn_args])
             ) );
 
@@ -763,9 +764,9 @@ class JavaWrapperGenerator(object):
             ret_type = fi.ctype
             if fi.ctype.endswith('*'):
                 ret_type = ret_type[:-1]
-            ret_val = type_dict[ret_type]["j_type"] + " retVal = "
+            ret_val = type_dict[ret_type]["j_type"] + " retVal = " if j_epilogue else "return "
             tail = ""
-            ret = "return retVal;"
+            ret = "return retVal;" if j_epilogue else ""
             if "v_type" in type_dict[ret_type]:
                 j_type = type_dict[ret_type]["j_type"]
                 if type_dict[ret_type]["v_type"] in ("Mat", "vector_Mat"):
@@ -776,70 +777,77 @@ class JavaWrapperGenerator(object):
                         ret_val = "Mat retValMat = new Mat("
                         j_prologue.append( j_type + ' retVal = new Array' + j_type+'();')
                         j_epilogue.append('Converters.Mat_to_' + ret_type + '(retValMat, retVal);')
+                        ret = "return retVal;"
             elif ret_type.startswith("Ptr_"):
-                ret_val = type_dict[fi.ctype]["j_type"] + " retVal = " + type_dict[ret_type]["j_type"] + ".__fromPtr__("
+                constructor = type_dict[ret_type]["j_type"] + ".__fromPtr__(";
+                if j_epilogue:
+                    ret_val = type_dict[fi.ctype]["j_type"] + " retVal = " + constructor
+                else:
+                    ret_val = "return " + constructor
                 tail = ")"
             elif ret_type == "void":
                 ret_val = ""
-                ret = "return;"
+                ret = ""
             elif ret_type == "": # c-tor
                 if fi.classname and ci.base:
-                    ret_val = "super( "
-                    tail = " )"
+                    ret_val = "super("
+                    tail = ")"
                 else:
                     ret_val = "nativeObj = "
-                ret = "return;"
+                ret = ""
             elif self.isWrapped(ret_type): # wrapped class
-                ret_val = type_dict[ret_type]["j_type"] + " retVal = new " + self.getClass(ret_type).jname + "("
+                constructor = self.getClass(ret_type).jname + "(";
+                if j_epilogue:
+                    ret_val = type_dict[ret_type]["j_type"] + " retVal = new " + constructor
+                else:
+                    ret_val = "return new " + constructor
                 tail = ")"
             elif "jn_type" not in type_dict[ret_type]:
-                ret_val = type_dict[fi.ctype]["j_type"] + " retVal = new " + type_dict[ret_type]["j_type"] + "("
+                constructor = type_dict[ret_type]["j_type"] + "(";
+                if j_epilogue:
+                    ret_val = type_dict[fi.ctype]["j_type"] + " retVal = new " + constructor
+                else:
+                    ret_val = "return new " + constructor
                 tail = ")"
 
             static = "static"
             if fi.classname:
                 static = fi.static
 
-            j_code.write( Template(\
-"""    public $static $j_type $j_name($j_args)
-    {
-        $prologue
-        $ret_val$jn_name($jn_args_call)$tail;
-        $epilogue
-        $ret
+            j_code.write( Template(
+"""    public $static$j_type $j_name($j_args) {$prologue
+        $ret_val$jn_name($jn_args_call)$tail;$epilogue$ret
     }
 
 """
-                ).substitute(\
-                    ret = ret, \
-                    ret_val = ret_val, \
-                    tail = tail, \
-                    prologue = "\n        ".join(j_prologue), \
-                    epilogue = "\n        ".join(j_epilogue), \
-                    static=static, \
-                    j_type=type_dict[fi.ctype]["j_type"], \
-                    j_name=fi.jname, \
-                    j_args=", ".join(j_args), \
-                    jn_name=fi.jname + '_' + str(suffix_counter), \
-                    jn_args_call=", ".join( [a.name for a in jn_args] ),\
+                ).substitute(
+                    ret = "\n        " + ret if ret else "",
+                    ret_val = ret_val,
+                    tail = tail,
+                    prologue = "\n        " + "\n        ".join(j_prologue) if j_prologue else "",
+                    epilogue = "\n        " + "\n        ".join(j_epilogue) if j_epilogue else "",
+                    static = static + " " if static else "",
+                    j_type=type_dict[fi.ctype]["j_type"],
+                    j_name=fi.jname,
+                    j_args=", ".join(j_args),
+                    jn_name=fi.jname + '_' + str(suffix_counter),
+                    jn_args_call=", ".join( [a.name for a in jn_args] ),
                 )
             )
 
 
             # cpp part:
             # jni_func(..) { _retval_ = cv_func(..); return _retval_; }
-            ret = "return _retval_;"
+            ret = "return _retval_;" if c_epilogue else ""
             default = "return 0;"
             if fi.ctype == "void":
-                ret = "return;"
-                default = "return;"
+                ret = ""
+                default = ""
             elif not fi.ctype: # c-tor
                 ret = "return (jlong) _retval_;"
             elif "v_type" in type_dict[fi.ctype]: # c-tor
                 if type_dict[fi.ctype]["v_type"] in ("Mat", "vector_Mat"):
                     ret = "return (jlong) _retval_;"
-                else: # returned as jobject
-                    ret = "return _retval_;"
             elif fi.ctype in ['String', 'string']:
                 ret = "return env->NewStringUTF(_retval_.c_str());"
                 default = 'return env->NewStringUTF("");'
@@ -862,21 +870,21 @@ class JavaWrapperGenerator(object):
                     name = prop_name + ";//"
 
             cvname = fi.fullName(isCPP=True)
-            retval = self.fullTypeName(fi.ctype) + " _retval_ = "
+            retval = self.fullTypeName(fi.ctype) + " _retval_ = " if ret else "return "
             if fi.ctype == "void":
                 retval = ""
             elif fi.ctype == "String":
-                retval = "cv::" + retval
+                retval = "cv::" + self.fullTypeName(fi.ctype) + " _retval_ = "
             elif fi.ctype == "string":
-                retval = "std::" + retval
+                retval = "std::string _retval_ = "
             elif "v_type" in type_dict[fi.ctype]: # vector is returned
                 retval = type_dict[fi.ctype]['jni_var'] % {"n" : '_ret_val_vector_'} + " = "
                 if type_dict[fi.ctype]["v_type"] in ("Mat", "vector_Mat"):
                     c_epilogue.append("Mat* _retval_ = new Mat();")
                     c_epilogue.append(fi.ctype+"_to_Mat(_ret_val_vector_, *_retval_);")
                 else:
-                    c_epilogue.append("jobject _retval_ = " + fi.ctype + "_to_List(env, _ret_val_vector_);")
-            if len(fi.classname)>0:
+                    c_epilogue.append("return " + fi.ctype + "_to_List(env, _ret_val_vector_);")
+            if fi.classname:
                 if not fi.ctype: # c-tor
                     retval = fi.fullClass(isCPP=True) + "* _retval_ = "
                     cvname = "new " + fi.fullClass(isCPP=True)
@@ -884,9 +892,9 @@ class JavaWrapperGenerator(object):
                     cvname = fi.fullName(isCPP=True)
                 else:
                     cvname = ("me->" if  not self.isSmartClass(ci) else "(*me)->") + name
-                    c_prologue.append(\
-                        "%(cls)s* me = (%(cls)s*) self; //TODO: check for NULL" \
-                            % { "cls" : self.smartWrap(ci, fi.fullClass(isCPP=True))} \
+                    c_prologue.append(
+                        "%(cls)s* me = (%(cls)s*) self; //TODO: check for NULL"
+                            % { "cls" : self.smartWrap(ci, fi.fullClass(isCPP=True))}
                     )
             cvargs = []
             for a in args:
@@ -909,7 +917,7 @@ class JavaWrapperGenerator(object):
 
             rtype = type_dict[fi.ctype].get("jni_type", "jdoubleArray")
             clazz = ci.jname
-            cpp_code.write ( Template( \
+            cpp_code.write ( Template(
 """
 ${namespace}
 
@@ -920,37 +928,34 @@ JNIEXPORT $rtype JNICALL Java_org_opencv_${module}_${clazz}_$fname
 {
     static const char method_name[] = "$module::$fname()";
     try {
-        LOGD("%s", method_name);
-        $prologue
-        $retval$cvname( $cvargs );
-        $epilogue$ret
+        LOGD("%s", method_name);$prologue
+        $retval$cvname($cvargs);$epilogue$ret
     } catch(const std::exception &e) {
         throwJavaException(env, &e, method_name);
     } catch (...) {
         throwJavaException(env, 0, method_name);
-    }
-    $default
+    }$default
 }
 
 
-""" ).substitute( \
-        rtype = rtype, \
-        module = self.module.replace('_', '_1'), \
-        clazz = clazz.replace('_', '_1'), \
-        fname = (fi.jname + '_' + str(suffix_counter)).replace('_', '_1'), \
-        args  = ", ".join(["%s %s" % (type_dict[a.ctype].get("jni_type"), a.name) for a in jni_args]), \
-        argst = ", ".join([type_dict[a.ctype].get("jni_type") for a in jni_args]), \
-        prologue = "\n        ".join(c_prologue), \
-        epilogue = "  ".join(c_epilogue) + ("\n        " if c_epilogue else ""), \
-        ret = ret, \
-        cvname = cvname, \
-        cvargs = ", ".join(cvargs), \
-        default = default, \
-        retval = retval, \
+""" ).substitute(
+        rtype = rtype,
+        module = self.module.replace('_', '_1'),
+        clazz = clazz.replace('_', '_1'),
+        fname = (fi.jname + '_' + str(suffix_counter)).replace('_', '_1'),
+        args  = ", ".join(["%s %s" % (type_dict[a.ctype].get("jni_type"), a.name) for a in jni_args]),
+        argst = ", ".join([type_dict[a.ctype].get("jni_type") for a in jni_args]),
+        prologue = "\n        " + "\n        ".join(c_prologue) if c_prologue else "",
+        epilogue = "\n        " + "\n        ".join(c_epilogue) if c_epilogue else "",
+        ret = "\n        " + ret if ret else "",
+        cvname = cvname,
+        cvargs = " " + ", ".join(cvargs) + " " if cvargs else "",
+        default = "\n    " + default if default else "",
+        retval = retval,
         namespace = ('using namespace ' + ci.namespace.replace('.', '::') + ';') if ci.namespace else ''
     ) )
 
-            # adding method signature to dictionarry
+            # adding method signature to dictionary
             j_signatures.append(j_signature)
 
             # processing args with default values
@@ -1047,7 +1052,7 @@ JNIEXPORT $rtype JNICALL Java_org_opencv_${module}_${clazz}_$fname
 """ )
 
             # native support for java finalize()
-            ci.cpp_code.write( \
+            ci.cpp_code.write(
 """
 //
 //  native support for java finalize()
diff --git a/samples/data/dnn/action_recongnition_kinetics.txt b/samples/data/dnn/action_recongnition_kinetics.txt
new file mode 100644
index 0000000000..cdaafcb141
--- /dev/null
+++ b/samples/data/dnn/action_recongnition_kinetics.txt
@@ -0,0 +1,400 @@
+abseiling
+air drumming
+answering questions
+applauding
+applying cream
+archery
+arm wrestling
+arranging flowers
+assembling computer
+auctioning
+baby waking up
+baking cookies
+balloon blowing
+bandaging
+barbequing
+bartending
+beatboxing
+bee keeping
+belly dancing
+bench pressing
+bending back
+bending metal
+biking through snow
+blasting sand
+blowing glass
+blowing leaves
+blowing nose
+blowing out candles
+bobsledding
+bookbinding
+bouncing on trampoline
+bowling
+braiding hair
+breading or breadcrumbing
+breakdancing
+brush painting
+brushing hair
+brushing teeth
+building cabinet
+building shed
+bungee jumping
+busking
+canoeing or kayaking
+capoeira
+carrying baby
+cartwheeling
+carving pumpkin
+catching fish
+catching or throwing baseball
+catching or throwing frisbee
+catching or throwing softball
+celebrating
+changing oil
+changing wheel
+checking tires
+cheerleading
+chopping wood
+clapping
+clay pottery making
+clean and jerk
+cleaning floor
+cleaning gutters
+cleaning pool
+cleaning shoes
+cleaning toilet
+cleaning windows
+climbing a rope
+climbing ladder
+climbing tree
+contact juggling
+cooking chicken
+cooking egg
+cooking on campfire
+cooking sausages
+counting money
+country line dancing
+cracking neck
+crawling baby
+crossing river
+crying
+curling hair
+cutting nails
+cutting pineapple
+cutting watermelon
+dancing ballet
+dancing charleston
+dancing gangnam style
+dancing macarena
+deadlifting
+decorating the christmas tree
+digging
+dining
+disc golfing
+diving cliff
+dodgeball
+doing aerobics
+doing laundry
+doing nails
+drawing
+dribbling basketball
+drinking
+drinking beer
+drinking shots
+driving car
+driving tractor
+drop kicking
+drumming fingers
+dunking basketball
+dying hair
+eating burger
+eating cake
+eating carrots
+eating chips
+eating doughnuts
+eating hotdog
+eating ice cream
+eating spaghetti
+eating watermelon
+egg hunting
+exercising arm
+exercising with an exercise ball
+extinguishing fire
+faceplanting
+feeding birds
+feeding fish
+feeding goats
+filling eyebrows
+finger snapping
+fixing hair
+flipping pancake
+flying kite
+folding clothes
+folding napkins
+folding paper
+front raises
+frying vegetables
+garbage collecting
+gargling
+getting a haircut
+getting a tattoo
+giving or receiving award
+golf chipping
+golf driving
+golf putting
+grinding meat
+grooming dog
+grooming horse
+gymnastics tumbling
+hammer throw
+headbanging
+headbutting
+high jump
+high kick
+hitting baseball
+hockey stop
+holding snake
+hopscotch
+hoverboarding
+hugging
+hula hooping
+hurdling
+hurling (sport)
+ice climbing
+ice fishing
+ice skating
+ironing
+javelin throw
+jetskiing
+jogging
+juggling balls
+juggling fire
+juggling soccer ball
+jumping into pool
+jumpstyle dancing
+kicking field goal
+kicking soccer ball
+kissing
+kitesurfing
+knitting
+krumping
+laughing
+laying bricks
+long jump
+lunge
+making a cake
+making a sandwich
+making bed
+making jewelry
+making pizza
+making snowman
+making sushi
+making tea
+marching
+massaging back
+massaging feet
+massaging legs
+massaging person's head
+milking cow
+mopping floor
+motorcycling
+moving furniture
+mowing lawn
+news anchoring
+opening bottle
+opening present
+paragliding
+parasailing
+parkour
+passing American football (in game)
+passing American football (not in game)
+peeling apples
+peeling potatoes
+petting animal (not cat)
+petting cat
+picking fruit
+planting trees
+plastering
+playing accordion
+playing badminton
+playing bagpipes
+playing basketball
+playing bass guitar
+playing cards
+playing cello
+playing chess
+playing clarinet
+playing controller
+playing cricket
+playing cymbals
+playing didgeridoo
+playing drums
+playing flute
+playing guitar
+playing harmonica
+playing harp
+playing ice hockey
+playing keyboard
+playing kickball
+playing monopoly
+playing organ
+playing paintball
+playing piano
+playing poker
+playing recorder
+playing saxophone
+playing squash or racquetball
+playing tennis
+playing trombone
+playing trumpet
+playing ukulele
+playing violin
+playing volleyball
+playing xylophone
+pole vault
+presenting weather forecast
+pull ups
+pumping fist
+pumping gas
+punching bag
+punching person (boxing)
+push up
+pushing car
+pushing cart
+pushing wheelchair
+reading book
+reading newspaper
+recording music
+riding a bike
+riding camel
+riding elephant
+riding mechanical bull
+riding mountain bike
+riding mule
+riding or walking with horse
+riding scooter
+riding unicycle
+ripping paper
+robot dancing
+rock climbing
+rock scissors paper
+roller skating
+running on treadmill
+sailing
+salsa dancing
+sanding floor
+scrambling eggs
+scuba diving
+setting table
+shaking hands
+shaking head
+sharpening knives
+sharpening pencil
+shaving head
+shaving legs
+shearing sheep
+shining shoes
+shooting basketball
+shooting goal (soccer)
+shot put
+shoveling snow
+shredding paper
+shuffling cards
+side kick
+sign language interpreting
+singing
+situp
+skateboarding
+ski jumping
+skiing (not slalom or crosscountry)
+skiing crosscountry
+skiing slalom
+skipping rope
+skydiving
+slacklining
+slapping
+sled dog racing
+smoking
+smoking hookah
+snatch weight lifting
+sneezing
+sniffing
+snorkeling
+snowboarding
+snowkiting
+snowmobiling
+somersaulting
+spinning poi
+spray painting
+spraying
+springboard diving
+squat
+sticking tongue out
+stomping grapes
+stretching arm
+stretching leg
+strumming guitar
+surfing crowd
+surfing water
+sweeping floor
+swimming backstroke
+swimming breast stroke
+swimming butterfly stroke
+swing dancing
+swinging legs
+swinging on something
+sword fighting
+tai chi
+taking a shower
+tango dancing
+tap dancing
+tapping guitar
+tapping pen
+tasting beer
+tasting food
+testifying
+texting
+throwing axe
+throwing ball
+throwing discus
+tickling
+tobogganing
+tossing coin
+tossing salad
+training dog
+trapezing
+trimming or shaving beard
+trimming trees
+triple jump
+tying bow tie
+tying knot (not on a tie)
+tying tie
+unboxing
+unloading truck
+using computer
+using remote controller (not gaming)
+using segway
+vault
+waiting in line
+walking the dog
+washing dishes
+washing feet
+washing hair
+washing hands
+water skiing
+water sliding
+watering plants
+waxing back
+waxing chest
+waxing eyebrows
+waxing legs
+weaving basket
+welding
+whistling
+windsurfing
+wrapping present
+wrestling
+writing
+yawning
+yoga
+zumba
diff --git a/samples/dnn/action_recognition.py b/samples/dnn/action_recognition.py
new file mode 100644
index 0000000000..f2f37303fb
--- /dev/null
+++ b/samples/dnn/action_recognition.py
@@ -0,0 +1,82 @@
+import os
+import numpy as np
+import cv2 as cv
+import argparse
+from common import findFile
+
+parser = argparse.ArgumentParser(description='Use this script to run action recognition using 3D ResNet34',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--input', '-i', help='Path to input video file. Skip this argument to capture frames from a camera.')
+parser.add_argument('--model', required=True, help='Path to model.')
+parser.add_argument('--classes', default=findFile('action_recongnition_kinetics.txt'), help='Path to classes list.')
+
+# To get net download original repository https://github.com/kenshohara/video-classification-3d-cnn-pytorch
+# For correct ONNX export modify file: video-classification-3d-cnn-pytorch/models/resnet.py
+# change
+# - def downsample_basic_block(x, planes, stride):
+# -     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+# -     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
+# -                              out.size(2), out.size(3),
+# -                              out.size(4)).zero_()
+# -     if isinstance(out.data, torch.cuda.FloatTensor):
+# -         zero_pads = zero_pads.cuda()
+# -
+# -     out = Variable(torch.cat([out.data, zero_pads], dim=1))
+# -     return out
+
+# To
+# + def downsample_basic_block(x, planes, stride):
+# +     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+# +     out = F.pad(out, (0, 0, 0, 0, 0, 0, 0, int(planes - out.size(1)), 0, 0), "constant", 0)
+# +     return out
+
+# To ONNX export use torch.onnx.export(model, inputs, model_name)
+
+def get_class_names(path):
+    class_names = []
+    with open(path) as f:
+        for row in f:
+            class_names.append(row[:-1])
+    return class_names
+
+def classify_video(video_path, net_path):
+    SAMPLE_DURATION = 16
+    SAMPLE_SIZE = 112
+    mean = (114.7748, 107.7354, 99.4750)
+    class_names = get_class_names(args.classes)
+
+    net = cv.dnn.readNet(net_path)
+    net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
+    net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
+
+    winName = 'Deep learning image classification in OpenCV'
+    cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
+    cap = cv.VideoCapture(video_path)
+    while cv.waitKey(1) < 0:
+        frames = []
+        for _ in range(SAMPLE_DURATION):
+            hasFrame, frame = cap.read()
+            if not hasFrame:
+                exit(0)
+            frames.append(frame)
+
+        inputs = cv.dnn.blobFromImages(frames, 1, (SAMPLE_SIZE, SAMPLE_SIZE), mean, True, crop=True)
+        inputs = np.transpose(inputs, (1, 0, 2, 3))
+        inputs = np.expand_dims(inputs, axis=0)
+        net.setInput(inputs)
+        outputs = net.forward()
+        class_pred = np.argmax(outputs)
+        label = class_names[class_pred]
+
+        for frame in frames:
+            labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+            cv.rectangle(frame, (0, 10 - labelSize[1]),
+                                (labelSize[0], 10 + baseLine), (255, 255, 255), cv.FILLED)
+            cv.putText(frame, label, (0, 10), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
+            cv.imshow(winName, frame)
+        if cv.waitKey(1) & 0xFF == ord('q'):
+            break
+
+if __name__ == "__main__":
+    args, _ = parser.parse_known_args()
+    classify_video(args.input if args.input else 0, args.model)
diff --git a/samples/python/tutorial_code/core/mat_mask_operations/mat_mask_operations.py b/samples/python/tutorial_code/core/mat_mask_operations/mat_mask_operations.py
index 3b9385020e..ccbed22c38 100644
--- a/samples/python/tutorial_code/core/mat_mask_operations/mat_mask_operations.py
+++ b/samples/python/tutorial_code/core/mat_mask_operations/mat_mask_operations.py
@@ -69,7 +69,7 @@ def main(argv):
 
     dst0 = sharpen(src)
 
-    t = (time.time() - t) / 1000
+    t = (time.time() - t)
     print("Hand written function time passed in seconds: %s" % t)
 
     cv.imshow("Output", dst0)
@@ -86,7 +86,7 @@ def main(argv):
     # ddepth = -1, means destination image has depth same as input image
     ## [filter2D]
 
-    t = (time.time() - t) / 1000
+    t = (time.time() - t)
     print("Built-in filter2D time passed in seconds:     %s" % t)
 
     cv.imshow("Output", dst1)