diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 9c680bf116..e33269b4b8 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -3559,6 +3559,10 @@ CV_EXPORTS MatExpr operator + (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator + (const MatExpr& e, const Scalar& s);
 CV_EXPORTS MatExpr operator + (const Scalar& s, const MatExpr& e);
 CV_EXPORTS MatExpr operator + (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; }
 
 CV_EXPORTS MatExpr operator - (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator - (const Mat& a, const Scalar& s);
@@ -3568,6 +3572,10 @@ CV_EXPORTS MatExpr operator - (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator - (const MatExpr& e, const Scalar& s);
 CV_EXPORTS MatExpr operator - (const Scalar& s, const MatExpr& e);
 CV_EXPORTS MatExpr operator - (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Mat& a, const Matx<_Tp, m, n>& b) { return a - Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) - b; }
 
 CV_EXPORTS MatExpr operator - (const Mat& m);
 CV_EXPORTS MatExpr operator - (const MatExpr& e);
@@ -3580,6 +3588,10 @@ CV_EXPORTS MatExpr operator * (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator * (const MatExpr& e, double s);
 CV_EXPORTS MatExpr operator * (double s, const MatExpr& e);
 CV_EXPORTS MatExpr operator * (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; }
 
 CV_EXPORTS MatExpr operator / (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator / (const Mat& a, double s);
@@ -3589,52 +3601,100 @@ CV_EXPORTS MatExpr operator / (const Mat& m, const MatExpr& e);
 CV_EXPORTS MatExpr operator / (const MatExpr& e, double s);
 CV_EXPORTS MatExpr operator / (double s, const MatExpr& e);
 CV_EXPORTS MatExpr operator / (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Mat& a, const Matx<_Tp, m, n>& b) { return a / Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) / b; }
 
 CV_EXPORTS MatExpr operator < (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator < (const Mat& a, double s);
 CV_EXPORTS MatExpr operator < (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Mat& a, const Matx<_Tp, m, n>& b) { return a < Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) < b; }
 
 CV_EXPORTS MatExpr operator <= (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator <= (const Mat& a, double s);
 CV_EXPORTS MatExpr operator <= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Mat& a, const Matx<_Tp, m, n>& b) { return a <= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) <= b; }
 
 CV_EXPORTS MatExpr operator == (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator == (const Mat& a, double s);
 CV_EXPORTS MatExpr operator == (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Mat& a, const Matx<_Tp, m, n>& b) { return a == Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) == b; }
 
 CV_EXPORTS MatExpr operator != (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator != (const Mat& a, double s);
 CV_EXPORTS MatExpr operator != (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Mat& a, const Matx<_Tp, m, n>& b) { return a != Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) != b; }
 
 CV_EXPORTS MatExpr operator >= (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator >= (const Mat& a, double s);
 CV_EXPORTS MatExpr operator >= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Mat& a, const Matx<_Tp, m, n>& b) { return a >= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) >= b; }
 
 CV_EXPORTS MatExpr operator > (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator > (const Mat& a, double s);
 CV_EXPORTS MatExpr operator > (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Mat& a, const Matx<_Tp, m, n>& b) { return a > Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) > b; }
 
 CV_EXPORTS MatExpr operator & (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator & (const Mat& a, const Scalar& s);
 CV_EXPORTS MatExpr operator & (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Mat& a, const Matx<_Tp, m, n>& b) { return a & Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) & b; }
 
 CV_EXPORTS MatExpr operator | (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator | (const Mat& a, const Scalar& s);
 CV_EXPORTS MatExpr operator | (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Mat& a, const Matx<_Tp, m, n>& b) { return a | Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) | b; }
 
 CV_EXPORTS MatExpr operator ^ (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator ^ (const Mat& a, const Scalar& s);
 CV_EXPORTS MatExpr operator ^ (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Mat& a, const Matx<_Tp, m, n>& b) { return a ^ Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) ^ b; }
 
 CV_EXPORTS MatExpr operator ~(const Mat& m);
 
 CV_EXPORTS MatExpr min(const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr min(const Mat& a, double s);
 CV_EXPORTS MatExpr min(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Mat& a, const Matx<_Tp, m, n>& b) { return min(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Matx<_Tp, m, n>& a, const Mat& b) { return min(Mat(a), b); }
 
 CV_EXPORTS MatExpr max(const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr max(const Mat& a, double s);
 CV_EXPORTS MatExpr max(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Mat& a, const Matx<_Tp, m, n>& b) { return max(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Matx<_Tp, m, n>& a, const Mat& b) { return max(Mat(a), b); }
 
 /** @brief Calculates an absolute value of each matrix element.
 
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index a352048a49..0e0db4072f 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -258,48 +258,67 @@ Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) c
     template<typename _Tp> CV_MAT_AUG_OPERATOR1(op, cvop, A, B) \
     template<typename _Tp> CV_MAT_AUG_OPERATOR1(op, cvop, const A, B)
 
+#define CV_MAT_AUG_OPERATOR_TN(op, cvop, A)                                \
+    template<typename _Tp, int m, int n> static inline A& operator op (A& a, const Matx<_Tp,m,n>& b) { cvop; return a; } \
+    template<typename _Tp, int m, int n> static inline const A& operator op (const A& a, const Matx<_Tp,m,n>& b) { cvop; return a; }
+
 CV_MAT_AUG_OPERATOR  (+=, cv::add(a,b,a), Mat, Mat)
 CV_MAT_AUG_OPERATOR  (+=, cv::add(a,b,a), Mat, Scalar)
 CV_MAT_AUG_OPERATOR_T(+=, cv::add(a,b,a), Mat_<_Tp>, Mat)
 CV_MAT_AUG_OPERATOR_T(+=, cv::add(a,b,a), Mat_<_Tp>, Scalar)
 CV_MAT_AUG_OPERATOR_T(+=, cv::add(a,b,a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a,Mat(b),a), Mat)
+CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a,Mat(b),a), Mat_<_Tp>)
 
 CV_MAT_AUG_OPERATOR  (-=, cv::subtract(a,b,a), Mat, Mat)
 CV_MAT_AUG_OPERATOR  (-=, cv::subtract(a,b,a), Mat, Scalar)
 CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a,b,a), Mat_<_Tp>, Mat)
 CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a,b,a), Mat_<_Tp>, Scalar)
 CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a,b,a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a,Mat(b),a), Mat)
+CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a,Mat(b),a), Mat_<_Tp>)
 
 CV_MAT_AUG_OPERATOR  (*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat, Mat)
 CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat)
 CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat_<_Tp>)
 CV_MAT_AUG_OPERATOR  (*=, a.convertTo(a, -1, b), Mat, double)
 CV_MAT_AUG_OPERATOR_T(*=, a.convertTo(a, -1, b), Mat_<_Tp>, double)
+CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat)
+CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat_<_Tp>)
 
 CV_MAT_AUG_OPERATOR  (/=, cv::divide(a,b,a), Mat, Mat)
 CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a,b,a), Mat_<_Tp>, Mat)
 CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a,b,a), Mat_<_Tp>, Mat_<_Tp>)
 CV_MAT_AUG_OPERATOR  (/=, a.convertTo((Mat&)a, -1, 1./b), Mat, double)
 CV_MAT_AUG_OPERATOR_T(/=, a.convertTo((Mat&)a, -1, 1./b), Mat_<_Tp>, double)
+CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), a), Mat)
+CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), a), Mat_<_Tp>)
 
 CV_MAT_AUG_OPERATOR  (&=, cv::bitwise_and(a,b,a), Mat, Mat)
 CV_MAT_AUG_OPERATOR  (&=, cv::bitwise_and(a,b,a), Mat, Scalar)
 CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a,b,a), Mat_<_Tp>, Mat)
 CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a,b,a), Mat_<_Tp>, Scalar)
 CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a,b,a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), a), Mat)
+CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), a), Mat_<_Tp>)
 
 CV_MAT_AUG_OPERATOR  (|=, cv::bitwise_or(a,b,a), Mat, Mat)
 CV_MAT_AUG_OPERATOR  (|=, cv::bitwise_or(a,b,a), Mat, Scalar)
 CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a,b,a), Mat_<_Tp>, Mat)
 CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a,b,a), Mat_<_Tp>, Scalar)
 CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a,b,a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), a), Mat)
+CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), a), Mat_<_Tp>)
 
 CV_MAT_AUG_OPERATOR  (^=, cv::bitwise_xor(a,b,a), Mat, Mat)
 CV_MAT_AUG_OPERATOR  (^=, cv::bitwise_xor(a,b,a), Mat, Scalar)
 CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a,b,a), Mat_<_Tp>, Mat)
 CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a,b,a), Mat_<_Tp>, Scalar)
 CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a,b,a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), a), Mat)
+CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), a), Mat_<_Tp>)
 
+#undef CV_MAT_AUG_OPERATOR_TN
 #undef CV_MAT_AUG_OPERATOR_T
 #undef CV_MAT_AUG_OPERATOR
 #undef CV_MAT_AUG_OPERATOR1
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index e0a2c99991..aea6f229ac 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -69,6 +69,7 @@ protected:
     bool TestVec();
     bool TestMatxMultiplication();
     bool TestMatxElementwiseDivison();
+    bool TestMatMatxCastSum();
     bool TestSubMatAccess();
     bool TestExp();
     bool TestSVD();
@@ -885,6 +886,74 @@ bool CV_OperationsTest::TestMatxMultiplication()
     return true;
 }
 
+bool CV_OperationsTest::TestMatMatxCastSum()
+{
+    try
+    {
+        Mat ref1 = (Mat_<double>(3, 1) << 1, 2, 3);
+        Mat ref2 = (Mat_<double>(3, 1) << 3, 4, 5);
+        Mat ref3 = Mat::ones(3, 1, CV_64FC1);
+
+        Mat mat = Mat::zeros(3, 1, CV_64FC1);
+
+        Mat tst1 = ref1.clone();
+        Mat_<double> tst2 = ref2.clone();
+        Matx<double, 3, 1> tst3(1, 2, 3);
+        Vec3d tst4(3, 4, 5);
+        Scalar tst5(1, 2, 3);
+        Mat res;
+
+        res = mat + tst1;
+        CHECK_DIFF_FLT(res, ref1);
+        res = mat + tst2;
+        CHECK_DIFF_FLT(res, ref2);
+        res = mat + tst3;
+        CHECK_DIFF_FLT(res, ref1);
+        res = mat + tst4;
+        CHECK_DIFF_FLT(res, ref2);
+
+        res = mat + tst5;
+        CHECK_DIFF_FLT(res, ref3);
+        res = mat + 1;
+        CHECK_DIFF_FLT(res, ref3);
+
+        cv::add(mat, tst1, res);
+        CHECK_DIFF_FLT(res, ref1);
+        cv::add(mat, tst2, res);
+        CHECK_DIFF_FLT(res, ref2);
+        cv::add(mat, tst3, res);
+        CHECK_DIFF_FLT(res, ref1);
+        cv::add(mat, tst4, res);
+        CHECK_DIFF_FLT(res, ref2);
+
+        cv::add(mat, tst5, res);
+        CHECK_DIFF_FLT(res, ref3);
+        cv::add(mat, 1, res);
+        CHECK_DIFF_FLT(res, ref3);
+
+        res = mat.clone(); res += tst1;
+        CHECK_DIFF_FLT(res, ref1);
+        res = mat.clone(); res += tst2;
+        CHECK_DIFF_FLT(res, ref2);
+        res = mat.clone(); res += tst3;
+        CHECK_DIFF_FLT(res, ref1);
+        res = mat.clone(); res += tst4;
+        CHECK_DIFF_FLT(res, ref2);
+
+        res = mat.clone(); res += tst5;
+        CHECK_DIFF_FLT(res, ref3);
+        res = mat.clone(); res += 1;
+        CHECK_DIFF_FLT(res, ref3);
+    }
+    catch (const test_excep& e)
+    {
+        ts->printf(cvtest::TS::LOG, "%s\n", e.s.c_str());
+        ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
+        return false;
+    }
+    return true;
+}
+
 bool CV_OperationsTest::TestMatxElementwiseDivison()
 {
     try
@@ -1135,6 +1204,9 @@ void CV_OperationsTest::run( int /* start_from */)
     if (!TestMatxElementwiseDivison())
         return;
 
+    if (!TestMatMatxCastSum())
+        return;
+
     if (!TestSubMatAccess())
         return;
 
diff --git a/modules/features2d/src/draw.cpp b/modules/features2d/src/draw.cpp
index e791596476..84fb0aca39 100644
--- a/modules/features2d/src/draw.cpp
+++ b/modules/features2d/src/draw.cpp
@@ -95,9 +95,9 @@ void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, In
 
     if( !(flags & DrawMatchesFlags::DRAW_OVER_OUTIMG) )
     {
-        if( image.type() == CV_8UC3 )
+        if (image.type() == CV_8UC3 || image.type() == CV_8UC4)
         {
-            image.copyTo( outImage );
+            image.copyTo(outImage);
         }
         else if( image.type() == CV_8UC1 )
         {
@@ -105,7 +105,7 @@ void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, In
         }
         else
         {
-            CV_Error( Error::StsBadArg, "Incorrect type of input image.\n" );
+            CV_Error( Error::StsBadArg, "Incorrect type of input image: " + typeToString(image.type()) );
         }
     }
 
@@ -122,6 +122,25 @@ void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, In
     }
 }
 
+static void _prepareImage(InputArray src, const Mat& dst)
+{
+    CV_CheckType(src.type(), src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4, "Unsupported source image");
+    CV_CheckType(dst.type(), dst.type() == CV_8UC3 || dst.type() == CV_8UC4, "Unsupported destination image");
+    const int src_cn = src.channels();
+    const int dst_cn = dst.channels();
+
+    if (src_cn == dst_cn)
+        src.copyTo(dst);
+    else if (src_cn == 1)
+        cvtColor(src, dst, dst_cn == 3 ? COLOR_GRAY2BGR : COLOR_GRAY2BGRA);
+    else if (src_cn == 3 && dst_cn == 4)
+        cvtColor(src, dst, COLOR_BGR2BGRA);
+    else if (src_cn == 4 && dst_cn == 3)
+        cvtColor(src, dst, COLOR_BGRA2BGR);
+    else
+        CV_Error(Error::StsInternal, "");
+}
+
 static void _prepareImgAndDrawKeypoints( InputArray img1, const std::vector<KeyPoint>& keypoints1,
                                          InputArray img2, const std::vector<KeyPoint>& keypoints2,
                                          InputOutputArray _outImg, Mat& outImg1, Mat& outImg2,
@@ -140,21 +159,16 @@ static void _prepareImgAndDrawKeypoints( InputArray img1, const std::vector<KeyP
     }
     else
     {
-        _outImg.create( size, CV_MAKETYPE(img1.depth(), 3) );
+        const int cn1 = img1.channels(), cn2 = img2.channels();
+        const int out_cn = std::max(3, std::max(cn1, cn2));
+        _outImg.create(size, CV_MAKETYPE(img1.depth(), out_cn));
         outImg = _outImg.getMat();
         outImg = Scalar::all(0);
         outImg1 = outImg( Rect(0, 0, img1size.width, img1size.height) );
         outImg2 = outImg( Rect(img1size.width, 0, img2size.width, img2size.height) );
 
-        if( img1.type() == CV_8U )
-            cvtColor( img1, outImg1, COLOR_GRAY2BGR );
-        else
-            img1.copyTo( outImg1 );
-
-        if( img2.type() == CV_8U )
-            cvtColor( img2, outImg2, COLOR_GRAY2BGR );
-        else
-            img2.copyTo( outImg2 );
+        _prepareImage(img1, outImg1);
+        _prepareImage(img2, outImg2);
     }
 
     // draw keypoints
diff --git a/modules/features2d/test/test_drawing.cpp b/modules/features2d/test/test_drawing.cpp
new file mode 100644
index 0000000000..34434c2738
--- /dev/null
+++ b/modules/features2d/test/test_drawing.cpp
@@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+static
+Mat getReference_DrawKeypoint(int cn)
+{
+    static Mat ref = (Mat_<uint8_t>(11, 11) <<
+        1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+        1,   1,   1,   1,  15,  54,  15,   1,   1,   1,   1,
+        1,   1,   1,  76, 217, 217, 221,  81,   1,   1,   1,
+        1,   1, 100, 224, 111,  57, 115, 225, 101,   1,   1,
+        1,  44, 215, 100,   1,   1,   1, 101, 214,  44,   1,
+        1,  54, 212,  57,   1,   1,   1,  55, 212,  55,   1,
+        1,  40, 215, 104,   1,   1,   1, 105, 215,  40,   1,
+        1,   1, 102, 221, 111,  55, 115, 222, 103,   1,   1,
+        1,   1,   1,  76, 218, 217, 220,  81,   1,   1,   1,
+        1,   1,   1,   1,  15,  55,  15,   1,   1,   1,   1,
+        1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1);
+    Mat res;
+    cvtColor(ref, res, (cn == 4) ? COLOR_GRAY2BGRA : COLOR_GRAY2BGR);
+    return res;
+}
+
+typedef testing::TestWithParam<MatType> Features2D_drawKeypoints;
+TEST_P(Features2D_drawKeypoints, Accuracy)
+{
+    const int cn = CV_MAT_CN(GetParam());
+    Mat inpImg(11, 11, GetParam(), Scalar(1, 1, 1, 255)), outImg;
+
+    std::vector<KeyPoint> keypoints(1, KeyPoint(5, 5, 1));
+    drawKeypoints(inpImg, keypoints, outImg, Scalar::all(255));
+    ASSERT_EQ(outImg.channels(), (cn == 4) ? 4 : 3);
+
+    Mat ref_ = getReference_DrawKeypoint(cn);
+    EXPECT_EQ(0, cv::norm(outImg, ref_, NORM_INF));
+}
+INSTANTIATE_TEST_CASE_P(/**/, Features2D_drawKeypoints, Values(CV_8UC1, CV_8UC3, CV_8UC4));
+
+typedef testing::TestWithParam<tuple<MatType, MatType> > Features2D_drawMatches;
+TEST_P(Features2D_drawMatches, Accuracy)
+{
+    Mat inpImg1(11, 11, get<0>(GetParam()), Scalar(1, 1, 1, 255));
+    Mat inpImg2(11, 11, get<1>(GetParam()), Scalar(2, 2, 2, 255)), outImg2, outImg;
+
+    std::vector<KeyPoint> keypoints(1, KeyPoint(5, 5, 1));
+
+    // Get outImg2 using drawKeypoints assuming that it works correctly (see the test above).
+    drawKeypoints(inpImg2, keypoints, outImg2, Scalar::all(255));
+    ASSERT_EQ(outImg2.channels(), (inpImg2.channels() == 4) ? 4 : 3);
+
+    // Merge both references.
+    const int cn = max(3, max(inpImg1.channels(), inpImg2.channels()));
+    if (cn == 4 && outImg2.channels() == 3)
+        cvtColor(outImg2, outImg2, COLOR_BGR2BGRA);
+    Mat ref_ = getReference_DrawKeypoint(cn);
+    Mat concattedRef;
+    hconcat(ref_, outImg2, concattedRef);
+
+    std::vector<DMatch> matches;
+    drawMatches(inpImg1, keypoints, inpImg2, keypoints, matches, outImg,
+                Scalar::all(255), Scalar::all(255));
+    ASSERT_EQ(outImg.channels(), cn);
+
+    EXPECT_EQ(0, cv::norm(outImg, concattedRef, NORM_INF));
+}
+INSTANTIATE_TEST_CASE_P(/**/, Features2D_drawMatches, Combine(
+    Values(CV_8UC1, CV_8UC3, CV_8UC4),
+    Values(CV_8UC1, CV_8UC3, CV_8UC4)
+));
+
+}} // namespace
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index 0a386e50f3..81f7eb6ab2 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -66,6 +66,7 @@
 #include <functional>
 #include "opencv2/highgui.hpp"
 #include <GL/gl.h>
+#include "opencv2/core/opengl.hpp"
 #endif
 
 static const char* trackbar_text =
@@ -1144,20 +1145,20 @@ static void icvUpdateWindowPos( CvWindow* window )
         {
             RECT rmw, rw = icvCalcWindowRect(window );
             MoveWindow(window->hwnd, rw.left, rw.top,
-                rw.right - rw.left + 1, rw.bottom - rw.top + 1, FALSE);
+                rw.right - rw.left, rw.bottom - rw.top, FALSE);
             GetClientRect(window->hwnd, &rw);
             GetWindowRect(window->frame, &rmw);
             // Resize the mainhWnd window in order to make the bitmap fit into the child window
             MoveWindow(window->frame, rmw.left, rmw.top,
-                rmw.right - rmw.left + size.cx - rw.right + rw.left,
-                rmw.bottom  - rmw.top + size.cy - rw.bottom + rw.top, TRUE );
+                size.cx + (rmw.right - rmw.left) - (rw.right - rw.left),
+                size.cy + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE );
         }
     }
 
     rect = icvCalcWindowRect(window);
     MoveWindow(window->hwnd, rect.left, rect.top,
-               rect.right - rect.left + 1,
-               rect.bottom - rect.top + 1, TRUE );
+               rect.right - rect.left,
+               rect.bottom - rect.top, TRUE );
 }
 
 CV_IMPL void
@@ -1263,18 +1264,18 @@ CV_IMPL void cvResizeWindow(const char* name, int width, int height )
     {
         rw = icvCalcWindowRect(window);
         MoveWindow(window->hwnd, rw.left, rw.top,
-            rw.right - rw.left + 1, rw.bottom - rw.top + 1, FALSE);
+            rw.right - rw.left, rw.bottom - rw.top, FALSE);
         GetClientRect(window->hwnd, &rw);
         GetWindowRect(window->frame, &rmw);
         // Resize the mainhWnd window in order to make the bitmap fit into the child window
         MoveWindow(window->frame, rmw.left, rmw.top,
-            rmw.right - rmw.left + width - rw.right + rw.left,
-            rmw.bottom  - rmw.top + height - rw.bottom + rw.top, TRUE);
+            width  + (rmw.right - rmw.left) - (rw.right - rw.left),
+            height + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE);
     }
 
     rect = icvCalcWindowRect(window);
     MoveWindow(window->hwnd, rect.left, rect.top,
-        rect.right - rect.left + 1, rect.bottom - rect.top + 1, TRUE);
+        rect.right - rect.left, rect.bottom - rect.top, TRUE);
 
     __END__;
 }
@@ -1421,7 +1422,20 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
           GetClientRect( window->hwnd, &rect );
 
           SIZE size = {0,0};
-          icvGetBitmapData( window, &size, 0, 0 );
+#ifdef HAVE_OPENGL
+          if (window->useGl)
+          {
+              cv::ogl::Texture2D* texObj = static_cast<cv::ogl::Texture2D*>(window->glDrawData);
+              size.cx = texObj->cols();
+              size.cy = texObj->rows();
+          }
+          else
+          {
+              icvGetBitmapData(window, &size, 0, 0);
+          }
+#else
+          icvGetBitmapData(window, &size, 0, 0);
+#endif
 
           window->on_mouse( event, pt.x*size.cx/MAX(rect.right - rect.left,1),
                                    pt.y*size.cy/MAX(rect.bottom - rect.top,1), flags,
@@ -1561,8 +1575,8 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
             RECT rect = icvCalcWindowRect(window);
             pos->x = rect.left;
             pos->y = rect.top;
-            pos->cx = rect.right - rect.left + 1;
-            pos->cy = rect.bottom - rect.top + 1;
+            pos->cx = rect.right - rect.left;
+            pos->cy = rect.bottom - rect.top;
         }
         break;
 
@@ -1615,7 +1629,21 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                 SIZE size = {0, 0};
 
                 GetClientRect( window->hwnd, &rect );
+
+#ifdef HAVE_OPENGL
+                if (window->useGl)
+                {
+                    cv::ogl::Texture2D* texObj = static_cast<cv::ogl::Texture2D*>(window->glDrawData);
+                    size.cx = texObj->cols();
+                    size.cy = texObj->rows();
+                }
+                else
+                {
+                    icvGetBitmapData(window, &size, 0, 0);
+                }
+#else
                 icvGetBitmapData( window, &size, 0, 0 );
+#endif
 
                 window->on_mouse( event, pt.x*size.cx/MAX(rect.right - rect.left,1),
                                          pt.y*size.cy/MAX(rect.bottom - rect.top,1), flags,
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 342421e134..550fdffdb9 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -587,6 +587,7 @@ struct RowVec_8u32s
                 i += v_uint32::nlanes;
             }
         }
+        vx_cleanup();
         return i;
     }
 
@@ -1083,6 +1084,7 @@ struct SymmRowSmallVec_8u32s
             }
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1106,6 +1108,8 @@ struct SymmColumnVec_32s8u
     int operator()(const uchar** _src, uchar* dst, int width) const
     {
         int _ksize = kernel.rows + kernel.cols - 1;
+        if( _ksize == 1 )
+            return 0;
         int ksize2 = _ksize/2;
         const float* ky = kernel.ptr<float>() + ksize2;
         int i = 0, k;
@@ -1115,9 +1119,8 @@ struct SymmColumnVec_32s8u
         v_float32 d4 = vx_setall_f32(delta);
         if( symmetrical )
         {
-            if (_ksize == 1)
-                return 0;
             v_float32 f0 = vx_setall_f32(ky[0]);
+            v_float32 f1 = vx_setall_f32(ky[1]);
             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
             {
                 const int* S = src[0] + i;
@@ -1125,11 +1128,17 @@ struct SymmColumnVec_32s8u
                 v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
                 v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
                 v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
-                for( k = 1; k <= ksize2; k++ )
+                const int* S0 = src[1] + i;
+                const int* S1 = src[-1] + i;
+                s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
+                s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
+                s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2);
+                s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3);
+                for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
-                    const int* S0 = src[k] + i;
-                    const int* S1 = src[-k] + i;
+                    S0 = src[k] + i;
+                    S1 = src[-k] + i;
                     s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
                     s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
@@ -1142,11 +1151,15 @@ struct SymmColumnVec_32s8u
                 const int* S = src[0] + i;
                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
                 v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
-                for( k = 1; k <= ksize2; k++ )
+                const int* S0 = src[1] + i;
+                const int* S1 = src[-1] + i;
+                s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
+                s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
+                for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
-                    const int* S0 = src[k] + i;
-                    const int* S1 = src[-k] + i;
+                    S0 = src[k] + i;
+                    S1 = src[-k] + i;
                     s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
                 }
@@ -1160,7 +1173,8 @@ struct SymmColumnVec_32s8u
 #endif
             {
                 v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
-                for( k = 1; k <= ksize2; k++ )
+                s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0);
+                for( k = 2; k <= ksize2; k++ )
                     s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
                 v_int32x4 s32 = v_round(s0);
                 v_int16x8 s16 = v_pack(s32, s32);
@@ -1170,17 +1184,20 @@ struct SymmColumnVec_32s8u
         }
         else
         {
+            v_float32 f1 = vx_setall_f32(ky[1]);
             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
             {
-                v_float32 s0 = d4;
-                v_float32 s1 = d4;
-                v_float32 s2 = d4;
-                v_float32 s3 = d4;
-                for ( k = 1; k <= ksize2; k++ )
+                const int* S0 = src[1] + i;
+                const int* S1 = src[-1] + i;
+                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
+                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4);
+                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4);
+                for ( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
-                    const int* S0 = src[k] + i;
-                    const int* S1 = src[-k] + i;
+                    S0 = src[k] + i;
+                    S1 = src[-k] + i;
                     s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
                     s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
@@ -1190,13 +1207,15 @@ struct SymmColumnVec_32s8u
             }
             if( i <= width - v_uint16::nlanes )
             {
-                v_float32 s0 = d4;
-                v_float32 s1 = d4;
-                for ( k = 1; k <= ksize2; k++ )
+                const int* S0 = src[1] + i;
+                const int* S1 = src[-1] + i;
+                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
+                for ( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
-                    const int* S0 = src[k] + i;
-                    const int* S1 = src[-k] + i;
+                    S0 = src[k] + i;
+                    S1 = src[-k] + i;
                     s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
                 }
@@ -1209,8 +1228,8 @@ struct SymmColumnVec_32s8u
             if( i <= width - v_int32x4::nlanes )
 #endif
             {
-                v_float32x4 s0 = v_setall_f32(delta);
-                for (k = 1; k <= ksize2; k++)
+                v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta));
+                for (k = 2; k <= ksize2; k++)
                     s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
                 v_int32x4 s32 = v_round(s0);
                 v_int16x8 s16 = v_pack(s32, s32);
@@ -1219,6 +1238,7 @@ struct SymmColumnVec_32s8u
             }
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1250,57 +1270,104 @@ struct SymmColumnSmallVec_32s16s
         short* dst = (short*)_dst;
 
         v_float32 df4 = vx_setall_f32(delta);
-        v_int32 d4 = v_round(df4);
+        int d = cvRound(delta);
+        v_int16 d8 = vx_setall_s16((short)d);
         if( symmetrical )
         {
             if( ky[0] == 2 && ky[1] == 1 )
             {
-                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                {
+                    v_int32 s0 = vx_load(S1 + i);
+                    v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
+                    v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
+                    v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
+                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8);
+                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2),
+                                                              vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8);
+                }
+                if( i <= width - v_int16::nlanes )
                 {
                     v_int32 sl = vx_load(S1 + i);
                     v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh)));
+                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8);
+                    i += v_int16::nlanes;
                 }
                 if( i <= width - v_int32::nlanes )
                 {
                     v_int32 s = vx_load(S1 + i);
-                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s));
+                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s));
                     i += v_int32::nlanes;
                 }
             }
             else if( ky[0] == -2 && ky[1] == 1 )
             {
-                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                {
+                    v_int32 s0 = vx_load(S1 + i);
+                    v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
+                    v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
+                    v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
+                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0),
+                                            vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8);
+                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2),
+                                                              vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8);
+                }
+                if( i <= width - v_int16::nlanes )
                 {
                     v_int32 sl = vx_load(S1 + i);
                     v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh)));
+                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8);
+                    i += v_int16::nlanes;
                 }
                 if( i <= width - v_int32::nlanes )
                 {
                     v_int32 s = vx_load(S1 + i);
-                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s));
+                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s));
                     i += v_int32::nlanes;
                 }
             }
+#if CV_NEON
             else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) )
             {
                 v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
-                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                v_int32 d4 = vx_setall_s32(d);
+                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                {
                     v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
                                             v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+                    v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)),
+                                                              v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4))));
+                }
+                if( i <= width - v_int16::nlanes )
+                {
+                    v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                            v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+                    i += v_int16::nlanes;
+                }
                 if( i <= width - v_int32::nlanes )
                 {
                     v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
                     i += v_int32::nlanes;
                 }
             }
+#endif
             else
             {
                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
-                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                {
                     v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
                                             v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+                    v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))),
+                                                              v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4)))));
+                }
+                if( i <= width - v_int16::nlanes )
+                {
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                            v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+                    i += v_int16::nlanes;
+                }
                 if( i <= width - v_int32::nlanes )
                 {
                     v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
@@ -1314,20 +1381,38 @@ struct SymmColumnSmallVec_32s16s
             {
                 if( ky[1] < 0 )
                     std::swap(S0, S2);
-                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
-                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4));
+                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                {
+                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
+                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8);
+                }
+                if( i <= width - v_int16::nlanes )
+                {
+                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
+                    i += v_int16::nlanes;
+                }
                 if( i <= width - v_int32::nlanes )
                 {
-                    v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+                    v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d));
                     i += v_int32::nlanes;
                 }
             }
             else
             {
                 v_float32 k1 = vx_setall_f32(ky[1]);
-                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                {
                     v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
                                             v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+                    v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)),
+                                                              v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4))));
+                }
+                if( i <= width - v_int16::nlanes )
+                {
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
+                                            v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+                    i += v_int16::nlanes;
+                }
                 if( i <= width - v_int32::nlanes )
                 {
                     v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
@@ -1336,6 +1421,7 @@ struct SymmColumnSmallVec_32s16s
             }
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1362,19 +1448,43 @@ struct RowVec_16s32f
         const float* _kx = kernel.ptr<float>();
         width *= cn;
 
-        for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+        for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+        {
+            const short* src = (const short*)_src + i;
+            v_float32 s0 = vx_setzero_f32();
+            v_float32 s1 = vx_setzero_f32();
+            v_float32 s2 = vx_setzero_f32();
+            v_float32 s3 = vx_setzero_f32();
+            for( k = 0; k < _ksize; k++, src += cn )
+            {
+                v_float32 f = vx_setall_f32(_kx[k]);
+                v_int16 xl = vx_load(src);
+                v_int16 xh = vx_load(src + v_int16::nlanes);
+                s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0);
+                s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1);
+                s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2);
+                s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3);
+            }
+            v_store(dst + i, s0);
+            v_store(dst + i + v_float32::nlanes, s1);
+            v_store(dst + i + 2*v_float32::nlanes, s2);
+            v_store(dst + i + 3*v_float32::nlanes, s3);
+        }
+        if( i <= width - v_int16::nlanes )
         {
             const short* src = (const short*)_src + i;
             v_float32 s0 = vx_setzero_f32();
             v_float32 s1 = vx_setzero_f32();
             for( k = 0; k < _ksize; k++, src += cn )
             {
+                v_float32 f = vx_setall_f32(_kx[k]);
                 v_int16 x = vx_load(src);
-                s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0);
-                s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1);
+                s0 = v_muladd(v_cvt_f32(v_expand_low(x)), f, s0);
+                s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1);
             }
             v_store(dst + i, s0);
             v_store(dst + i + v_float32::nlanes, s1);
+            i += v_int16::nlanes;
         }
         if( i <= width - v_float32::nlanes )
         {
@@ -1385,6 +1495,7 @@ struct RowVec_16s32f
             v_store(dst + i, s0);
             i += v_float32::nlanes;
         }
+        vx_cleanup();
         return i;
     }
 
@@ -1406,6 +1517,8 @@ struct SymmColumnVec_32f16s
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
         int _ksize = kernel.rows + kernel.cols - 1;
+        if( _ksize == 1 )
+            return 0;
         int ksize2 = _ksize / 2;
         const float* ky = kernel.ptr<float>() + ksize2;
         int i = 0, k;
@@ -1416,25 +1529,49 @@ struct SymmColumnVec_32f16s
         v_float32 d4 = vx_setall_f32(delta);
         if( symmetrical )
         {
-            if (_ksize == 1)
-                return 0;
             v_float32 k0 = vx_setall_f32(ky[0]);
-            for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+            v_float32 k1 = vx_setall_f32(ky[1]);
+            for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
                 v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
-                for( k = 1; k <= ksize2; k++ )
+                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
+                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
+                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+                s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
+                s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2);
+                s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3);
+                for( k = 2; k <= ksize2; k++ )
                 {
-                    v_float32 k1 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+                    v_float32 k2 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
+                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+            }
+            if( i <= width - v_int16::nlanes )
+            {
+                v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+                s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    v_float32 k2 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                }
+                v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                i += v_int16::nlanes;
             }
             if( i <= width - v_float32::nlanes )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                for( k = 1; k <= ksize2; k++ )
+                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+                for( k = 2; k <= ksize2; k++ )
                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
                 v_pack_store(dst + i, v_round(s0));
                 i += v_float32::nlanes;
@@ -1442,28 +1579,48 @@ struct SymmColumnVec_32f16s
         }
         else
         {
-            for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+            v_float32 k1 = vx_setall_f32(ky[1]);
+            for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
             {
-                v_float32 s0 = d4;
-                v_float32 s1 = d4;
-                for( k = 1; k <= ksize2; k++ )
+                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
+                v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+                for( k = 2; k <= ksize2; k++ )
                 {
-                    v_float32 k1 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+                    v_float32 k2 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
+                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+            }
+            if( i <= width - v_int16::nlanes )
+            {
+                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    v_float32 k2 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                }
+                v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                i += v_int16::nlanes;
             }
             if( i <= width - v_float32::nlanes )
             {
-                v_float32 s0 = d4;
-                for( k = 1; k <= ksize2; k++ )
+                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                for( k = 2; k <= ksize2; k++ )
                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
                 v_pack_store(dst + i, v_round(s0));
                 i += v_float32::nlanes;
             }
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1505,6 +1662,7 @@ struct RowVec_32f
         }
 #endif
         int _ksize = kernel.rows + kernel.cols - 1;
+        CV_DbgAssert(_ksize > 0);
         const float* src0 = (const float*)_src;
         float* dst = (float*)_dst;
         const float* _kx = kernel.ptr<float>();
@@ -1516,14 +1674,55 @@ struct RowVec_32f
         if (haveAVX2)
             return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize);
 #endif
-        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+        v_float32 k0 = vx_setall_f32(_kx[0]);
+        for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
         {
             const float* src = src0 + i;
-            v_float32 s0 = vx_setzero_f32();
-            for( k = 0; k < _ksize; k++, src += cn )
+            v_float32 s0 = vx_load(src) * k0;
+            v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
+            v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0;
+            v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0;
+            src += cn;
+            for( k = 1; k < _ksize; k++, src += cn )
+            {
+                v_float32 k1 = vx_setall_f32(_kx[k]);
+                s0 = v_muladd(vx_load(src), k1, s0);
+                s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
+                s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2);
+                s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3);
+            }
+            v_store(dst + i, s0);
+            v_store(dst + i + v_float32::nlanes, s1);
+            v_store(dst + i + 2*v_float32::nlanes, s2);
+            v_store(dst + i + 3*v_float32::nlanes, s3);
+        }
+        if( i <= width - 2*v_float32::nlanes )
+        {
+            const float* src = src0 + i;
+            v_float32 s0 = vx_load(src) * k0;
+            v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
+            src += cn;
+            for( k = 1; k < _ksize; k++, src += cn )
+            {
+                v_float32 k1 = vx_setall_f32(_kx[k]);
+                s0 = v_muladd(vx_load(src), k1, s0);
+                s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
+            }
+            v_store(dst + i, s0);
+            v_store(dst + i + v_float32::nlanes, s1);
+            i += 2*v_float32::nlanes;
+        }
+        if( i <= width - v_float32::nlanes )
+        {
+            const float* src = src0 + i;
+            v_float32 s0 = vx_load(src) * k0;
+            src += cn;
+            for( k = 1; k < _ksize; k++, src += cn )
                 s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
             v_store(dst + i, s0);
+            i += v_float32::nlanes;
         }
+        vx_cleanup();
         return i;
     }
 
@@ -1584,6 +1783,8 @@ struct SymmRowSmallVec_32f
     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
     {
         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
+        if( _ksize == 1 )
+            return 0;
         float* dst = (float*)_dst;
         const float* src = (const float*)_src + (_ksize/2)*cn;
         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
@@ -1592,15 +1793,28 @@ struct SymmRowSmallVec_32f
 
         if( symmetrical )
         {
-            if( _ksize == 1 )
-                return 0;
             if( _ksize == 3 )
             {
                 if( fabs(kx[0]) == 2 && kx[1] == 1 )
                 {
+#if CV_FMA3 || CV_AVX2
                     v_float32 k0 = vx_setall_f32(kx[0]);
                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
                         v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+#else
+                    if( kx[0] > 0 )
+                        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        {
+                            v_float32 x = vx_load(src);
+                            v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x));
+                        }
+                    else
+                        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        {
+                            v_float32 x = vx_load(src);
+                            v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x));
+                        }
+#endif
                 }
                 else
                 {
@@ -1613,9 +1827,17 @@ struct SymmRowSmallVec_32f
             {
                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
                 {
+#if CV_FMA3 || CV_AVX2
                     v_float32 k0 = vx_setall_f32(-2);
                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
                         v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+#else
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                    {
+                        v_float32 x = vx_load(src);
+                        v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x));
+                    }
+#endif
                 }
                 else
                 {
@@ -1647,6 +1869,7 @@ struct SymmRowSmallVec_32f
             }
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1688,12 +1911,47 @@ struct SymmColumnVec_32f
                 return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2);
 #endif
             const v_float32 d4 = vx_setall_f32(delta);
-            for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+            const v_float32 k0 = vx_setall_f32(ky[0]);
+            for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
             {
-                v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4);
+                v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
+                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
+                for( k = 1; k <= ksize2; k++ )
+                {
+                    v_float32 k1 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2);
+                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3);
+                }
+                v_store(dst + i, s0);
+                v_store(dst + i + v_float32::nlanes, s1);
+                v_store(dst + i + 2*v_float32::nlanes, s2);
+                v_store(dst + i + 3*v_float32::nlanes, s3);
+            }
+            if( i <= width - 2*v_float32::nlanes )
+            {
+                v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+                for( k = 1; k <= ksize2; k++ )
+                {
+                    v_float32 k1 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+                }
+                v_store(dst + i, s0);
+                v_store(dst + i + v_float32::nlanes, s1);
+                i += 2*v_float32::nlanes;
+            }
+            if( i <= width - v_float32::nlanes )
+            {
+                v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
                 v_store(dst + i, s0);
+                i += v_float32::nlanes;
             }
         }
         else
@@ -1702,16 +1960,53 @@ struct SymmColumnVec_32f
             if (haveAVX2)
                 return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
 #endif
+            CV_DbgAssert(ksize2 > 0);
             const v_float32 d4 = vx_setall_f32(delta);
-            for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+            const v_float32 k1 = vx_setall_f32(ky[1]);
+            for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
             {
-                v_float32 s0 = d4;
-                for( k = 1; k <= ksize2; k++ )
+                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
+                v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    v_float32 k2 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
+                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+                }
+                v_store(dst + i, s0);
+                v_store(dst + i + v_float32::nlanes, s1);
+                v_store(dst + i + 2*v_float32::nlanes, s2);
+                v_store(dst + i + 3*v_float32::nlanes, s3);
+            }
+            if( i <= width - 2*v_float32::nlanes )
+            {
+                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    v_float32 k2 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                }
+                v_store(dst + i, s0);
+                v_store(dst + i + v_float32::nlanes, s1);
+                i += 2*v_float32::nlanes;
+            }
+            if( i <= width - v_float32::nlanes )
+            {
+                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                for( k = 2; k <= ksize2; k++ )
                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
                 v_store(dst + i, s0);
+                i += v_float32::nlanes;
             }
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1748,9 +2043,24 @@ struct SymmColumnSmallVec_32f
         {
             if( fabs(ky[0]) == 2 && ky[1] == 1 )
             {
+#if CV_FMA3 || CV_AVX2
                 v_float32 k0 = vx_setall_f32(ky[0]);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
                     v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
+#else
+                if(ky[0] > 0)
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    {
+                        v_float32 x = vx_load(S1 + i);
+                        v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x));
+                    }
+                else
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    {
+                        v_float32 x = vx_load(S1 + i);
+                        v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x));
+                    }
+#endif
             }
             else
             {
@@ -1776,6 +2086,7 @@ struct SymmColumnSmallVec_32f
             }
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1804,19 +2115,27 @@ struct FilterVec_8u
 
     int operator()(const uchar** src, uchar* dst, int width) const
     {
+        CV_DbgAssert(_nz > 0);
         const float* kf = (const float*)&coeffs[0];
         int i = 0, k, nz = _nz;
 
         v_float32 d4 = vx_setall_f32(delta);
+        v_float32 f0 = vx_setall_f32(kf[0]);
         for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
         {
-            v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-            for( k = 0; k < nz; k++ )
+            v_uint16 xl, xh;
+            v_expand(vx_load(src[0] + i), xl, xh);
+            v_uint32 x0, x1, x2, x3;
+            v_expand(xl, x0, x1);
+            v_expand(xh, x2, x3);
+            v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4);
+            v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4);
+            v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f0, d4);
+            v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f0, d4);
+            for( k = 1; k < nz; k++ )
             {
                 v_float32 f = vx_setall_f32(kf[k]);
-                v_uint16 xl, xh;
                 v_expand(vx_load(src[k] + i), xl, xh);
-                v_uint32 x0, x1, x2, x3;
                 v_expand(xl, x0, x1);
                 v_expand(xh, x2, x3);
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
@@ -1828,11 +2147,13 @@ struct FilterVec_8u
         }
         if( i <= width - v_uint16::nlanes )
         {
-            v_float32 s0 = d4, s1 = d4;
-            for( k = 0; k < nz; k++ )
+            v_uint32 x0, x1;
+            v_expand(vx_load_expand(src[0] + i), x0, x1);
+            v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4);
+            v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4);
+            for( k = 1; k < nz; k++ )
             {
                 v_float32 f = vx_setall_f32(kf[k]);
-                v_uint32 x0, x1;
                 v_expand(vx_load_expand(src[k] + i), x0, x1);
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
@@ -1846,8 +2167,8 @@ struct FilterVec_8u
         if( i <= width - v_int32x4::nlanes )
 #endif
         {
-            v_float32x4 s0 = v_setall_f32(delta);
-            for( k = 0; k < nz; k++ )
+            v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta));
+            for( k = 1; k < nz; k++ )
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
             v_int32x4 s32 = v_round(s0);
             v_int16x8 s16 = v_pack(s32, s32);
@@ -1855,6 +2176,7 @@ struct FilterVec_8u
             i += v_int32x4::nlanes;
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1879,18 +2201,24 @@ struct FilterVec_8u16s
 
     int operator()(const uchar** src, uchar* _dst, int width) const
     {
+        CV_DbgAssert(_nz > 0);
         const float* kf = (const float*)&coeffs[0];
         short* dst = (short*)_dst;
         int i = 0, k, nz = _nz;
 
         v_float32 d4 = vx_setall_f32(delta);
+        v_float32 f0 = vx_setall_f32(kf[0]);
         for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
         {
-            v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-            for( k = 0; k < nz; k++ )
+            v_uint16 xl, xh;
+            v_expand(vx_load(src[0] + i), xl, xh);
+            v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f0, d4);
+            v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f0, d4);
+            v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f0, d4);
+            v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f0, d4);
+            for( k = 1; k < nz; k++ )
             {
                 v_float32 f = vx_setall_f32(kf[k]);
-                v_uint16 xl, xh;
                 v_expand(vx_load(src[k] + i), xl, xh);
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0);
                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1);
@@ -1902,11 +2230,13 @@ struct FilterVec_8u16s
         }
         if( i <= width - v_uint16::nlanes )
         {
-            v_float32 s0 = d4, s1 = d4;
-            for( k = 0; k < nz; k++ )
+            v_uint16 x = vx_load_expand(src[0] + i);
+            v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4);
+            v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f0, d4);
+            for( k = 1; k < nz; k++ )
             {
                 v_float32 f = vx_setall_f32(kf[k]);
-                v_uint16 x = vx_load_expand(src[k] + i);
+                x = vx_load_expand(src[k] + i);
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0);
                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
             }
@@ -1915,13 +2245,14 @@ struct FilterVec_8u16s
         }
         if( i <= width - v_int32::nlanes )
         {
-            v_float32 s0 = d4;
-            for( k = 0; k < nz; k++ )
+            v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4);
+            for( k = 1; k < nz; k++ )
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
             v_pack_store(dst + i, v_round(s0));
             i += v_int32::nlanes;
         }
 
+        vx_cleanup();
         return i;
     }
 
@@ -1950,14 +2281,50 @@ struct FilterVec_32f
         int i = 0, k, nz = _nz;
 
         v_float32 d4 = vx_setall_f32(delta);
-        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+        v_float32 f0 = vx_setall_f32(kf[0]);
+        for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
         {
-            v_float32 s0 = d4;
-            for( k = 0; k < nz; k++ )
+            v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
+            v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
+            v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4);
+            v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4);
+            for( k = 1; k < nz; k++ )
+            {
+                v_float32 f1 = vx_setall_f32(kf[k]);
+                s0 = v_muladd(vx_load(src[k] + i), f1, s0);
+                s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
+                s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2);
+                s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3);
+            }
+            v_store(dst + i, s0);
+            v_store(dst + i + v_float32::nlanes, s1);
+            v_store(dst + i + 2*v_float32::nlanes, s2);
+            v_store(dst + i + 3*v_float32::nlanes, s3);
+        }
+        if( i <= width - 2*v_float32::nlanes )
+        {
+            v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
+            v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
+            for( k = 1; k < nz; k++ )
+            {
+                v_float32 f1 = vx_setall_f32(kf[k]);
+                s0 = v_muladd(vx_load(src[k] + i), f1, s0);
+                s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
+            }
+            v_store(dst + i, s0);
+            v_store(dst + i + v_float32::nlanes, s1);
+            i += 2*v_float32::nlanes;
+        }
+        if( i <= width - v_float32::nlanes )
+        {
+            v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
+            for( k = 1; k < nz; k++ )
                 s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
             v_store(dst + i, s0);
+            i += v_float32::nlanes;
         }
 
+        vx_cleanup();
         return i;
     }
 
diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp
index 155c62f342..7749e4a59a 100644
--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -403,9 +403,9 @@ void CV_FilterTest::get_test_array_types_and_sizes( int test_case_idx,
 {
     CV_FilterBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
     RNG& rng = ts->get_rng();
-    int depth = cvtest::randInt(rng)%3;
+    int depth = cvtest::randInt(rng)%4;
     int cn = CV_MAT_CN(types[INPUT][0]);
-    depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : CV_32F;
+    depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F;
     types[INPUT][0] = types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth, cn);
 }
 
@@ -457,10 +457,11 @@ void CV_DerivBaseTest::get_test_array_types_and_sizes( int test_case_idx,
 {
     RNG& rng = ts->get_rng();
     CV_FilterBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
-    int depth = cvtest::randInt(rng) % 2;
-    depth = depth == 0 ? CV_8U : CV_32F;
+    int depth = cvtest::randInt(rng) % 4;
+    depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F;
     types[INPUT][0] = CV_MAKETYPE(depth,1);
-    types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth==CV_8U?CV_16S:CV_32F,1);
+    int sameDepth = cvtest::randInt(rng) % 2;
+    types[OUTPUT][0] = types[REF_OUTPUT][0] = sameDepth ? depth : CV_MAKETYPE(depth==CV_8U?CV_16S:CV_32F,1);
     _aperture_size = (cvtest::randInt(rng)%5)*2 - 1;
     sizes[INPUT][1] = aperture_size = cvSize(_aperture_size, _aperture_size);
 }
@@ -2211,4 +2212,27 @@ TEST(Imgproc_MedianBlur, hires_regression_13409)
 
     ASSERT_EQ(0.0, cvtest::norm(dst_hires(Rect(516, 516, 1016, 1016)), dst_ref(Rect(4, 4, 1016, 1016)), NORM_INF));
 }
+
+TEST(Imgproc_Sobel, s16_regression_13506)
+{
+    Mat src = (Mat_<short>(8, 16) << 127, 138, 130, 102, 118,  97,  76,  84, 124,  90, 146,  63, 130,  87, 212,  85,
+                                     164,   3,  51, 124, 151,  89, 154, 117,  36,  88, 116, 117, 180, 112, 147, 124,
+                                      63,  50, 115, 103,  83, 148, 106,  79, 213, 106, 135,  53,  79, 106, 122, 112,
+                                     218, 107,  81, 126,  78, 138,  85, 142, 151, 108, 104, 158, 155,  81, 112, 178,
+                                     184,  96, 187, 148, 150, 112, 138, 162, 222, 146, 128,  49, 124,  46, 165, 104,
+                                     119, 164,  77, 144, 186,  98, 106, 148, 155, 157, 160, 151, 156, 149,  43, 122,
+                                     106, 155, 120, 132, 159, 115, 126, 188,  44,  79, 164, 201, 153,  97, 139, 133,
+                                     133,  98, 111, 165,  66, 106, 131,  85, 176, 156,  67, 108, 142,  91,  74, 137);
+    Mat ref = (Mat_<short>(8, 16) <<     0,    0,    0,    0,     0,    0,    0,     0,     0,     0,     0,    0,    0,     0,     0,     0,
+                                     -1020, -796, -489, -469,  -247,  317,  760,  1429,  1983,  1384,   254, -459, -899, -1197, -1172, -1058,
+                                      2552, 2340, 1617,  591,     9,   96,  722,  1985,  2746,  1916,   676,    9, -635, -1115,  -779,  -380,
+                                      3546, 3349, 2838, 2206,  1388,  669,  938,  1880,  2252,  1785,  1083,  606,  180,  -298,  -464,  -418,
+                                       816,  966, 1255, 1652,  1619,  924,  535,   288,     5,   601,  1581, 1870, 1520,   625,  -627, -1260,
+                                      -782, -610, -395, -267,  -122,  -42, -317, -1378, -2293, -1451,   596, 1870, 1679,   763,   -69,  -394,
+                                      -882, -681, -463, -818, -1167, -732, -463, -1042, -1604, -1592, -1047, -334, -104,  -117,   229,   512,
+                                         0,    0,    0,    0,     0,    0,    0,     0,     0,     0,     0,    0,    0,     0,     0,     0);
+    Mat dst;
+    Sobel(src, dst, CV_16S, 0, 1, 5);
+    ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF));
+}
 }} // namespace
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index cabe515932..5e96822e81 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -351,6 +351,7 @@ void FeaturesMatcher::operator ()(const std::vector<ImageFeatures> &features, st
             if (features[i].keypoints.size() > 0 && features[j].keypoints.size() > 0 && mask_(i, j))
                 near_pairs.push_back(std::make_pair(i, j));
 
+    pairwise_matches.clear(); // clear history values
     pairwise_matches.resize(num_images * num_images);
     MatchPairsBody body(*this, features, pairwise_matches, near_pairs);
 
diff --git a/samples/dnn/openpose.cpp b/samples/dnn/openpose.cpp
index b4934d76e4..48e2dc0475 100644
--- a/samples/dnn/openpose.cpp
+++ b/samples/dnn/openpose.cpp
@@ -57,21 +57,26 @@ const int POSE_PAIRS[3][20][2] = {
 int main(int argc, char **argv)
 {
     CommandLineParser parser(argc, argv,
-        "{ h help           | false | print this help message }"
-        "{ p proto          |       | (required) model configuration, e.g. hand/pose.prototxt }"
-        "{ m model          |       | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }"
-        "{ i image          |       | (required) path to image file (containing a single person, or hand) }"
-        "{ width            |  368  | Preprocess input image by resizing to a specific width. }"
-        "{ height           |  368  | Preprocess input image by resizing to a specific height. }"
-        "{ t threshold      |  0.1  | threshold or confidence value for the heatmap }"
+        "{ h help           | false     | print this help message }"
+        "{ p proto          |           | (required) model configuration, e.g. hand/pose.prototxt }"
+        "{ m model          |           | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }"
+        "{ i image          |           | (required) path to image file (containing a single person, or hand) }"
+        "{ d dataset        |           | specify what kind of model was trained. It could be (COCO, MPI, HAND) depends on dataset. }"
+        "{ width            |  368      | Preprocess input image by resizing to a specific width. }"
+        "{ height           |  368      | Preprocess input image by resizing to a specific height. }"
+        "{ t threshold      |  0.1      | threshold or confidence value for the heatmap }"
+        "{ s scale          |  0.003922 | scale for blob }"
     );
 
     String modelTxt = samples::findFile(parser.get<string>("proto"));
     String modelBin = samples::findFile(parser.get<string>("model"));
     String imageFile = samples::findFile(parser.get<String>("image"));
+    String dataset = parser.get<String>("dataset");
     int W_in = parser.get<int>("width");
     int H_in = parser.get<int>("height");
     float thresh = parser.get<float>("threshold");
+    float scale  = parser.get<float>("scale");
+
     if (parser.get<bool>("help") || modelTxt.empty() || modelBin.empty() || imageFile.empty())
     {
         cout << "A sample app to demonstrate human or hand pose detection with a pretrained OpenPose dnn." << endl;
@@ -79,9 +84,18 @@ int main(int argc, char **argv)
         return 0;
     }
 
-    // read the network model
-    Net net = readNetFromCaffe(modelTxt, modelBin);
+    int midx, npairs, nparts;
+         if (!dataset.compare("COCO")) {  midx = 0; npairs = 17; nparts = 18; }
+    else if (!dataset.compare("MPI"))  {  midx = 1; npairs = 14; nparts = 16; }
+    else if (!dataset.compare("HAND")) {  midx = 2; npairs = 20; nparts = 22; }
+    else
+    {
+        std::cerr << "Can't interpret dataset parameter: " << dataset << std::endl;
+        exit(-1);
+    }
 
+    // read the network model
+    Net net = readNet(modelBin, modelTxt);
     // and the image
     Mat img = imread(imageFile);
     if (img.empty())
@@ -91,39 +105,14 @@ int main(int argc, char **argv)
     }
 
     // send it through the network
-    Mat inputBlob = blobFromImage(img, 1.0 / 255, Size(W_in, H_in), Scalar(0, 0, 0), false, false);
+    Mat inputBlob = blobFromImage(img, scale, Size(W_in, H_in), Scalar(0, 0, 0), false, false);
     net.setInput(inputBlob);
     Mat result = net.forward();
     // the result is an array of "heatmaps", the probability of a body part being in location x,y
 
-    int midx, npairs;
-    int nparts = result.size[1];
     int H = result.size[2];
     int W = result.size[3];
 
-    // find out, which model we have
-    if (nparts == 19)
-    {   // COCO body
-        midx   = 0;
-        npairs = 17;
-        nparts = 18; // skip background
-    }
-    else if (nparts == 16)
-    {   // MPI body
-        midx   = 1;
-        npairs = 14;
-    }
-    else if (nparts == 22)
-    {   // hand
-        midx   = 2;
-        npairs = 20;
-    }
-    else
-    {
-        cerr << "there should be 19 parts for the COCO model, 16 for MPI, or 22 for the hand one, but this model has " << nparts << " parts." << endl;
-        return (0);
-    }
-
     // find the position of the body parts
     vector<Point> points(22);
     for (int n=0; n<nparts; n++)
diff --git a/samples/dnn/openpose.py b/samples/dnn/openpose.py
index 9fcca1350a..e6bb1ba05a 100644
--- a/samples/dnn/openpose.py
+++ b/samples/dnn/openpose.py
@@ -1,5 +1,5 @@
 # To use Inference Engine backend, specify location of plugins:
-# export LD_LIBRARY_PATH=/opt/intel/deeplearning_deploymenttoolkit/deployment_tools/external/mklml_lnx/lib:$LD_LIBRARY_PATH
+# source /opt/intel/computer_vision_sdk/bin/setupvars.sh
 import cv2 as cv
 import numpy as np
 import argparse
@@ -12,10 +12,11 @@ parser.add_argument('--input', help='Path to image or video. Skip to capture fra
 parser.add_argument('--proto', help='Path to .prototxt')
 parser.add_argument('--model', help='Path to .caffemodel')
 parser.add_argument('--dataset', help='Specify what kind of model was trained. '
-                                      'It could be (COCO, MPI) depends on dataset.')
+                                      'It could be (COCO, MPI, HAND) depends on dataset.')
 parser.add_argument('--thr', default=0.1, type=float, help='Threshold value for pose parts heat map')
 parser.add_argument('--width', default=368, type=int, help='Resize input to specific width.')
 parser.add_argument('--height', default=368, type=int, help='Resize input to specific height.')
+parser.add_argument('--scale', default=0.003922, type=float, help='Scale for blob.')
 
 args = parser.parse_args()
 
@@ -30,8 +31,7 @@ if args.dataset == 'COCO':
                    ["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"],
                    ["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"],
                    ["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"] ]
-else:
-    assert(args.dataset == 'MPI')
+elif args.dataset == 'MPI':
     BODY_PARTS = { "Head": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
                    "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
                    "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "Chest": 14,
@@ -41,11 +41,33 @@ else:
                    ["RElbow", "RWrist"], ["Neck", "LShoulder"], ["LShoulder", "LElbow"],
                    ["LElbow", "LWrist"], ["Neck", "Chest"], ["Chest", "RHip"], ["RHip", "RKnee"],
                    ["RKnee", "RAnkle"], ["Chest", "LHip"], ["LHip", "LKnee"], ["LKnee", "LAnkle"] ]
+else:
+    assert(args.dataset == 'HAND')
+    BODY_PARTS = { "Wrist": 0,
+                   "ThumbMetacarpal": 1, "ThumbProximal": 2, "ThumbMiddle": 3, "ThumbDistal": 4,
+                   "IndexFingerMetacarpal": 5, "IndexFingerProximal": 6, "IndexFingerMiddle": 7, "IndexFingerDistal": 8,
+                   "MiddleFingerMetacarpal": 9, "MiddleFingerProximal": 10, "MiddleFingerMiddle": 11, "MiddleFingerDistal": 12,
+                   "RingFingerMetacarpal": 13, "RingFingerProximal": 14, "RingFingerMiddle": 15, "RingFingerDistal": 16,
+                   "LittleFingerMetacarpal": 17, "LittleFingerProximal": 18, "LittleFingerMiddle": 19, "LittleFingerDistal": 20,
+                 }
+
+    POSE_PAIRS = [ ["Wrist", "ThumbMetacarpal"], ["ThumbMetacarpal", "ThumbProximal"],
+                   ["ThumbProximal", "ThumbMiddle"], ["ThumbMiddle", "ThumbDistal"],
+                   ["Wrist", "IndexFingerMetacarpal"], ["IndexFingerMetacarpal", "IndexFingerProximal"],
+                   ["IndexFingerProximal", "IndexFingerMiddle"], ["IndexFingerMiddle", "IndexFingerDistal"],
+                   ["Wrist", "MiddleFingerMetacarpal"], ["MiddleFingerMetacarpal", "MiddleFingerProximal"],
+                   ["MiddleFingerProximal", "MiddleFingerMiddle"], ["MiddleFingerMiddle", "MiddleFingerDistal"],
+                   ["Wrist", "RingFingerMetacarpal"], ["RingFingerMetacarpal", "RingFingerProximal"],
+                   ["RingFingerProximal", "RingFingerMiddle"], ["RingFingerMiddle", "RingFingerDistal"],
+                   ["Wrist", "LittleFingerMetacarpal"], ["LittleFingerMetacarpal", "LittleFingerProximal"],
+                   ["LittleFingerProximal", "LittleFingerMiddle"], ["LittleFingerMiddle", "LittleFingerDistal"] ]
+
 
 inWidth = args.width
 inHeight = args.height
+inScale = args.scale
 
-net = cv.dnn.readNetFromCaffe(cv.samples.findFile(args.proto), cv.samples.findFile(args.model))
+net = cv.dnn.readNet(cv.samples.findFile(args.proto), cv.samples.findFile(args.model))
 
 cap = cv.VideoCapture(args.input if args.input else 0)
 
@@ -57,12 +79,12 @@ while cv.waitKey(1) < 0:
 
     frameWidth = frame.shape[1]
     frameHeight = frame.shape[0]
-    inp = cv.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight),
+    inp = cv.dnn.blobFromImage(frame, inScale, (inWidth, inHeight),
                               (0, 0, 0), swapRB=False, crop=False)
     net.setInput(inp)
     out = net.forward()
 
-    assert(len(BODY_PARTS) == out.shape[1])
+    assert(len(BODY_PARTS) <= out.shape[1])
 
     points = []
     for i in range(len(BODY_PARTS)):