diff --git a/cmake/OpenCVDetectApacheAnt.cmake b/cmake/OpenCVDetectApacheAnt.cmake
index 989cb4075b..2f8243838e 100644
--- a/cmake/OpenCVDetectApacheAnt.cmake
+++ b/cmake/OpenCVDetectApacheAnt.cmake
@@ -1,3 +1,6 @@
+set(OPENCV_JAVA_SOURCE_VERSION "" CACHE STRING "Java source version (javac Ant target)")
+set(OPENCV_JAVA_TARGET_VERSION "" CACHE STRING "Java target version (javac Ant target)")
+
 file(TO_CMAKE_PATH "$ENV{ANT_DIR}" ANT_DIR_ENV_PATH)
 file(TO_CMAKE_PATH "$ENV{ProgramFiles}" ProgramFiles_ENV_PATH)
 
diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index d41f9243b4..8c17497a25 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -78,9 +78,9 @@ endif()
 
 if(INF_ENGINE_TARGET)
   if(NOT INF_ENGINE_RELEASE)
-    message(WARNING "InferenceEngine version have not been set, 2018R4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    message(WARNING "InferenceEngine version have not been set, 2018R5 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
   endif()
-  set(INF_ENGINE_RELEASE "2018040000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2018R2.0.2 -> 2018020002)")
+  set(INF_ENGINE_RELEASE "2018050000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2018R2.0.2 -> 2018020002)")
   set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
     INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
   )
diff --git a/doc/tutorials/dnn/dnn_android/dnn_android.markdown b/doc/tutorials/dnn/dnn_android/dnn_android.markdown
index 5dd6e2d664..58bda98e89 100644
--- a/doc/tutorials/dnn/dnn_android/dnn_android.markdown
+++ b/doc/tutorials/dnn/dnn_android/dnn_android.markdown
@@ -12,7 +12,7 @@ Tutorial was written for the following versions of corresponding software:
 
 - Download and install Android Studio from https://developer.android.com/studio.
 
-- Get the latest pre-built OpenCV for Android release from https://github.com/opencv/opencv/releases and unpack it (for example, `opencv-3.4.4-android-sdk.zip`).
+- Get the latest pre-built OpenCV for Android release from https://github.com/opencv/opencv/releases and unpack it (for example, `opencv-3.4.5-android-sdk.zip`).
 
 - Download MobileNet object detection model from https://github.com/chuanqi305/MobileNet-SSD. We need a configuration file `MobileNetSSD_deploy.prototxt` and weights `MobileNetSSD_deploy.caffemodel`.
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 19de221005..c3797d67c1 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1278,6 +1278,16 @@ OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
 
+inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_fma(a, b, c);
+}
+
 inline v_float32x8 v_invsqrt(const v_float32x8& x)
 {
     v_float32x8 half = x * v256_setall_f32(0.5);
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 1e2adef89a..9371d12c97 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -750,6 +750,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  @brief Reads a network model stored in <a href="http://torch.ch">Torch7</a> framework's format.
      *  @param model    path to the file, dumped from Torch by using torch.save() function.
      *  @param isBinary specifies whether the network was serialized in ascii mode or binary.
+     *  @param evaluate specifies testing phase of network. If true, it's similar to evaluate() method in Torch.
      *  @returns Net object.
      *
      *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use `long` type of C language,
@@ -771,7 +772,7 @@ CV__DNN_INLINE_NS_BEGIN
      *
      * Also some equivalents of these classes from cunn, cudnn, and fbcunn may be successfully imported.
      */
-     CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true);
+     CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true, bool evaluate = true);
 
      /**
       * @brief Read deep learning network represented in one of the supported formats.
diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp
index 7d0f125ed2..b41efdae1b 100644
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP
 
 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20181205
+#define OPENCV_DNN_API_VERSION 20181221
 
 #if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index a32cbecee9..93dd5f05f6 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -116,9 +116,15 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
+#ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+            return !zeroDev && eps <= 1e-7f;
+#else
             return !zeroDev && (preferableTarget == DNN_TARGET_CPU || eps <= 1e-7f);
+#endif
         else
+#endif  // HAVE_INF_ENGINE
             return backendId == DNN_BACKEND_OPENCV;
     }
 
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index b690a1f756..b62366c51b 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -420,31 +420,30 @@ void ONNXImporter::populateNet(Net dstNet)
         }
         else if (layer_type == "Sub")
         {
-            Mat blob = (-1.0f) * getBlob(node_proto, constBlobs, 1);
-            blob = blob.reshape(1, 1);
+            Mat blob = getBlob(node_proto, constBlobs, 1);
             if (blob.total() == 1) {
                 layerParams.type = "Power";
-                layerParams.set("shift", blob.at<float>(0));
+                layerParams.set("shift", -blob.at<float>(0));
             }
             else {
                 layerParams.type = "Scale";
                 layerParams.set("has_bias", true);
-                layerParams.blobs.push_back(blob);
+                layerParams.blobs.push_back(-1.0f * blob.reshape(1, 1));
             }
         }
         else if (layer_type == "Div")
         {
             Mat blob = getBlob(node_proto, constBlobs, 1);
             CV_Assert_N(blob.type() == CV_32F, blob.total());
-            divide(1.0, blob, blob);
             if (blob.total() == 1)
             {
-                layerParams.set("scale", blob.at<float>(0));
+                layerParams.set("scale", 1.0f / blob.at<float>(0));
                 layerParams.type = "Power";
             }
             else
             {
                 layerParams.type = "Scale";
+                divide(1.0, blob, blob);
                 layerParams.blobs.push_back(blob);
                 layerParams.set("bias_term", false);
             }
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 69d4944d51..118e525d97 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -26,10 +26,11 @@
 #define INF_ENGINE_RELEASE_2018R2 2018020000
 #define INF_ENGINE_RELEASE_2018R3 2018030000
 #define INF_ENGINE_RELEASE_2018R4 2018040000
+#define INF_ENGINE_RELEASE_2018R5 2018050000
 
 #ifndef INF_ENGINE_RELEASE
-#warning("IE version have not been provided via command-line. Using 2018R4 by default")
-#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R4
+#warning("IE version have not been provided via command-line. Using 2018R5 by default")
+#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R5
 #endif
 
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
diff --git a/modules/dnn/src/torch/torch_importer.cpp b/modules/dnn/src/torch/torch_importer.cpp
index 6c19093805..b9af28feee 100644
--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@@ -129,13 +129,15 @@ struct TorchImporter
     Module *rootModule;
     Module *curModule;
     int moduleCounter;
+    bool testPhase;
 
-    TorchImporter(String filename, bool isBinary)
+    TorchImporter(String filename, bool isBinary, bool evaluate)
     {
         CV_TRACE_FUNCTION();
 
         rootModule = curModule = NULL;
         moduleCounter = 0;
+        testPhase = evaluate;
 
         file = cv::Ptr<THFile>(THDiskFile_new(filename, "r", 0), THFile_free);
         CV_Assert(file && THFile_isOpened(file));
@@ -680,7 +682,8 @@ struct TorchImporter
                     layerParams.blobs.push_back(tensorParams["bias"].second);
                 }
 
-                if (nnName == "InstanceNormalization")
+                bool trainPhase = scalarParams.get<bool>("train", false);
+                if (nnName == "InstanceNormalization" || (trainPhase && !testPhase))
                 {
                     cv::Ptr<Module> mvnModule(new Module(nnName));
                     mvnModule->apiType = "MVN";
@@ -1243,18 +1246,18 @@ struct TorchImporter
 
 Mat readTorchBlob(const String &filename, bool isBinary)
 {
-    TorchImporter importer(filename, isBinary);
+    TorchImporter importer(filename, isBinary, true);
     importer.readObject();
     CV_Assert(importer.tensors.size() == 1);
 
     return importer.tensors.begin()->second;
 }
 
-Net readNetFromTorch(const String &model, bool isBinary)
+Net readNetFromTorch(const String &model, bool isBinary, bool evaluate)
 {
     CV_TRACE_FUNCTION();
 
-    TorchImporter importer(model, isBinary);
+    TorchImporter importer(model, isBinary, evaluate);
     Net net;
     importer.populateNet(net);
     return net;
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index cf901a2aed..3a64d6485b 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -226,9 +226,9 @@ TEST_P(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
 TEST_P(DNNTestNetwork, OpenFace)
 {
 #if defined(INF_ENGINE_RELEASE)
-#if INF_ENGINE_RELEASE < 2018030000
+#if (INF_ENGINE_RELEASE < 2018030000 || INF_ENGINE_RELEASE == 2018050000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("Test is enabled starts from OpenVINO 2018R3");
+        throw SkipTestException("");
 #elif INF_ENGINE_RELEASE < 2018040000
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
         throw SkipTestException("Test is enabled starts from OpenVINO 2018R4");
diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp
index db718eb2c5..0d9e508f84 100644
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@@ -190,6 +190,14 @@ TEST_P(DNNTestOpenVINO, models)
                                          modelName == "landmarks-regression-retail-0009" ||
                                           modelName == "semantic-segmentation-adas-0001")))
         throw SkipTestException("");
+#elif INF_ENGINE_RELEASE == 2018050000
+    if (modelName == "single-image-super-resolution-0063" ||
+        modelName == "single-image-super-resolution-1011" ||
+        modelName == "single-image-super-resolution-1021" ||
+        (target == DNN_TARGET_OPENCL_FP16 && modelName == "face-reidentification-retail-0095") ||
+        (target == DNN_TARGET_MYRIAD && (modelName == "license-plate-recognition-barrier-0001" ||
+                                         modelName == "semantic-segmentation-adas-0001")))
+        throw SkipTestException("");
 #endif
 #endif
 
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 62e625f03c..4ccefd28a9 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -295,6 +295,10 @@ TEST_P(Test_Caffe_layers, Eltwise)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
         throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
+#endif
     testLayerUsingCaffeModels("layer_eltwise");
 }
 
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 36e7450892..deccbfb0eb 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -164,6 +164,8 @@ TEST_P(Test_ONNX_layers, MultyInputs)
 
 TEST_P(Test_ONNX_layers, DynamicReshape)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
     testONNXModels("dynamic_reshape");
 }
 
@@ -249,6 +251,10 @@ TEST_P(Test_ONNX_nets, VGG16)
     else if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) {
         lInf = 1.2e-4;
     }
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        l1 = 0.131;
+#endif
     testONNXModels("vgg16", pb, l1, lInf);
 }
 
@@ -327,7 +333,7 @@ TEST_P(Test_ONNX_nets, CNN_MNIST)
 TEST_P(Test_ONNX_nets, MobileNet_v2)
 {
     // output range: [-166; 317]
-    const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.38 : 7e-5;
+    const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.4 : 7e-5;
     const double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2.87 : 5e-4;
     testONNXModels("mobilenetv2", pb, l1, lInf);
 }
@@ -350,7 +356,17 @@ TEST_P(Test_ONNX_nets, LResNet100E_IR)
 
 TEST_P(Test_ONNX_nets, Emotion_ferplus)
 {
-    testONNXModels("emotion_ferplus", pb);
+    double l1 = default_l1;
+    double lInf = default_lInf;
+    // Output values are in range [-2.01109, 2.11111]
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        l1 = 0.007;
+    else if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+    {
+        l1 = 0.021;
+        lInf = 0.034;
+    }
+    testONNXModels("emotion_ferplus", pb, l1, lInf);
 }
 
 TEST_P(Test_ONNX_nets, Inception_v2)
@@ -371,6 +387,10 @@ TEST_P(Test_ONNX_nets, DenseNet121)
 
 TEST_P(Test_ONNX_nets, Inception_v1)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
     testONNXModels("inception_v1", pb);
 }
 
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 7c53f8a93f..ce4997cd4e 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -241,6 +241,10 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten)
 
 TEST_P(Test_TensorFlow_layers, leaky_relu)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("");
+#endif
     runTensorFlowNet("leaky_relu_order1");
     runTensorFlowNet("leaky_relu_order2");
     runTensorFlowNet("leaky_relu_order3");
@@ -383,6 +387,10 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
 
 TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("Unstable test case");
+#endif
     checkBackend();
     std::string proto = findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pbtxt", false);
     std::string model = findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pb", false);
@@ -560,6 +568,10 @@ TEST_P(Test_TensorFlow_layers, slice)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
         (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
     runTensorFlowNet("slice_4d");
 }
 
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 7fa0dc47ef..c63cf26e45 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -73,7 +73,7 @@ class Test_Torch_layers : public DNNTestLayer
 {
 public:
     void runTorchNet(const String& prefix, String outLayerName = "",
-                     bool check2ndBlob = false, bool isBinary = false,
+                     bool check2ndBlob = false, bool isBinary = false, bool evaluate = true,
                      double l1 = 0.0, double lInf = 0.0)
     {
         String suffix = (isBinary) ? ".dat" : ".txt";
@@ -84,7 +84,7 @@ public:
 
         checkBackend(backend, target, &inp, &outRef);
 
-        Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
+        Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary, evaluate);
         ASSERT_FALSE(net.empty());
 
         net.setPreferableBackend(backend);
@@ -114,7 +114,7 @@ TEST_P(Test_Torch_layers, run_convolution)
     // Output reference values are in range [23.4018, 72.0181]
     double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.08 : default_l1;
     double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.42 : default_lInf;
-    runTorchNet("net_conv", "", false, true, l1, lInf);
+    runTorchNet("net_conv", "", false, true, true, l1, lInf);
 }
 
 TEST_P(Test_Torch_layers, run_pool_max)
@@ -147,7 +147,7 @@ TEST_P(Test_Torch_layers, run_reshape)
 TEST_P(Test_Torch_layers, run_reshape_single_sample)
 {
     // Reference output values in range [14.4586, 18.4492].
-    runTorchNet("net_reshape_single_sample", "", false, false,
+    runTorchNet("net_reshape_single_sample", "", false, false, true,
                 (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.0073 : default_l1,
                 (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.025 : default_lInf);
 }
@@ -166,7 +166,7 @@ TEST_P(Test_Torch_layers, run_concat)
 
 TEST_P(Test_Torch_layers, run_depth_concat)
 {
-    runTorchNet("net_depth_concat", "", false, true, 0.0,
+    runTorchNet("net_depth_concat", "", false, true, true, 0.0,
                 target == DNN_TARGET_OPENCL_FP16 ? 0.021 : 0.0);
 }
 
@@ -182,6 +182,7 @@ TEST_P(Test_Torch_layers, run_deconv)
 TEST_P(Test_Torch_layers, run_batch_norm)
 {
     runTorchNet("net_batch_norm", "", false, true);
+    runTorchNet("net_batch_norm_train", "", false, true, false);
 }
 
 TEST_P(Test_Torch_layers, net_prelu)
@@ -216,7 +217,7 @@ TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
         throw SkipTestException("");
-    runTorchNet("net_conv_gemm_lrn", "", false, true,
+    runTorchNet("net_conv_gemm_lrn", "", false, true, true,
                 target == DNN_TARGET_OPENCL_FP16 ? 0.046 : 0.0,
                 target == DNN_TARGET_OPENCL_FP16 ? 0.023 : 0.0);
 }
@@ -266,9 +267,9 @@ class Test_Torch_nets : public DNNTestLayer {};
 
 TEST_P(Test_Torch_nets, OpenFace_accuracy)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE < 2018030000
+#if defined(INF_ENGINE_RELEASE) && (INF_ENGINE_RELEASE < 2018030000 || INF_ENGINE_RELEASE == 2018050000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("Test is enabled starts from OpenVINO 2018R3");
+        throw SkipTestException("");
 #endif
     checkBackend();
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
@@ -389,6 +390,10 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
 //   -model models/instance_norm/feathers.t7
 TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
     checkBackend();
     std::string models[] = {"dnn/fast_neural_style_eccv16_starry_night.t7",
                             "dnn/fast_neural_style_instance_norm_feathers.t7"};
diff --git a/modules/features2d/src/keypoint.cpp b/modules/features2d/src/keypoint.cpp
index 8b116cbbab..219634e5b4 100644
--- a/modules/features2d/src/keypoint.cpp
+++ b/modules/features2d/src/keypoint.cpp
@@ -77,7 +77,7 @@ void KeyPointsFilter::retainBest(std::vector<KeyPoint>& keypoints, int n_points)
             return;
         }
         //first use nth element to partition the keypoints into the best and worst.
-        std::nth_element(keypoints.begin(), keypoints.begin() + n_points, keypoints.end(), KeypointResponseGreater());
+        std::nth_element(keypoints.begin(), keypoints.begin() + n_points - 1, keypoints.end(), KeypointResponseGreater());
         //this is the boundary response, and in the case of FAST may be ambiguous
         float ambiguous_response = keypoints[n_points - 1].response;
         //use std::partition to grab all of the keypoints with the boundary response.
diff --git a/modules/features2d/test/test_utils.cpp b/modules/features2d/test/test_utils.cpp
new file mode 100644
index 0000000000..78febd353a
--- /dev/null
+++ b/modules/features2d/test/test_utils.cpp
@@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+TEST(Features2D_KeypointUtils, retainBest_issue_12594)
+{
+    const size_t N = 9;
+
+    // Construct 4-way tie for 3rd highest - correct answer for "3 best" is 6
+    const float no_problem[] = { 5.0f, 4.0f, 1.0f, 2.0f, 0.0f, 3.0f, 3.0f, 3.0f, 3.0f };
+
+    // Same set, different order that exposes partial sort property of std::nth_element
+    // Note: the problem case may depend on your particular implementation of STL
+    const float problem[] = { 3.0f, 3.0f, 3.0f, 3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f };
+
+    const size_t NBEST  = 3u;
+    const size_t ANSWER = 6u;
+
+    std::vector<cv::KeyPoint> sorted_cv(N);
+    std::vector<cv::KeyPoint> unsorted_cv(N);
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        sorted_cv[i].response   = no_problem[i];
+        unsorted_cv[i].response = problem[i];
+    }
+
+    cv::KeyPointsFilter::retainBest(sorted_cv, NBEST);
+    cv::KeyPointsFilter::retainBest(unsorted_cv, NBEST);
+
+    EXPECT_EQ(ANSWER, sorted_cv.size());
+    EXPECT_EQ(ANSWER, unsorted_cv.size());
+}
+
+}} // namespace
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 6da66a1f0d..342421e134 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -44,6 +44,7 @@
 #include "opencv2/core/opencl/ocl_defs.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "hal_replacement.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include "filter.hpp"
 
 
@@ -477,7 +478,7 @@ struct FilterNoVec
 };
 
 
-#if CV_SSE2
+#if CV_SIMD
 
 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
 
@@ -502,9 +503,6 @@ struct RowVec_8u32s
 
     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
         int* dst = (int*)_dst;
         const int* _kx = kernel.ptr<int>();
@@ -512,52 +510,81 @@ struct RowVec_8u32s
 
         if( smallValues )
         {
-            __m128i z = _mm_setzero_si128();
-            for( ; i <= width - 8; i += 8 )
+            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
             {
                 const uchar* src = _src + i;
-                __m128i s0 = z, s1 = z;
-
-                for( k = 0; k < _ksize; k++, src += cn )
+                v_int32 s0 = vx_setzero_s32();
+                v_int32 s1 = vx_setzero_s32();
+                v_int32 s2 = vx_setzero_s32();
+                v_int32 s3 = vx_setzero_s32();
+                k = 0;
+                for (; k <= _ksize - 2; k += 2, src += 2 * cn)
                 {
-                    __m128i f = _mm_cvtsi32_si128(_kx[k]);
-                    f = _mm_shuffle_epi32(f, 0);
-
-                    __m128i x0 = _mm_loadl_epi64((const __m128i*)src);
-                    x0 = _mm_unpacklo_epi8(x0, z);
-
-                    __m128i x1 = _mm_unpackhi_epi16(x0, z);
-                    x0 = _mm_unpacklo_epi16(x0, z);
-
-                    x0 = _mm_madd_epi16(x0, f);
-                    x1 = _mm_madd_epi16(x1, f);
-
-                    s0 = _mm_add_epi32(s0, x0);
-                    s1 = _mm_add_epi32(s1, x1);
+                    v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                    v_uint8 x0, x1;
+                    v_zip(vx_load(src), vx_load(src + cn), x0, x1);
+                    s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+                    s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+                    s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+                    s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
                 }
-
-                _mm_store_si128((__m128i*)(dst + i), s0);
-                _mm_store_si128((__m128i*)(dst + i + 4), s1);
+                if (k < _ksize)
+                {
+                    v_int32 f = vx_setall_s32(_kx[k]);
+                    v_uint16 x0, x1;
+                    v_expand(vx_load(src), x0, x1);
+                    s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+                    s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+                    s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+                    s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                }
+                v_store(dst + i, s0);
+                v_store(dst + i + v_int32::nlanes, s1);
+                v_store(dst + i + 2*v_int32::nlanes, s2);
+                v_store(dst + i + 3*v_int32::nlanes, s3);
             }
-
-            if( i <= width - 4 )
+            if( i <= width - v_uint16::nlanes )
             {
                 const uchar* src = _src + i;
-                __m128i s0 = z;
-
-                for( k = 0; k < _ksize; k++, src += cn )
+                v_int32 s0 = vx_setzero_s32();
+                v_int32 s1 = vx_setzero_s32();
+                k = 0;
+                for( ; k <= _ksize - 2; k += 2, src += 2*cn )
                 {
-                    __m128i f = _mm_cvtsi32_si128(_kx[k]);
-                    f = _mm_shuffle_epi32(f, 0);
-
-                    __m128i x0 = _mm_cvtsi32_si128(*(const int*)src);
-                    x0 = _mm_unpacklo_epi8(x0, z);
-                    x0 = _mm_unpacklo_epi16(x0, z);
-                    x0 = _mm_madd_epi16(x0, f);
-                    s0 = _mm_add_epi32(s0, x0);
+                    v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                    v_uint16 x0, x1;
+                    v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1);
+                    s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+                    s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
                 }
-                _mm_store_si128((__m128i*)(dst + i), s0);
-                i += 4;
+                if( k < _ksize )
+                {
+                    v_int32 f = vx_setall_s32(_kx[k]);
+                    v_uint32 x0, x1;
+                    v_expand(vx_load_expand(src), x0, x1);
+                    s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+                    s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+                }
+                v_store(dst + i, s0);
+                v_store(dst + i + v_int32::nlanes, s1);
+                i += v_uint16::nlanes;
+            }
+            if( i <= width - v_uint32::nlanes )
+            {
+                v_int32 d = vx_setzero_s32();
+                k = 0;
+                const uchar* src = _src + i;
+                for (; k <= _ksize - 2; k += 2, src += 2*cn)
+                {
+                    v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                    v_uint32 x0, x1;
+                    v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1);
+                    d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f));
+                }
+                if (k < _ksize)
+                    d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k])));
+                v_store(dst + i, d);
+                i += v_uint32::nlanes;
             }
         }
         return i;
@@ -590,9 +617,6 @@ struct SymmRowSmallVec_8u32s
 
     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
         int* dst = (int*)_dst;
         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
@@ -603,7 +627,6 @@ struct SymmRowSmallVec_8u32s
         src += (_ksize/2)*cn;
         width *= cn;
 
-        __m128i z = _mm_setzero_si128();
         if( symmetrical )
         {
             if( _ksize == 1 )
@@ -611,143 +634,276 @@ struct SymmRowSmallVec_8u32s
             if( _ksize == 3 )
             {
                 if( kx[0] == 2 && kx[1] == 1 )
-                    for( ; i <= width - 16; i += 16, src += 16 )
+                {
+                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                     {
-                        __m128i x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_si128((__m128i*)(src - cn));
-                        x1 = _mm_loadu_si128((__m128i*)src);
-                        x2 = _mm_loadu_si128((__m128i*)(src + cn));
-                        y0 = _mm_unpackhi_epi8(x0, z);
-                        x0 = _mm_unpacklo_epi8(x0, z);
-                        y1 = _mm_unpackhi_epi8(x1, z);
-                        x1 = _mm_unpacklo_epi8(x1, z);
-                        y2 = _mm_unpackhi_epi8(x2, z);
-                        x2 = _mm_unpacklo_epi8(x2, z);
-                        x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
-                        y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
-                        _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
-                        _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
-                        _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
-                        _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
+                        v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                        v_expand(vx_load(src - cn), x0l, x0h);
+                        v_expand(vx_load(src), x1l, x1h);
+                        v_expand(vx_load(src + cn), x2l, x2h);
+                        x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l));
+                        x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h));
+                        v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l)));
+                        v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l)));
+                        v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h)));
+                        v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h)));
                     }
+                    if( i <= width - v_uint16::nlanes )
+                    {
+                        v_uint16 x = vx_load_expand(src);
+                        x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)));
+                        v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x)));
+                        v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x)));
+                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    }
+                    if( i <= width - v_uint32::nlanes )
+                    {
+                        v_uint32 x = vx_load_expand_q(src);
+                        x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn);
+                        v_store(dst + i, v_reinterpret_as_s32(x));
+                        i += v_uint32::nlanes;
+                    }
+                }
                 else if( kx[0] == -2 && kx[1] == 1 )
-                    for( ; i <= width - 16; i += 16, src += 16 )
+                {
+                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                     {
-                        __m128i x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_si128((__m128i*)(src - cn));
-                        x1 = _mm_loadu_si128((__m128i*)src);
-                        x2 = _mm_loadu_si128((__m128i*)(src + cn));
-                        y0 = _mm_unpackhi_epi8(x0, z);
-                        x0 = _mm_unpacklo_epi8(x0, z);
-                        y1 = _mm_unpackhi_epi8(x1, z);
-                        x1 = _mm_unpacklo_epi8(x1, z);
-                        y2 = _mm_unpackhi_epi8(x2, z);
-                        x2 = _mm_unpacklo_epi8(x2, z);
-                        x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
-                        y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
-                        _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                        v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                        v_expand(vx_load(src - cn), x0l, x0h);
+                        v_expand(vx_load(src), x1l, x1h);
+                        v_expand(vx_load(src + cn), x2l, x2h);
+                        x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+                        x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+                        v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
                     }
+                    if( i <= width - v_uint16::nlanes )
+                    {
+                        v_uint16 x = vx_load_expand(src);
+                        x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x));
+                        v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    }
+                    if( i <= width - v_uint32::nlanes )
+                    {
+                        v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+                        x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x);
+                        v_store(dst + i, x);
+                        i += v_uint32::nlanes;
+                    }
+                }
                 else
                 {
-                    __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
-                            k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
-                    k1 = _mm_packs_epi32(k1, k1);
-
-                    for( ; i <= width - 8; i += 8, src += 8 )
+                    v_int16 k0 = vx_setall_s16((short)kx[0]);
+                    v_int16 k1 = vx_setall_s16((short)kx[1]);
+                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                     {
-                        __m128i x0 = _mm_loadl_epi64((__m128i*)(src - cn));
-                        __m128i x1 = _mm_loadl_epi64((__m128i*)src);
-                        __m128i x2 = _mm_loadl_epi64((__m128i*)(src + cn));
+                        v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                        v_expand(vx_load(src - cn), x0l, x0h);
+                        v_expand(vx_load(src), x1l, x1h);
+                        v_expand(vx_load(src + cn), x2l, x2h);
 
-                        x0 = _mm_unpacklo_epi8(x0, z);
-                        x1 = _mm_unpacklo_epi8(x1, z);
-                        x2 = _mm_unpacklo_epi8(x2, z);
-                        __m128i x3 = _mm_unpacklo_epi16(x0, x2);
-                        __m128i x4 = _mm_unpackhi_epi16(x0, x2);
-                        __m128i x5 = _mm_unpacklo_epi16(x1, z);
-                        __m128i x6 = _mm_unpackhi_epi16(x1, z);
-                        x3 = _mm_madd_epi16(x3, k1);
-                        x4 = _mm_madd_epi16(x4, k1);
-                        x5 = _mm_madd_epi16(x5, k0);
-                        x6 = _mm_madd_epi16(x6, k0);
-                        x3 = _mm_add_epi32(x3, x5);
-                        x4 = _mm_add_epi32(x4, x6);
+                        v_int32 dl, dh;
+                        v_int16 x0, x1;
+                        v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh);
+                        v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1);
+                        dl += v_dotprod(x0, k1);
+                        dh += v_dotprod(x1, k1);
+                        v_store(dst + i, dl);
+                        v_store(dst + i + v_int32::nlanes, dh);
 
-                        _mm_store_si128((__m128i*)(dst + i), x3);
-                        _mm_store_si128((__m128i*)(dst + i + 4), x4);
+                        v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh);
+                        v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1);
+                        dl += v_dotprod(x0, k1);
+                        dh += v_dotprod(x1, k1);
+                        v_store(dst + i + 2*v_int32::nlanes, dl);
+                        v_store(dst + i + 3*v_int32::nlanes, dh);
+                    }
+                    if ( i <= width - v_uint16::nlanes )
+                    {
+                        v_int32 dl, dh;
+                        v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh);
+                        v_int16 x0, x1;
+                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1);
+                        dl += v_dotprod(x0, k1);
+                        dh += v_dotprod(x1, k1);
+                        v_store(dst + i, dl);
+                        v_store(dst + i + v_int32::nlanes, dh);
+                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    }
+                    if ( i <= width - v_uint32::nlanes )
+                    {
+                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1])));
+                        i += v_uint32::nlanes;
                     }
                 }
             }
             else if( _ksize == 5 )
             {
                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                    for( ; i <= width - 16; i += 16, src += 16 )
+                {
+                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                     {
-                        __m128i x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
-                        x1 = _mm_loadu_si128((__m128i*)src);
-                        x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
-                        y0 = _mm_unpackhi_epi8(x0, z);
-                        x0 = _mm_unpacklo_epi8(x0, z);
-                        y1 = _mm_unpackhi_epi8(x1, z);
-                        x1 = _mm_unpacklo_epi8(x1, z);
-                        y2 = _mm_unpackhi_epi8(x2, z);
-                        x2 = _mm_unpacklo_epi8(x2, z);
-                        x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
-                        y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
-                        _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                        v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                        v_expand(vx_load(src - 2*cn), x0l, x0h);
+                        v_expand(vx_load(src), x1l, x1h);
+                        v_expand(vx_load(src + 2*cn), x2l, x2h);
+                        x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+                        x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+                        v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
                     }
+                    if( i <= width - v_uint16::nlanes )
+                    {
+                        v_uint16 x = vx_load_expand(src);
+                        x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x));
+                        v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    }
+                    if( i <= width - v_uint32::nlanes )
+                    {
+                        v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+                        x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x);
+                        v_store(dst + i, x);
+                        i += v_uint32::nlanes;
+                    }
+                }
                 else
                 {
-                    __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
-                            k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
-                            k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
-                    k1 = _mm_packs_epi32(k1, k1);
-                    k2 = _mm_packs_epi32(k2, k2);
-
-                    for( ; i <= width - 8; i += 8, src += 8 )
+                    v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                    v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                     {
-                        __m128i x0 = _mm_loadl_epi64((__m128i*)src);
+                        v_int32 x0, x1, x2, x3;
+                        v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+                        v_int16 xl, xh;
 
-                        x0 = _mm_unpacklo_epi8(x0, z);
-                        __m128i x1 = _mm_unpacklo_epi16(x0, z);
-                        __m128i x2 = _mm_unpackhi_epi16(x0, z);
-                        x1 = _mm_madd_epi16(x1, k0);
-                        x2 = _mm_madd_epi16(x2, k0);
+                        v_expand(vx_load(src), x0l, x0h);
+                        v_mul_expand(v_reinterpret_as_s16(x0l), k0, x0, x1);
+                        v_mul_expand(v_reinterpret_as_s16(x0h), k0, x2, x3);
 
-                        __m128i x3 = _mm_loadl_epi64((__m128i*)(src - cn));
-                        __m128i x4 = _mm_loadl_epi64((__m128i*)(src + cn));
+                        v_expand(vx_load(src - cn), x0l, x0h);
+                        v_expand(vx_load(src + cn), x1l, x1h);
+                        v_expand(vx_load(src - 2*cn), x2l, x2h);
+                        v_expand(vx_load(src + 2*cn), x3l, x3h);
+                        v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh);
+                        x0 += v_dotprod(xl, k12);
+                        x1 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh);
+                        x2 += v_dotprod(xl, k12);
+                        x3 += v_dotprod(xh, k12);
 
-                        x3 = _mm_unpacklo_epi8(x3, z);
-                        x4 = _mm_unpacklo_epi8(x4, z);
-                        __m128i x5 = _mm_unpacklo_epi16(x3, x4);
-                        __m128i x6 = _mm_unpackhi_epi16(x3, x4);
-                        x5 = _mm_madd_epi16(x5, k1);
-                        x6 = _mm_madd_epi16(x6, k1);
-                        x1 = _mm_add_epi32(x1, x5);
-                        x2 = _mm_add_epi32(x2, x6);
-
-                        x3 = _mm_loadl_epi64((__m128i*)(src - cn*2));
-                        x4 = _mm_loadl_epi64((__m128i*)(src + cn*2));
-
-                        x3 = _mm_unpacklo_epi8(x3, z);
-                        x4 = _mm_unpacklo_epi8(x4, z);
-                        x5 = _mm_unpacklo_epi16(x3, x4);
-                        x6 = _mm_unpackhi_epi16(x3, x4);
-                        x5 = _mm_madd_epi16(x5, k2);
-                        x6 = _mm_madd_epi16(x6, k2);
-                        x1 = _mm_add_epi32(x1, x5);
-                        x2 = _mm_add_epi32(x2, x6);
-
-                        _mm_store_si128((__m128i*)(dst + i), x1);
-                        _mm_store_si128((__m128i*)(dst + i + 4), x2);
+                        v_store(dst + i, x0);
+                        v_store(dst + i + v_int32::nlanes, x1);
+                        v_store(dst + i + 2*v_int32::nlanes, x2);
+                        v_store(dst + i + 3*v_int32::nlanes, x3);
                     }
+                    if( i <= width - v_uint16::nlanes )
+                    {
+                        v_int32 x1, x2;
+                        v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2);
+
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh);
+                        x1 += v_dotprod(xl, k12);
+                        x2 += v_dotprod(xh, k12);
+
+                        v_store(dst + i, x1);
+                        v_store(dst + i + v_int32::nlanes, x2);
+                        i += v_uint16::nlanes, src += v_uint16::nlanes;
+                    }
+                    if( i <= width - v_uint32::nlanes )
+                    {
+                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]),
+                                         v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]),
+                                                  v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2]))));
+                        i += v_uint32::nlanes;
+                    }
+                }
+            }
+            else
+            {
+                v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                {
+                    v_uint8 v_src = vx_load(src);
+                    v_int32 s0, s1, s2, s3;
+                    v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+                    v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+                    for (k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn)
+                    {
+                        v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+
+                        v_uint8 v_src0 = vx_load(src - j);
+                        v_uint8 v_src1 = vx_load(src - j - cn);
+                        v_uint8 v_src2 = vx_load(src + j);
+                        v_uint8 v_src3 = vx_load(src + j + cn);
+
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
+                        s0 += v_dotprod(xl, k12);
+                        s1 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
+                        s2 += v_dotprod(xl, k12);
+                        s3 += v_dotprod(xh, k12);
+                    }
+                    if( k < _ksize / 2 + 1 )
+                    {
+                        v_int16 k1 = vx_setall_s16((short)(kx[k]));
+
+                        v_uint8 v_src0 = vx_load(src - j);
+                        v_uint8 v_src1 = vx_load(src + j);
+
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
+                        s0 += v_dotprod(xl, k1);
+                        s1 += v_dotprod(xh, k1);
+                        v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
+                        s2 += v_dotprod(xl, k1);
+                        s3 += v_dotprod(xh, k1);
+                    }
+                    v_store(dst + i, s0);
+                    v_store(dst + i + v_int32::nlanes, s1);
+                    v_store(dst + i + 2*v_int32::nlanes, s2);
+                    v_store(dst + i + 3*v_int32::nlanes, s3);
+                }
+                if( i <= width - v_uint16::nlanes )
+                {
+                    v_int32 s0, s1;
+                    v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+                    for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn)
+                    {
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh);
+                        v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16)));
+                        s0 += v_dotprod(xl, k12);
+                        s1 += v_dotprod(xh, k12);
+                    }
+                    if ( k < _ksize / 2 + 1 )
+                    {
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh);
+                        v_int16 k1 = vx_setall_s16((short)(kx[k]));
+                        s0 += v_dotprod(xl, k1);
+                        s1 += v_dotprod(xh, k1);
+                    }
+                    v_store(dst + i, s0);
+                    v_store(dst + i + v_int32::nlanes, s1);
+                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                }
+                if( i <= width - v_uint32::nlanes )
+                {
+                    v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                    for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn )
+                        s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0);
+                    v_store(dst + i, s0);
+                    i += v_uint32::nlanes;
                 }
             }
         }
@@ -756,111 +912,175 @@ struct SymmRowSmallVec_8u32s
             if( _ksize == 3 )
             {
                 if( kx[0] == 0 && kx[1] == 1 )
-                    for( ; i <= width - 16; i += 16, src += 16 )
+                {
+                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                     {
-                        __m128i x0, x1, y0;
-                        x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                        x1 = _mm_loadu_si128((__m128i*)(src - cn));
-                        y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
-                        x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
-                        _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                        _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                        v_uint16 x0l, x0h, x2l, x2h;
+                        v_expand(vx_load(src - cn), x0l, x0h);
+                        v_expand(vx_load(src + cn), x2l, x2h);
+                        v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l));
+                        v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h));
+                        v_store(dst + i, v_expand_low(dl));
+                        v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh));
+                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh));
                     }
+                    if( i <= width - v_uint16::nlanes )
+                    {
+                        v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn)));
+                        v_store(dst + i, v_expand_low(dl));
+                        v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    }
+                    if (i <= width - v_uint32::nlanes)
+                    {
+                        v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)));
+                        i += v_uint32::nlanes;
+                    }
+                }
                 else
                 {
-                    __m128i k0 = _mm_set_epi32(-kx[1], kx[1], -kx[1], kx[1]);
-                    k0 = _mm_packs_epi32(k0, k0);
-
-                    for( ; i <= width - 16; i += 16, src += 16 )
+                    v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16)));
+                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                     {
-                        __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                        __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
-
-                        __m128i x2 = _mm_unpacklo_epi8(x0, z);
-                        __m128i x3 = _mm_unpacklo_epi8(x1, z);
-                        __m128i x4 = _mm_unpackhi_epi8(x0, z);
-                        __m128i x5 = _mm_unpackhi_epi8(x1, z);
-                        __m128i x6 = _mm_unpacklo_epi16(x2, x3);
-                        __m128i x7 = _mm_unpacklo_epi16(x4, x5);
-                        __m128i x8 = _mm_unpackhi_epi16(x2, x3);
-                        __m128i x9 = _mm_unpackhi_epi16(x4, x5);
-                        x6 = _mm_madd_epi16(x6, k0);
-                        x7 = _mm_madd_epi16(x7, k0);
-                        x8 = _mm_madd_epi16(x8, k0);
-                        x9 = _mm_madd_epi16(x9, k0);
-
-                        _mm_store_si128((__m128i*)(dst + i), x6);
-                        _mm_store_si128((__m128i*)(dst + i + 4), x8);
-                        _mm_store_si128((__m128i*)(dst + i + 8), x7);
-                        _mm_store_si128((__m128i*)(dst + i + 12), x9);
+                        v_uint16 x0l, x0h, x2l, x2h;
+                        v_expand(vx_load(src - cn), x0l, x0h);
+                        v_expand(vx_load(src + cn), x2l, x2h);
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh);
+                        v_store(dst + i, v_dotprod(xl, k0));
+                        v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                        v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh);
+                        v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0));
+                        v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0));
+                    }
+                    if( i <= width - v_uint16::nlanes )
+                    {
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh);
+                        v_store(dst + i, v_dotprod(xl, k0));
+                        v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    }
+                    if (i <= width - v_uint32::nlanes)
+                    {
+                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1])));
+                        i += v_uint32::nlanes;
                     }
                 }
             }
             else if( _ksize == 5 )
             {
-                __m128i k0 = _mm_loadl_epi64((__m128i*)(kx + 1));
-                k0 = _mm_unpacklo_epi64(k0, k0);
-                k0 = _mm_packs_epi32(k0, k0);
-
-                for( ; i <= width - 16; i += 16, src += 16 )
+                v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
                 {
-                    __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                    __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
-
-                    __m128i x2 = _mm_unpackhi_epi8(x0, z);
-                    __m128i x3 = _mm_unpackhi_epi8(x1, z);
-                    x0 = _mm_unpacklo_epi8(x0, z);
-                    x1 = _mm_unpacklo_epi8(x1, z);
-                    __m128i x5 = _mm_sub_epi16(x2, x3);
-                    __m128i x4 = _mm_sub_epi16(x0, x1);
-
-                    __m128i x6 = _mm_loadu_si128((__m128i*)(src + cn * 2));
-                    __m128i x7 = _mm_loadu_si128((__m128i*)(src - cn * 2));
-
-                    __m128i x8 = _mm_unpackhi_epi8(x6, z);
-                    __m128i x9 = _mm_unpackhi_epi8(x7, z);
-                    x6 = _mm_unpacklo_epi8(x6, z);
-                    x7 = _mm_unpacklo_epi8(x7, z);
-                    __m128i x11 = _mm_sub_epi16(x8, x9);
-                    __m128i x10 = _mm_sub_epi16(x6, x7);
-
-                    __m128i x13 = _mm_unpackhi_epi16(x5, x11);
-                    __m128i x12 = _mm_unpackhi_epi16(x4, x10);
-                    x5 = _mm_unpacklo_epi16(x5, x11);
-                    x4 = _mm_unpacklo_epi16(x4, x10);
-                    x5 = _mm_madd_epi16(x5, k0);
-                    x4 = _mm_madd_epi16(x4, k0);
-                    x13 = _mm_madd_epi16(x13, k0);
-                    x12 = _mm_madd_epi16(x12, k0);
-
-                    _mm_store_si128((__m128i*)(dst + i), x4);
-                    _mm_store_si128((__m128i*)(dst + i + 4), x12);
-                    _mm_store_si128((__m128i*)(dst + i + 8), x5);
-                    _mm_store_si128((__m128i*)(dst + i + 12), x13);
+                    v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+                    v_expand(vx_load(src - cn), x0l, x0h);
+                    v_expand(vx_load(src - 2*cn), x1l, x1h);
+                    v_expand(vx_load(src + cn), x2l, x2h);
+                    v_expand(vx_load(src + 2*cn), x3l, x3h);
+                    v_int16 x0, x1;
+                    v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1);
+                    v_store(dst + i, v_dotprod(x0, k0));
+                    v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                    v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1);
+                    v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0));
+                    v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0));
+                }
+                if( i <= width - v_uint16::nlanes )
+                {
+                    v_int16 x0, x1;
+                    v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))),
+                          v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1);
+                    v_store(dst + i, v_dotprod(x0, k0));
+                    v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                }
+                if( i <= width - v_uint32::nlanes )
+                {
+                    v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]),
+                                             (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2])));
+                    i += v_uint32::nlanes;
                 }
             }
-        }
-
-        src -= (_ksize/2)*cn;
-        kx -= _ksize/2;
-        for( ; i <= width - 4; i += 4, src += 4 )
-        {
-            __m128i s0 = z;
-
-            for( k = j = 0; k < _ksize; k++, j += cn )
+            else
             {
-                __m128i f = _mm_cvtsi32_si128(kx[k]);
-                f = _mm_shuffle_epi32(f, 0);
+                v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                {
+                    v_uint8 v_src = vx_load(src);
+                    v_int32 s0, s1, s2, s3;
+                    v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+                    v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+                    for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+                    {
+                        v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
 
-                __m128i x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
-                x0 = _mm_unpacklo_epi8(x0, z);
-                x0 = _mm_unpacklo_epi16(x0, z);
-                x0 = _mm_madd_epi16(x0, f);
-                s0 = _mm_add_epi32(s0, x0);
+                        v_uint8 v_src0 = vx_load(src - j);
+                        v_uint8 v_src1 = vx_load(src - j - cn);
+                        v_uint8 v_src2 = vx_load(src + j);
+                        v_uint8 v_src3 = vx_load(src + j + cn);
+
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh);
+                        s0 += v_dotprod(xl, k12);
+                        s1 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh);
+                        s2 += v_dotprod(xl, k12);
+                        s3 += v_dotprod(xh, k12);
+                    }
+                    if( k < _ksize / 2 + 1 )
+                    {
+                        v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+                        v_uint8 v_src0 = vx_load(src - j);
+                        v_uint8 v_src1 = vx_load(src + j);
+
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh);
+                        s0 += v_dotprod(xl, k12);
+                        s1 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh);
+                        s2 += v_dotprod(xl, k12);
+                        s3 += v_dotprod(xh, k12);
+                    }
+                    v_store(dst + i, s0);
+                    v_store(dst + i + v_int32::nlanes, s1);
+                    v_store(dst + i + 2*v_int32::nlanes, s2);
+                    v_store(dst + i + 3*v_int32::nlanes, s3);
+                }
+                if( i <= width - v_uint16::nlanes )
+                {
+                    v_int32 s0, s1;
+                    v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+                    for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+                    {
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh);
+                        v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+                        s0 += v_dotprod(xl, k12);
+                        s1 += v_dotprod(xh, k12);
+                    }
+                    if( k < _ksize / 2 + 1 )
+                    {
+                        v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+                        v_int16 xl, xh;
+                        v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh);
+                        s0 += v_dotprod(xl, k1);
+                        s1 += v_dotprod(xh, k1);
+                    }
+                    v_store(dst + i, s0);
+                    v_store(dst + i + v_int32::nlanes, s1);
+                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                }
+                if( i <= width - v_uint32::nlanes )
+                {
+                    v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                    for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn)
+                        s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0);
+                    v_store(dst + i, s0);
+                    i += v_uint32::nlanes;
+                }
             }
-            _mm_store_si128((__m128i*)(dst + i), s0);
         }
 
         return i;
@@ -885,129 +1105,117 @@ struct SymmColumnVec_32s8u
 
     int operator()(const uchar** _src, uchar* dst, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+        int _ksize = kernel.rows + kernel.cols - 1;
+        int ksize2 = _ksize/2;
         const float* ky = kernel.ptr<float>() + ksize2;
         int i = 0, k;
         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
         const int** src = (const int**)_src;
-        const __m128i *S, *S2;
-        __m128 d4 = _mm_set1_ps(delta);
 
+        v_float32 d4 = vx_setall_f32(delta);
         if( symmetrical )
         {
-            for( ; i <= width - 16; i += 16 )
+            if (_ksize == 1)
+                return 0;
+            v_float32 f0 = vx_setall_f32(ky[0]);
+            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
             {
-                __m128 f = _mm_load_ss(ky);
-                f = _mm_shuffle_ps(f, f, 0);
-                __m128 s0, s1, s2, s3;
-                __m128i x0, x1;
-                S = (const __m128i*)(src[0] + i);
-                s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
-                s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
-                s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
-                s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
-                s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
-
+                const int* S = src[0] + i;
+                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
+                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
+                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
                 for( k = 1; k <= ksize2; k++ )
                 {
-                    S = (const __m128i*)(src[k] + i);
-                    S2 = (const __m128i*)(src[-k] + i);
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                    x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
-                    x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
-                    x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
-                    s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                    s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+                    v_float32 f = vx_setall_f32(ky[k]);
+                    const int* S0 = src[k] + i;
+                    const int* S1 = src[-k] + i;
+                    s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
+                    s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+                    s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3);
                 }
-
-                x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-                x0 = _mm_packus_epi16(x0, x1);
-                _mm_storeu_si128((__m128i*)(dst + i), x0);
+                v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
-
-            for( ; i <= width - 4; i += 4 )
+            if( i <= width - v_uint16::nlanes )
             {
-                __m128 f = _mm_load_ss(ky);
-                f = _mm_shuffle_ps(f, f, 0);
-                __m128i x0;
-                __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
-                s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-
+                const int* S = src[0] + i;
+                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
                 for( k = 1; k <= ksize2; k++ )
                 {
-                    S = (const __m128i*)(src[k] + i);
-                    S2 = (const __m128i*)(src[-k] + i);
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+                    v_float32 f = vx_setall_f32(ky[k]);
+                    const int* S0 = src[k] + i;
+                    const int* S1 = src[-k] + i;
+                    s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
                 }
-
-                x0 = _mm_cvtps_epi32(s0);
-                x0 = _mm_packs_epi32(x0, x0);
-                x0 = _mm_packus_epi16(x0, x0);
-                *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+                v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                i += v_uint16::nlanes;
+            }
+#if CV_SIMD_WIDTH > 16
+            while( i <= width - v_int32x4::nlanes )
+#else
+            if( i <= width - v_int32x4::nlanes )
+#endif
+            {
+                v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
+                for( k = 1; k <= ksize2; k++ )
+                    s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+                v_int32x4 s32 = v_round(s0);
+                v_int16x8 s16 = v_pack(s32, s32);
+                *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+                i += v_int32x4::nlanes;
             }
         }
         else
         {
-            for( ; i <= width - 16; i += 16 )
+            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
             {
-                __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                __m128i x0, x1;
-
-                for( k = 1; k <= ksize2; k++ )
+                v_float32 s0 = d4;
+                v_float32 s1 = d4;
+                v_float32 s2 = d4;
+                v_float32 s3 = d4;
+                for ( k = 1; k <= ksize2; k++ )
                 {
-                    S = (const __m128i*)(src[k] + i);
-                    S2 = (const __m128i*)(src[-k] + i);
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                    x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
-                    x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
-                    x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
-                    s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                    s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+                    v_float32 f = vx_setall_f32(ky[k]);
+                    const int* S0 = src[k] + i;
+                    const int* S1 = src[-k] + i;
+                    s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
+                    s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+                    s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3);
                 }
-
-                x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-                x0 = _mm_packus_epi16(x0, x1);
-                _mm_storeu_si128((__m128i*)(dst + i), x0);
+                v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
-
-            for( ; i <= width - 4; i += 4 )
+            if( i <= width - v_uint16::nlanes )
             {
-                __m128 f, s0 = d4;
-                __m128i x0;
-
-                for( k = 1; k <= ksize2; k++ )
+                v_float32 s0 = d4;
+                v_float32 s1 = d4;
+                for ( k = 1; k <= ksize2; k++ )
                 {
-                    S = (const __m128i*)(src[k] + i);
-                    S2 = (const __m128i*)(src[-k] + i);
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+                    v_float32 f = vx_setall_f32(ky[k]);
+                    const int* S0 = src[k] + i;
+                    const int* S1 = src[-k] + i;
+                    s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
                 }
-
-                x0 = _mm_cvtps_epi32(s0);
-                x0 = _mm_packs_epi32(x0, x0);
-                x0 = _mm_packus_epi16(x0, x0);
-                *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+                v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                i += v_uint16::nlanes;
+            }
+#if CV_SIMD_WIDTH > 16
+            while( i <= width - v_int32x4::nlanes )
+#else
+            if( i <= width - v_int32x4::nlanes )
+#endif
+            {
+                v_float32x4 s0 = v_setall_f32(delta);
+                for (k = 1; k <= ksize2; k++)
+                    s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+                v_int32x4 s32 = v_round(s0);
+                v_int16x8 s16 = v_pack(s32, s32);
+                *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+                i += v_int32x4::nlanes;
             }
         }
 
@@ -1033,9 +1241,6 @@ struct SymmColumnSmallVec_32s16s
 
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
         const float* ky = kernel.ptr<float>() + ksize2;
         int i = 0;
@@ -1043,66 +1248,63 @@ struct SymmColumnSmallVec_32s16s
         const int** src = (const int**)_src;
         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
         short* dst = (short*)_dst;
-        __m128 df4 = _mm_set1_ps(delta);
-        __m128i d4 = _mm_cvtps_epi32(df4);
 
+        v_float32 df4 = vx_setall_f32(delta);
+        v_int32 d4 = v_round(df4);
         if( symmetrical )
         {
             if( ky[0] == 2 && ky[1] == 1 )
             {
-                for( ; i <= width - 8; i += 8 )
+                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
                 {
-                    __m128i s0, s1, s2, s3, s4, s5;
-                    s0 = _mm_load_si128((__m128i*)(S0 + i));
-                    s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                    s2 = _mm_load_si128((__m128i*)(S1 + i));
-                    s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
-                    s4 = _mm_load_si128((__m128i*)(S2 + i));
-                    s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                    s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
-                    s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
-                    s0 = _mm_add_epi32(s0, d4);
-                    s1 = _mm_add_epi32(s1, d4);
-                    _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                    v_int32 sl = vx_load(S1 + i);
+                    v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh)));
+                }
+                if( i <= width - v_int32::nlanes )
+                {
+                    v_int32 s = vx_load(S1 + i);
+                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s));
+                    i += v_int32::nlanes;
                 }
             }
             else if( ky[0] == -2 && ky[1] == 1 )
             {
-                for( ; i <= width - 8; i += 8 )
+                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
                 {
-                    __m128i s0, s1, s2, s3, s4, s5;
-                    s0 = _mm_load_si128((__m128i*)(S0 + i));
-                    s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                    s2 = _mm_load_si128((__m128i*)(S1 + i));
-                    s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
-                    s4 = _mm_load_si128((__m128i*)(S2 + i));
-                    s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                    s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
-                    s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
-                    s0 = _mm_add_epi32(s0, d4);
-                    s1 = _mm_add_epi32(s1, d4);
-                    _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                    v_int32 sl = vx_load(S1 + i);
+                    v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh)));
+                }
+                if( i <= width - v_int32::nlanes )
+                {
+                    v_int32 s = vx_load(S1 + i);
+                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s));
+                    i += v_int32::nlanes;
+                }
+            }
+            else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) )
+            {
+                v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
+                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                    v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                            v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+                if( i <= width - v_int32::nlanes )
+                {
+                    v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+                    i += v_int32::nlanes;
                 }
             }
             else
             {
-                __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
-                for( ; i <= width - 8; i += 8 )
+                v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                            v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+                if( i <= width - v_int32::nlanes )
                 {
-                    __m128 s0, s1;
-                    s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
-                    s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
-                    s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
-                    s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
-                    __m128i x0, x1;
-                    x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
-                                       _mm_load_si128((__m128i*)(S2 + i)));
-                    x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
-                                       _mm_load_si128((__m128i*)(S2 + i + 4)));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
-                    x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                    _mm_storeu_si128((__m128i*)(dst + i), x0);
+                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
+                    i += v_int32::nlanes;
                 }
             }
         }
@@ -1112,33 +1314,24 @@ struct SymmColumnSmallVec_32s16s
             {
                 if( ky[1] < 0 )
                     std::swap(S0, S2);
-                for( ; i <= width - 8; i += 8 )
+                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4));
+                if( i <= width - v_int32::nlanes )
                 {
-                    __m128i s0, s1, s2, s3;
-                    s0 = _mm_load_si128((__m128i*)(S2 + i));
-                    s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                    s2 = _mm_load_si128((__m128i*)(S0 + i));
-                    s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                    s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
-                    s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
-                    _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                    v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+                    i += v_int32::nlanes;
                 }
             }
             else
             {
-                __m128 k1 = _mm_set1_ps(ky[1]);
-                for( ; i <= width - 8; i += 8 )
+                v_float32 k1 = vx_setall_f32(ky[1]);
+                for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
+                                            v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+                if( i <= width - v_int32::nlanes )
                 {
-                    __m128 s0 = df4, s1 = df4;
-                    __m128i x0, x1;
-                    x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
-                                       _mm_load_si128((__m128i*)(S0 + i)));
-                    x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
-                                       _mm_load_si128((__m128i*)(S0 + i + 4)));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
-                    x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                    _mm_storeu_si128((__m128i*)(dst + i), x0);
+                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
+                    i += v_int32::nlanes;
                 }
             }
         }
@@ -1156,188 +1349,118 @@ struct SymmColumnSmallVec_32s16s
 
 struct RowVec_16s32f
 {
-    RowVec_16s32f() { sse2_supported = false; }
+    RowVec_16s32f() {}
     RowVec_16s32f( const Mat& _kernel )
     {
         kernel = _kernel;
-        sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
     }
 
     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
     {
-        if( !sse2_supported )
-            return 0;
-
         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
         float* dst = (float*)_dst;
         const float* _kx = kernel.ptr<float>();
         width *= cn;
 
-        for( ; i <= width - 8; i += 8 )
+        for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
         {
             const short* src = (const short*)_src + i;
-            __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+            v_float32 s0 = vx_setzero_f32();
+            v_float32 s1 = vx_setzero_f32();
             for( k = 0; k < _ksize; k++, src += cn )
             {
-                f = _mm_load_ss(_kx+k);
-                f = _mm_shuffle_ps(f, f, 0);
-
-                __m128i x0i = _mm_loadu_si128((const __m128i*)src);
-                __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
-                x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
-                x0 = _mm_cvtepi32_ps(x0i);
-                x1 = _mm_cvtepi32_ps(x1i);
-                s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
+                v_int16 x = vx_load(src);
+                s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0);
+                s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1);
             }
-            _mm_store_ps(dst + i, s0);
-            _mm_store_ps(dst + i + 4, s1);
+            v_store(dst + i, s0);
+            v_store(dst + i + v_float32::nlanes, s1);
+        }
+        if( i <= width - v_float32::nlanes )
+        {
+            const short* src = (const short*)_src + i;
+            v_float32 s0 = vx_setzero_f32();
+            for( k = 0; k < _ksize; k++, src += cn )
+                s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0);
+            v_store(dst + i, s0);
+            i += v_float32::nlanes;
         }
         return i;
     }
 
     Mat kernel;
-    bool sse2_supported;
 };
 
 
 struct SymmColumnVec_32f16s
 {
-    SymmColumnVec_32f16s() { symmetryType=0; delta = 0; sse2_supported = false; }
+    SymmColumnVec_32f16s() { symmetryType=0; delta = 0; }
     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
     {
         symmetryType = _symmetryType;
         kernel = _kernel;
         delta = (float)_delta;
         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-        sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
     }
 
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
-        if( !sse2_supported )
-            return 0;
-
-        int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+        int _ksize = kernel.rows + kernel.cols - 1;
+        int ksize2 = _ksize / 2;
         const float* ky = kernel.ptr<float>() + ksize2;
         int i = 0, k;
         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
         const float** src = (const float**)_src;
-        const float *S, *S2;
         short* dst = (short*)_dst;
-        __m128 d4 = _mm_set1_ps(delta);
 
+        v_float32 d4 = vx_setall_f32(delta);
         if( symmetrical )
         {
-            for( ; i <= width - 16; i += 16 )
+            if (_ksize == 1)
+                return 0;
+            v_float32 k0 = vx_setall_f32(ky[0]);
+            for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
             {
-                __m128 f = _mm_load_ss(ky);
-                f = _mm_shuffle_ps(f, f, 0);
-                __m128 s0, s1, s2, s3;
-                __m128 x0, x1;
-                S = src[0] + i;
-                s0 = _mm_load_ps(S);
-                s1 = _mm_load_ps(S+4);
-                s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                s2 = _mm_load_ps(S+8);
-                s3 = _mm_load_ps(S+12);
-                s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
-
+                v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
                 {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                    x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                    x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                    x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                    s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                    s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+                    v_float32 k1 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
                 }
-
-                __m128i s0i = _mm_cvtps_epi32(s0);
-                __m128i s1i = _mm_cvtps_epi32(s1);
-                __m128i s2i = _mm_cvtps_epi32(s2);
-                __m128i s3i = _mm_cvtps_epi32(s3);
-
-                _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
-                _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+                v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
             }
-
-            for( ; i <= width - 4; i += 4 )
+            if( i <= width - v_float32::nlanes )
             {
-                __m128 f = _mm_load_ss(ky);
-                f = _mm_shuffle_ps(f, f, 0);
-                __m128 x0, s0 = _mm_load_ps(src[0] + i);
-                s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-
+                v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
-                {
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-                    x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                }
-
-                __m128i s0i = _mm_cvtps_epi32(s0);
-                _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                v_pack_store(dst + i, v_round(s0));
+                i += v_float32::nlanes;
             }
         }
         else
         {
-            for( ; i <= width - 16; i += 16 )
+            for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
             {
-                __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                __m128 x0, x1;
-                S = src[0] + i;
-
+                v_float32 s0 = d4;
+                v_float32 s1 = d4;
                 for( k = 1; k <= ksize2; k++ )
                 {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                    x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                    x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                    x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                    s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                    s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+                    v_float32 k1 = vx_setall_f32(ky[k]);
+                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0);
+                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
                 }
-
-                __m128i s0i = _mm_cvtps_epi32(s0);
-                __m128i s1i = _mm_cvtps_epi32(s1);
-                __m128i s2i = _mm_cvtps_epi32(s2);
-                __m128i s3i = _mm_cvtps_epi32(s3);
-
-                _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
-                _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+                v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
             }
-
-            for( ; i <= width - 4; i += 4 )
+            if( i <= width - v_float32::nlanes )
             {
-                __m128 f, x0, s0 = d4;
-
+                v_float32 s0 = d4;
                 for( k = 1; k <= ksize2; k++ )
-                {
-                    f = _mm_load_ss(ky+k);
-                    f = _mm_shuffle_ps(f, f, 0);
-                    x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                }
-
-                __m128i s0i = _mm_cvtps_epi32(s0);
-                _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                v_pack_store(dst + i, v_round(s0));
+                i += v_float32::nlanes;
             }
         }
 
@@ -1347,7 +1470,6 @@ struct SymmColumnVec_32f16s
     int symmetryType;
     float delta;
     Mat kernel;
-    bool sse2_supported;
 };
 
 
@@ -1357,7 +1479,6 @@ struct RowVec_32f
 {
     RowVec_32f()
     {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE);
         haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
 #if defined USE_IPP_SEP_FILTERS
         bufsz = -1;
@@ -1367,7 +1488,6 @@ struct RowVec_32f
     RowVec_32f( const Mat& _kernel )
     {
         kernel = _kernel;
-        haveSSE = checkHardwareSupport(CV_CPU_SSE);
         haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
 #if defined USE_IPP_SEP_FILTERS
         bufsz = -1;
@@ -1389,9 +1509,6 @@ struct RowVec_32f
         float* dst = (float*)_dst;
         const float* _kx = kernel.ptr<float>();
 
-        if( !haveSSE )
-            return 0;
-
         int i = 0, k;
         width *= cn;
 
@@ -1399,27 +1516,18 @@ struct RowVec_32f
         if (haveAVX2)
             return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize);
 #endif
-        for( ; i <= width - 8; i += 8 )
+        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
         {
             const float* src = src0 + i;
-            __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+            v_float32 s0 = vx_setzero_f32();
             for( k = 0; k < _ksize; k++, src += cn )
-            {
-                f = _mm_set1_ps(_kx[k]);
-
-                x0 = _mm_loadu_ps(src);
-                x1 = _mm_loadu_ps(src + 4);
-                s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-            }
-            _mm_store_ps(dst + i, s0);
-            _mm_store_ps(dst + i + 4, s1);
+                s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
+            v_store(dst + i, s0);
         }
         return i;
     }
 
     Mat kernel;
-    bool haveSSE;
     bool haveAVX2;
 #if defined USE_IPP_SEP_FILTERS
 private:
@@ -1475,9 +1583,6 @@ struct SymmRowSmallVec_32f
 
     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
         float* dst = (float*)_dst;
         const float* src = (const float*)_src + (_ksize/2)*cn;
@@ -1491,101 +1596,32 @@ struct SymmRowSmallVec_32f
                 return 0;
             if( _ksize == 3 )
             {
-                if( kx[0] == 2 && kx[1] == 1 )
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        __m128 x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_ps(src - cn);
-                        x1 = _mm_loadu_ps(src);
-                        x2 = _mm_loadu_ps(src + cn);
-                        y0 = _mm_loadu_ps(src - cn + 4);
-                        y1 = _mm_loadu_ps(src + 4);
-                        y2 = _mm_loadu_ps(src + cn + 4);
-                        x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
-                        y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
-                        _mm_store_ps(dst + i, x0);
-                        _mm_store_ps(dst + i + 4, y0);
-                    }
-                else if( kx[0] == -2 && kx[1] == 1 )
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        __m128 x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_ps(src - cn);
-                        x1 = _mm_loadu_ps(src);
-                        x2 = _mm_loadu_ps(src + cn);
-                        y0 = _mm_loadu_ps(src - cn + 4);
-                        y1 = _mm_loadu_ps(src + 4);
-                        y2 = _mm_loadu_ps(src + cn + 4);
-                        x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
-                        y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
-                        _mm_store_ps(dst + i, x0);
-                        _mm_store_ps(dst + i + 4, y0);
-                    }
+                if( fabs(kx[0]) == 2 && kx[1] == 1 )
+                {
+                    v_float32 k0 = vx_setall_f32(kx[0]);
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+                }
                 else
                 {
-                    __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        __m128 x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_ps(src - cn);
-                        x1 = _mm_loadu_ps(src);
-                        x2 = _mm_loadu_ps(src + cn);
-                        y0 = _mm_loadu_ps(src - cn + 4);
-                        y1 = _mm_loadu_ps(src + 4);
-                        y2 = _mm_loadu_ps(src + cn + 4);
-
-                        x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
-                        y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
-                        x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
-                        y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
-                        _mm_store_ps(dst + i, x0);
-                        _mm_store_ps(dst + i + 4, y0);
-                    }
+                    v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]);
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1));
                 }
             }
             else if( _ksize == 5 )
             {
                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        __m128 x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_ps(src - cn*2);
-                        x1 = _mm_loadu_ps(src);
-                        x2 = _mm_loadu_ps(src + cn*2);
-                        y0 = _mm_loadu_ps(src - cn*2 + 4);
-                        y1 = _mm_loadu_ps(src + 4);
-                        y2 = _mm_loadu_ps(src + cn*2 + 4);
-                        x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
-                        y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
-                        _mm_store_ps(dst + i, x0);
-                        _mm_store_ps(dst + i + 4, y0);
-                    }
+                {
+                    v_float32 k0 = vx_setall_f32(-2);
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+                }
                 else
                 {
-                    __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        __m128 x0, x1, x2, y0, y1, y2;
-                        x0 = _mm_loadu_ps(src - cn);
-                        x1 = _mm_loadu_ps(src);
-                        x2 = _mm_loadu_ps(src + cn);
-                        y0 = _mm_loadu_ps(src - cn + 4);
-                        y1 = _mm_loadu_ps(src + 4);
-                        y2 = _mm_loadu_ps(src + cn + 4);
-
-                        x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
-                        y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
-                        x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
-                        y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
-
-                        x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
-                        y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
-                        x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
-                        y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
-
-                        _mm_store_ps(dst + i, x0);
-                        _mm_store_ps(dst + i + 4, y0);
-                    }
+                    v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)));
                 }
             }
         }
@@ -1594,58 +1630,20 @@ struct SymmRowSmallVec_32f
             if( _ksize == 3 )
             {
                 if( kx[0] == 0 && kx[1] == 1 )
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        __m128 x0, x2, y0, y2;
-                        x0 = _mm_loadu_ps(src + cn);
-                        x2 = _mm_loadu_ps(src - cn);
-                        y0 = _mm_loadu_ps(src + cn + 4);
-                        y2 = _mm_loadu_ps(src - cn + 4);
-                        x0 = _mm_sub_ps(x0, x2);
-                        y0 = _mm_sub_ps(y0, y2);
-                        _mm_store_ps(dst + i, x0);
-                        _mm_store_ps(dst + i + 4, y0);
-                    }
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        v_store(dst + i, vx_load(src + cn) - vx_load(src - cn));
                 else
                 {
-                    __m128 k1 = _mm_set1_ps(kx[1]);
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        __m128 x0, x2, y0, y2;
-                        x0 = _mm_loadu_ps(src + cn);
-                        x2 = _mm_loadu_ps(src - cn);
-                        y0 = _mm_loadu_ps(src + cn + 4);
-                        y2 = _mm_loadu_ps(src - cn + 4);
-
-                        x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
-                        y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
-                        _mm_store_ps(dst + i, x0);
-                        _mm_store_ps(dst + i + 4, y0);
-                    }
+                    v_float32 k1 = vx_setall_f32(kx[1]);
+                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1);
                 }
             }
             else if( _ksize == 5 )
             {
-                __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
-                for( ; i <= width - 8; i += 8, src += 8 )
-                {
-                    __m128 x0, x2, y0, y2;
-                    x0 = _mm_loadu_ps(src + cn);
-                    x2 = _mm_loadu_ps(src - cn);
-                    y0 = _mm_loadu_ps(src + cn + 4);
-                    y2 = _mm_loadu_ps(src - cn + 4);
-
-                    x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
-                    y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
-
-                    x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
-                    y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
-                    x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
-                    y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
-
-                    _mm_store_ps(dst + i, x0);
-                    _mm_store_ps(dst + i + 4, y0);
-                }
+                v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                    v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1));
             }
         }
 
@@ -1661,7 +1659,6 @@ struct SymmColumnVec_32f
 {
     SymmColumnVec_32f() {
         symmetryType=0;
-        haveSSE = checkHardwareSupport(CV_CPU_SSE);
         haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
         delta = 0;
     }
@@ -1670,22 +1667,17 @@ struct SymmColumnVec_32f
         symmetryType = _symmetryType;
         kernel = _kernel;
         delta = (float)_delta;
-        haveSSE = checkHardwareSupport(CV_CPU_SSE);
         haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
     }
 
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
-        if( !haveSSE )
-            return 0;
-
         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
         const float* ky = kernel.ptr<float>() + ksize2;
         int i = 0, k;
         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
         const float** src = (const float**)_src;
-        const float *S, *S2;
         float* dst = (float*)_dst;
 
         if( symmetrical )
@@ -1695,59 +1687,13 @@ struct SymmColumnVec_32f
             if (haveAVX2)
                 return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2);
 #endif
-            const __m128 d4 = _mm_set1_ps(delta);
-            for( ; i <= width - 16; i += 16 )
+            const v_float32 d4 = vx_setall_f32(delta);
+            for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
             {
-                __m128 f = _mm_set1_ps(ky[0]);
-                __m128 s0, s1, s2, s3;
-                __m128 x0, x1;
-                S = src[0] + i;
-                s0 = _mm_load_ps(S);
-                s1 = _mm_load_ps(S+4);
-                s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                s2 = _mm_load_ps(S+8);
-                s3 = _mm_load_ps(S+12);
-                s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
-
+                v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4);
                 for( k = 1; k <= ksize2; k++ )
-                {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-                    f = _mm_set1_ps(ky[k]);
-                    x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                    x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                    x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                    x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                    s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                    s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
-                }
-
-                _mm_storeu_ps(dst + i, s0);
-                _mm_storeu_ps(dst + i + 4, s1);
-                _mm_storeu_ps(dst + i + 8, s2);
-                _mm_storeu_ps(dst + i + 12, s3);
-            }
-
-            for( ; i <= width - 4; i += 4 )
-            {
-                __m128 f = _mm_set1_ps(ky[0]);
-                __m128 x0, s0 = _mm_load_ps(src[0] + i);
-                s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-
-                for( k = 1; k <= ksize2; k++ )
-                {
-                    f = _mm_set1_ps(ky[k]);
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-                    x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                }
-
-                _mm_storeu_ps(dst + i, s0);
+                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                v_store(dst + i, s0);
             }
         }
         else
@@ -1756,46 +1702,13 @@ struct SymmColumnVec_32f
             if (haveAVX2)
                 return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
 #endif
-            const __m128 d4 = _mm_set1_ps(delta);
-            for( ; i <= width - 16; i += 16 )
+            const v_float32 d4 = vx_setall_f32(delta);
+            for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
             {
-                __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                __m128 x0, x1;
-                S = src[0] + i;
-
+                v_float32 s0 = d4;
                 for( k = 1; k <= ksize2; k++ )
-                {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-                    f = _mm_set1_ps(ky[k]);
-                    x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                    x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                    x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                    x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                    s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                    s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
-                }
-
-                _mm_storeu_ps(dst + i, s0);
-                _mm_storeu_ps(dst + i + 4, s1);
-                _mm_storeu_ps(dst + i + 8, s2);
-                _mm_storeu_ps(dst + i + 12, s3);
-            }
-
-            for( ; i <= width - 4; i += 4 )
-            {
-                __m128 f, x0, s0 = d4;
-
-                for( k = 1; k <= ksize2; k++ )
-                {
-                    f = _mm_set1_ps(ky[k]);
-                    x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                }
-
-                _mm_storeu_ps(dst + i, s0);
+                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                v_store(dst + i, s0);
             }
         }
 
@@ -1805,7 +1718,6 @@ struct SymmColumnVec_32f
     int symmetryType;
     float delta;
     Mat kernel;
-    bool haveSSE;
     bool haveAVX2;
 };
 
@@ -1823,9 +1735,6 @@ struct SymmColumnSmallVec_32f
 
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
         const float* ky = kernel.ptr<float>() + ksize2;
         int i = 0;
@@ -1833,65 +1742,21 @@ struct SymmColumnSmallVec_32f
         const float** src = (const float**)_src;
         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
         float* dst = (float*)_dst;
-        __m128 d4 = _mm_set1_ps(delta);
 
+        v_float32 d4 = vx_setall_f32(delta);
         if( symmetrical )
         {
-            if( ky[0] == 2 && ky[1] == 1 )
+            if( fabs(ky[0]) == 2 && ky[1] == 1 )
             {
-                for( ; i <= width - 8; i += 8 )
-                {
-                    __m128 s0, s1, s2, s3, s4, s5;
-                    s0 = _mm_load_ps(S0 + i);
-                    s1 = _mm_load_ps(S0 + i + 4);
-                    s2 = _mm_load_ps(S1 + i);
-                    s3 = _mm_load_ps(S1 + i + 4);
-                    s4 = _mm_load_ps(S2 + i);
-                    s5 = _mm_load_ps(S2 + i + 4);
-                    s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
-                    s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
-                    s0 = _mm_add_ps(s0, d4);
-                    s1 = _mm_add_ps(s1, d4);
-                    _mm_storeu_ps(dst + i, s0);
-                    _mm_storeu_ps(dst + i + 4, s1);
-                }
-            }
-            else if( ky[0] == -2 && ky[1] == 1 )
-            {
-                for( ; i <= width - 8; i += 8 )
-                {
-                    __m128 s0, s1, s2, s3, s4, s5;
-                    s0 = _mm_load_ps(S0 + i);
-                    s1 = _mm_load_ps(S0 + i + 4);
-                    s2 = _mm_load_ps(S1 + i);
-                    s3 = _mm_load_ps(S1 + i + 4);
-                    s4 = _mm_load_ps(S2 + i);
-                    s5 = _mm_load_ps(S2 + i + 4);
-                    s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
-                    s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
-                    s0 = _mm_add_ps(s0, d4);
-                    s1 = _mm_add_ps(s1, d4);
-                    _mm_storeu_ps(dst + i, s0);
-                    _mm_storeu_ps(dst + i + 4, s1);
-                }
+                v_float32 k0 = vx_setall_f32(ky[0]);
+                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
             }
             else
             {
-                __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
-                for( ; i <= width - 8; i += 8 )
-                {
-                    __m128 s0, s1, x0, x1;
-                    s0 = _mm_load_ps(S1 + i);
-                    s1 = _mm_load_ps(S1 + i + 4);
-                    s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
-                    s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
-                    x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
-                    x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
-                    _mm_storeu_ps(dst + i, s0);
-                    _mm_storeu_ps(dst + i + 4, s1);
-                }
+                v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
             }
         }
         else
@@ -1900,32 +1765,14 @@ struct SymmColumnSmallVec_32f
             {
                 if( ky[1] < 0 )
                     std::swap(S0, S2);
-                for( ; i <= width - 8; i += 8 )
-                {
-                    __m128 s0, s1, s2, s3;
-                    s0 = _mm_load_ps(S2 + i);
-                    s1 = _mm_load_ps(S2 + i + 4);
-                    s2 = _mm_load_ps(S0 + i);
-                    s3 = _mm_load_ps(S0 + i + 4);
-                    s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
-                    s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
-                    _mm_storeu_ps(dst + i, s0);
-                    _mm_storeu_ps(dst + i + 4, s1);
-                }
+                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
             }
             else
             {
-                __m128 k1 = _mm_set1_ps(ky[1]);
-                for( ; i <= width - 8; i += 8 )
-                {
-                    __m128 s0 = d4, s1 = d4, x0, x1;
-                    x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
-                    x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
-                    s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
-                    s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
-                    _mm_storeu_ps(dst + i, s0);
-                    _mm_storeu_ps(dst + i + 4, s1);
-                }
+                v_float32 k1 = vx_setall_f32(ky[1]);
+                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4));
             }
         }
 
@@ -1957,63 +1804,55 @@ struct FilterVec_8u
 
     int operator()(const uchar** src, uchar* dst, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         const float* kf = (const float*)&coeffs[0];
         int i = 0, k, nz = _nz;
-        __m128 d4 = _mm_set1_ps(delta);
 
-        for( ; i <= width - 16; i += 16 )
+        v_float32 d4 = vx_setall_f32(delta);
+        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
         {
-            __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-            __m128i x0, x1, z = _mm_setzero_si128();
-
+            v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
             for( k = 0; k < nz; k++ )
             {
-                __m128 f = _mm_load_ss(kf+k), t0, t1;
-                f = _mm_shuffle_ps(f, f, 0);
-
-                x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
-                x1 = _mm_unpackhi_epi8(x0, z);
-                x0 = _mm_unpacklo_epi8(x0, z);
-
-                t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
-                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-
-                t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
-                t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
-                s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
+                v_float32 f = vx_setall_f32(kf[k]);
+                v_uint16 xl, xh;
+                v_expand(vx_load(src[k] + i), xl, xh);
+                v_uint32 x0, x1, x2, x3;
+                v_expand(xl, x0, x1);
+                v_expand(xh, x2, x3);
+                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+                s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
+                s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f, s2);
+                s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f, s3);
             }
-
-            x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-            x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-            x0 = _mm_packus_epi16(x0, x1);
-            _mm_storeu_si128((__m128i*)(dst + i), x0);
+            v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
         }
-
-        for( ; i <= width - 4; i += 4 )
+        if( i <= width - v_uint16::nlanes )
         {
-            __m128 s0 = d4;
-            __m128i x0, z = _mm_setzero_si128();
-
+            v_float32 s0 = d4, s1 = d4;
             for( k = 0; k < nz; k++ )
             {
-                __m128 f = _mm_load_ss(kf+k), t0;
-                f = _mm_shuffle_ps(f, f, 0);
-
-                x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
-                x0 = _mm_unpacklo_epi8(x0, z);
-                t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
+                v_float32 f = vx_setall_f32(kf[k]);
+                v_uint32 x0, x1;
+                v_expand(vx_load_expand(src[k] + i), x0, x1);
+                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+                s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
             }
-
-            x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
-            x0 = _mm_packus_epi16(x0, x0);
-            *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+            v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+            i += v_uint16::nlanes;
+        }
+#if CV_SIMD_WIDTH > 16
+        while( i <= width - v_int32x4::nlanes )
+#else
+        if( i <= width - v_int32x4::nlanes )
+#endif
+        {
+            v_float32x4 s0 = v_setall_f32(delta);
+            for( k = 0; k < nz; k++ )
+                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
+            v_int32x4 s32 = v_round(s0);
+            v_int16x8 s16 = v_pack(s32, s32);
+            *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+            i += v_int32x4::nlanes;
         }
 
         return i;
@@ -2040,63 +1879,47 @@ struct FilterVec_8u16s
 
     int operator()(const uchar** src, uchar* _dst, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
         const float* kf = (const float*)&coeffs[0];
         short* dst = (short*)_dst;
         int i = 0, k, nz = _nz;
-        __m128 d4 = _mm_set1_ps(delta);
 
-        for( ; i <= width - 16; i += 16 )
+        v_float32 d4 = vx_setall_f32(delta);
+        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
         {
-            __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-            __m128i x0, x1, z = _mm_setzero_si128();
-
+            v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
             for( k = 0; k < nz; k++ )
             {
-                __m128 f = _mm_load_ss(kf+k), t0, t1;
-                f = _mm_shuffle_ps(f, f, 0);
-
-                x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
-                x1 = _mm_unpackhi_epi8(x0, z);
-                x0 = _mm_unpacklo_epi8(x0, z);
-
-                t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
-                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-
-                t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
-                t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
-                s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
+                v_float32 f = vx_setall_f32(kf[k]);
+                v_uint16 xl, xh;
+                v_expand(vx_load(src[k] + i), xl, xh);
+                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0);
+                s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1);
+                s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f, s2);
+                s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3);
             }
-
-            x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-            x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-            _mm_storeu_si128((__m128i*)(dst + i), x0);
-            _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
+            v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+            v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
         }
-
-        for( ; i <= width - 4; i += 4 )
+        if( i <= width - v_uint16::nlanes )
         {
-            __m128 s0 = d4;
-            __m128i x0, z = _mm_setzero_si128();
-
+            v_float32 s0 = d4, s1 = d4;
             for( k = 0; k < nz; k++ )
             {
-                __m128 f = _mm_load_ss(kf+k), t0;
-                f = _mm_shuffle_ps(f, f, 0);
-
-                x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
-                x0 = _mm_unpacklo_epi8(x0, z);
-                t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
+                v_float32 f = vx_setall_f32(kf[k]);
+                v_uint16 x = vx_load_expand(src[k] + i);
+                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0);
+                s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
             }
-
-            x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
-            _mm_storel_epi64((__m128i*)(dst + i), x0);
+            v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+            i += v_uint16::nlanes;
+        }
+        if( i <= width - v_int32::nlanes )
+        {
+            v_float32 s0 = d4;
+            for( k = 0; k < nz; k++ )
+                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
+            v_pack_store(dst + i, v_round(s0));
+            i += v_int32::nlanes;
         }
 
         return i;
@@ -2121,54 +1944,18 @@ struct FilterVec_32f
 
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
         const float* kf = (const float*)&coeffs[0];
         const float** src = (const float**)_src;
         float* dst = (float*)_dst;
         int i = 0, k, nz = _nz;
-        __m128 d4 = _mm_set1_ps(delta);
 
-        for( ; i <= width - 16; i += 16 )
+        v_float32 d4 = vx_setall_f32(delta);
+        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
         {
-            __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-
+            v_float32 s0 = d4;
             for( k = 0; k < nz; k++ )
-            {
-                __m128 f = _mm_load_ss(kf+k), t0, t1;
-                f = _mm_shuffle_ps(f, f, 0);
-                const float* S = src[k] + i;
-
-                t0 = _mm_loadu_ps(S);
-                t1 = _mm_loadu_ps(S + 4);
-                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-
-                t0 = _mm_loadu_ps(S + 8);
-                t1 = _mm_loadu_ps(S + 12);
-                s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
-            }
-
-            _mm_storeu_ps(dst + i, s0);
-            _mm_storeu_ps(dst + i + 4, s1);
-            _mm_storeu_ps(dst + i + 8, s2);
-            _mm_storeu_ps(dst + i + 12, s3);
-        }
-
-        for( ; i <= width - 4; i += 4 )
-        {
-            __m128 s0 = d4;
-
-            for( k = 0; k < nz; k++ )
-            {
-                __m128 f = _mm_load_ss(kf+k), t0;
-                f = _mm_shuffle_ps(f, f, 0);
-                t0 = _mm_loadu_ps(src[k] + i);
-                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-            }
-            _mm_storeu_ps(dst + i, s0);
+                s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
+            v_store(dst + i, s0);
         }
 
         return i;
@@ -2179,847 +1966,6 @@ struct FilterVec_32f
     float delta;
 };
 
-
-#elif CV_NEON
-
-struct SymmRowSmallVec_8u32s
-{
-    SymmRowSmallVec_8u32s() { smallValues = false; }
-    SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
-    {
-        kernel = _kernel;
-        symmetryType = _symmetryType;
-        smallValues = true;
-        int k, ksize = kernel.rows + kernel.cols - 1;
-        for( k = 0; k < ksize; k++ )
-        {
-            int v = kernel.ptr<int>()[k];
-            if( v < SHRT_MIN || v > SHRT_MAX )
-            {
-                smallValues = false;
-                break;
-            }
-        }
-    }
-
-    int operator()(const uchar* src, uchar* _dst, int width, int cn) const
-    {
-         if( !checkHardwareSupport(CV_CPU_NEON) )
-             return 0;
-
-        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
-        int* dst = (int*)_dst;
-        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-        const int* kx = kernel.ptr<int>() + _ksize/2;
-        if( !smallValues )
-            return 0;
-
-        src += (_ksize/2)*cn;
-        width *= cn;
-
-        if( symmetrical )
-        {
-            if( _ksize == 1 )
-                return 0;
-            if( _ksize == 3 )
-            {
-                if( kx[0] == 2 && kx[1] == 1 )
-                {
-                    uint16x8_t zq = vdupq_n_u16(0);
-
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        uint8x8_t x0, x1, x2;
-                        x0 = vld1_u8( (uint8_t *) (src - cn) );
-                        x1 = vld1_u8( (uint8_t *) (src) );
-                        x2 = vld1_u8( (uint8_t *) (src + cn) );
-
-                        uint16x8_t y0, y1, y2;
-                        y0 = vaddl_u8(x0, x2);
-                        y1 = vshll_n_u8(x1, 1);
-                        y2 = vaddq_u16(y0, y1);
-
-                        uint16x8x2_t str;
-                        str.val[0] = y2; str.val[1] = zq;
-                        vst2q_u16( (uint16_t *) (dst + i), str );
-                    }
-                }
-                else if( kx[0] == -2 && kx[1] == 1 )
-                    return 0;
-                else
-                {
-                    int32x4_t k32 = vdupq_n_s32(0);
-                    k32 = vld1q_lane_s32(kx, k32, 0);
-                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
-
-                    int16x4_t k = vqmovn_s32(k32);
-
-                    uint8x8_t z = vdup_n_u8(0);
-
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        uint8x8_t x0, x1, x2;
-                        x0 = vld1_u8( (uint8_t *) (src - cn) );
-                        x1 = vld1_u8( (uint8_t *) (src) );
-                        x2 = vld1_u8( (uint8_t *) (src + cn) );
-
-                        int16x8_t y0, y1;
-                        int32x4_t y2, y3;
-                        y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
-                        y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
-                        y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
-                        y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
-                        y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
-                        y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
-
-                        vst1q_s32((int32_t *)(dst + i), y2);
-                        vst1q_s32((int32_t *)(dst + i + 4), y3);
-                    }
-                }
-            }
-            else if( _ksize == 5 )
-            {
-                if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                    return 0;
-                else
-                {
-                    int32x4_t k32 = vdupq_n_s32(0);
-                    k32 = vld1q_lane_s32(kx, k32, 0);
-                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                    k32 = vld1q_lane_s32(kx + 2, k32, 2);
-
-                    int16x4_t k = vqmovn_s32(k32);
-
-                    uint8x8_t z = vdup_n_u8(0);
-
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        uint8x8_t x0, x1, x2, x3, x4;
-                        x0 = vld1_u8( (uint8_t *) (src - cn) );
-                        x1 = vld1_u8( (uint8_t *) (src) );
-                        x2 = vld1_u8( (uint8_t *) (src + cn) );
-
-                        int16x8_t y0, y1;
-                        int32x4_t accl, acch;
-                        y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
-                        y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
-                        accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
-                        accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
-                        acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
-                        acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
-
-                        int16x8_t y2;
-                        x3 = vld1_u8( (uint8_t *) (src - cn*2) );
-                        x4 = vld1_u8( (uint8_t *) (src + cn*2) );
-                        y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
-                        accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
-                        acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
-
-                        vst1q_s32((int32_t *)(dst + i), accl);
-                        vst1q_s32((int32_t *)(dst + i + 4), acch);
-                    }
-                }
-            }
-        }
-        else
-        {
-            if( _ksize == 3 )
-            {
-                if( kx[0] == 0 && kx[1] == 1 )
-                {
-                    uint8x8_t z = vdup_n_u8(0);
-
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        uint8x8_t x0, x1;
-                        x0 = vld1_u8( (uint8_t *) (src - cn) );
-                        x1 = vld1_u8( (uint8_t *) (src + cn) );
-
-                        int16x8_t y0;
-                        y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                                vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-
-                        vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
-                        vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
-                    }
-                }
-                else
-                {
-                    int32x4_t k32 = vdupq_n_s32(0);
-                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
-
-                    int16x4_t k = vqmovn_s32(k32);
-
-                    uint8x8_t z = vdup_n_u8(0);
-
-                    for( ; i <= width - 8; i += 8, src += 8 )
-                    {
-                        uint8x8_t x0, x1;
-                        x0 = vld1_u8( (uint8_t *) (src - cn) );
-                        x1 = vld1_u8( (uint8_t *) (src + cn) );
-
-                        int16x8_t y0;
-                        int32x4_t y1, y2;
-                        y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                            vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-                        y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
-                        y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
-
-                        vst1q_s32((int32_t *)(dst + i), y1);
-                        vst1q_s32((int32_t *)(dst + i + 4), y2);
-                    }
-                }
-            }
-            else if( _ksize == 5 )
-            {
-                int32x4_t k32 = vdupq_n_s32(0);
-                k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                k32 = vld1q_lane_s32(kx + 2, k32, 2);
-
-                int16x4_t k = vqmovn_s32(k32);
-
-                uint8x8_t z = vdup_n_u8(0);
-
-                for( ; i <= width - 8; i += 8, src += 8 )
-                {
-                    uint8x8_t x0, x1;
-                    x0 = vld1_u8( (uint8_t *) (src - cn) );
-                    x1 = vld1_u8( (uint8_t *) (src + cn) );
-
-                    int32x4_t accl, acch;
-                    int16x8_t y0;
-                    y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                        vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-                    accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
-                    acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
-
-                    uint8x8_t x2, x3;
-                    x2 = vld1_u8( (uint8_t *) (src - cn*2) );
-                    x3 = vld1_u8( (uint8_t *) (src + cn*2) );
-
-                    int16x8_t y1;
-                    y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
-                        vreinterpretq_s16_u16(vaddl_u8(x2, z)));
-                    accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
-                    acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
-
-                    vst1q_s32((int32_t *)(dst + i), accl);
-                    vst1q_s32((int32_t *)(dst + i + 4), acch);
-                }
-            }
-        }
-
-        return i;
-    }
-
-    Mat kernel;
-    int symmetryType;
-    bool smallValues;
-};
-
-
-struct SymmColumnVec_32s8u
-{
-    SymmColumnVec_32s8u() { symmetryType=0; }
-    SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
-    {
-        symmetryType = _symmetryType;
-        _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-        delta = (float)(_delta/(1 << _bits));
-        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-    }
-
-    int operator()(const uchar** _src, uchar* dst, int width) const
-    {
-         if( !checkHardwareSupport(CV_CPU_NEON) )
-             return 0;
-
-        int _ksize = kernel.rows + kernel.cols - 1;
-        int ksize2 = _ksize / 2;
-        const float* ky = kernel.ptr<float>() + ksize2;
-        int i = 0, k;
-        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-        const int** src = (const int**)_src;
-        const int *S, *S2;
-
-        float32x4_t d4 = vdupq_n_f32(delta);
-
-        if( symmetrical )
-        {
-            if( _ksize == 1 )
-                return 0;
-
-
-            float32x2_t k32;
-            k32 = vdup_n_f32(0);
-            k32 = vld1_lane_f32(ky, k32, 0);
-            k32 = vld1_lane_f32(ky + 1, k32, 1);
-
-            for( ; i <= width - 8; i += 8 )
-            {
-                float32x4_t accl, acch;
-                float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
-
-                S = src[0] + i;
-
-                f0l = vcvtq_f32_s32( vld1q_s32(S) );
-                f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-
-                S = src[1] + i;
-                S2 = src[-1] + i;
-
-                f1l = vcvtq_f32_s32( vld1q_s32(S) );
-                f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                f2l = vcvtq_f32_s32( vld1q_s32(S2) );
-                f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
-                accl = acch = d4;
-                accl = vmlaq_lane_f32(accl, f0l, k32, 0);
-                acch = vmlaq_lane_f32(acch, f0h, k32, 0);
-                accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
-                acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
-
-                for( k = 2; k <= ksize2; k++ )
-                {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-
-                    float32x4_t f3l, f3h, f4l, f4h;
-                    f3l = vcvtq_f32_s32( vld1q_s32(S) );
-                    f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                    f4l = vcvtq_f32_s32( vld1q_s32(S2) );
-                    f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
-                    accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
-                    acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
-                }
-
-                int32x4_t s32l, s32h;
-                s32l = vcvtq_s32_f32(accl);
-                s32h = vcvtq_s32_f32(acch);
-
-                int16x4_t s16l, s16h;
-                s16l = vqmovn_s32(s32l);
-                s16h = vqmovn_s32(s32h);
-
-                uint8x8_t u8;
-                u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
-
-                vst1_u8((uint8_t *)(dst + i), u8);
-            }
-        }
-        else
-        {
-            float32x2_t k32;
-            k32 = vdup_n_f32(0);
-            k32 = vld1_lane_f32(ky + 1, k32, 1);
-
-            for( ; i <= width - 8; i += 8 )
-            {
-                float32x4_t accl, acch;
-                float32x4_t f1l, f1h, f2l, f2h;
-
-                S = src[1] + i;
-                S2 = src[-1] + i;
-
-                f1l = vcvtq_f32_s32( vld1q_s32(S) );
-                f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                f2l = vcvtq_f32_s32( vld1q_s32(S2) );
-                f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
-                accl = acch = d4;
-                accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
-                acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
-
-                for( k = 2; k <= ksize2; k++ )
-                {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-
-                    float32x4_t f3l, f3h, f4l, f4h;
-                    f3l = vcvtq_f32_s32( vld1q_s32(S) );
-                    f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                    f4l = vcvtq_f32_s32( vld1q_s32(S2) );
-                    f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-
-                    accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
-                    acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
-                }
-
-                int32x4_t s32l, s32h;
-                s32l = vcvtq_s32_f32(accl);
-                s32h = vcvtq_s32_f32(acch);
-
-                int16x4_t s16l, s16h;
-                s16l = vqmovn_s32(s32l);
-                s16h = vqmovn_s32(s32h);
-
-                uint8x8_t u8;
-                u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
-
-                vst1_u8((uint8_t *)(dst + i), u8);
-            }
-        }
-
-        return i;
-    }
-
-    int symmetryType;
-    float delta;
-    Mat kernel;
-};
-
-
-struct SymmColumnSmallVec_32s16s
-{
-    SymmColumnSmallVec_32s16s() { symmetryType=0; }
-    SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
-    {
-        symmetryType = _symmetryType;
-        _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-        delta = (float)(_delta/(1 << _bits));
-        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-    }
-
-    int operator()(const uchar** _src, uchar* _dst, int width) const
-    {
-         if( !checkHardwareSupport(CV_CPU_NEON) )
-             return 0;
-
-        int ksize2 = (kernel.rows + kernel.cols - 1)/2;
-        const float* ky = kernel.ptr<float>() + ksize2;
-        int i = 0;
-        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-        const int** src = (const int**)_src;
-        const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
-        short* dst = (short*)_dst;
-        float32x4_t df4 = vdupq_n_f32(delta);
-        int32x4_t d4 = vcvtq_s32_f32(df4);
-
-        if( symmetrical )
-        {
-            if( ky[0] == 2 && ky[1] == 1 )
-            {
-                for( ; i <= width - 4; i += 4 )
-                {
-                    int32x4_t x0, x1, x2;
-                    x0 = vld1q_s32((int32_t const *)(S0 + i));
-                    x1 = vld1q_s32((int32_t const *)(S1 + i));
-                    x2 = vld1q_s32((int32_t const *)(S2 + i));
-
-                    int32x4_t y0, y1, y2, y3;
-                    y0 = vaddq_s32(x0, x2);
-                    y1 = vqshlq_n_s32(x1, 1);
-                    y2 = vaddq_s32(y0, y1);
-                    y3 = vaddq_s32(y2, d4);
-
-                    int16x4_t t;
-                    t = vqmovn_s32(y3);
-
-                    vst1_s16((int16_t *)(dst + i), t);
-                }
-            }
-            else if( ky[0] == -2 && ky[1] == 1 )
-            {
-                for( ; i <= width - 4; i += 4 )
-                {
-                    int32x4_t x0, x1, x2;
-                    x0 = vld1q_s32((int32_t const *)(S0 + i));
-                    x1 = vld1q_s32((int32_t const *)(S1 + i));
-                    x2 = vld1q_s32((int32_t const *)(S2 + i));
-
-                    int32x4_t y0, y1, y2, y3;
-                    y0 = vaddq_s32(x0, x2);
-                    y1 = vqshlq_n_s32(x1, 1);
-                    y2 = vsubq_s32(y0, y1);
-                    y3 = vaddq_s32(y2, d4);
-
-                    int16x4_t t;
-                    t = vqmovn_s32(y3);
-
-                    vst1_s16((int16_t *)(dst + i), t);
-                }
-            }
-            else if( ky[0] == 10 && ky[1] == 3 )
-            {
-                for( ; i <= width - 4; i += 4 )
-                {
-                    int32x4_t x0, x1, x2, x3;
-                    x0 = vld1q_s32((int32_t const *)(S0 + i));
-                    x1 = vld1q_s32((int32_t const *)(S1 + i));
-                    x2 = vld1q_s32((int32_t const *)(S2 + i));
-
-                    x3 = vaddq_s32(x0, x2);
-
-                    int32x4_t y0;
-                    y0 = vmlaq_n_s32(d4, x1, 10);
-                    y0 = vmlaq_n_s32(y0, x3, 3);
-
-                    int16x4_t t;
-                    t = vqmovn_s32(y0);
-
-                    vst1_s16((int16_t *)(dst + i), t);
-                }
-            }
-            else
-            {
-                float32x2_t k32 = vdup_n_f32(0);
-                k32 = vld1_lane_f32(ky, k32, 0);
-                k32 = vld1_lane_f32(ky + 1, k32, 1);
-
-                for( ; i <= width - 4; i += 4 )
-                {
-                    int32x4_t x0, x1, x2, x3, x4;
-                    x0 = vld1q_s32((int32_t const *)(S0 + i));
-                    x1 = vld1q_s32((int32_t const *)(S1 + i));
-                    x2 = vld1q_s32((int32_t const *)(S2 + i));
-
-                    x3 = vaddq_s32(x0, x2);
-
-                    float32x4_t s0, s1, s2;
-                    s0 = vcvtq_f32_s32(x1);
-                    s1 = vcvtq_f32_s32(x3);
-                    s2 = vmlaq_lane_f32(df4, s0, k32, 0);
-                    s2 = vmlaq_lane_f32(s2, s1, k32, 1);
-
-                    x4 = vcvtq_s32_f32(s2);
-
-                    int16x4_t x5;
-                    x5 = vqmovn_s32(x4);
-
-                    vst1_s16((int16_t *)(dst + i), x5);
-                }
-            }
-        }
-        else
-        {
-            if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
-            {
-                if( ky[1] < 0 )
-                    std::swap(S0, S2);
-                for( ; i <= width - 4; i += 4 )
-                {
-                    int32x4_t x0, x1;
-                    x0 = vld1q_s32((int32_t const *)(S0 + i));
-                    x1 = vld1q_s32((int32_t const *)(S2 + i));
-
-                    int32x4_t y0, y1;
-                    y0 = vsubq_s32(x1, x0);
-                    y1 = vqaddq_s32(y0, d4);
-
-                    int16x4_t t;
-                    t = vqmovn_s32(y1);
-
-                    vst1_s16((int16_t *)(dst + i), t);
-                }
-            }
-            else
-            {
-                float32x2_t k32 = vdup_n_f32(0);
-                k32 = vld1_lane_f32(ky + 1, k32, 1);
-
-                for( ; i <= width - 4; i += 4 )
-                {
-                    int32x4_t x0, x1, x2, x3;
-                    x0 = vld1q_s32((int32_t const *)(S0 + i));
-                    x1 = vld1q_s32((int32_t const *)(S2 + i));
-
-                    x2 = vsubq_s32(x1, x0);
-
-                    float32x4_t s0, s1;
-                    s0 = vcvtq_f32_s32(x2);
-                    s1 = vmlaq_lane_f32(df4, s0, k32, 1);
-
-                    x3 = vcvtq_s32_f32(s1);
-
-                    int16x4_t x4;
-                    x4 = vqmovn_s32(x3);
-
-                    vst1_s16((int16_t *)(dst + i), x4);
-                }
-            }
-        }
-
-        return i;
-    }
-
-    int symmetryType;
-    float delta;
-    Mat kernel;
-};
-
-
-struct SymmColumnVec_32f16s
-{
-    SymmColumnVec_32f16s() { symmetryType=0; }
-    SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
-    {
-        symmetryType = _symmetryType;
-        kernel = _kernel;
-        delta = (float)_delta;
-        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-         neon_supported = checkHardwareSupport(CV_CPU_NEON);
-    }
-
-    int operator()(const uchar** _src, uchar* _dst, int width) const
-    {
-         if( !neon_supported )
-             return 0;
-
-        int _ksize = kernel.rows + kernel.cols - 1;
-        int ksize2 = _ksize / 2;
-        const float* ky = kernel.ptr<float>() + ksize2;
-        int i = 0, k;
-        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-        const float** src = (const float**)_src;
-        const float *S, *S2;
-        short* dst = (short*)_dst;
-
-        float32x4_t d4 = vdupq_n_f32(delta);
-
-        if( symmetrical )
-        {
-            if( _ksize == 1 )
-                return 0;
-
-
-            float32x2_t k32;
-            k32 = vdup_n_f32(0);
-            k32 = vld1_lane_f32(ky, k32, 0);
-            k32 = vld1_lane_f32(ky + 1, k32, 1);
-
-            for( ; i <= width - 8; i += 8 )
-            {
-                float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
-                float32x4_t accl, acch;
-
-                S = src[0] + i;
-
-                x0l = vld1q_f32(S);
-                x0h = vld1q_f32(S + 4);
-
-                S = src[1] + i;
-                S2 = src[-1] + i;
-
-                x1l = vld1q_f32(S);
-                x1h = vld1q_f32(S + 4);
-                x2l = vld1q_f32(S2);
-                x2h = vld1q_f32(S2 + 4);
-
-                accl = acch = d4;
-                accl = vmlaq_lane_f32(accl, x0l, k32, 0);
-                acch = vmlaq_lane_f32(acch, x0h, k32, 0);
-                accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
-                acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
-
-                for( k = 2; k <= ksize2; k++ )
-                {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-
-                    float32x4_t x3l, x3h, x4l, x4h;
-                    x3l = vld1q_f32(S);
-                    x3h = vld1q_f32(S + 4);
-                    x4l = vld1q_f32(S2);
-                    x4h = vld1q_f32(S2 + 4);
-
-                    accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
-                    acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
-                }
-
-                int32x4_t s32l, s32h;
-                s32l = vcvtq_s32_f32(accl);
-                s32h = vcvtq_s32_f32(acch);
-
-                int16x4_t s16l, s16h;
-                s16l = vqmovn_s32(s32l);
-                s16h = vqmovn_s32(s32h);
-
-                vst1_s16((int16_t *)(dst + i), s16l);
-                vst1_s16((int16_t *)(dst + i + 4), s16h);
-            }
-        }
-        else
-        {
-            float32x2_t k32;
-            k32 = vdup_n_f32(0);
-            k32 = vld1_lane_f32(ky + 1, k32, 1);
-
-            for( ; i <= width - 8; i += 8 )
-            {
-                float32x4_t x1l, x1h, x2l, x2h;
-                float32x4_t accl, acch;
-
-                S = src[1] + i;
-                S2 = src[-1] + i;
-
-                x1l = vld1q_f32(S);
-                x1h = vld1q_f32(S + 4);
-                x2l = vld1q_f32(S2);
-                x2h = vld1q_f32(S2 + 4);
-
-                accl = acch = d4;
-                accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
-                acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
-
-                for( k = 2; k <= ksize2; k++ )
-                {
-                    S = src[k] + i;
-                    S2 = src[-k] + i;
-
-                    float32x4_t x3l, x3h, x4l, x4h;
-                    x3l = vld1q_f32(S);
-                    x3h = vld1q_f32(S + 4);
-                    x4l = vld1q_f32(S2);
-                    x4h = vld1q_f32(S2 + 4);
-
-                    accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
-                    acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
-                }
-
-                int32x4_t s32l, s32h;
-                s32l = vcvtq_s32_f32(accl);
-                s32h = vcvtq_s32_f32(acch);
-
-                int16x4_t s16l, s16h;
-                s16l = vqmovn_s32(s32l);
-                s16h = vqmovn_s32(s32h);
-
-                vst1_s16((int16_t *)(dst + i), s16l);
-                vst1_s16((int16_t *)(dst + i + 4), s16h);
-            }
-        }
-
-        return i;
-    }
-
-    int symmetryType;
-    float delta;
-    Mat kernel;
-    bool neon_supported;
-};
-
-
-struct SymmRowSmallVec_32f
-{
-    SymmRowSmallVec_32f() {}
-    SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
-    {
-        kernel = _kernel;
-        symmetryType = _symmetryType;
-    }
-
-    int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
-    {
-         if( !checkHardwareSupport(CV_CPU_NEON) )
-             return 0;
-
-        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
-        float* dst = (float*)_dst;
-        const float* src = (const float*)_src + (_ksize/2)*cn;
-        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-        const float* kx = kernel.ptr<float>() + _ksize/2;
-        width *= cn;
-
-        if( symmetrical )
-        {
-            if( _ksize == 1 )
-                return 0;
-            if( _ksize == 3 )
-            {
-                if( kx[0] == 2 && kx[1] == 1 )
-                    return 0;
-                else if( kx[0] == -2 && kx[1] == 1 )
-                    return 0;
-                else
-                {
-                    return 0;
-                }
-            }
-            else if( _ksize == 5 )
-            {
-                if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                    return 0;
-                else
-                {
-                    float32x2_t k0, k1;
-                    k0 = k1 = vdup_n_f32(0);
-                    k0 = vld1_lane_f32(kx + 0, k0, 0);
-                    k0 = vld1_lane_f32(kx + 1, k0, 1);
-                    k1 = vld1_lane_f32(kx + 2, k1, 0);
-
-                    for( ; i <= width - 4; i += 4, src += 4 )
-                    {
-                        float32x4_t x0, x1, x2, x3, x4;
-                        x0 = vld1q_f32(src);
-                        x1 = vld1q_f32(src - cn);
-                        x2 = vld1q_f32(src + cn);
-                        x3 = vld1q_f32(src - cn*2);
-                        x4 = vld1q_f32(src + cn*2);
-
-                        float32x4_t y0;
-                        y0 = vmulq_lane_f32(x0, k0, 0);
-                        y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
-                        y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
-
-                        vst1q_f32(dst + i, y0);
-                    }
-                }
-            }
-        }
-        else
-        {
-            if( _ksize == 3 )
-            {
-                if( kx[0] == 0 && kx[1] == 1 )
-                    return 0;
-                else
-                {
-                    return 0;
-                }
-            }
-            else if( _ksize == 5 )
-            {
-                float32x2_t k;
-                k = vdup_n_f32(0);
-                k = vld1_lane_f32(kx + 1, k, 0);
-                k = vld1_lane_f32(kx + 2, k, 1);
-
-                for( ; i <= width - 4; i += 4, src += 4 )
-                {
-                    float32x4_t x0, x1, x2, x3;
-                    x0 = vld1q_f32(src - cn);
-                    x1 = vld1q_f32(src + cn);
-                    x2 = vld1q_f32(src - cn*2);
-                    x3 = vld1q_f32(src + cn*2);
-
-                    float32x4_t y0;
-                    y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
-                    y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
-
-                    vst1q_f32(dst + i, y0);
-                }
-            }
-        }
-
-        return i;
-    }
-
-    Mat kernel;
-    int symmetryType;
-};
-
-
-typedef RowNoVec RowVec_8u32s;
-typedef RowNoVec RowVec_16s32f;
-typedef RowNoVec RowVec_32f;
-typedef ColumnNoVec SymmColumnVec_32f;
-typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
-typedef FilterNoVec FilterVec_8u;
-typedef FilterNoVec FilterVec_8u16s;
-typedef FilterNoVec FilterVec_32f;
-
-
 #else
 
 typedef RowNoVec RowVec_8u32s;
@@ -4655,15 +3601,9 @@ static bool dftFilter2D(int stype, int dtype, int kernel_type,
                         double delta, int borderType)
 {
     {
-#if CV_SSE2
         int sdepth = CV_MAT_DEPTH(stype);
         int ddepth = CV_MAT_DEPTH(dtype);
-        int dft_filter_size = ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3) ? 130 : 50;
-#else
-        CV_UNUSED(stype);
-        CV_UNUSED(dtype);
-        int dft_filter_size = 50;
-#endif
+        int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50;
         if (kernel_width * kernel_height < dft_filter_size)
             return false;
     }
diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.cpp
index 9f5a9ba7d5..567d197695 100644
--- a/modules/imgproc/src/median_blur.cpp
+++ b/modules/imgproc/src/median_blur.cpp
@@ -282,10 +282,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
                         {
 #if CV_SIMD256
-                            v_fine += v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                            v_fine = v_fine + v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
 #elif CV_SIMD128
-                            v_finel += v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
-                            v_fineh += v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
+                            v_finel = v_finel + v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                            v_fineh = v_fineh + v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
 #else
                             for (int ind = 0; ind < 16; ++ind)
                                 H[c].fine[k][ind] += px[16 * MIN(luc[c][k], n - 1) + ind] - px[16 * MAX(luc[c][k] - 2 * r - 1, 0) + ind];
@@ -321,10 +321,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                     CV_Assert( b < 16 );
                 }
             }
-#if CV_SIMD
-            vx_cleanup();
-#endif
         }
+#if CV_SIMD
+        vx_cleanup();
+#endif
     }
 
 #undef HOP
diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp
index 10b77ca790..155c62f342 100644
--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -2200,4 +2200,15 @@ TEST(Imgproc_Filter2D, dftFilter2d_regression_10683)
 
     EXPECT_LE(cvtest::norm(dst, expected, NORM_INF), 2);
 }
+
+TEST(Imgproc_MedianBlur, hires_regression_13409)
+{
+    Mat src(2048, 2048, CV_8UC1), dst_hires, dst_ref;
+    randu(src, 0, 256);
+
+    medianBlur(src, dst_hires, 9);
+    medianBlur(src(Rect(512, 512, 1024, 1024)), dst_ref, 9);
+
+    ASSERT_EQ(0.0, cvtest::norm(dst_hires(Rect(516, 516, 1016, 1016)), dst_ref(Rect(4, 4, 1016, 1016)), NORM_INF));
+}
 }} // namespace
diff --git a/modules/java/jar/CMakeLists.txt b/modules/java/jar/CMakeLists.txt
index d34a8e26bf..0e16e3b5eb 100644
--- a/modules/java/jar/CMakeLists.txt
+++ b/modules/java/jar/CMakeLists.txt
@@ -18,6 +18,13 @@ set(depends gen_opencv_java_source "${OPENCV_DEPHELPER}/gen_opencv_java_source")
 ocv_copyfiles_add_target(${the_module}_jar_source_copy JAVA_SRC_COPY "Copy Java(JAR) source files" ${depends})
 set(depends ${the_module}_jar_source_copy "${OPENCV_DEPHELPER}/${the_module}_jar_source_copy")
 
+if(OPENCV_JAVA_SOURCE_VERSION)
+  set(OPENCV_ANT_JAVAC_EXTRA_ATTRS "${OPENCV_ANT_JAVAC_EXTRA_ATTRS} source=\"${OPENCV_JAVA_SOURCE_VERSION}\"")
+endif()
+if(OPENCV_JAVA_TARGET_VERSION)
+  set(OPENCV_ANT_JAVAC_EXTRA_ATTRS "${OPENCV_ANT_JAVAC_EXTRA_ATTRS} target=\"${OPENCV_JAVA_TARGET_VERSION}\"")
+endif()
+
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.xml.in" "${OPENCV_JAVA_DIR}/build.xml" @ONLY)
 list(APPEND depends "${OPENCV_JAVA_DIR}/build.xml")
 
diff --git a/modules/java/jar/build.xml.in b/modules/java/jar/build.xml.in
index d4f01931b2..41b55b76fc 100644
--- a/modules/java/jar/build.xml.in
+++ b/modules/java/jar/build.xml.in
@@ -11,7 +11,7 @@
     <!-- This is to make a jar with a source attachment, for e.g. easy -->
     <!-- navigation in Eclipse. See this question: -->
     <!-- http://stackoverflow.com/questions/3584968/ant-how-to-compile-jar-that-includes-source-attachment -->
-    <javac sourcepath="" srcdir="java" destdir="build/classes" debug="on" includeantruntime="false" >
+    <javac sourcepath="" srcdir="java" destdir="build/classes" debug="on" includeantruntime="false" @OPENCV_ANT_JAVAC_EXTRA_ATTRS@ >
       <include name="**/*.java"/>
       <compilerarg line="-encoding utf-8"/>
     </javac>
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 9a719e44f2..ec5d4007a3 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -782,6 +782,9 @@ bool QRCodeDetector::detect(InputArray in, OutputArray points) const
     Mat inarr = in.getMat();
     CV_Assert(!inarr.empty());
     CV_Assert(inarr.depth() == CV_8U);
+    if (inarr.cols <= 20 || inarr.rows <= 20)
+        return false;  // image data is not enough for providing reliable results
+
     int incn = inarr.channels();
     if( incn == 3 || incn == 4 )
     {
@@ -1054,6 +1057,8 @@ std::string QRCodeDetector::decode(InputArray in, InputArray points,
     Mat inarr = in.getMat();
     CV_Assert(!inarr.empty());
     CV_Assert(inarr.depth() == CV_8U);
+    if (inarr.cols <= 20 || inarr.rows <= 20)
+        return cv::String();  // image data is not enough for providing reliable results
 
     int incn = inarr.channels();
     if( incn == 3 || incn == 4 )
@@ -1092,6 +1097,8 @@ std::string QRCodeDetector::detectAndDecode(InputArray in,
     Mat inarr = in.getMat();
     CV_Assert(!inarr.empty());
     CV_Assert(inarr.depth() == CV_8U);
+    if (inarr.cols <= 20 || inarr.rows <= 20)
+        return cv::String();  // image data is not enough for providing reliable results
 
     int incn = inarr.channels();
     if( incn == 3 || incn == 4 )
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 930abb1bbf..f34564d41d 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -237,6 +237,11 @@ make & enjoy!
 #include <sys/videoio.h>
 #endif
 
+// https://github.com/opencv/opencv/issues/13335
+#ifndef V4L2_CID_ISO_SENSITIVITY
+#define V4L2_CID_ISO_SENSITIVITY (V4L2_CID_CAMERA_CLASS_BASE+23)
+#endif
+
 /* Defaults - If your board can do better, set it here.  Set for the most common type inputs. */
 #define DEFAULT_V4L_WIDTH  640
 #define DEFAULT_V4L_HEIGHT 480
@@ -1757,7 +1762,7 @@ bool CvCaptureCAM_V4L::icvSetFrameSize(int _width, int _height)
     if (_width > 0)
         width_set = _width;
 
-    if (height > 0)
+    if (_height > 0)
         height_set = _height;
 
     /* two subsequent calls setting WIDTH and HEIGHT will change
diff --git a/modules/videoio/test/test_camera.cpp b/modules/videoio/test/test_camera.cpp
index eb6fb60c52..ac115b1d60 100644
--- a/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@@ -11,7 +11,7 @@
 
 namespace opencv_test { namespace {
 
-static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100)
+static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100, Mat* lastFrame = NULL)
 {
     Mat frame;
     int64 time0 = cv::getTickCount();
@@ -26,6 +26,7 @@ static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100)
     }
     int64 time1 = cv::getTickCount();
     printf("Processed %d frames on %.2f FPS\n", N, (N * cv::getTickFrequency()) / (time1 - time0 + 1));
+    if (lastFrame) *lastFrame = frame.clone();
 }
 
 TEST(DISABLED_VideoIO_Camera, basic)
@@ -69,4 +70,39 @@ TEST(DISABLED_VideoIO_Camera, dshow_avermedia_capture)
     capture.release();
 }
 
+TEST(DISABLED_VideoIO_Camera, validate_V4L2_FrameSize)
+{
+    VideoCapture capture(CAP_V4L2);
+    ASSERT_TRUE(capture.isOpened());
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+    std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+    test_readFrames(capture, 30);
+
+    EXPECT_TRUE(capture.set(CAP_PROP_FRAME_WIDTH, 640));
+    EXPECT_TRUE(capture.set(CAP_PROP_FRAME_HEIGHT, 480));
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    Mat frame640x480;
+    test_readFrames(capture, 30, &frame640x480);
+    EXPECT_EQ(640, frame640x480.cols);
+    EXPECT_EQ(480, frame640x480.rows);
+
+    EXPECT_TRUE(capture.set(CAP_PROP_FRAME_WIDTH, 1280));
+    EXPECT_TRUE(capture.set(CAP_PROP_FRAME_HEIGHT, 720));
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    Mat frame1280x720;
+    test_readFrames(capture, 30, &frame1280x720);
+    EXPECT_EQ(1280, frame1280x720.cols);
+    EXPECT_EQ(720, frame1280x720.rows);
+
+    capture.release();
+}
+
 }} // namespace
diff --git a/platforms/android/service/engine/AndroidManifest.xml b/platforms/android/service/engine/AndroidManifest.xml
index 04f9ca568a..660152ed29 100644
--- a/platforms/android/service/engine/AndroidManifest.xml
+++ b/platforms/android/service/engine/AndroidManifest.xml
@@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.opencv.engine"
-    android:versionCode="344@ANDROID_PLATFORM_ID@"
-    android:versionName="3.44">
+    android:versionCode="345@ANDROID_PLATFORM_ID@"
+    android:versionName="3.45">
 
     <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" android:targetSdkVersion="22"/>
     <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
index 1b810029d4..850bad5349 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
+++ b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
@@ -137,7 +137,7 @@ public class OpenCVEngineService extends Service {
 
             @Override
             public int getEngineVersion() throws RemoteException {
-                int version = 3440;
+                int version = 3450;
                 try {
                     version = getPackageManager().getPackageInfo(getPackageName(), 0).versionCode;
                 } catch (NameNotFoundException e) {
diff --git a/platforms/android/service/readme.txt b/platforms/android/service/readme.txt
index c9c2c66096..b77adbca79 100644
--- a/platforms/android/service/readme.txt
+++ b/platforms/android/service/readme.txt
@@ -12,7 +12,7 @@ manually using adb tool:
 
     adb install <path-to-OpenCV-sdk>/apk/OpenCV_<version>_Manager_<app_version>_<platform>.apk
 
-Example: OpenCV_3.4.4-dev_Manager_3.44_armeabi-v7a.apk
+Example: OpenCV_3.4.5-dev_Manager_3.45_armeabi-v7a.apk
 
 Use the list of platforms below to determine proper OpenCV Manager package for your device: