Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-01-18 06:03:15 +08:00 · 2020-08-14 17:23:24 +00:00 · 2020-08-14 17:23:24 +00:00 · b45273eccb
commit b45273eccb
parent 949fe93d5a 68f527267b
15 changed files with 1422 additions and 468 deletions
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -414,6 +414,29 @@ void Mat::copyTo( OutputArray _dst, InputArray _mask ) const
        copymask(ptrs[0], 0, ptrs[2], 0, ptrs[1], 0, sz, &esz);
 }

+
+static bool can_apply_memset(const Mat &mat, const Scalar &s, int &fill_value)
+{
+    // check if depth is 1 byte.
+    switch (mat.depth())
+    {
+    case CV_8U: fill_value = saturate_cast<uchar>( s.val[0] ); break;
+    case CV_8S: fill_value = saturate_cast<schar>( s.val[0] ); break;
+    default: return false;
+    }
+
+    // check if all element is same.
+    const int64* is = (const int64*)&s.val[0];
+    switch (mat.channels())
+    {
+    case 1: return true;
+    case 2: return (is[0] == is[1]);
+    case 3: return (is[0] == is[1] && is[1] == is[2]);
+    case 4: return (is[0] == is[1] && is[1] == is[2] && is[2] == is[3]);
+    default: return false;
+    }
+}
+
 Mat& Mat::operator = (const Scalar& s)
 {
    CV_INSTRUMENT_REGION();
@ -434,6 +457,14 @@ Mat& Mat::operator = (const Scalar& s)
    }
    else
    {
+        int fill_value = 0;
+        if ( can_apply_memset(*this, s, fill_value) )
+        {
+            for (size_t i = 0; i < it.nplanes; i++, ++it)
+                memset(dptr, fill_value, elsize);
+            return *this;
+        }
+
        if( it.nplanes > 0 )
        {
            double scalar[12];
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@ -561,7 +561,7 @@ void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) co
        {convertToNoScale<double, uchar>, convertToNoScale<double, schar>, convertToNoScale<double, ushort>, convertToNoScale<double, short>, convertToNoScale<double, int>, convertToNoScale<double, float>, 0}
    };

-    funcs[sdepth][ddepth](reshape(1), dst.reshape(1), stream);
+    funcs[sdepth][ddepth](src.reshape(1), dst.reshape(1), stream);
 }

 void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& stream) const
@ -591,7 +591,7 @@ void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, doub
        {convertToScale<double, uchar>, convertToScale<double, schar>, convertToScale<double, ushort>, convertToScale<double, short>, convertToScale<double, int>, convertToScale<double, float>, convertToScale<double, double>}
    };

-    funcs[sdepth][ddepth](reshape(1), dst.reshape(1), alpha, beta, stream);
+    funcs[sdepth][ddepth](src.reshape(1), dst.reshape(1), alpha, beta, stream);
 }

 void cv::cuda::convertFp16(InputArray _src, OutputArray _dst, Stream& stream)
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -6464,6 +6464,9 @@ struct Image2D::Impl
            CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");

        cl_context context = (cl_context)Context::getDefault().ptr();
+        if (!context)
+            return false;
+
        // Figure out how many formats are supported by this context.
        cl_uint numFormats = 0;
        cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -119,7 +119,6 @@ public:
            CV_CheckEQ(inputs.size(), (size_t)2, "");
            numOutput = inputs[1].back();
            cAxis = inputs[0].size() - 1;
-            CV_CheckEQ(numOutput, inputs[0][cAxis - 1], "");
            int dims = inputs[0].size();
            CV_CheckEQ(inputs[1].size(), (size_t)dims, "");
            CV_CheckGE(dims, 2, "");
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -108,6 +108,8 @@ public:
                type = AVE;
            else if (pool == "stochastic")
                type = STOCHASTIC;
+            else if (pool == "sum")
+                type = SUM;
            else
                CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");

@ -209,7 +211,7 @@ public:
                return type == MAX || type == AVE;
            }
            else
-                return type != STOCHASTIC;
+                return type != STOCHASTIC && type != SUM;
        }
 #endif
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@ -304,7 +306,7 @@ public:
                maxPooling(inputs[0], outputs[0], mask);
                break;
            }
-            case AVE:
+            case AVE: case SUM:
                CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
                avePooling(inputs[0], outputs[0]);
                break;
@ -513,7 +515,7 @@ public:
    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
-        CV_Assert_N((inputs.size() == 1 && (type == MAX || type == AVE)) || inputs.size() == 2, nodes.size() == inputs.size());
+        CV_Assert_N((inputs.size() == 1 && (type == MAX || type == AVE || type == SUM)) || inputs.size() == 2, nodes.size() == inputs.size());
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;

        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
@ -528,6 +530,19 @@ public:
                            exclude_pad, rounding_type, pad_type);
            return Ptr<BackendNode>(new InfEngineNgraphNode(ave_pool));
        }
+        else if (type == SUM) {
+            ngraph::Shape inpShape = ieInpNode->get_shape();
+            CV_Assert(inpShape.size() == 2 + kernel_size.size());
+            std::vector<int64_t> axes;
+            for (size_t i = 0; i < kernel_size.size(); i++)
+            {
+                if (inpShape[2 + i] == kernel_size[i])
+                    axes.push_back(2 + i);
+            }
+            auto reduction_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes);
+            auto reduce_sum = std::make_shared<ngraph::op::v1::ReduceSum>(ieInpNode, reduction_axes, true);
+            return Ptr<BackendNode>(new InfEngineNgraphNode(reduce_sum));
+        }
        else if (type == MAX) {
            auto max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
                            ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
@ -887,7 +902,7 @@ public:
                            }
                        }
                    }
-                else if (poolingType == AVE)
+                else if (poolingType == AVE || poolingType == SUM)
                {
                    for( ; x0 < x1; ++x0)
                    {
@ -898,7 +913,7 @@ public:
                        xend = min(xend, inp_width);
                        float inv_kernel_area = avePoolPaddedArea ? xdelta * ydelta * ddelta :
                                                ((dend - dstart) * (yend - ystart) * (xend - xstart));
-                        inv_kernel_area = 1.0 / inv_kernel_area;
+                        inv_kernel_area = poolingType == AVE ? 1.0 / inv_kernel_area : 1.0;
 #if CV_SIMD128
                        if( isPool2D && xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_l + kernel_w < inp_width )
                        {
@ -1243,6 +1258,7 @@ private:
        MAX,
        AVE,
        STOCHASTIC,
+        SUM,
        ROI,   // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
        PSROI  // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
    };
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@ -262,6 +262,24 @@ public:
    }
 };

+class ExpandSubgraph : public Subgraph
+{
+public:
+    ExpandSubgraph()
+    {
+        int input = addNodeToMatch("");
+        int values = addNodeToMatch("");
+        int init = addNodeToMatch("ConstantOfShape", values);
+        int coeff = addNodeToMatch("Constant");
+        int mul = addNodeToMatch("Mul", init, coeff);
+        int shape = addNodeToMatch("Constant");
+        int condition = addNodeToMatch("Equal", shape, mul);
+        int where = addNodeToMatch("Where", condition, init, addNodeToMatch("Constant"));
+        addNodeToMatch("Expand", input, where);
+        setFusedNode("Expand", input, shape);
+    }
+};
+
 class MulCastSubgraph : public Subgraph
 {
 public:
@ -459,6 +477,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
    subgraphs.push_back(makePtr<NormalizeSubgraph3>());
    subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
    subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
+    subgraphs.push_back(makePtr<ExpandSubgraph>());

    simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -387,26 +387,42 @@ void ONNXImporter::populateNet(Net dstNet)
            layerParams.set("ceil_mode", layerParams.has("pad_mode"));
            layerParams.set("ave_pool_padded_area", framework_name == "pytorch");
        }
-        else if (layer_type == "GlobalAveragePool" || layer_type == "GlobalMaxPool" || layer_type == "ReduceMean")
+        else if (layer_type == "GlobalAveragePool" || layer_type == "GlobalMaxPool" ||
+                layer_type == "ReduceMean" || layer_type == "ReduceSum")
        {
            CV_Assert(node_proto.input_size() == 1);
            layerParams.type = "Pooling";
-            layerParams.set("pool", layer_type == "GlobalMaxPool"? "MAX" : "AVE");
+            String pool;
+            if (layer_type == "GlobalMaxPool")
+                pool = "MAX";
+            else if (layer_type == "ReduceSum")
+                pool = "SUM";
+            else
+                pool = "AVE";
+            layerParams.set("pool", pool);
            layerParams.set("global_pooling", layer_type == "GlobalAveragePool" || layer_type == "GlobalMaxPool");
-
-            if (layer_type == "ReduceMean")
+            if (layer_type == "ReduceMean" || layer_type == "ReduceSum")
            {
-                if (layerParams.get<int>("keepdims") == 0 || !layerParams.has("axes"))
-                    CV_Error(Error::StsNotImplemented, "Unsupported mode of ReduceMean operation.");
+                if (!layerParams.has("axes"))
+                    CV_Error(Error::StsNotImplemented, "Unsupported mode of " + layer_type + " operation.");

                MatShape inpShape = outShapes[node_proto.input(0)];
                DictValue axes = layerParams.get("axes");
+                bool keepdims = layerParams.get<int>("keepdims");
+                MatShape targetShape = inpShape;
+                for (int i = 0; i < axes.size(); i++) {
+                    int axis = clamp(axes.get<int>(i), inpShape.size());
+                    if (keepdims) {
+                        targetShape[axis] = 1;
+                    } else {
+                        targetShape.erase(targetShape.begin() + axis);
+                    }
+                }
+
                if (inpShape.size() == 3 && axes.size() <= 2)
                {
-                    int axis = axes.get<int>(0);
+                    int axis = clamp(axes.get<int>(0), inpShape.size());
                    CV_CheckNE(axis, 0, "");
-                    outShapes[layerParams.name] = inpShape;
-                    outShapes[layerParams.name][axis] = 1;

                    LayerParams reshapeLp;
                    reshapeLp.name = layerParams.name + "/reshape";
@ -426,13 +442,12 @@ void ONNXImporter::populateNet(Net dstNet)
                    avgLp.name = layerParams.name + "/avg";
                    avgLp.type = "Pooling";
                    CV_Assert(layer_id.find(avgLp.name) == layer_id.end());
-                    avgLp.set("pool", "ave");
+                    avgLp.set("pool", pool);
                    if (axes.size() == 2)
                    {
-                        CV_CheckEQ(axes.get<int>(0), 1, "Unsupported ReduceMean mode");
-                        CV_CheckEQ(axes.get<int>(1), 2, "Unsupported ReduceMean mode");
+                        CV_CheckEQ(clamp(axes.get<int>(0), inpShape.size()), 1, ("Unsupported " + layer_type  + " mode").c_str());
+                        CV_CheckEQ(clamp(axes.get<int>(1), inpShape.size()), 2, ("Unsupported " + layer_type  + " mode").c_str());
                        avgLp.set("global_pooling", true);
-                        outShapes[layerParams.name][axes.get<int>(1)] = 1;
                    }
                    else
                    {
@ -443,28 +458,33 @@ void ONNXImporter::populateNet(Net dstNet)
                    node_proto.set_input(0, reshapeLp.name);
                    node_proto.set_output(0, avgLp.name);
                    addLayer(dstNet, avgLp, node_proto, layer_id, outShapes);
-
-                    layerParams.type = "Flatten";
-                    layerParams.set("axis", 0);
-                    layerParams.set("end_axis", 1);
-
-                    node_proto.set_input(0, avgLp.name);
-                    node_proto.set_output(0, layerParams.name);
                }
                else
                {
                    if (inpShape.size() != 4 && inpShape.size() != 5)
-                    CV_Error(Error::StsNotImplemented, "Unsupported input shape of reduce_mean operation.");
+                        CV_Error(Error::StsNotImplemented, "Unsupported input shape of " + layer_type + " operation.");

                    CV_Assert(axes.size() <= inpShape.size() - 2);
                    std::vector<int> kernel_size(inpShape.size() - 2, 1);
                    for (int i = 0; i < axes.size(); i++) {
-                        int axis = axes.get<int>(i);
+                        int axis = clamp(axes.get<int>(i), inpShape.size());
                        CV_Assert_N(axis >= 2 + i, axis < inpShape.size());
                        kernel_size[axis - 2] = inpShape[axis];
                    }
-                    layerParams.set("kernel_size", DictValue::arrayInt(&kernel_size[0], kernel_size.size()));
+                    LayerParams poolLp = layerParams;
+                    poolLp.name = layerParams.name + "/avg";
+                    CV_Assert(layer_id.find(poolLp.name) == layer_id.end());
+                    poolLp.set("kernel_size", DictValue::arrayInt(&kernel_size[0], kernel_size.size()));
+
+                    node_proto.set_output(0, poolLp.name);
+                    addLayer(dstNet, poolLp, node_proto, layer_id, outShapes);
                }
+
+                layerParams.type = "Reshape";
+                layerParams.set("dim", DictValue::arrayInt(&targetShape[0], targetShape.size()));
+
+                node_proto.set_input(0, node_proto.output(0));
+                node_proto.set_output(0, layerParams.name);
            }
        }
        else if (layer_type == "Slice")
@ -641,6 +661,17 @@ void ONNXImporter::populateNet(Net dstNet)
                    {
                        layerParams.type = "Scale";
                        layerParams.set("bias_term", true);
+                        int axis = 1;
+                        for (int i = 0; i < graph_proto.initializer_size(); i++)
+                        {
+                            opencv_onnx::TensorProto tensor_proto = graph_proto.initializer(i);
+                            if (tensor_proto.name() == node_proto.input(const_blob_id))
+                            {
+                                axis = inpShape.size() - tensor_proto.dims_size();
+                                break;
+                            }
+                        }
+                        layerParams.set("axis", axis);
                        blob = blob.reshape(1, 1);
                        layerParams.blobs.push_back((isSub ? -1 : 1) * blob);
                    }
@ -911,13 +942,20 @@ void ONNXImporter::populateNet(Net dstNet)
            CV_Assert(node_proto.input_size() == 2);
            layerParams.type = "InnerProduct";
            layerParams.set("bias_term", false);
+            CV_Assert(constBlobs.find(node_proto.input(0)) == constBlobs.end());
+            int firstInpDims = outShapes[node_proto.input(0)].size();
+            int secondInpDims;

            if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
            {
                Mat blob = getBlob(node_proto, constBlobs, 1);
+                secondInpDims = blob.dims;
                layerParams.blobs.push_back(blob.t());
                layerParams.set("num_output", layerParams.blobs[0].size[0]);
+            } else {
+                secondInpDims = outShapes[node_proto.input(1)].size();
            }
+            layerParams.set("axis", firstInpDims - secondInpDims + 1);
        }
        else if (layer_type == "Mul" || layer_type == "Div")
        {
@ -983,15 +1021,10 @@ void ONNXImporter::populateNet(Net dstNet)
            {
                Mat inp0 = getBlob(node_proto, constBlobs, 0);
                Mat inp1 = getBlob(node_proto, constBlobs, 1);
-                if (inp0.size != inp1.size)
+                if (inp0.size != inp1.size && inp1.total() != 1)
                    CV_Error(Error::StsNotImplemented, "Constant multiply with different shapes");

-                Mat out;
-                if (isDiv)
-                    divide(inp0, inp1, out);
-                else
-                    multiply(inp0, inp1, out);
-
+                Mat out = isDiv ? inp0 / inp1 : inp0.mul(inp1);
                out = out.reshape(1, inp0.dims, inp0.size);
                out.dims = inp0.dims;  // to workaround dims == 1
                addConstant(layerParams.name, out, constBlobs, outShapes);
@ -1162,9 +1195,45 @@ void ONNXImporter::populateNet(Net dstNet)
            Mat newShapeMat = getBlob(node_proto, constBlobs, 1);
            MatShape targetShape(newShapeMat.ptr<int>(), newShapeMat.ptr<int>() + newShapeMat.total());

-            shapeIt = outShapes.find(node_proto.input(0));
-            CV_Assert(shapeIt != outShapes.end());
-            MatShape inpShape = shapeIt->second;
+            MatShape inpShape;
+            bool haveVariables = constBlobs.find(node_proto.input(0)) == constBlobs.end();
+            if (haveVariables)
+            {
+                shapeIt = outShapes.find(node_proto.input(0));
+                CV_Assert(shapeIt != outShapes.end());
+                inpShape = shapeIt->second;
+            }
+            else
+            {
+                inpShape = shape(getBlob(node_proto, constBlobs, 0));
+            }
+
+            String srcName = node_proto.input(0);
+            // Unsqueeze and repeat along new axis
+            if (targetShape.size() == inpShape.size() + 1)
+            {
+                for (int i = 0; i < targetShape.size(); i++)
+                {
+                    if (targetShape[i] == -1 && i < inpShape.size())
+                        targetShape[i] = inpShape[i];
+                    else if (i < inpShape.size() && targetShape[i] != inpShape[i])
+                        inpShape.insert(inpShape.begin() + i, 1);
+                }
+                if (haveVariables)
+                {
+                    LayerParams reshapeLp;
+                    reshapeLp.name = layerParams.name + "/reshape";
+                    reshapeLp.type = "Reshape";
+                    CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
+                    reshapeLp.set("dim", DictValue::arrayInt(&inpShape[0], inpShape.size()));
+
+                    opencv_onnx::NodeProto proto;
+                    proto.add_input(node_proto.input(0));
+                    proto.add_output(reshapeLp.name);
+                    addLayer(dstNet, reshapeLp, proto, layer_id, outShapes);
+                    srcName = reshapeLp.name;
+                }
+            }
            CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");

            std::vector<int> broadcast_axes;
@ -1179,6 +1248,19 @@ void ONNXImporter::populateNet(Net dstNet)
                }
            }

+            if (!haveVariables)
+            {
+                if (broadcast_axes.size() != 1)
+                    CV_Error(Error::StsNotImplemented, "Expand op doesn't support multiple axes for constant input");
+
+                Mat input = getBlob(node_proto, constBlobs, 0);
+                input = input.reshape(0, total(inpShape, 0, broadcast_axes[0]));
+                Mat output = cv::repeat(input, 1, targetShape[broadcast_axes[0]]);
+                output = output.reshape(0, targetShape);
+                addConstant(layerParams.name, output, constBlobs, outShapes);
+                continue;
+            }
+
            if (broadcast_axes.size() == 2 &&
                broadcast_axes[0] == broadcast_axes[1] - 1 && broadcast_axes[1] == inpShape.size() - 1)
            {
@ -1213,6 +1295,7 @@ void ONNXImporter::populateNet(Net dstNet)
                    CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
                    input_names.push_back(copyLP.name);

+                    node_proto.set_input(0, srcName);
                    node_proto.set_output(0, copyLP.name);
                    addLayer(dstNet, copyLP, node_proto, layer_id, outShapes);
                }
@ -1223,6 +1306,7 @@ void ONNXImporter::populateNet(Net dstNet)
                }
                layerParams.set("axis", broadcast_axes[0]);
                layerParams.type = "Concat";
+                node_proto.set_output(0, layerParams.name);
            }
            else
                CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
@ -1395,6 +1479,7 @@ void ONNXImporter::populateNet(Net dstNet)

                    inpShape.erase(inpShape.begin() + axis);
                    layerParams.type = "Reshape";
+                    layerParams.set("axis", 0);
                    layerParams.set("dim", DictValue::arrayInt(&inpShape[0], inpShape.size()));
                    node_proto.set_input(0, sliceLp.name);
                }
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -2067,7 +2067,7 @@ void TFImporter::populateNet(Net dstNet)
            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
        }
-        else if (type == "Mean")
+        else if (type == "Mean" || type == "Sum")
        {
            // Computes the mean of elements across dimensions of a tensor.
            // If keepdims is false (default) reduces input_tensor along the dimensions given in axis,
@ -2116,7 +2116,7 @@ void TFImporter::populateNet(Net dstNet)
                LayerParams avgLp;
                std::string avgName = name + "/avg";
                CV_Assert(layer_id.find(avgName) == layer_id.end());
-                avgLp.set("pool", "ave");
+                avgLp.set("pool", type == "Mean" ? "ave" : "sum");
                // pooling kernel H x 1
                avgLp.set("global_pooling_h", true);
                avgLp.set("kernel_w", 1);
@ -2153,11 +2153,44 @@ void TFImporter::populateNet(Net dstNet)
                layer_id[name] = id;
                connect(layer_id, dstNet, Pin(avgName), id, 0);
                connect(layer_id, dstNet, Pin(layerShapeName), id, 1);
+            } else if (indices.total() == 1) {
+                int axis = toNCHW(indices.at<int>(0));
+                if (axis == 2 || axis == 3)
+                {
+                    layerParams.set("pool", type == "Mean" ? "ave" : "sum");
+                    layerParams.set(axis == 2 ? "kernel_w" : "kernel_h", 1);
+                    layerParams.set(axis == 2 ? "global_pooling_h" : "global_pooling_w", true);
+                    int id = dstNet.addLayer(name, "Pooling", layerParams);
+                    layer_id[name] = id;
+                    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+
+                    if (!keepDims)
+                    {
+                        // To keep correct order after squeeze dims we first need to change layout from NCHW to NHWC
+                        LayerParams permLP;
+                        int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+                        permLP.set("order", DictValue::arrayInt<int*>(order, 4));
+                        std::string permName = name + "/nchw";
+                        CV_Assert(layer_id.find(permName) == layer_id.end());
+                        int permId = dstNet.addLayer(permName, "Permute", permLP);
+                        layer_id[permName] = permId;
+                        connect(layer_id, dstNet, Pin(name), permId, 0);
+
+                        LayerParams squeezeLp;
+                        std::string squeezeName = name + "/squeeze";
+                        CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+                        squeezeLp.set("axis", indices.at<int>(0));
+                        squeezeLp.set("end_axis", indices.at<int>(0) + 1);
+                        int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
+                        layer_id[squeezeName] = squeezeId;
+                        connect(layer_id, dstNet, Pin(permName), squeezeId, 0);
+                    }
+                }
            } else {
                if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
-                    CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean operation.");
+                    CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean or reduce_sum operation.");

-                layerParams.set("pool", "ave");
+                layerParams.set("pool", type == "Mean" ? "ave" : "sum");
                layerParams.set("global_pooling", true);
                int id = dstNet.addLayer(name, "Pooling", layerParams);
                layer_id[name] = id;
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -786,6 +786,8 @@ TEST_P(Test_Darknet_layers, connected)

 TEST_P(Test_Darknet_layers, relu)
 {
+     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
    testDarknetLayer("relu");
 }

--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -2098,4 +2098,436 @@ TEST_P(Layer_Test_BatchNorm, fusion)

 INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_BatchNorm, dnnBackendsAndTargets());

+class TestLayerFusion : public DNNTestLayer {
+public:
+    static void makeDefaultTestConvolutionLayer(LayerParams& convParams, int in_channels, int num_filters, bool bias_term)
+    {
+        const int kernel_h = 3, kernel_w = 3;
+        const int pad_h = kernel_h / 2, pad_w = kernel_w / 2;
+
+        convParams.set("kernel_h", kernel_h);
+        convParams.set("kernel_w", kernel_w);
+        convParams.set("pad_h", pad_h);
+        convParams.set("pad_w", pad_w);
+        convParams.set("num_output", num_filters);
+        convParams.set("bias_term", bias_term);
+        convParams.type = "Convolution";
+        convParams.name = "convolution";
+
+        float conv_init_magnitude = 1.0f / in_channels / kernel_h / kernel_w;
+        int weightsShape[] = {num_filters, in_channels, kernel_h, kernel_w};
+        Mat weights(4, &weightsShape[0], CV_32F);
+        randu(weights, -conv_init_magnitude, conv_init_magnitude);
+        convParams.blobs.push_back(weights);
+        if (bias_term)
+        {
+            Mat bias(1, num_filters, CV_32F);
+            randu(bias, -1.0f, 1.0f);
+            convParams.blobs.push_back(bias);
+        }
+    }
+
+    static void makeDefaultTestActivationLayer(LayerParams& activationParams, const std::string& type, int in_channels)
+    {
+        activationParams.type = type;
+        activationParams.name = "activation";
+        if (activationParams.type == "ReLU")
+            activationParams.set("negative_slope", 0.1f);
+        else if (activationParams.type == "Power")
+        {
+            activationParams.set("power", 2.0f);
+            activationParams.set("scale", 0.5f);
+            activationParams.set("shift", 0.3f);
+        }
+        else if (activationParams.type == "ReLU6")
+        {
+            activationParams.set("min_value", -1.0f);
+            activationParams.set("max_value", 1.0f);
+        }
+        else if (activationParams.type == "ChannelsPReLU")
+        {
+            Mat scales(1, in_channels, CV_32F);
+            randu(scales, -1.0f, 1.0f);
+            activationParams.blobs.push_back(scales);
+        }
+    }
+
+    static void makeDefaultTestEltwiseLayer(LayerParams& eltwiseParams, const std::string& op, bool withCoefficients)
+    {
+        eltwiseParams.type = "Eltwise";
+        eltwiseParams.name = "eltwise";
+        eltwiseParams.set("operation", op);
+        if (withCoefficients)
+        {
+            float coeff[] = {0.3f, 0.5f};
+            eltwiseParams.set("coeff", DictValue::arrayReal<float*>(coeff, 2));
+        }
+    }
+
+    static void test(Mat& input, Net& net, Backend backendId, Target targetId, std::vector<int> expectedFusedLayers = std::vector<int>(), double l1 = 0.0, double lInf = 0.0)
+    {
+        DNNTestLayer::checkBackend(backendId, targetId);
+
+        net.enableFusion(false);
+        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        net.setPreferableTarget(DNN_TARGET_CPU);
+        net.setInput(input);
+        Mat outputReference = net.forward().clone();
+        std::vector<double> refTimings;
+        net.getPerfProfile(refTimings);
+        for (int i = 0; i < refTimings.size(); i++)
+        {
+            CV_Assert(refTimings[i] != 0.0);
+        }
+
+        net.enableFusion(true);
+        net.setPreferableBackend(backendId);
+        net.setPreferableTarget(targetId);
+        net.setInput(input);
+        Mat outputTest = net.forward().clone();
+        std::vector<double> testTimings;
+        net.getPerfProfile(testTimings);
+        for (int i = 0; i < testTimings.size(); i++)
+        {
+            if(std::find(expectedFusedLayers.begin(), expectedFusedLayers.end(), i + 1) != expectedFusedLayers.end())
+            {
+                EXPECT_EQ(testTimings[i], 0.0);
+            }
+            else
+            {
+                EXPECT_NE(testTimings[i], 0.0);
+            }
+        }
+
+        // double ref_max_value, ref_min_value;
+        // minMaxLoc(outputReference.reshape(1, 1), &ref_min_value, &ref_max_value);
+        // std::cout << "reference range: " << ref_min_value << ' ' << ref_max_value << std::endl;
+
+        double default_l1, default_lInf;
+        DNNTestLayer::getDefaultThresholds(backendId, targetId, &default_l1, &default_lInf);
+        if (l1 == 0.0)
+            l1 = default_l1;
+        if (lInf == 0.0)
+            lInf = default_lInf;
+        normAssert(outputReference, outputTest, "", l1, lInf);
+    }
+
+    static testing::internal::ParamGenerator<std::string> eltwiseOpList()
+    {
+        // TODO: automate list generation
+        return Values("sum", "max", "prod", "div");
+    }
+
+    static testing::internal::ParamGenerator<std::string> activationLayersList()
+    {
+        // TODO: automate list generation
+        return Values("ReLU", "ReLU6", "ChannelsPReLU", "TanH", "Swish", "Mish", "Sigmoid", "ELU", "AbsVal", "BNLL", "Power");
+    }
+
+    static testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsForFusionTests()
+    {
+        return dnnBackendsAndTargets(false, false, true, false, false, false); // OCV OpenCL + OCV CPU
+    }
+};
+
+typedef TestWithParam<tuple<bool, std::string, tuple<Backend, Target> > > ConvolutionActivationFusion;
+TEST_P(ConvolutionActivationFusion, Accuracy)
+{
+    //          input
+    //            |
+    // -----------------------
+    // |     convolution     |
+    // -----------------------
+    //            |
+    // -----------------------
+    // |     activation      |
+    // -----------------------
+    //            |
+    //         output
+
+    const int batch_size = 2, in_channels = 16;
+    const int in_height = 16, in_width = 16;
+    int inputShape[] = {batch_size, in_channels, in_height, in_width};
+    Mat input(4, &inputShape[0], CV_32F);
+    randu(input, 1.0f, 2.0f);
+
+    bool bias_term = get<0>(GetParam());
+    LayerParams convParams;
+    TestLayerFusion::makeDefaultTestConvolutionLayer(convParams, in_channels, in_channels, bias_term);
+
+    std::string actType = get<1>(GetParam());
+    LayerParams activationParams;
+    TestLayerFusion::makeDefaultTestActivationLayer(activationParams, actType, in_channels);
+
+    Backend backendId = get<0>(get<2>(GetParam()));
+    Target targetId = get<1>(get<2>(GetParam()));
+
+    // bug: https://github.com/opencv/opencv/issues/17964
+    if (actType == "Power" && backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+
+    // bug: https://github.com/opencv/opencv/issues/17953
+    if (actType == "ChannelsPReLU" && bias_term == false &&
+        backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
+    {
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    }
+
+    Net net;
+    int convId = net.addLayer(convParams.name, convParams.type, convParams);
+    int activId = net.addLayerToPrev(activationParams.name, activationParams.type, activationParams);
+    net.connect(0, 0, convId, 0);
+
+    std::vector<int> expectedFusedLayers;
+    if (backendId == DNN_BACKEND_OPENCV)
+    {
+        if (targetId == DNN_TARGET_CPU)
+            expectedFusedLayers.push_back(activId); // all activations are fused
+        else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
+        {
+            if (actType == "ReLU" || actType == "ChannelsPReLU" || actType == "ReLU6" || actType == "TanH" || actType == "Power")
+                expectedFusedLayers.push_back(activId);
+        }
+    }
+
+    TestLayerFusion::test(input, net, backendId, targetId, expectedFusedLayers);
+}
+INSTANTIATE_TEST_CASE_P(TestLayerFusion, ConvolutionActivationFusion, Combine(
+/* bias */       testing::Bool(),
+/* activation */ TestLayerFusion::activationLayersList(),
+                 TestLayerFusion::dnnBackendsAndTargetsForFusionTests()
+));
+
+typedef TestWithParam<tuple<bool, std::string, bool, tuple<Backend, Target> > > ConvolutionEltwiseFusion;
+TEST_P(ConvolutionEltwiseFusion, Accuracy)
+{
+    //                 input
+    //                   |
+    //    -------------------------------
+    //    |                             |
+    //    |                      ---------------
+    //    |                      | convolution |
+    //    |                      ---------------
+    //    |                             |
+    //    |       ----------------      |
+    //    --------|  eltwise op  |-------
+    //            ----------------
+    //                   |
+    //                 output
+
+    const int batch_size = 2, in_channels = 16;
+    const int in_height = 16, in_width = 16;
+    int inputShape[] = {batch_size, in_channels, in_height, in_width};
+    Mat input(4, &inputShape[0], CV_32F);
+    randu(input, 1.0f, 2.0f); // avoid small values to test eltwise div
+
+    bool bias_term = get<0>(GetParam());
+    LayerParams convParams;
+    TestLayerFusion::makeDefaultTestConvolutionLayer(convParams, in_channels, in_channels, bias_term);
+
+    std::string eltwiseOp = get<1>(GetParam());
+    bool weightedEltwise = get<2>(GetParam());
+    if (eltwiseOp != "sum" && weightedEltwise)
+            throw SkipTestException("weighted eltwise not supported");
+    LayerParams eltwiseParams;
+    TestLayerFusion::makeDefaultTestEltwiseLayer(eltwiseParams, eltwiseOp, weightedEltwise);
+
+    Net net;
+    int convId = net.addLayer(convParams.name, convParams.type, convParams);
+    int eltwiseId = net.addLayer(eltwiseParams.name, eltwiseParams.type, eltwiseParams);
+    net.connect(0, 0, convId, 0);
+    net.connect(convId, 0, eltwiseId, 0);
+    net.connect(0, 0, eltwiseId, 1);
+
+    Backend backendId = get<0>(get<3>(GetParam()));
+    Target targetId = get<1>(get<3>(GetParam()));
+    TestLayerFusion::test(input, net, backendId, targetId);
+}
+INSTANTIATE_TEST_CASE_P(TestLayerFusion, ConvolutionEltwiseFusion, Combine(
+/* bias */              testing::Bool(),
+/* eltwise op */        TestLayerFusion::eltwiseOpList(),
+/* eltwise weighted */  testing::Bool(),
+                        TestLayerFusion::dnnBackendsAndTargetsForFusionTests()
+));
+
+typedef TestWithParam<tuple<bool, std::string, bool, std::string, tuple<Backend, Target> > > ConvolutionEltwiseActivationFusion;
+TEST_P(ConvolutionEltwiseActivationFusion, Accuracy)
+{
+    //                 input
+    //                   |
+    //    -------------------------------
+    //    |                             |
+    //    |                      ---------------
+    //    |                      | convolution |
+    //    |                      ---------------
+    //    |                             |
+    //    |       ----------------      |
+    //    --------|  eltwise op  |-------
+    //            ----------------
+    //                   |
+    //            ----------------
+    //            |  activation  |
+    //            ----------------
+    //                   |
+    //                output
+
+    const int batch_size = 2, in_channels = 16;
+    const int in_height = 16, in_width = 16;
+    int inputShape[] = {batch_size, in_channels, in_height, in_width};
+    Mat input(4, &inputShape[0], CV_32F);
+    randu(input, 1.0f, 2.0f); // avoid small values to test eltwise div
+
+    bool bias_term = get<0>(GetParam());
+    LayerParams convParams;
+    TestLayerFusion::makeDefaultTestConvolutionLayer(convParams, in_channels, in_channels, bias_term);
+
+    std::string eltwiseOp = get<1>(GetParam());
+    bool weightedEltwise = get<2>(GetParam());
+    if (eltwiseOp != "sum" && weightedEltwise)
+            throw SkipTestException("weighted eltwise not supported");
+    LayerParams eltwiseParams;
+    TestLayerFusion::makeDefaultTestEltwiseLayer(eltwiseParams, eltwiseOp, false);
+
+    std::string actType = get<3>(GetParam());
+    LayerParams activationParams;
+    TestLayerFusion::makeDefaultTestActivationLayer(activationParams, actType, in_channels);
+
+    Backend backendId = get<0>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
+
+    // bug: https://github.com/opencv/opencv/issues/17945
+    if (eltwiseOp != "sum" && backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+
+    // bug: https://github.com/opencv/opencv/issues/17953
+    if (eltwiseOp == "sum" && actType == "ChannelsPReLU" && bias_term == false &&
+        backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
+    {
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    }
+
+    // bug: https://github.com/opencv/opencv/issues/17964
+    if (actType == "Power" && backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+
+    Net net;
+    int convId = net.addLayer(convParams.name, convParams.type, convParams);
+    int eltwiseId = net.addLayer(eltwiseParams.name, eltwiseParams.type, eltwiseParams);
+    int activId = net.addLayer(activationParams.name, activationParams.type, activationParams);
+    net.connect(0, 0, convId, 0);
+    net.connect(convId, 0, eltwiseId, 0);
+    net.connect(0, 0, eltwiseId, 1);
+    net.connect(eltwiseId, 0, activId, 0);
+
+    std::vector<int> expectedFusedLayers;
+    if (backendId == DNN_BACKEND_OPENCV)
+    {
+        if (targetId == DNN_TARGET_CPU)
+            expectedFusedLayers.push_back(activId); // activation is fused with eltwise layer
+        else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
+        {
+            if (actType == "ReLU" || actType == "ChannelsPReLU" || actType == "Power")
+            {
+                expectedFusedLayers.push_back(eltwiseId);
+                expectedFusedLayers.push_back(activId);
+            }
+        }
+    }
+
+    TestLayerFusion::test(input, net, backendId, targetId, expectedFusedLayers);
+}
+INSTANTIATE_TEST_CASE_P(TestLayerFusion, ConvolutionEltwiseActivationFusion, Combine(
+/* bias */              testing::Bool(),
+/* eltwise op */        TestLayerFusion::eltwiseOpList(),
+/* eltwise weighted */  testing::Bool(),
+/* activation */        TestLayerFusion::activationLayersList(),
+                        TestLayerFusion::dnnBackendsAndTargetsForFusionTests()
+));
+
+typedef TestWithParam<tuple<bool, std::string, std::string, bool, tuple<Backend, Target> > > ConvolutionActivationEltwiseFusion;
+TEST_P(ConvolutionActivationEltwiseFusion, Accuracy)
+{
+    //                 input
+    //                   |
+    //    -------------------------------
+    //    |                             |
+    //    |                     ----------------
+    //    |                     |  convolution |
+    //    |                     ----------------
+    //    |                             |
+    //    |                     ----------------
+    //    |                     |  activation  |
+    //    |                     ----------------
+    //    |                             |
+    //    |       ----------------      |
+    //    --------| eltwise sum  |-------
+    //            ----------------
+    //                   |
+
+    const int batch_size = 2, in_channels = 16;
+    const int in_height = 16, in_width = 16;
+    int inputShape[] = {batch_size, in_channels, in_height, in_width};
+    Mat input(4, &inputShape[0], CV_32F);
+    randu(input, 1.0f, 2.0f); // avoid small values to test eltwise div
+
+    bool bias_term = get<0>(GetParam());
+    LayerParams convParams;
+    TestLayerFusion::makeDefaultTestConvolutionLayer(convParams, in_channels, in_channels, bias_term);
+
+    std::string actType = get<1>(GetParam());
+    LayerParams activationParams;
+    TestLayerFusion::makeDefaultTestActivationLayer(activationParams, actType, in_channels);
+
+    std::string eltwiseOp = get<2>(GetParam());
+    bool weightedEltwise = get<3>(GetParam());
+    if (eltwiseOp != "sum" && weightedEltwise)
+            throw SkipTestException("weighted eltwise not supported");
+    LayerParams eltwiseParams;
+    TestLayerFusion::makeDefaultTestEltwiseLayer(eltwiseParams, eltwiseOp, false);
+
+    Backend backendId = get<0>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
+
+    // bug: https://github.com/opencv/opencv/issues/17964
+    if (actType == "Power" && backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+
+    // bug: https://github.com/opencv/opencv/issues/17953
+    if (actType == "ChannelsPReLU" && bias_term == false &&
+        backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16))
+    {
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    }
+
+    Net net;
+    int convId = net.addLayer(convParams.name, convParams.type, convParams);
+    int activId = net.addLayer(activationParams.name, activationParams.type, activationParams);
+    int eltwiseId = net.addLayer(eltwiseParams.name, eltwiseParams.type, eltwiseParams);
+    net.connect(0, 0, convId, 0);
+    net.connect(convId, 0, activId, 0);
+    net.connect(activId, 0, eltwiseId, 0);
+    net.connect(0, 0, eltwiseId, 1);
+
+    std::vector<int> expectedFusedLayers;
+    if (backendId == DNN_BACKEND_OPENCV)
+    {
+        if (targetId == DNN_TARGET_CPU)
+            expectedFusedLayers.push_back(activId); // activation fused with convolution
+        else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
+        {
+            if (actType == "ReLU" || actType == "ChannelsPReLU" || actType == "ReLU6" || actType == "TanH" || actType == "Power")
+                expectedFusedLayers.push_back(activId); // activation fused with convolution
+        }
+    }
+
+    TestLayerFusion::test(input, net, backendId, targetId, expectedFusedLayers);
+}
+INSTANTIATE_TEST_CASE_P(TestLayerFusion, ConvolutionActivationEltwiseFusion, Combine(
+/* bias */              testing::Bool(),
+/* activation */        TestLayerFusion::activationLayersList(),
+/* eltwise op */        TestLayerFusion::eltwiseOpList(),
+/* eltwise weighted */  testing::Bool(),
+                        TestLayerFusion::dnnBackendsAndTargetsForFusionTests()
+));
+
 }} // namespace
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -270,6 +270,11 @@ TEST_P(Test_ONNX_layers, ReduceMean)
    testONNXModels("reduce_mean_axis2");
 }

+TEST_P(Test_ONNX_layers, ReduceSum)
+{
+    testONNXModels("reduce_sum");
+}
+
 TEST_P(Test_ONNX_layers, ReduceMean3D)
 {
    if (backend == DNN_BACKEND_CUDA)
@ -436,10 +441,20 @@ TEST_P(Test_ONNX_layers, MatMul)
    testONNXModels("matmul_4d");
 }

+TEST_P(Test_ONNX_layers, MatMulAdd)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    testONNXModels("matmul_add");
+}
+
 TEST_P(Test_ONNX_layers, Expand)
 {
    testONNXModels("expand_batch");
    testONNXModels("expand_channels");
+    testONNXModels("expand_neg_batch");
 }

 TEST_P(Test_ONNX_layers, ExpandHW)
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -128,6 +128,13 @@ TEST_P(Test_TensorFlow_layers, reduce_mean)
    runTensorFlowNet("global_pool_by_axis");
 }

+TEST_P(Test_TensorFlow_layers, reduce_sum)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    runTensorFlowNet("sum_pool_by_axis");
+}
+
 TEST_P(Test_TensorFlow_layers, conv_single_conv)
 {
    runTensorFlowNet("single_conv");
@ -354,6 +361,11 @@ TEST_P(Test_TensorFlow_layers, pooling_reduce_mean)
    runTensorFlowNet("reduce_mean");  // an average pooling over all spatial dimensions.
 }

+TEST_P(Test_TensorFlow_layers, pooling_reduce_sum)
+{
+    runTensorFlowNet("reduce_sum");  // a SUM pooling over all spatial dimensions.
+}
+
 TEST_P(Test_TensorFlow_layers, max_pool_grad)
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@ -452,14 +452,10 @@ public:
            root_[i] = pool_.allocate<KMeansNode>();
            std::memset(root_[i], 0, sizeof(KMeansNode));

-            if(is_kdtree_distance::val || is_vector_space_distance::val) {
-                computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
-                computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
-            }
-            else {
-                computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
-                computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
-            }
+            Distance* dummy = NULL;
+            computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_, dummy);
+
+            computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
        }
    }

@ -818,6 +814,413 @@ private:
    }


+    template<typename DistType>
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const DistType* identifier)
+    {
+        (void)identifier;
+        computeNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::HammingLUT* identifier)
+    {
+        (void)identifier;
+        computeBitfieldNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::Hamming<unsigned char>* identifier)
+    {
+        (void)identifier;
+        computeBitfieldNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::Hamming2<unsigned char>* identifier)
+    {
+        (void)identifier;
+        computeBitfieldNodeStatistics(node, indices, indices_length);
+    }
+
+
+    void refineClustering(int* indices, int indices_length, int branching, CentersType** centers,
+                          std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        cv::AutoBuffer<double> dcenters_buf(branching*veclen_);
+        Matrix<double> dcenters(dcenters_buf.data(), branching, veclen_);
+
+        bool converged = false;
+        int iteration = 0;
+        while (!converged && iteration<iterations_) {
+            converged = true;
+            iteration++;
+
+            // compute the new cluster centers
+            for (int i=0; i<branching; ++i) {
+                memset(dcenters[i],0,sizeof(double)*veclen_);
+                radiuses[i] = 0;
+            }
+            for (int i=0; i<indices_length; ++i) {
+                ElementType* vec = dataset_[indices[i]];
+                double* center = dcenters[belongs_to[i]];
+                for (size_t k=0; k<veclen_; ++k) {
+                    center[k] += vec[k];
+                }
+            }
+            for (int i=0; i<branching; ++i) {
+                int cnt = count[i];
+                for (size_t k=0; k<veclen_; ++k) {
+                    dcenters[i][k] /= cnt;
+                }
+            }
+
+            std::vector<int> new_centroids(indices_length);
+            std::vector<DistanceType> sq_dists(indices_length);
+
+            // reassign points to clusters
+            KMeansDistanceComputer<Matrix<double> > invoker(
+                        distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists);
+            parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+            for (int i=0; i < (int)indices_length; ++i) {
+                DistanceType sq_dist(sq_dists[i]);
+                int new_centroid(new_centroids[i]);
+                if (sq_dist > radiuses[new_centroid]) {
+                    radiuses[new_centroid] = sq_dist;
+                }
+                if (new_centroid != belongs_to[i]) {
+                    count[belongs_to[i]]--;
+                    count[new_centroid]++;
+                    belongs_to[i] = new_centroid;
+                    converged = false;
+                }
+            }
+
+            for (int i=0; i<branching; ++i) {
+                // if one cluster converges to an empty cluster,
+                // move an element into that cluster
+                if (count[i]==0) {
+                    int j = (i+1)%branching;
+                    while (count[j]<=1) {
+                        j = (j+1)%branching;
+                    }
+
+                    for (int k=0; k<indices_length; ++k) {
+                        if (belongs_to[k]==j) {
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
+                        }
+                    }
+                    converged = false;
+                }
+            }
+        }
+
+       for (int i=0; i<branching; ++i) {
+           centers[i] = new CentersType[veclen_];
+           memoryCounter_ += (int)(veclen_*sizeof(CentersType));
+           for (size_t k=0; k<veclen_; ++k) {
+               centers[i][k] = (CentersType)dcenters[i][k];
+           }
+       }
+    }
+
+
+    void refineBitfieldClustering(int* indices, int indices_length, int branching, CentersType** centers,
+                                  std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        for (int i=0; i<branching; ++i) {
+            centers[i] = new CentersType[veclen_];
+            memoryCounter_ += (int)(veclen_*sizeof(CentersType));
+        }
+
+        const unsigned int accumulator_veclen = static_cast<unsigned int>(
+                                                veclen_*sizeof(ElementType)*BITS_PER_CHAR);
+        cv::AutoBuffer<unsigned int> dcenters_buf(branching*accumulator_veclen);
+        Matrix<unsigned int> dcenters(dcenters_buf.data(), branching, accumulator_veclen);
+
+        bool converged = false;
+        int iteration = 0;
+        while (!converged && iteration<iterations_) {
+            converged = true;
+            iteration++;
+
+            // compute the new cluster centers
+            for (int i=0; i<branching; ++i) {
+                memset(dcenters[i],0,sizeof(unsigned int)*accumulator_veclen);
+                radiuses[i] = 0;
+            }
+            for (int i=0; i<indices_length; ++i) {
+                unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+                unsigned int* dcenter = dcenters[belongs_to[i]];
+                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                    dcenter[k]   += (vec[l])    & 0x01;
+                    dcenter[k+1] += (vec[l]>>1) & 0x01;
+                    dcenter[k+2] += (vec[l]>>2) & 0x01;
+                    dcenter[k+3] += (vec[l]>>3) & 0x01;
+                    dcenter[k+4] += (vec[l]>>4) & 0x01;
+                    dcenter[k+5] += (vec[l]>>5) & 0x01;
+                    dcenter[k+6] += (vec[l]>>6) & 0x01;
+                    dcenter[k+7] += (vec[l]>>7) & 0x01;
+                }
+            }
+            for (int i=0; i<branching; ++i) {
+                double cnt = static_cast<double>(count[i]);
+                unsigned int* dcenter = dcenters[i];
+                unsigned char* charCenter = (unsigned char*)centers[i];
+                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                    charCenter[l] = static_cast<unsigned char>(
+                                      (((int)(0.5 + (double)(dcenter[k])   / cnt)))
+                                    | (((int)(0.5 + (double)(dcenter[k+1]) / cnt))<<1)
+                                    | (((int)(0.5 + (double)(dcenter[k+2]) / cnt))<<2)
+                                    | (((int)(0.5 + (double)(dcenter[k+3]) / cnt))<<3)
+                                    | (((int)(0.5 + (double)(dcenter[k+4]) / cnt))<<4)
+                                    | (((int)(0.5 + (double)(dcenter[k+5]) / cnt))<<5)
+                                    | (((int)(0.5 + (double)(dcenter[k+6]) / cnt))<<6)
+                                    | (((int)(0.5 + (double)(dcenter[k+7]) / cnt))<<7));
+                }
+            }
+
+            std::vector<int> new_centroids(indices_length);
+            std::vector<DistanceType> dists(indices_length);
+
+            // reassign points to clusters
+            KMeansDistanceComputer<ElementType**> invoker(
+                        distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
+            parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+            for (int i=0; i < indices_length; ++i) {
+                DistanceType dist(dists[i]);
+                int new_centroid(new_centroids[i]);
+                if (dist > radiuses[new_centroid]) {
+                    radiuses[new_centroid] = dist;
+                }
+                if (new_centroid != belongs_to[i]) {
+                    count[belongs_to[i]]--;
+                    count[new_centroid]++;
+                    belongs_to[i] = new_centroid;
+                    converged = false;
+                }
+            }
+
+            for (int i=0; i<branching; ++i) {
+                // if one cluster converges to an empty cluster,
+                // move an element into that cluster
+                if (count[i]==0) {
+                    int j = (i+1)%branching;
+                    while (count[j]<=1) {
+                        j = (j+1)%branching;
+                    }
+
+                    for (int k=0; k<indices_length; ++k) {
+                        if (belongs_to[k]==j) {
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
+                        }
+                    }
+                    converged = false;
+                }
+            }
+        }
+    }
+
+
+    void computeSubClustering(KMeansNodePtr node, int* indices, int indices_length,
+                              int branching, int level, CentersType** centers,
+                              std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        // compute kmeans clustering for each of the resulting clusters
+        node->childs = pool_.allocate<KMeansNodePtr>(branching);
+        int start = 0;
+        int end = start;
+        for (int c=0; c<branching; ++c) {
+            int s = count[c];
+
+            DistanceType variance = 0;
+            DistanceType mean_radius =0;
+            for (int i=0; i<indices_length; ++i) {
+                if (belongs_to[i]==c) {
+                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
+                    variance += d;
+                    mean_radius += static_cast<DistanceType>( sqrt(d) );
+                    std::swap(indices[i],indices[end]);
+                    std::swap(belongs_to[i],belongs_to[end]);
+                    end++;
+                }
+            }
+            variance /= s;
+            mean_radius /= s;
+            variance -= distance_(centers[c], ZeroIterator<ElementType>(), veclen_);
+
+            node->childs[c] = pool_.allocate<KMeansNode>();
+            std::memset(node->childs[c], 0, sizeof(KMeansNode));
+            node->childs[c]->radius = radiuses[c];
+            node->childs[c]->pivot = centers[c];
+            node->childs[c]->variance = variance;
+            node->childs[c]->mean_radius = mean_radius;
+            computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
+            start=end;
+        }
+    }
+
+
+    void computeAnyBitfieldSubClustering(KMeansNodePtr node, int* indices, int indices_length,
+                              int branching, int level, CentersType** centers,
+                              std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        // compute kmeans clustering for each of the resulting clusters
+        node->childs = pool_.allocate<KMeansNodePtr>(branching);
+        int start = 0;
+        int end = start;
+        for (int c=0; c<branching; ++c) {
+            int s = count[c];
+
+            unsigned long long variance = 0ull;
+            DistanceType mean_radius =0;
+            for (int i=0; i<indices_length; ++i) {
+                if (belongs_to[i]==c) {
+                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
+                    variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(d) );
+                    mean_radius += ensureSimpleDistance<Distance>(d);
+                    std::swap(indices[i],indices[end]);
+                    std::swap(belongs_to[i],belongs_to[end]);
+                    end++;
+                }
+            }
+            mean_radius = static_cast<DistanceType>(
+                        0.5f + static_cast<float>(mean_radius) / static_cast<float>(s));
+            variance = static_cast<unsigned long long>(
+                        0.5 + static_cast<double>(variance) / static_cast<double>(s));
+            variance -= static_cast<unsigned long long>(
+                        ensureSquareDistance<Distance>(
+                            distance_(centers[c], ZeroIterator<ElementType>(), veclen_)));
+
+            node->childs[c] = pool_.allocate<KMeansNode>();
+            std::memset(node->childs[c], 0, sizeof(KMeansNode));
+            node->childs[c]->radius = radiuses[c];
+            node->childs[c]->pivot = centers[c];
+            node->childs[c]->variance = static_cast<DistanceType>(variance);
+            node->childs[c]->mean_radius = mean_radius;
+            computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
+            start=end;
+        }
+    }
+
+
+    template<typename DistType>
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const DistType* identifier)
+    {
+        (void)identifier;
+        refineClustering(indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeSubClustering(node, indices, indices_length, branching,
+                             level, centers, radiuses, belongs_to, count);
+    }
+
+
+    /**
+     * The methods responsible with doing the recursive hierarchical clustering on
+     * binary vectors.
+     * As some might have heared that KMeans on binary data doesn't make sense,
+     * it's worth a little explanation why it actually fairly works. As
+     * with the Hierarchical Clustering algortihm, we seed several centers for the
+     * current node by picking some of its points. Then in a first pass each point
+     * of the node is then related to its closest center. Now let's have a look at
+     * the 5 central dimensions of the 9 following points:
+     *
+     * xxxxxx11100xxxxx (1)
+     * xxxxxx11010xxxxx (2)
+     * xxxxxx11001xxxxx (3)
+     * xxxxxx10110xxxxx (4)
+     * xxxxxx10101xxxxx (5)
+     * xxxxxx10011xxxxx (6)
+     * xxxxxx01110xxxxx (7)
+     * xxxxxx01101xxxxx (8)
+     * xxxxxx01011xxxxx (9)
+     * sum   _____
+     * of 1: 66555
+     *
+     * Even if the barycenter notion doesn't apply, we can set a center
+     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
+     * on for these points.
+     *
+     * Note that convergence isn't ensured anymore. In practice, using Gonzales
+     * as seeding algorithm should be fine for getting convergence ("iterations"
+     * value can be set to -1). But with KMeans++ seeding you should definitely
+     * set a maximum number of iterations (but make it higher than the "iterations"
+     * default value of 11).
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     indices_length = number of points in the current node
+     *     branching = the branching factor to use in the clustering
+     *     level = 0 for the root node, it increases with the subdivision levels
+     *     centers = clusters centers to compute
+     *     radiuses = radiuses of clusters
+     *     belongs_to = LookUp Table returning, for a given indice id, the center id it belongs to
+     *     count = array storing the number of indices for a given center id
+     *     identifier = dummy pointer on an instance of Distance (use to branch correctly among templates)
+     */
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::HammingLUT* identifier)
+    {
+        (void)identifier;
+        refineBitfieldClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+
+
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::Hamming<unsigned char>* identifier)
+    {
+        (void)identifier;
+        refineBitfieldClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+
+
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::Hamming2<unsigned char>* identifier)
+    {
+        (void)identifier;
+        refineBitfieldClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+

    /**
     * The method responsible with actually doing the recursive hierarchical
@ -882,372 +1285,16 @@ private:
            count[belongs_to[i]]++;
        }

-        cv::AutoBuffer<double> dcenters_buf(branching*veclen_);
-        Matrix<double> dcenters(dcenters_buf.data(), branching, veclen_);
-        for (int i=0; i<centers_length; ++i) {
-            ElementType* vec = dataset_[centers_idx[i]];
-            for (size_t k=0; k<veclen_; ++k) {
-                dcenters[i][k] = double(vec[k]);
-            }
-        }
-
-        bool converged = false;
-        int iteration = 0;
-        while (!converged && iteration<iterations_) {
-            converged = true;
-            iteration++;
-
-            // compute the new cluster centers
-            for (int i=0; i<branching; ++i) {
-                memset(dcenters[i],0,sizeof(double)*veclen_);
-                radiuses[i] = 0;
-            }
-            for (int i=0; i<indices_length; ++i) {
-                ElementType* vec = dataset_[indices[i]];
-                double* center = dcenters[belongs_to[i]];
-                for (size_t k=0; k<veclen_; ++k) {
-                    center[k] += vec[k];
-                }
-            }
-            for (int i=0; i<branching; ++i) {
-                int cnt = count[i];
-                for (size_t k=0; k<veclen_; ++k) {
-                    dcenters[i][k] /= cnt;
-                }
-            }
-
-            std::vector<int> new_centroids(indices_length);
-            std::vector<DistanceType> sq_dists(indices_length);
-
-            // reassign points to clusters
-            KMeansDistanceComputer<Matrix<double> > invoker(distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists);
-            parallel_for_(cv::Range(0, (int)indices_length), invoker);
-
-            for (int i=0; i < (int)indices_length; ++i) {
-                DistanceType sq_dist(sq_dists[i]);
-                int new_centroid(new_centroids[i]);
-                if (sq_dist > radiuses[new_centroid]) {
-                    radiuses[new_centroid] = sq_dist;
-                }
-                if (new_centroid != belongs_to[i]) {
-                    count[belongs_to[i]]--;
-                    count[new_centroid]++;
-                    belongs_to[i] = new_centroid;
-                    converged = false;
-                }
-            }
-
-            for (int i=0; i<branching; ++i) {
-                // if one cluster converges to an empty cluster,
-                // move an element into that cluster
-                if (count[i]==0) {
-                    int j = (i+1)%branching;
-                    while (count[j]<=1) {
-                        j = (j+1)%branching;
-                    }
-
-                    for (int k=0; k<indices_length; ++k) {
-                        if (belongs_to[k]==j) {
-                            // for cluster j, we move the furthest element from the center to the empty cluster i
-                            if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) {
-                                belongs_to[k] = i;
-                                count[j]--;
-                                count[i]++;
-                                break;
-                            }
-                        }
-                    }
-                    converged = false;
-                }
-            }
-
-        }
-
        CentersType** centers = new CentersType*[branching];

-        for (int i=0; i<branching; ++i) {
-            centers[i] = new CentersType[veclen_];
-            memoryCounter_ += (int)(veclen_*sizeof(CentersType));
-            for (size_t k=0; k<veclen_; ++k) {
-                centers[i][k] = (CentersType)dcenters[i][k];
-            }
-        }
-
-
-        // compute kmeans clustering for each of the resulting clusters
-        node->childs = pool_.allocate<KMeansNodePtr>(branching);
-        int start = 0;
-        int end = start;
-        for (int c=0; c<branching; ++c) {
-            int s = count[c];
-
-            DistanceType variance = 0;
-            DistanceType mean_radius =0;
-            for (int i=0; i<indices_length; ++i) {
-                if (belongs_to[i]==c) {
-                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
-                    variance += d;
-                    mean_radius += static_cast<DistanceType>( sqrt(d) );
-                    std::swap(indices[i],indices[end]);
-                    std::swap(belongs_to[i],belongs_to[end]);
-                    end++;
-                }
-            }
-            variance /= s;
-            mean_radius /= s;
-            variance -= distance_(centers[c], ZeroIterator<ElementType>(), veclen_);
-
-            node->childs[c] = pool_.allocate<KMeansNode>();
-            std::memset(node->childs[c], 0, sizeof(KMeansNode));
-            node->childs[c]->radius = radiuses[c];
-            node->childs[c]->pivot = centers[c];
-            node->childs[c]->variance = variance;
-            node->childs[c]->mean_radius = mean_radius;
-            computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
-            start=end;
-        }
+        Distance* dummy = NULL;
+        refineAndSplitClustering(node, indices, indices_length, branching, level,
+                                 centers, radiuses, belongs_to, count, dummy);

        delete[] centers;
    }


-    /**
-     * The method responsible with doing the recursive hierarchical clustering on
-     * binary vectors.
-     * As some might have heared that KMeans on binary data doesn't make sense,
-     * it's worth a little explanation why it actually fairly works. As
-     * with the Hierarchical Clustering algortihm, we seed several centers for the
-     * current node by picking some of its points. Then in a first pass each point
-     * of the node is then related to its closest center. Now let's have a look at
-     * the 5 central dimensions of the 9 following points:
-     *
-     * xxxxxx11100xxxxx (1)
-     * xxxxxx11010xxxxx (2)
-     * xxxxxx11001xxxxx (3)
-     * xxxxxx10110xxxxx (4)
-     * xxxxxx10101xxxxx (5)
-     * xxxxxx10011xxxxx (6)
-     * xxxxxx01110xxxxx (7)
-     * xxxxxx01101xxxxx (8)
-     * xxxxxx01011xxxxx (9)
-     * sum   _____
-     * of 1: 66555
-     *
-     * Even if the barycenter notion doesn't apply, we can set a center
-     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
-     * on for these points.
-     *
-     * Note that convergence isn't ensured anymore. In practice, using Gonzales
-     * as seeding algorithm should be fine for getting convergence ("iterations"
-     * value can be set to -1). But with KMeans++ seeding you should definitely
-     * set a maximum number of iterations (but make it higher than the "iterations"
-     * default value of 11).
-     *
-     * Params:
-     *     node = the node to cluster
-     *     indices = indices of the points belonging to the current node
-     *     indices_length = number of points in the current node
-     *     branching = the branching factor to use in the clustering
-     *     level = 0 for the root node, it increases with the subdivision levels
-     */
-    void computeBitfieldClustering(KMeansNodePtr node, int* indices,
-                                   int indices_length, int branching, int level)
-    {
-        node->size = indices_length;
-        node->level = level;
-
-        if (indices_length < branching) {
-            node->indices = indices;
-            std::sort(node->indices,node->indices+indices_length);
-            node->childs = NULL;
-            return;
-        }
-
-        cv::AutoBuffer<int> centers_idx_buf(branching);
-        int* centers_idx = centers_idx_buf.data();
-        int centers_length;
-        (this->*chooseCenters)(branching, indices, indices_length, centers_idx, centers_length);
-
-        if (centers_length<branching) {
-            node->indices = indices;
-            std::sort(node->indices,node->indices+indices_length);
-            node->childs = NULL;
-            return;
-        }
-
-        const unsigned int accumulator_veclen = static_cast<unsigned int>(
-                                                veclen_*sizeof(ElementType)*BITS_PER_CHAR);
-        cv::AutoBuffer<unsigned int> dcenters_buf(branching*accumulator_veclen);
-        Matrix<unsigned int> dcenters(dcenters_buf.data(), branching, accumulator_veclen);
-
-        CentersType** centers = new CentersType*[branching];
-
-        for (int i=0; i<branching; ++i) {
-            centers[i] = new CentersType[veclen_];
-            memoryCounter_ += (int)(veclen_*sizeof(CentersType));
-        }
-
-        std::vector<DistanceType> radiuses(branching);
-        cv::AutoBuffer<int> count_buf(branching);
-        int* count = count_buf.data();
-        for (int i=0; i<branching; ++i) {
-            radiuses[i] = 0;
-            count[i] = 0;
-        }
-
-        //	assign points to clusters
-        cv::AutoBuffer<int> belongs_to_buf(indices_length);
-        int* belongs_to = belongs_to_buf.data();
-        for (int i=0; i<indices_length; ++i) {
-
-            DistanceType dist = distance_(dataset_[indices[i]], dataset_[centers_idx[0]], veclen_);
-            belongs_to[i] = 0;
-            for (int j=1; j<branching; ++j) {
-                DistanceType new_dist = distance_(dataset_[indices[i]], dataset_[centers_idx[j]], veclen_);
-                if (dist>new_dist) {
-                    belongs_to[i] = j;
-                    dist = new_dist;
-                }
-            }
-            if (dist>radiuses[belongs_to[i]]) {
-                radiuses[belongs_to[i]] = dist;
-            }
-            count[belongs_to[i]]++;
-        }
-
-        bool converged = false;
-        int iteration = 0;
-        while (!converged && iteration<iterations_) {
-            converged = true;
-            iteration++;
-
-            // compute the new cluster centers
-            for (int i=0; i<branching; ++i) {
-                memset(dcenters[i],0,sizeof(unsigned int)*accumulator_veclen);
-                radiuses[i] = 0;
-            }
-            for (int i=0; i<indices_length; ++i) {
-                unsigned char* vec = (unsigned char*)dataset_[indices[i]];
-                unsigned int* dcenter = dcenters[belongs_to[i]];
-                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
-                    dcenter[k]   += (vec[l])    & 0x01;
-                    dcenter[k+1] += (vec[l]>>1) & 0x01;
-                    dcenter[k+2] += (vec[l]>>2) & 0x01;
-                    dcenter[k+3] += (vec[l]>>3) & 0x01;
-                    dcenter[k+4] += (vec[l]>>4) & 0x01;
-                    dcenter[k+5] += (vec[l]>>5) & 0x01;
-                    dcenter[k+6] += (vec[l]>>6) & 0x01;
-                    dcenter[k+7] += (vec[l]>>7) & 0x01;
-                }
-            }
-            for (int i=0; i<branching; ++i) {
-                double cnt = static_cast<double>(count[i]);
-                unsigned int* dcenter = dcenters[i];
-                unsigned char* charCenter = (unsigned char*)centers[i];
-                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
-                    charCenter[l] = static_cast<unsigned char>(
-                                      (((int)(0.5 + (double)(dcenter[k])   / cnt)))
-                                    | (((int)(0.5 + (double)(dcenter[k+1]) / cnt))<<1)
-                                    | (((int)(0.5 + (double)(dcenter[k+2]) / cnt))<<2)
-                                    | (((int)(0.5 + (double)(dcenter[k+3]) / cnt))<<3)
-                                    | (((int)(0.5 + (double)(dcenter[k+4]) / cnt))<<4)
-                                    | (((int)(0.5 + (double)(dcenter[k+5]) / cnt))<<5)
-                                    | (((int)(0.5 + (double)(dcenter[k+6]) / cnt))<<6)
-                                    | (((int)(0.5 + (double)(dcenter[k+7]) / cnt))<<7));
-                }
-            }
-
-            std::vector<int> new_centroids(indices_length);
-            std::vector<DistanceType> dists(indices_length);
-
-            // reassign points to clusters
-            KMeansDistanceComputer<ElementType**> invoker(distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
-            parallel_for_(cv::Range(0, (int)indices_length), invoker);
-
-            for (int i=0; i < indices_length; ++i) {
-                DistanceType dist(dists[i]);
-                int new_centroid(new_centroids[i]);
-                if (dist > radiuses[new_centroid]) {
-                    radiuses[new_centroid] = dist;
-                }
-                if (new_centroid != belongs_to[i]) {
-                    count[belongs_to[i]]--;
-                    count[new_centroid]++;
-                    belongs_to[i] = new_centroid;
-                    converged = false;
-                }
-            }
-
-            for (int i=0; i<branching; ++i) {
-                // if one cluster converges to an empty cluster,
-                // move an element into that cluster
-                if (count[i]==0) {
-                    int j = (i+1)%branching;
-                    while (count[j]<=1) {
-                        j = (j+1)%branching;
-                    }
-
-                    for (int k=0; k<indices_length; ++k) {
-                        if (belongs_to[k]==j) {
-                            // for cluster j, we move the furthest element from the center to the empty cluster i
-                            if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) {
-                                belongs_to[k] = i;
-                                count[j]--;
-                                count[i]++;
-                                break;
-                            }
-                        }
-                    }
-                    converged = false;
-                }
-            }
-
-        }
-
-
-        // compute kmeans clustering for each of the resulting clusters
-        node->childs = pool_.allocate<KMeansNodePtr>(branching);
-        int start = 0;
-        int end = start;
-        for (int c=0; c<branching; ++c) {
-            int s = count[c];
-
-            unsigned long long variance = 0ull;
-            DistanceType mean_radius =0;
-            for (int i=0; i<indices_length; ++i) {
-                if (belongs_to[i]==c) {
-                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
-                    variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(d) );
-                    mean_radius += ensureSimpleDistance<Distance>(d);
-                    std::swap(indices[i],indices[end]);
-                    std::swap(belongs_to[i],belongs_to[end]);
-                    end++;
-                }
-            }
-            mean_radius = static_cast<DistanceType>(
-                        0.5f + static_cast<float>(mean_radius) / static_cast<float>(s));
-            variance = static_cast<unsigned long long>(
-                        0.5 + static_cast<double>(variance) / static_cast<double>(s));
-            variance -= static_cast<unsigned long long>(
-                        ensureSquareDistance<Distance>(
-                            distance_(centers[c], ZeroIterator<ElementType>(), veclen_)));
-
-            node->childs[c] = pool_.allocate<KMeansNode>();
-            std::memset(node->childs[c], 0, sizeof(KMeansNode));
-            node->childs[c]->radius = radiuses[c];
-            node->childs[c]->pivot = centers[c];
-            node->childs[c]->variance = static_cast<DistanceType>(variance);
-            node->childs[c]->mean_radius = mean_radius;
-            computeBitfieldClustering(node->childs[c],indices+start, end-start, branching, level+1);
-            start=end;
-        }
-
-        delete[] centers;
-    }
-
-
-
-
    /**
     * Performs one descent in the hierarchical k-means tree. The branches not
     * visited are stored in a priority queue.
--- a/samples/cpp/flann_search_dataset.cpp
+++ b/samples/cpp/flann_search_dataset.cpp
@ -0,0 +1,250 @@
+// flann_search_dataset.cpp
+// Naive program to search a query picture in a dataset illustrating usage of FLANN
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/core/utils/filesystem.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/flann.hpp"
+
+using namespace cv;
+using std::cout;
+using std::endl;
+
+#define _ORB_
+
+const char* keys =
+    "{ help h | | Print help message. }"
+    "{ dataset | | Path to the images folder used as dataset. }"
+    "{ image |   | Path to the image to search for in the dataset. }"
+    "{ save |    | Path and filename where to save the flann structure to. }"
+    "{ load |    | Path and filename where to load the flann structure from. }";
+
+struct img_info {
+    int img_index;
+    unsigned int nbr_of_matches;
+
+    img_info(int _img_index, unsigned int _nbr_of_matches)
+        : img_index(_img_index)
+        , nbr_of_matches(_nbr_of_matches)
+    {}
+};
+
+
+int main( int argc, char* argv[] )
+{
+    //-- Test the program options
+    CommandLineParser parser( argc, argv, keys );
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return -1;
+    }
+
+    const cv::String img_path = parser.get<String>("image");
+    Mat img = imread( samples::findFile( img_path ), IMREAD_GRAYSCALE );
+    if (img.empty() )
+    {
+        cout << "Could not open the image "<< img_path << endl;
+        return -1;
+    }
+
+    const cv::String db_path = parser.get<String>("dataset");
+    if (!utils::fs::isDirectory(db_path))
+    {
+        cout << "Dataset folder "<< db_path.c_str() <<" doesn't exist!" << endl;
+        return -1;
+    }
+
+    const cv::String load_db_path = parser.get<String>("load");
+    if ((load_db_path != String()) && (!utils::fs::exists(load_db_path)))
+    {
+        cout << "File " << load_db_path.c_str()
+             << " where to load the flann structure from doesn't exist!" << endl;
+        return -1;
+    }
+
+    const cv::String save_db_path = parser.get<String>("save");
+
+    //-- Step 1: Detect the keypoints using a detector, compute the descriptors
+    //   in the folder containing the images of the dataset
+#ifdef _SIFT_
+    int minHessian = 400;
+    Ptr<Feature2D> detector = SIFT::create( minHessian );
+#elif defined(_ORB_)
+    Ptr<Feature2D> detector = ORB::create();
+#else
+    cout << "Missing or unknown defined descriptor. "
+            "Only SIFT and ORB are currently interfaced here" << endl;
+    return -1;
+#endif
+
+    std::vector<KeyPoint> db_keypoints;
+    Mat db_descriptors;
+    std::vector<unsigned int> db_images_indice_range; //store the range of indices per image
+    std::vector<int> db_indice_2_image_lut;           //match descriptor indice to its image
+
+    db_images_indice_range.push_back(0);
+    std::vector<cv::String> files;
+    utils::fs::glob(db_path, cv::String(), files);
+    for (std::vector<cv::String>::iterator itr = files.begin(); itr != files.end(); ++itr)
+    {
+        Mat tmp_img = imread( *itr, IMREAD_GRAYSCALE );
+        if (!tmp_img.empty())
+        {
+            std::vector<KeyPoint> kpts;
+            Mat descriptors;
+            detector->detectAndCompute( tmp_img, noArray(), kpts, descriptors );
+
+            db_keypoints.insert( db_keypoints.end(), kpts.begin(), kpts.end() );
+            db_descriptors.push_back( descriptors );
+            db_images_indice_range.push_back( db_images_indice_range.back()
+                                              + static_cast<unsigned int>(kpts.size()) );
+        }
+    }
+
+    //-- Set the LUT
+    db_indice_2_image_lut.resize( db_images_indice_range.back() );
+    const int nbr_of_imgs = static_cast<int>( db_images_indice_range.size()-1 );
+    for (int i = 0; i < nbr_of_imgs; ++i)
+    {
+        const unsigned int first_indice = db_images_indice_range[i];
+        const unsigned int last_indice = db_images_indice_range[i+1];
+        std::fill( db_indice_2_image_lut.begin() + first_indice,
+                   db_indice_2_image_lut.begin() + last_indice,
+                   i );
+    }
+
+    //-- Step 2: build the structure storing the descriptors
+#if defined(_SIFT_)
+    cv::Ptr<flann::GenericIndex<cvflann::L2<float> > > index;
+    if (load_db_path != String())
+        index = cv::makePtr<flann::GenericIndex<cvflann::L2<float> > >(db_descriptors,
+                                                             cvflann::SavedIndexParams(load_db_path));
+    else
+        index = cv::makePtr<flann::GenericIndex<cvflann::L2<float> > >(db_descriptors,
+                                                             cvflann::KDTreeIndexParams(4));
+
+#elif defined(_ORB_)
+    cv::Ptr<flann::GenericIndex<cvflann::Hamming<unsigned char> > > index;
+    if (load_db_path != String())
+        index  = cv::makePtr<flann::GenericIndex<cvflann::Hamming<unsigned char> > >
+                (db_descriptors, cvflann::SavedIndexParams(load_db_path));
+    else
+        index  = cv::makePtr<flann::GenericIndex<cvflann::Hamming<unsigned char> > >
+                (db_descriptors, cvflann::LshIndexParams());
+#else
+    cout<< "Descriptor not listed. Set the proper FLANN distance for this descriptor" <<endl;
+    return -1;
+#endif
+    if (save_db_path != String())
+        index->save(save_db_path);
+
+
+    // Return if no query image was set
+    if (img_path == String())
+        return 0;
+
+    //-- Detect the keypoints and compute the descriptors for the query image
+    std::vector<KeyPoint> img_keypoints;
+    Mat img_descriptors;
+    detector->detectAndCompute( img, noArray(), img_keypoints, img_descriptors );
+
+
+    //-- Step 3: retrieve the descriptors in the dataset matching the ones of the query image
+    // /!\ knnSearch doesn't follow OpenCV standards by not initialising empty Mat properties
+    const int knn = 2;
+    Mat indices(img_descriptors.rows, knn, CV_32S);
+#if defined(_SIFT_)
+#define DIST_TYPE float
+    Mat dists(img_descriptors.rows, knn, CV_32F);
+#elif defined(_ORB_)
+#define DIST_TYPE int
+    Mat dists(img_descriptors.rows, knn, CV_32S);
+#endif
+    index->knnSearch( img_descriptors, indices, dists, knn, cvflann::SearchParams(32) );
+
+    //-- Filter matches using the Lowe's ratio test
+    const float ratio_thresh = 0.7f;
+    std::vector<DMatch> good_matches; //contains
+    std::vector<unsigned int> matches_per_img_histogram( nbr_of_imgs, 0 );
+    for (int i = 0; i < dists.rows; ++i)
+    {
+        if (dists.at<DIST_TYPE>(i,0) < ratio_thresh * dists.at<DIST_TYPE>(i,1))
+        {
+            const int indice_in_db = indices.at<int>(i,0);
+            DMatch dmatch(i, indice_in_db, db_indice_2_image_lut[indice_in_db],
+                          static_cast<float>(dists.at<DIST_TYPE>(i,0)));
+            good_matches.push_back( dmatch );
+            matches_per_img_histogram[ db_indice_2_image_lut[indice_in_db] ]++;
+        }
+    }
+
+
+    //-- Step 4: find the dataset image with the highest proportion of matches
+    std::multimap<float, img_info> images_infos;
+    for (int i = 0; i < nbr_of_imgs; ++i)
+    {
+        const unsigned int nbr_of_matches = matches_per_img_histogram[i];
+        if (nbr_of_matches < 4) //we need at leat 4 points for a homography
+            continue;
+
+        const unsigned int nbr_of_kpts = db_images_indice_range[i+1] - db_images_indice_range[i];
+        const float inverse_proportion_of_retrieved_kpts =
+                static_cast<float>(nbr_of_kpts) / static_cast<float>(nbr_of_matches);
+
+        img_info info(i, nbr_of_matches);
+        images_infos.insert( std::pair<float,img_info>(inverse_proportion_of_retrieved_kpts,
+                                                       info) );
+    }
+
+    if (images_infos.begin() == images_infos.end())
+    {
+        cout<<"No good match could be found."<<endl;
+        return 0;
+    }
+
+    //-- if there are several images with a similar proportion of matches,
+    // select the one with the highest number of matches weighted by the
+    // squared ratio of proportions
+    const float best_matches_proportion = images_infos.begin()->first;
+    float new_matches_proportion = best_matches_proportion;
+    img_info best_img = images_infos.begin()->second;
+
+    std::multimap<float, img_info>::iterator it = images_infos.begin();
+    ++it;
+    while ((it!=images_infos.end()) && (it->first < 1.1*best_matches_proportion))
+    {
+        const float ratio = new_matches_proportion / it->first;
+        if( it->second.nbr_of_matches * (ratio * ratio) > best_img.nbr_of_matches)
+        {
+            new_matches_proportion = it->first;
+            best_img = it->second;
+        }
+        ++it;
+    }
+
+    //-- Step 5: filter goodmatches that belong to the best image match of the dataset
+    std::vector<DMatch> filtered_good_matches;
+    for (std::vector<DMatch>::iterator itr(good_matches.begin()); itr != good_matches.end(); ++itr)
+    {
+        if (itr->imgIdx == best_img.img_index)
+            filtered_good_matches.push_back(*itr);
+    }
+
+    //-- Retrieve the best image match from the dataset
+    Mat db_img = imread( files[best_img.img_index], IMREAD_GRAYSCALE );
+
+    //-- Draw matches
+    Mat img_matches;
+    drawMatches( img, img_keypoints, db_img, db_keypoints, filtered_good_matches, img_matches, Scalar::all(-1),
+                 Scalar::all(-1), std::vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
+
+    //-- Show detected matches
+    imshow("Good Matches", img_matches );
+    waitKey();
+
+    return 0;
+}
--- a/samples/dnn/dasiamrpn_tracker.py
+++ b/samples/dnn/dasiamrpn_tracker.py
@ -14,8 +14,8 @@ import argparse
 import sys

 class DaSiamRPNTracker:
-    #initialization of used values, initial bounding box, used network
-    def __init__(self, im, target_pos, target_sz, net, kernel_r1, kernel_cls1):
+    # Initialization of used values, initial bounding box, used network
+    def __init__(self, net="dasiamrpn_model.onnx", kernel_r1="dasiamrpn_kernel_r1.onnx", kernel_cls1="dasiamrpn_kernel_cls1.onnx"):
        self.windowing = "cosine"
        self.exemplar_size = 127
        self.instance_size = 271
@ -28,42 +28,52 @@ class DaSiamRPNTracker:
        self.penalty_k = 0.055
        self.window_influence = 0.42
        self.lr = 0.295
-        self.im_h = im.shape[0]
-        self.im_w = im.shape[1]
-        self.target_pos = target_pos
-        self.target_sz = target_sz
-        self.avg_chans = np.mean(im, axis=(0, 1))
-        self.net = net
        self.score = []
-
-        if ((self.target_sz[0] * self.target_sz[1]) / float(self.im_h * self.im_w)) < 0.004:
-             raise AssertionError("Initializing BB is too small-try to restart tracker with larger BB")
-
-        self.anchor = self.__generate_anchor()
-        wc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
-        hc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
-        s_z = round(np.sqrt(wc_z * hc_z))
-
-        z_crop = self.__get_subwindow_tracking(im, self.exemplar_size, s_z)
-        z_crop = z_crop.transpose(2, 0, 1).reshape(1, 3, 127, 127).astype(np.float32)
-        self.net.setInput(z_crop)
-        z_f = self.net.forward('63')
-        kernel_r1.setInput(z_f)
-        r1 = kernel_r1.forward()
-        kernel_cls1.setInput(z_f)
-        cls1 = kernel_cls1.forward()
-        r1 = r1.reshape(20, 256, 4, 4)
-        cls1 = cls1.reshape(10, 256 , 4, 4)
-        self.net.setParam(self.net.getLayerId('65'), 0, r1)
-        self.net.setParam(self.net.getLayerId('68'), 0, cls1)
-
        if self.windowing == "cosine":
            self.window = np.outer(np.hanning(self.score_size), np.hanning(self.score_size))
        elif self.windowing == "uniform":
            self.window = np.ones((self.score_size, self.score_size))
        self.window = np.tile(self.window.flatten(), self.anchor_num)
+        # Loading network`s and kernel`s models
+        self.net = cv.dnn.readNet(net)
+        self.kernel_r1 = cv.dnn.readNet(kernel_r1)
+        self.kernel_cls1 = cv.dnn.readNet(kernel_cls1)

-    #creating anchor for tracking bounding box
+    def init(self, im, init_bb):
+        target_pos, target_sz = np.array([init_bb[0], init_bb[1]]), np.array([init_bb[2], init_bb[3]])
+        self.im_h = im.shape[0]
+        self.im_w = im.shape[1]
+        self.target_pos = target_pos
+        self.target_sz = target_sz
+        self.avg_chans = np.mean(im, axis=(0, 1))
+
+        # When we trying to generate ONNX model from the pre-trained .pth model
+        # we are using only one state of the network. In our case used state
+        # with big bounding box, so we were forced to add assertion for
+        # too small bounding boxes - current state of the network can not
+        # work properly with such small bounding boxes
+        if ((self.target_sz[0] * self.target_sz[1]) / float(self.im_h * self.im_w)) < 0.004:
+            raise AssertionError(
+        "Initializing BB is too small-try to restart tracker with larger BB")
+
+        self.anchor = self.__generate_anchor()
+        wc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
+        hc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
+        s_z = round(np.sqrt(wc_z * hc_z))
+        z_crop = self.__get_subwindow_tracking(im, self.exemplar_size, s_z)
+        z_crop = z_crop.transpose(2, 0, 1).reshape(1, 3, 127, 127).astype(np.float32)
+        self.net.setInput(z_crop)
+        z_f = self.net.forward('63')
+        self.kernel_r1.setInput(z_f)
+        r1 = self.kernel_r1.forward()
+        self.kernel_cls1.setInput(z_f)
+        cls1 = self.kernel_cls1.forward()
+        r1 = r1.reshape(20, 256, 4, 4)
+        cls1 = cls1.reshape(10, 256 , 4, 4)
+        self.net.setParam(self.net.getLayerId('65'), 0, r1)
+        self.net.setParam(self.net.getLayerId('68'), 0, cls1)
+
+    # Сreating anchor for tracking bounding box
    def __generate_anchor(self):
        self.anchor = np.zeros((self.anchor_num, 4),  dtype = np.float32)
        size = self.total_stride * self.total_stride
@ -86,8 +96,8 @@ class DaSiamRPNTracker:
        self.anchor[:, 0], self.anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
        return self.anchor

-    #track function
-    def track(self, im):
+    # Function for updating tracker state
+    def update(self, im):
        wc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
        hc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
        s_z = np.sqrt(wc_z * hc_z)
@ -96,7 +106,7 @@ class DaSiamRPNTracker:
        pad = d_search / scale_z
        s_x = round(s_z + 2 * pad)

-        #region preprocessing
+        # Region preprocessing part
        x_crop = self.__get_subwindow_tracking(im, self.instance_size, s_x)
        x_crop = x_crop.transpose(2, 0, 1).reshape(1, 3, 271, 271).astype(np.float32)
        self.score = self.__tracker_eval(x_crop, scale_z)
@ -105,7 +115,12 @@ class DaSiamRPNTracker:
        self.target_sz[0] = max(10, min(self.im_w, self.target_sz[0]))
        self.target_sz[1] = max(10, min(self.im_h, self.target_sz[1]))

-    #update bounding box position
+        cx, cy = self.target_pos
+        w, h = self.target_sz
+        updated_bb = (cx, cy, w, h)
+        return True, updated_bb
+
+    # Function for updating position of the bounding box
    def __tracker_eval(self, x_crop, scale_z):
        target_size = self.target_sz * scale_z
        self.net.setInput(x_crop)
@ -160,7 +175,7 @@ class DaSiamRPNTracker:
        y = e_x / e_x.sum(axis = 0)
        return y

-    #evaluations with cropped image
+    # Reshaping cropped image for using in the model
    def __get_subwindow_tracking(self, im, model_size, original_sz):
        im_sz = im.shape
        c = (original_sz + 1) / 2
@ -171,19 +186,20 @@ class DaSiamRPNTracker:
        left_pad = int(max(0., -context_xmin))
        top_pad = int(max(0., -context_ymin))
        right_pad = int(max(0., context_xmax - im_sz[1] + 1))
-        bottom_pad = int(max(0., context_ymax - im_sz[0] + 1))
+        bot_pad = int(max(0., context_ymax - im_sz[0] + 1))
        context_xmin += left_pad
        context_xmax += left_pad
        context_ymin += top_pad
        context_ymax += top_pad
        r, c, k = im.shape

-        if any([top_pad, bottom_pad, left_pad, right_pad]):
-            te_im = np.zeros((r + top_pad + bottom_pad, c + left_pad + right_pad, k), np.uint8)
+        if any([top_pad, bot_pad, left_pad, right_pad]):
+            te_im = np.zeros((
+                r + top_pad + bot_pad, c + left_pad + right_pad, k), np.uint8)
            te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
            if top_pad:
                te_im[0:top_pad, left_pad:left_pad + c, :] = self.avg_chans
-            if bottom_pad:
+            if bot_pad:
                te_im[r + top_pad:, left_pad:left_pad + c, :] = self.avg_chans
            if left_pad:
                te_im[:, 0:left_pad, :] = self.avg_chans
@ -195,23 +211,22 @@ class DaSiamRPNTracker:

        if not np.array_equal(model_size, original_sz):
            im_patch_original = cv.resize(im_patch_original, (model_size, model_size))
-
        return im_patch_original

-#function for reading paths, bounding box drawing, showing results
+# Sample for using DaSiamRPN tracker
 def main():
    parser = argparse.ArgumentParser(description="Run tracker")
+    parser.add_argument("--input", type=str, help="Full path to input (empty for camera)")
    parser.add_argument("--net", type=str, default="dasiamrpn_model.onnx", help="Full path to onnx model of net")
    parser.add_argument("--kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx", help="Full path to onnx model of kernel_r1")
    parser.add_argument("--kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Full path to onnx model of kernel_cls1")
-    parser.add_argument("--input", type=str, help="Full path to input. Do not use if input is camera")
    args = parser.parse_args()
    point1 = ()
    point2 = ()
    mark = True
    drawing = False
    cx, cy, w, h = 0.0, 0.0, 0, 0
-
+    # Fucntion for drawing during videostream
    def get_bb(event, x, y, flag, param):
        nonlocal point1, point2, cx, cy, w, h, drawing, mark

@ -233,12 +248,7 @@ def main():
            h = abs(point1[1] - point2[1])
            mark = False

-    #loading network`s and kernel`s models
-    net = cv.dnn.readNet(args.net)
-    kernel_r1 = cv.dnn.readNet(args.kernel_r1)
-    kernel_cls1 = cv.dnn.readNet(args.kernel_cls1)
-
-    #initializing bounding box
+    # Creating window for visualization
    cap = cv.VideoCapture(args.input if args.input else 0)
    cv.namedWindow("DaSiamRPN")
    cv.setMouseCallback("DaSiamRPN", get_bb)
@ -257,17 +267,17 @@ def main():
        cv.imshow("DaSiamRPN", twin)
        cv.waitKey(40)

-    target_pos, target_sz = np.array([cx, cy]), np.array([w, h])
-    tracker = DaSiamRPNTracker(frame, target_pos, target_sz, net, kernel_r1, kernel_cls1)
+    init_bb = (cx, cy, w, h)
+    tracker = DaSiamRPNTracker(args.net, args.kernel_r1, args.kernel_cls1)
+    tracker.init(frame, init_bb)

-    #tracking loop
+    # Tracking loop
    while cap.isOpened():
        has_frame, frame = cap.read()
        if not has_frame:
            sys.exit(0)
-        tracker.track(frame)
-        w, h = tracker.target_sz
-        cx, cy = tracker.target_pos
+        _, new_bb = tracker.update(frame)
+        cx, cy, w, h = new_bb
        cv.rectangle(frame, (int(cx - w // 2), int(cy - h // 2)), (int(cx - w // 2) + int(w), int(cy - h // 2) + int(h)),(0, 255, 255), 3)
        cv.imshow("DaSiamRPN", frame)
        key = cv.waitKey(1)