From 8488f2e26524d6bcf476c882e8462792ee11d0ab Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Tue, 24 Apr 2018 18:25:43 +0300 Subject: [PATCH] EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2) --- .../dnn_custom_layers/dnn_custom_layers.md | 22 +-- modules/dnn/include/opencv2/dnn/dnn.hpp | 4 + modules/dnn/src/nms.cpp | 24 +++ .../src/tensorflow/tf_graph_simplifier.cpp | 32 ++++ modules/dnn/src/tensorflow/tf_importer.cpp | 20 +++ modules/dnn/test/test_tf_importer.cpp | 74 +++++++- .../custom_layers.hpp} | 143 +++++++++------ samples/dnn/text_detection.cpp | 169 ++++++++++++++++++ 8 files changed, 412 insertions(+), 76 deletions(-) rename samples/{cpp/tutorial_code/dnn/custom_layers.cpp => dnn/custom_layers.hpp} (81%) create mode 100644 samples/dnn/text_detection.cpp diff --git a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md index 5b3f3c7347..f367946620 100644 --- a/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md +++ b/doc/tutorials/dnn/dnn_custom_layers/dnn_custom_layers.md @@ -32,11 +32,11 @@ Unspecified error: Can't create layer "layer_name" of type "MyType" in function To import the model correctly you have to derive a class from cv::dnn::Layer with the following methods: -@snippet dnn/custom_layers.cpp A custom layer interface +@snippet dnn/custom_layers.hpp A custom layer interface And register it before the import: -@snippet dnn/custom_layers.cpp Register a custom layer +@snippet dnn/custom_layers.hpp Register a custom layer @note `MyType` is a type of unimplemented layer from the thrown exception. @@ -44,27 +44,27 @@ Let's see what all the methods do: - Constructor -@snippet dnn/custom_layers.cpp MyLayer::MyLayer +@snippet dnn/custom_layers.hpp MyLayer::MyLayer Retrieves hyper-parameters from cv::dnn::LayerParams. If your layer has trainable weights they will be already stored in the Layer's member cv::dnn::Layer::blobs. - A static method `create` -@snippet dnn/custom_layers.cpp MyLayer::create +@snippet dnn/custom_layers.hpp MyLayer::create This method should create an instance of you layer and return cv::Ptr with it. - Output blobs' shape computation -@snippet dnn/custom_layers.cpp MyLayer::getMemoryShapes +@snippet dnn/custom_layers.hpp MyLayer::getMemoryShapes Returns layer's output shapes depends on input shapes. You may request an extra memory using `internals`. - Run a layer -@snippet dnn/custom_layers.cpp MyLayer::forward +@snippet dnn/custom_layers.hpp MyLayer::forward Implement a layer's logic here. Compute outputs for given inputs. @@ -74,7 +74,7 @@ the second invocation of `forward` will has the same data at `outputs` and `inte - Optional `finalize` method -@snippet dnn/custom_layers.cpp MyLayer::finalize +@snippet dnn/custom_layers.hpp MyLayer::finalize The chain of methods are the following: OpenCV deep learning engine calls `create` method once then it calls `getMemoryShapes` for an every created layer then you @@ -108,11 +108,11 @@ layer { This way our implementation can look like: -@snippet dnn/custom_layers.cpp InterpLayer +@snippet dnn/custom_layers.hpp InterpLayer Next we need to register a new layer type and try to import the model. -@snippet dnn/custom_layers.cpp Register InterpLayer +@snippet dnn/custom_layers.hpp Register InterpLayer ## Example: custom layer from TensorFlow This is an example of how to import a network with [tf.image.resize_bilinear](https://www.tensorflow.org/versions/master/api_docs/python/tf/image/resize_bilinear) @@ -185,11 +185,11 @@ Custom layers import from TensorFlow is designed to put all layer's `attr` into cv::dnn::LayerParams but input `Const` blobs into cv::dnn::Layer::blobs. In our case resize's output shape will be stored in layer's `blobs[0]`. -@snippet dnn/custom_layers.cpp ResizeBilinearLayer +@snippet dnn/custom_layers.hpp ResizeBilinearLayer Next we register a layer and try to import the model. -@snippet dnn/custom_layers.cpp Register ResizeBilinearLayer +@snippet dnn/custom_layers.hpp Register ResizeBilinearLayer ## Define a custom layer in Python The following example shows how to customize OpenCV's layers in Python. diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 7f8c7e7499..6ac2f1a7fe 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -826,6 +826,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN CV_OUT std::vector& indices, const float eta = 1.f, const int top_k = 0); + CV_EXPORTS void NMSBoxes(const std::vector& bboxes, const std::vector& scores, + const float score_threshold, const float nms_threshold, + CV_OUT std::vector& indices, + const float eta = 1.f, const int top_k = 0); //! @} CV__DNN_EXPERIMENTAL_NS_END diff --git a/modules/dnn/src/nms.cpp b/modules/dnn/src/nms.cpp index 3adaef165d..2ce1257cad 100644 --- a/modules/dnn/src/nms.cpp +++ b/modules/dnn/src/nms.cpp @@ -8,6 +8,8 @@ #include "precomp.hpp" #include "nms.inl.hpp" +#include + namespace cv { namespace dnn @@ -28,6 +30,28 @@ void NMSBoxes(const std::vector& bboxes, const std::vector& scores, NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rectOverlap); } +static inline float rotatedRectIOU(const RotatedRect& a, const RotatedRect& b) +{ + std::vector inter, hull; + int res = rotatedRectangleIntersection(a, b, inter); + if (inter.empty() || res == INTERSECT_NONE) + return 0.0f; + if (res == INTERSECT_FULL) + return 1.0f; + convexHull(inter, hull); + float interArea = contourArea(hull); + return interArea / (a.size.area() + b.size.area() - interArea); +} + +void NMSBoxes(const std::vector& bboxes, const std::vector& scores, + const float score_threshold, const float nms_threshold, + std::vector& indices, const float eta, const int top_k) +{ + CV_Assert(bboxes.size() == scores.size(), score_threshold >= 0, + nms_threshold >= 0, eta > 0); + NMSFast_(bboxes, scores, score_threshold, nms_threshold, eta, top_k, indices, rotatedRectIOU); +} + CV__DNN_EXPERIMENTAL_NS_END }// dnn }// cv diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp index 677f57ab7d..9208588e65 100644 --- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp +++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp @@ -538,6 +538,37 @@ public: } }; +// In case of resizing by factor. +class ResizeBilinearSubgraph : public Subgraph +{ +public: + ResizeBilinearSubgraph() + { + int input = addNodeToMatch(""); + + int shape = addNodeToMatch("Shape", input); + int stack = addNodeToMatch("Const"); + int stack_1 = addNodeToMatch("Const"); + int stack_2 = addNodeToMatch("Const"); + int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2); + int factorY = addNodeToMatch("Const"); + int mul = addNodeToMatch("Mul", strided_slice, factorY); + + shape = addNodeToMatch("Shape", input); + stack = addNodeToMatch("Const"); + stack_1 = addNodeToMatch("Const"); + stack_2 = addNodeToMatch("Const"); + strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2); + int factorX = addNodeToMatch("Const"); + int mul_1 = addNodeToMatch("Mul", strided_slice, factorX); + + int pack = addNodeToMatch("Pack", mul, mul_1); + + addNodeToMatch("ResizeBilinear", input, pack); + setFusedNode("ResizeBilinear", input, factorY, factorX); + } +}; + void simplifySubgraphs(tensorflow::GraphDef& net) { std::vector > subgraphs; @@ -551,6 +582,7 @@ void simplifySubgraphs(tensorflow::GraphDef& net) subgraphs.push_back(Ptr(new L2NormalizeSubgraph())); subgraphs.push_back(Ptr(new DeconvolutionValidKerasSubgraph())); subgraphs.push_back(Ptr(new DeconvolutionSameKerasSubgraph())); + subgraphs.push_back(Ptr(new ResizeBilinearSubgraph())); int numNodes = net.node_size(); std::vector matchedNodesIds; diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index 667e573705..efedbceb48 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -767,6 +767,26 @@ void TFImporter::populateNet(Net dstNet) } } } + else if (type == "Sub") + { + bool haveConst = false; + for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii) + { + Pin input = parsePin(layer.input(ii)); + haveConst = value_id.find(input.name) != value_id.end(); + } + CV_Assert(haveConst); + + layerParams.blobs.resize(1); + blobFromTensor(getConstBlob(layer, value_id), layerParams.blobs[0]); + layerParams.blobs[0] *= -1; + + int id = dstNet.addLayer(name, "Shift", layerParams); + layer_id[name] = id; + + // one input only + connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); + } else if (type == "MatMul") { CV_Assert(layer.input_size() == 2); diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 397aadfa08..64cfcb932a 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -373,9 +373,24 @@ public: ResizeBilinearLayer(const LayerParams ¶ms) : Layer(params) { CV_Assert(!params.get("align_corners", false)); - CV_Assert(blobs.size() == 1, blobs[0].type() == CV_32SC1); - outHeight = blobs[0].at(0, 0); - outWidth = blobs[0].at(0, 1); + CV_Assert(!blobs.empty()); + + for (size_t i = 0; i < blobs.size(); ++i) + CV_Assert(blobs[i].type() == CV_32SC1); + + if (blobs.size() == 1) + { + CV_Assert(blobs[0].total() == 2); + outHeight = blobs[0].at(0, 0); + outWidth = blobs[0].at(0, 1); + } + else + { + CV_Assert(blobs.size() == 2, blobs[0].total() == 1, blobs[1].total() == 1); + factorHeight = blobs[0].at(0, 0); + factorWidth = blobs[1].at(0, 0); + outHeight = outWidth = 0; + } } static Ptr create(LayerParams& params) @@ -391,12 +406,21 @@ public: std::vector outShape(4); outShape[0] = inputs[0][0]; // batch size outShape[1] = inputs[0][1]; // number of channels - outShape[2] = outHeight; - outShape[3] = outWidth; + outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight); + outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth); outputs.assign(1, outShape); return false; } + virtual void finalize(const std::vector& inputs, std::vector &outputs) CV_OVERRIDE + { + if (!outWidth && !outHeight) + { + outHeight = outputs[0].size[2]; + outWidth = outputs[0].size[3]; + } + } + // This implementation is based on a reference implementation from // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h virtual void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) CV_OVERRIDE @@ -447,13 +471,51 @@ private: return x + size[3] * (y + size[2] * (c + size[1] * b)); } - int outWidth, outHeight; + int outWidth, outHeight, factorWidth, factorHeight; }; TEST(Test_TensorFlow, resize_bilinear) { CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer); runTensorFlowNet("resize_bilinear"); + runTensorFlowNet("resize_bilinear_factor"); + LayerFactory::unregisterLayer("ResizeBilinear"); +} + +// inp = cv.imread('opencv_extra/testdata/cv/ximgproc/sources/08.png') +// inp = inp[:,:,[2, 1, 0]].astype(np.float32).reshape(1, 512, 512, 3) +// outs = sess.run([sess.graph.get_tensor_by_name('feature_fusion/Conv_7/Sigmoid:0'), +// sess.graph.get_tensor_by_name('feature_fusion/concat_3:0')], +// feed_dict={'input_images:0': inp}) +// scores = np.ascontiguousarray(outs[0].transpose(0, 3, 1, 2)) +// geometry = np.ascontiguousarray(outs[1].transpose(0, 3, 1, 2)) +// np.save('east_text_detection.scores.npy', scores) +// np.save('east_text_detection.geometry.npy', geometry) +TEST(Test_TensorFlow, EAST_text_detection) +{ + CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer); + std::string netPath = findDataFile("dnn/frozen_east_text_detection.pb", false); + std::string imgPath = findDataFile("cv/ximgproc/sources/08.png", false); + std::string refScoresPath = findDataFile("dnn/east_text_detection.scores.npy", false); + std::string refGeometryPath = findDataFile("dnn/east_text_detection.geometry.npy", false); + + Net net = readNet(findDataFile("dnn/frozen_east_text_detection.pb", false)); + + Mat img = imread(imgPath); + Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false); + net.setInput(inp); + + std::vector outs; + std::vector outNames(2); + outNames[0] = "feature_fusion/Conv_7/Sigmoid"; + outNames[1] = "feature_fusion/concat_3"; + net.forward(outs, outNames); + + Mat scores = outs[0]; + Mat geometry = outs[1]; + + normAssert(scores, blobFromNPY(refScoresPath), "scores"); + normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", 5e-5, 1e-3); LayerFactory::unregisterLayer("ResizeBilinear"); } diff --git a/samples/cpp/tutorial_code/dnn/custom_layers.cpp b/samples/dnn/custom_layers.hpp similarity index 81% rename from samples/cpp/tutorial_code/dnn/custom_layers.cpp rename to samples/dnn/custom_layers.hpp index 217e53659f..918cc8ae46 100644 --- a/samples/cpp/tutorial_code/dnn/custom_layers.cpp +++ b/samples/dnn/custom_layers.hpp @@ -1,35 +1,8 @@ +#ifndef __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__ +#define __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__ + #include - -//! [A custom layer interface] -class MyLayer : public cv::dnn::Layer -{ -public: - //! [MyLayer::MyLayer] - MyLayer(const cv::dnn::LayerParams ¶ms); - //! [MyLayer::MyLayer] - - //! [MyLayer::create] - static cv::Ptr create(cv::dnn::LayerParams& params); - //! [MyLayer::create] - - //! [MyLayer::getMemoryShapes] - virtual bool getMemoryShapes(const std::vector > &inputs, - const int requiredOutputs, - std::vector > &outputs, - std::vector > &internals) const CV_OVERRIDE; - //! [MyLayer::getMemoryShapes] - - //! [MyLayer::forward] - virtual void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) CV_OVERRIDE; - //! [MyLayer::forward] - - //! [MyLayer::finalize] - virtual void finalize(const std::vector &inputs, std::vector &outputs) CV_OVERRIDE; - //! [MyLayer::finalize] - - virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE; -}; -//! [A custom layer interface] +#include // getPlane //! [InterpLayer] class InterpLayer : public cv::dnn::Layer @@ -113,15 +86,33 @@ private: //! [InterpLayer] //! [ResizeBilinearLayer] -class ResizeBilinearLayer : public cv::dnn::Layer +class ResizeBilinearLayer CV_FINAL : public cv::dnn::Layer { public: ResizeBilinearLayer(const cv::dnn::LayerParams ¶ms) : Layer(params) { CV_Assert(!params.get("align_corners", false)); - CV_Assert(blobs.size() == 1, blobs[0].type() == CV_32SC1); - outHeight = blobs[0].at(0, 0); - outWidth = blobs[0].at(0, 1); + CV_Assert(!blobs.empty()); + + for (size_t i = 0; i < blobs.size(); ++i) + CV_Assert(blobs[i].type() == CV_32SC1); + + // There are two cases of input blob: a single blob which contains output + // shape and two blobs with scaling factors. + if (blobs.size() == 1) + { + CV_Assert(blobs[0].total() == 2); + outHeight = blobs[0].at(0, 0); + outWidth = blobs[0].at(0, 1); + factorHeight = factorWidth = 0; + } + else + { + CV_Assert(blobs.size() == 2, blobs[0].total() == 1, blobs[1].total() == 1); + factorHeight = blobs[0].at(0, 0); + factorWidth = blobs[1].at(0, 0); + outHeight = outWidth = 0; + } } static cv::Ptr create(cv::dnn::LayerParams& params) @@ -130,25 +121,32 @@ public: } virtual bool getMemoryShapes(const std::vector > &inputs, - const int requiredOutputs, + const int, std::vector > &outputs, - std::vector > &internals) const CV_OVERRIDE + std::vector > &) const CV_OVERRIDE { - CV_UNUSED(requiredOutputs); CV_UNUSED(internals); std::vector outShape(4); outShape[0] = inputs[0][0]; // batch size outShape[1] = inputs[0][1]; // number of channels - outShape[2] = outHeight; - outShape[3] = outWidth; + outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight); + outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth); outputs.assign(1, outShape); return false; } + virtual void finalize(const std::vector&, std::vector &outputs) CV_OVERRIDE + { + if (!outWidth && !outHeight) + { + outHeight = outputs[0].size[2]; + outWidth = outputs[0].size[3]; + } + } + // This implementation is based on a reference implementation from // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h - virtual void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) CV_OVERRIDE + virtual void forward(std::vector &inputs, std::vector &outputs, std::vector &) CV_OVERRIDE { - CV_UNUSED(internals); cv::Mat& inp = *inputs[0]; cv::Mat& out = outputs[0]; const float* inpData = (float*)inp.data; @@ -195,19 +193,54 @@ private: return x + size[3] * (y + size[2] * (c + size[1] * b)); } - int outWidth, outHeight; + int outWidth, outHeight, factorWidth, factorHeight; }; //! [ResizeBilinearLayer] -//! [Register a custom layer] -#include // CV_DNN_REGISTER_LAYER_CLASS macro +// +// The folowing code is used only to generate tutorials documentation. +// -int main(int argc, char** argv) +//! [A custom layer interface] +class MyLayer : public cv::dnn::Layer { - CV_DNN_REGISTER_LAYER_CLASS(MyType, MyLayer); +public: + //! [MyLayer::MyLayer] + MyLayer(const cv::dnn::LayerParams ¶ms); + //! [MyLayer::MyLayer] + + //! [MyLayer::create] + static cv::Ptr create(cv::dnn::LayerParams& params); + //! [MyLayer::create] + + //! [MyLayer::getMemoryShapes] + virtual bool getMemoryShapes(const std::vector > &inputs, + const int requiredOutputs, + std::vector > &outputs, + std::vector > &internals) const CV_OVERRIDE; + //! [MyLayer::getMemoryShapes] + + //! [MyLayer::forward] + virtual void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) CV_OVERRIDE; + //! [MyLayer::forward] + + //! [MyLayer::finalize] + virtual void finalize(const std::vector &inputs, std::vector &outputs) CV_OVERRIDE; + //! [MyLayer::finalize] + + virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE; +}; +//! [A custom layer interface] + +//! [Register a custom layer] +#include // CV_DNN_REGISTER_LAYER_CLASS + +static inline void loadNet() +{ + CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer); // ... //! [Register a custom layer] - CV_UNUSED(argc); CV_UNUSED(argv); + //! [Register InterpLayer] CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer); cv::dnn::Net caffeNet = cv::dnn::readNet("/path/to/config.prototxt", "/path/to/weights.caffemodel"); @@ -217,16 +250,8 @@ int main(int argc, char** argv) CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer); cv::dnn::Net tfNet = cv::dnn::readNet("/path/to/graph.pb"); //! [Register ResizeBilinearLayer] + + if (false) loadNet(); // To prevent unused function warning. } -cv::Ptr MyLayer::create(cv::dnn::LayerParams& params) -{ - return cv::Ptr(new MyLayer(params)); -} -MyLayer::MyLayer(const cv::dnn::LayerParams&) {} -bool MyLayer::getMemoryShapes(const std::vector >&, const int, - std::vector >&, - std::vector >&) const { return false; } -void MyLayer::forward(std::vector&, std::vector&, std::vector&) {} -void MyLayer::finalize(const std::vector&, std::vector&) {} -void MyLayer::forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {} +#endif // __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__ diff --git a/samples/dnn/text_detection.cpp b/samples/dnn/text_detection.cpp new file mode 100644 index 0000000000..48157d8a13 --- /dev/null +++ b/samples/dnn/text_detection.cpp @@ -0,0 +1,169 @@ +#include +#include +#include + +#include "custom_layers.hpp" + +using namespace cv; +using namespace cv::dnn; + +const char* keys = + "{ help h | | Print help message. }" + "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}" + "{ model m | | Path to a binary .pb file contains trained network.}" + "{ width | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }" + "{ height | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }" + "{ thr | 0.5 | Confidence threshold. }" + "{ nms | 0.4 | Non-maximum suppression threshold. }"; + +void decode(const Mat& scores, const Mat& geometry, float scoreThresh, + std::vector& detections, std::vector& confidences); + +int main(int argc, char** argv) +{ + // Parse command line arguments. + CommandLineParser parser(argc, argv, keys); + parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of " + "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)"); + if (argc == 1 || parser.has("help")) + { + parser.printMessage(); + return 0; + } + + float confThreshold = parser.get("thr"); + float nmsThreshold = parser.get("nms"); + int inpWidth = parser.get("width"); + int inpHeight = parser.get("height"); + CV_Assert(parser.has("model")); + String model = parser.get("model"); + + // Register a custom layer. + CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer); + + // Load network. + Net net = readNet(model); + + // Open a video file or an image file or a camera stream. + VideoCapture cap; + if (parser.has("input")) + cap.open(parser.get("input")); + else + cap.open(0); + + static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector"; + namedWindow(kWinName, WINDOW_NORMAL); + + std::vector outs; + std::vector outNames(2); + outNames[0] = "feature_fusion/Conv_7/Sigmoid"; + outNames[1] = "feature_fusion/concat_3"; + + Mat frame, blob; + while (waitKey(1) < 0) + { + cap >> frame; + if (frame.empty()) + { + waitKey(); + break; + } + + blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), Scalar(123.68, 116.78, 103.94), true, false); + net.setInput(blob); + net.forward(outs, outNames); + + Mat scores = outs[0]; + Mat geometry = outs[1]; + + // Decode predicted bounding boxes. + std::vector boxes; + std::vector confidences; + decode(scores, geometry, confThreshold, boxes, confidences); + + // Apply non-maximum suppression procedure. + std::vector indices; + NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices); + + // Render detections. + Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight); + for (size_t i = 0; i < indices.size(); ++i) + { + RotatedRect& box = boxes[indices[i]]; + + Point2f vertices[4]; + box.points(vertices); + for (int j = 0; j < 4; ++j) + { + vertices[j].x *= ratio.x; + vertices[j].y *= ratio.y; + } + for (int j = 0; j < 4; ++j) + line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1); + } + + // Put efficiency information. + std::vector layersTimes; + double freq = getTickFrequency() / 1000; + double t = net.getPerfProfile(layersTimes) / freq; + std::string label = format("Inference time: %.2f ms", t); + putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0)); + + imshow(kWinName, frame); + } + return 0; +} + +void decode(const Mat& scores, const Mat& geometry, float scoreThresh, + std::vector& detections, std::vector& confidences) +{ + detections.clear(); + CV_Assert(scores.dims == 4, geometry.dims == 4, scores.size[0] == 1, + geometry.size[0] == 1, scores.size[1] == 1, geometry.size[1] == 5, + scores.size[2] == geometry.size[2], scores.size[3] == geometry.size[3]); + + const int height = scores.size[2]; + const int width = scores.size[3]; + const int planeSize = height * width; + + float* scoresData = (float*)scores.data; + float* geometryData = (float*)geometry.data; + float* x0_data = geometryData; + float* x1_data = geometryData + planeSize; + float* x2_data = geometryData + planeSize * 2; + float* x3_data = geometryData + planeSize * 3; + float* anglesData = geometryData + planeSize * 4; + for (int y = 0; y < height; ++y) + { + for (int x = 0; x < width; ++x) + { + float score = scoresData[x]; + if (score < scoreThresh) + continue; + + // Decode a prediction. + + // Multiple by 4 because feature maps are 4 time less than input image. + float offsetX = x * 4.0f, offsetY = y * 4.0f; + float angle = anglesData[x]; + float cosA = std::cos(angle); + float sinA = std::sin(angle); + float h = x0_data[x] + x2_data[x]; + float w = x1_data[x] + x3_data[x]; + + Point2f offset(offsetX + cosA * x1_data[x] + sinA * x2_data[x], + offsetY - sinA * x1_data[x] + cosA * x2_data[x]); + Point2f p1 = Point2f(-sinA * h, -cosA * h) + offset; + Point2f p3 = Point2f(-cosA * w, sinA * w) + offset; + RotatedRect r(0.5f * (p1 + p3), Size2f(w, h), -angle * 180.0f / (float)CV_PI); + detections.push_back(r); + confidences.push_back(score); + } + scoresData += width; + x0_data += width; + x1_data += width; + x2_data += width; + x3_data += width; + anglesData += width; + } +}