diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index 12a2081685..eabee9877a 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -217,6 +217,16 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow) Mat(cv::Size(300, 300), CV_32FC3)); } +PERF_TEST_P_(DNNTestNetwork, YOLOv3) +{ + if (backend != DNN_BACKEND_DEFAULT) + throw SkipTestException(""); + Mat sample = imread(findDataFile("dnn/dog416.png", false)); + Mat inp; + sample.convertTo(inp, CV_32FC3); + processNet("dnn/yolov3.cfg", "dnn/yolov3.weights", "", inp / 255); +} + const tuple testCases[] = { #ifdef HAVE_HALIDE tuple(DNN_BACKEND_HALIDE, DNN_TARGET_CPU), diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp index 2f109cda6b..71f762a09d 100644 --- a/modules/dnn/src/darknet/darknet_io.cpp +++ b/modules/dnn/src/darknet/darknet_io.cpp @@ -89,6 +89,8 @@ namespace cv { return init_val; } + static const std::string kFirstLayerName = "data"; + class setLayersParams { NetParameter *net; @@ -97,8 +99,8 @@ namespace cv { std::vector fused_layer_names; public: - setLayersParams(NetParameter *_net, std::string _first_layer = "data") : - net(_net), layer_id(0), last_layer(_first_layer) + setLayersParams(NetParameter *_net) : + net(_net), layer_id(0), last_layer(kFirstLayerName) {} void setLayerBlobs(int i, std::vector blobs) @@ -275,7 +277,7 @@ namespace cv { fused_layer_names.push_back(last_layer); } - void setPermute() + void setPermute(bool isDarknetLayer = true) { cv::dnn::LayerParams permute_params; permute_params.name = "Permute-name"; @@ -294,8 +296,11 @@ namespace cv { last_layer = layer_name; net->layers.push_back(lp); - layer_id++; - fused_layer_names.push_back(last_layer); + if (isDarknetLayer) + { + layer_id++; + fused_layer_names.push_back(last_layer); + } } void setRegion(float thresh, int coords, int classes, int anchors, int classfix, int softmax, int softmax_tree, float *biasData) @@ -327,6 +332,85 @@ namespace cv { layer_id++; fused_layer_names.push_back(last_layer); } + + void setYolo(int classes, const std::vector& mask, const std::vector& anchors) + { + cv::dnn::LayerParams region_param; + region_param.name = "Region-name"; + region_param.type = "Region"; + + const int numAnchors = mask.size(); + + region_param.set("classes", classes); + region_param.set("anchors", numAnchors); + region_param.set("logistic", true); + + std::vector usedAnchors(numAnchors * 2); + for (int i = 0; i < numAnchors; ++i) + { + usedAnchors[i * 2] = anchors[mask[i] * 2]; + usedAnchors[i * 2 + 1] = anchors[mask[i] * 2 + 1]; + } + + cv::Mat biasData_mat = cv::Mat(1, numAnchors * 2, CV_32F, &usedAnchors[0]).clone(); + region_param.blobs.push_back(biasData_mat); + + darknet::LayerParameter lp; + std::string layer_name = cv::format("yolo_%d", layer_id); + lp.layer_name = layer_name; + lp.layer_type = region_param.type; + lp.layerParams = region_param; + lp.bottom_indexes.push_back(last_layer); + lp.bottom_indexes.push_back(kFirstLayerName); + last_layer = layer_name; + net->layers.push_back(lp); + + layer_id++; + fused_layer_names.push_back(last_layer); + } + + void setShortcut(int from) + { + cv::dnn::LayerParams shortcut_param; + shortcut_param.name = "Shortcut-name"; + shortcut_param.type = "Eltwise"; + + shortcut_param.set("op", "sum"); + + darknet::LayerParameter lp; + std::string layer_name = cv::format("shortcut_%d", layer_id); + lp.layer_name = layer_name; + lp.layer_type = shortcut_param.type; + lp.layerParams = shortcut_param; + lp.bottom_indexes.push_back(fused_layer_names.at(from)); + lp.bottom_indexes.push_back(last_layer); + last_layer = layer_name; + net->layers.push_back(lp); + + layer_id++; + fused_layer_names.push_back(last_layer); + } + + void setUpsample(int scaleFactor) + { + cv::dnn::LayerParams param; + param.name = "Upsample-name"; + param.type = "ResizeNearestNeighbor"; + + param.set("zoom_factor", scaleFactor); + + darknet::LayerParameter lp; + std::string layer_name = cv::format("upsample_%d", layer_id); + lp.layer_name = layer_name; + lp.layer_type = param.type; + lp.layerParams = param; + lp.bottom_indexes.push_back(last_layer); + last_layer = layer_name; + net->layers.push_back(lp); + + layer_id++; + fused_layer_names.push_back(last_layer); + } }; std::string escapeString(const std::string &src) @@ -464,7 +548,7 @@ namespace cv { current_channels = 0; for (size_t k = 0; k < layers_vec.size(); ++k) { - layers_vec[k] += layers_counter; + layers_vec[k] = layers_vec[k] > 0 ? layers_vec[k] : (layers_vec[k] + layers_counter); current_channels += net->out_channels_vec[layers_vec[k]]; } @@ -496,9 +580,43 @@ namespace cv { CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size()); - setParams.setPermute(); + setParams.setPermute(false); setParams.setRegion(thresh, coords, classes, num_of_anchors, classfix, softmax, softmax_tree, anchors_vec.data()); } + else if (layer_type == "shortcut") + { + std::string bottom_layer = getParam(layer_params, "from", ""); + CV_Assert(!bottom_layer.empty()); + int from = std::atoi(bottom_layer.c_str()); + + from += layers_counter; + current_channels = net->out_channels_vec[from]; + + setParams.setShortcut(from); + } + else if (layer_type == "upsample") + { + int scaleFactor = getParam(layer_params, "stride", 1); + setParams.setUpsample(scaleFactor); + } + else if (layer_type == "yolo") + { + int classes = getParam(layer_params, "classes", -1); + int num_of_anchors = getParam(layer_params, "num", -1); + + std::string anchors_values = getParam(layer_params, "anchors", std::string()); + CV_Assert(!anchors_values.empty()); + std::vector anchors_vec = getNumbers(anchors_values); + + std::string mask_values = getParam(layer_params, "mask", std::string()); + CV_Assert(!mask_values.empty()); + std::vector mask_vec = getNumbers(mask_values); + + CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size()); + + setParams.setPermute(false); + setParams.setYolo(classes, mask_vec, anchors_vec); + } else { CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type); } @@ -598,6 +716,10 @@ namespace cv { if(activation == "leaky") ++cv_layers_counter; } + if (layer_type == "region" || layer_type == "yolo") + { + ++cv_layers_counter; // For permute. + } current_channels = net->out_channels_vec[darknet_layers_counter]; } return true; diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index d82e7cd7ce..8de9c09ecb 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -1527,12 +1527,11 @@ struct Net::Impl convLayer = downLayerData->layerInstance.dynamicCast(); // first input layer is convolution layer - if( !convLayer.empty() ) + if( !convLayer.empty() && eltwiseData->consumers.size() == 1 ) { // fuse eltwise + activation layer LayerData *firstConvLayerData = downLayerData; { - CV_Assert(eltwiseData->consumers.size() == 1); nextData = &layers[eltwiseData->consumers[0].lid]; lpNext = LayerPin(eltwiseData->consumers[0].lid, 0); Ptr nextActivLayer; diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp index 0e4635ecbe..bcf038ce9f 100644 --- a/modules/dnn/src/layers/region_layer.cpp +++ b/modules/dnn/src/layers/region_layer.cpp @@ -59,7 +59,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer public: int coords, classes, anchors, classfix; float thresh, nmsThreshold; - bool useSoftmaxTree, useSoftmax; + bool useSoftmax, useLogistic; RegionLayerImpl(const LayerParams& params) { @@ -71,15 +71,17 @@ public: classes = params.get("classes", 0); anchors = params.get("anchors", 5); classfix = params.get("classfix", 0); - useSoftmaxTree = params.get("softmax_tree", false); useSoftmax = params.get("softmax", false); + useLogistic = params.get("logistic", false); nmsThreshold = params.get("nms_threshold", 0.4); CV_Assert(nmsThreshold >= 0.); CV_Assert(coords == 4); CV_Assert(classes >= 1); CV_Assert(anchors >= 1); - CV_Assert(useSoftmaxTree || useSoftmax); + CV_Assert(useLogistic || useSoftmax); + if (params.get("softmax_tree", false)) + CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented"); } bool getMemoryShapes(const std::vector &inputs, @@ -89,7 +91,7 @@ public: { CV_Assert(inputs.size() > 0); CV_Assert(inputs[0][3] == (1 + coords + classes)*anchors); - outputs = std::vector(inputs.size(), shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors)); + outputs = std::vector(1, shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors)); return false; } @@ -124,14 +126,13 @@ public: std::vector inputs; std::vector outputs; + // TODO: implement a logistic activation to classification scores. + if (useLogistic) + return false; + inps.getUMatVector(inputs); outs.getUMatVector(outputs); - if (useSoftmaxTree) { // Yolo 9000 - CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented"); - return false; - } - CV_Assert(inputs.size() >= 1); int const cell_size = classes + coords + 1; UMat blob_umat = blobs[0].getUMat(ACCESS_READ); @@ -203,6 +204,7 @@ public: CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_Assert(inputs.size() >= 1); + CV_Assert(outputs.size() == 1); int const cell_size = classes + coords + 1; const float* biasData = blobs[0].ptr(); @@ -214,6 +216,9 @@ public: int rows = inpBlob.size[1]; int cols = inpBlob.size[2]; + CV_Assert(inputs.size() < 2 || inputs[1]->dims == 4); + int hNorm = inputs.size() > 1 ? inputs[1]->size[2] : rows; + int wNorm = inputs.size() > 1 ? inputs[1]->size[3] : cols; const float *srcData = inpBlob.ptr(); float *dstData = outBlob.ptr(); @@ -225,49 +230,47 @@ public: dstData[index + 4] = logistic_activate(x); // logistic activation } - if (useSoftmaxTree) { // Yolo 9000 - CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented"); - } - else if (useSoftmax) { // Yolo v2 + if (useSoftmax) { // Yolo v2 // softmax activation for Probability, for each grid cell (X x Y x Anchor-index) for (int i = 0; i < rows*cols*anchors; ++i) { int index = cell_size*i; softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5); } - - for (int x = 0; x < cols; ++x) - for(int y = 0; y < rows; ++y) - for (int a = 0; a < anchors; ++a) { - int index = (y*cols + x)*anchors + a; // index for each grid-cell & anchor - int p_index = index * cell_size + 4; - float scale = dstData[p_index]; - if (classfix == -1 && scale < .5) scale = 0; // if(t0 < 0.5) t0 = 0; - int box_index = index * cell_size; - - dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols; - dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows; - dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / cols; - dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / rows; - - int class_index = index * cell_size + 5; - - if (useSoftmaxTree) { - CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented"); - } - else { - for (int j = 0; j < classes; ++j) { - float prob = scale*dstData[class_index + j]; // prob = IoU(box, object) = t0 * class-probability - dstData[class_index + j] = (prob > thresh) ? prob : 0; // if (IoU < threshold) IoU = 0; - } - } - } - } + else if (useLogistic) { // Yolo v3 + for (int i = 0; i < rows*cols*anchors; ++i) + { + int index = cell_size*i; + const float* input = srcData + index + 5; + float* output = dstData + index + 5; + for (int i = 0; i < classes; ++i) + output[i] = logistic_activate(input[i]); + } + } + for (int x = 0; x < cols; ++x) + for(int y = 0; y < rows; ++y) + for (int a = 0; a < anchors; ++a) { + int index = (y*cols + x)*anchors + a; // index for each grid-cell & anchor + int p_index = index * cell_size + 4; + float scale = dstData[p_index]; + if (classfix == -1 && scale < .5) scale = 0; // if(t0 < 0.5) t0 = 0; + int box_index = index * cell_size; + dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols; + dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows; + dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / hNorm; + dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / wNorm; + + int class_index = index * cell_size + 5; + + for (int j = 0; j < classes; ++j) { + float prob = scale*dstData[class_index + j]; // prob = IoU(box, object) = t0 * class-probability + dstData[class_index + j] = (prob > thresh) ? prob : 0; // if (IoU < threshold) IoU = 0; + } + } if (nmsThreshold > 0) { do_nms_sort(dstData, rows*cols*anchors, thresh, nmsThreshold); } - } } diff --git a/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp b/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp index 4b871bff67..e9a966296e 100644 --- a/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp +++ b/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp @@ -16,9 +16,11 @@ public: ResizeNearestNeighborLayerImpl(const LayerParams& params) { setParamsFrom(params); - CV_Assert(params.has("width"), params.has("height")); - outWidth = params.get("width"); - outHeight = params.get("height"); + CV_Assert(params.has("width") && params.has("height") || params.has("zoom_factor")); + CV_Assert(!params.has("width") && !params.has("height") || !params.has("zoom_factor")); + outWidth = params.get("width", 0); + outHeight = params.get("height", 0); + zoomFactor = params.get("zoom_factor", 1); alignCorners = params.get("align_corners", false); if (alignCorners) CV_Error(Error::StsNotImplemented, "Nearest neighborhood resize with align_corners=true is not implemented"); @@ -31,12 +33,21 @@ public: { CV_Assert(inputs.size() == 1, inputs[0].size() == 4); outputs.resize(1, inputs[0]); - outputs[0][2] = outHeight; - outputs[0][3] = outWidth; + outputs[0][2] = outHeight > 0 ? outHeight : (outputs[0][2] * zoomFactor); + outputs[0][3] = outWidth > 0 ? outWidth : (outputs[0][3] * zoomFactor); // We can work in-place (do nothing) if input shape == output shape. return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]); } + virtual void finalize(const std::vector& inputs, std::vector &outputs) CV_OVERRIDE + { + if (!outWidth && !outHeight) + { + outHeight = outputs[0].size[2]; + outWidth = outputs[0].size[3]; + } + } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); @@ -65,7 +76,7 @@ public: } } private: - int outWidth, outHeight; + int outWidth, outHeight, zoomFactor; bool alignCorners; }; diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp index e32b35a3a3..ebe8d93406 100644 --- a/modules/dnn/test/test_darknet_importer.cpp +++ b/modules/dnn/test/test_darknet_importer.cpp @@ -42,9 +42,8 @@ //M*/ #include "test_precomp.hpp" +#include "npy_blob.hpp" #include -#include -#include namespace opencv_test { namespace { @@ -66,238 +65,136 @@ TEST(Test_Darknet, read_yolo_voc) ASSERT_FALSE(net.empty()); } -OCL_TEST(Reproducibility_TinyYoloVoc, Accuracy) +// Test object detection network from Darknet framework. +static void testDarknetModel(const std::string& cfg, const std::string& weights, + const std::vector& outNames, + const std::vector& refClassIds, + const std::vector& refConfidences, + const std::vector& refBoxes, + int targetId, float confThreshold = 0.24) { - Net net; - { - const string cfg = findDataFile("dnn/tiny-yolo-voc.cfg", false); - const string model = findDataFile("dnn/tiny-yolo-voc.weights", false); - net = readNetFromDarknet(cfg, model); - ASSERT_FALSE(net.empty()); - } - - net.setPreferableBackend(DNN_BACKEND_DEFAULT); - net.setPreferableTarget(DNN_TARGET_OPENCL); - - // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format Mat sample = imread(_tf("dog416.png")); - ASSERT_TRUE(!sample.empty()); + Mat inp = blobFromImage(sample, 1.0/255, Size(416, 416), Scalar(), true, false); - Size inputSize(416, 416); + Net net = readNet(findDataFile("dnn/" + cfg, false), + findDataFile("dnn/" + weights, false)); + net.setPreferableTarget(targetId); + net.setInput(inp); + std::vector outs; + net.forward(outs, outNames); - if (sample.size() != inputSize) - resize(sample, sample, inputSize); - - net.setInput(blobFromImage(sample, 1 / 255.F), "data"); - Mat out = net.forward("detection_out"); - - Mat detection; - const float confidenceThreshold = 0.24; - - for (int i = 0; i < out.rows; i++) { - const int probability_index = 5; - const int probability_size = out.cols - probability_index; - float *prob_array_ptr = &out.at(i, probability_index); - size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr; - float confidence = out.at(i, (int)objectClass + probability_index); - - if (confidence > confidenceThreshold) - detection.push_back(out.row(i)); + std::vector classIds; + std::vector confidences; + std::vector boxes; + for (int i = 0; i < outs.size(); ++i) + { + Mat& out = outs[i]; + for (int j = 0; j < out.rows; ++j) + { + Mat scores = out.row(j).colRange(5, out.cols); + double confidence; + Point maxLoc; + minMaxLoc(scores, 0, &confidence, 0, &maxLoc); + if (confidence > confThreshold) + { + float* detection = out.ptr(j); + float centerX = detection[0]; + float centerY = detection[1]; + float width = detection[2]; + float height = detection[3]; + boxes.push_back(Rect2f(centerX - 0.5 * width, centerY - 0.5 * height, + width, height)); + confidences.push_back(confidence); + classIds.push_back(maxLoc.x); + } + } } - // obtained by: ./darknet detector test ./cfg/voc.data ./cfg/tiny-yolo-voc.cfg ./tiny-yolo-voc.weights -thresh 0.24 ./dog416.png - // There are 2 objects (6-car, 11-dog) with 25 values for each: - // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] } - float ref_array[] = { - 0.736762F, 0.239551F, 0.315440F, 0.160779F, 0.761977F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.761967F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - - 0.287486F, 0.653731F, 0.315579F, 0.534527F, 0.782737F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.780595F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F - }; - - const int number_of_objects = 2; - Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array); - - normAssert(ref, detection); + ASSERT_EQ(classIds.size(), refClassIds.size()); + ASSERT_EQ(confidences.size(), refConfidences.size()); + ASSERT_EQ(boxes.size(), refBoxes.size()); + for (int i = 0; i < boxes.size(); ++i) + { + ASSERT_EQ(classIds[i], refClassIds[i]); + ASSERT_LE(std::abs(confidences[i] - refConfidences[i]), 1e-4); + float iou = (boxes[i] & refBoxes[i]).area() / (boxes[i] | refBoxes[i]).area(); + ASSERT_LE(std::abs(iou - 1.0f), 1e-4); + } } -TEST(Reproducibility_TinyYoloVoc, Accuracy) +typedef testing::TestWithParam Test_Darknet_nets; + +TEST_P(Test_Darknet_nets, YoloVoc) { - Net net; - { - const string cfg = findDataFile("dnn/tiny-yolo-voc.cfg", false); - const string model = findDataFile("dnn/tiny-yolo-voc.weights", false); - net = readNetFromDarknet(cfg, model); - ASSERT_FALSE(net.empty()); - } + int targetId = GetParam(); + std::vector outNames(1, "detection_out"); - // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format - Mat sample = imread(_tf("dog416.png")); - ASSERT_TRUE(!sample.empty()); - - Size inputSize(416, 416); - - if (sample.size() != inputSize) - resize(sample, sample, inputSize); - - net.setInput(blobFromImage(sample, 1 / 255.F), "data"); - Mat out = net.forward("detection_out"); - - Mat detection; - const float confidenceThreshold = 0.24; - - for (int i = 0; i < out.rows; i++) { - const int probability_index = 5; - const int probability_size = out.cols - probability_index; - float *prob_array_ptr = &out.at(i, probability_index); - size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr; - float confidence = out.at(i, (int)objectClass + probability_index); - - if (confidence > confidenceThreshold) - detection.push_back(out.row(i)); - } - - // obtained by: ./darknet detector test ./cfg/voc.data ./cfg/tiny-yolo-voc.cfg ./tiny-yolo-voc.weights -thresh 0.24 ./dog416.png - // There are 2 objects (6-car, 11-dog) with 25 values for each: - // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] } - float ref_array[] = { - 0.736762F, 0.239551F, 0.315440F, 0.160779F, 0.761977F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.761967F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - - 0.287486F, 0.653731F, 0.315579F, 0.534527F, 0.782737F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.780595F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F - }; - - const int number_of_objects = 2; - Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array); - - normAssert(ref, detection); + std::vector classIds(3); + std::vector confidences(3); + std::vector boxes(3); + classIds[0] = 6; confidences[0] = 0.750469f; boxes[0] = Rect2f(0.577374, 0.127391, 0.325575, 0.173418); // a car + classIds[1] = 1; confidences[1] = 0.780879f; boxes[1] = Rect2f(0.270762, 0.264102, 0.461713, 0.48131); // a bycicle + classIds[2] = 11; confidences[2] = 0.901615f; boxes[2] = Rect2f(0.1386, 0.338509, 0.282737, 0.60028); // a dog + testDarknetModel("yolo-voc.cfg", "yolo-voc.weights", outNames, + classIds, confidences, boxes, targetId); } -OCL_TEST(Reproducibility_YoloVoc, Accuracy) +TEST_P(Test_Darknet_nets, TinyYoloVoc) { - Net net; - { - const string cfg = findDataFile("dnn/yolo-voc.cfg", false); - const string model = findDataFile("dnn/yolo-voc.weights", false); - net = readNetFromDarknet(cfg, model); - ASSERT_FALSE(net.empty()); - } - - net.setPreferableBackend(DNN_BACKEND_DEFAULT); - net.setPreferableTarget(DNN_TARGET_OPENCL); - - // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format - Mat sample = imread(_tf("dog416.png")); - ASSERT_TRUE(!sample.empty()); - - Size inputSize(416, 416); - - if (sample.size() != inputSize) - resize(sample, sample, inputSize); - - net.setInput(blobFromImage(sample, 1 / 255.F), "data"); - Mat out = net.forward("detection_out"); - - Mat detection; - const float confidenceThreshold = 0.24; - - for (int i = 0; i < out.rows; i++) { - const int probability_index = 5; - const int probability_size = out.cols - probability_index; - float *prob_array_ptr = &out.at(i, probability_index); - size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr; - float confidence = out.at(i, (int)objectClass + probability_index); - - if (confidence > confidenceThreshold) - detection.push_back(out.row(i)); - } - - // obtained by: ./darknet detector test ./cfg/voc.data ./cfg/yolo-voc.cfg ./yolo-voc.weights -thresh 0.24 ./dog416.png - // There are 3 objects (6-car, 1-bicycle, 11-dog) with 25 values for each: - // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] } - float ref_array[] = { - 0.740161F, 0.214100F, 0.325575F, 0.173418F, 0.750769F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.750469F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - - 0.501618F, 0.504757F, 0.461713F, 0.481310F, 0.783550F, 0.000000F, 0.780879F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - - 0.279968F, 0.638651F, 0.282737F, 0.600284F, 0.901864F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.901615F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F - }; - - const int number_of_objects = 3; - Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array); - - normAssert(ref, detection); + int targetId = GetParam(); + std::vector outNames(1, "detection_out"); + std::vector classIds(2); + std::vector confidences(2); + std::vector boxes(2); + classIds[0] = 6; confidences[0] = 0.761967f; boxes[0] = Rect2f(0.579042, 0.159161, 0.31544, 0.160779); // a car + classIds[1] = 11; confidences[1] = 0.780595f; boxes[1] = Rect2f(0.129696, 0.386467, 0.315579, 0.534527); // a dog + testDarknetModel("tiny-yolo-voc.cfg", "tiny-yolo-voc.weights", outNames, + classIds, confidences, boxes, targetId); } -TEST(Reproducibility_YoloVoc, Accuracy) +TEST_P(Test_Darknet_nets, YOLOv3) { - Net net; - { - const string cfg = findDataFile("dnn/yolo-voc.cfg", false); - const string model = findDataFile("dnn/yolo-voc.weights", false); - net = readNetFromDarknet(cfg, model); - ASSERT_FALSE(net.empty()); - } + int targetId = GetParam(); + std::vector outNames(3); + outNames[0] = "yolo_82"; + outNames[1] = "yolo_94"; + outNames[2] = "yolo_106"; - // dog416.png is dog.jpg that resized to 416x416 in the lossless PNG format - Mat sample = imread(_tf("dog416.png")); - ASSERT_TRUE(!sample.empty()); + std::vector classIds(3); + std::vector confidences(3); + std::vector boxes(3); + classIds[0] = 7; confidences[0] = 0.952983f; boxes[0] = Rect2f(0.614622, 0.150257, 0.286747, 0.138994); // a truck + classIds[1] = 1; confidences[1] = 0.987908f; boxes[1] = Rect2f(0.150913, 0.221933, 0.591342, 0.524327); // a bycicle + classIds[2] = 16; confidences[2] = 0.998836f; boxes[2] = Rect2f(0.160024, 0.389964, 0.257861, 0.553752); // a dog (COCO) + testDarknetModel("yolov3.cfg", "yolov3.weights", outNames, + classIds, confidences, boxes, targetId); +} - Size inputSize(416, 416); +INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_nets, availableDnnTargets()); - if (sample.size() != inputSize) - resize(sample, sample, inputSize); +static void testDarknetLayer(const std::string& name, bool hasWeights = false) +{ + std::string cfg = findDataFile("dnn/darknet/" + name + ".cfg", false); + std::string model = ""; + if (hasWeights) + model = findDataFile("dnn/darknet/" + name + ".weights", false); + Mat inp = blobFromNPY(findDataFile("dnn/darknet/" + name + "_in.npy", false)); + Mat ref = blobFromNPY(findDataFile("dnn/darknet/" + name + "_out.npy", false)); - net.setInput(blobFromImage(sample, 1 / 255.F), "data"); - Mat out = net.forward("detection_out"); + Net net = readNet(cfg, model); + net.setInput(inp); + Mat out = net.forward(); + normAssert(out, ref); +} - Mat detection; - const float confidenceThreshold = 0.24; +TEST(Test_Darknet, shortcut) +{ + testDarknetLayer("shortcut"); +} - for (int i = 0; i < out.rows; i++) { - const int probability_index = 5; - const int probability_size = out.cols - probability_index; - float *prob_array_ptr = &out.at(i, probability_index); - size_t objectClass = std::max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr; - float confidence = out.at(i, (int)objectClass + probability_index); - - if (confidence > confidenceThreshold) - detection.push_back(out.row(i)); - } - - // obtained by: ./darknet detector test ./cfg/voc.data ./cfg/yolo-voc.cfg ./yolo-voc.weights -thresh 0.24 ./dog416.png - // There are 3 objects (6-car, 1-bicycle, 11-dog) with 25 values for each: - // { relative_center_x, relative_center_y, relative_width, relative_height, unused_t0, probability_for_each_class[20] } - float ref_array[] = { - 0.740161F, 0.214100F, 0.325575F, 0.173418F, 0.750769F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.750469F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - - 0.501618F, 0.504757F, 0.461713F, 0.481310F, 0.783550F, 0.000000F, 0.780879F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - - 0.279968F, 0.638651F, 0.282737F, 0.600284F, 0.901864F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.901615F, - 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F, 0.000000F - }; - - const int number_of_objects = 3; - Mat ref(number_of_objects, sizeof(ref_array) / (number_of_objects * sizeof(float)), CV_32FC1, &ref_array); - - normAssert(ref, detection); +TEST(Test_Darknet, upsample) +{ + testDarknetLayer("upsample"); } }} // namespace diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp index 52b08fd009..16d82eb0e7 100644 --- a/samples/dnn/object_detection.cpp +++ b/samples/dnn/object_detection.cpp @@ -35,12 +35,14 @@ using namespace dnn; float confThreshold; std::vector classes; -void postprocess(Mat& frame, const Mat& out, Net& net); +void postprocess(Mat& frame, const std::vector& out, Net& net); void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame); void callback(int pos, void* userdata); +std::vector getOutputsNames(const Net& net); + int main(int argc, char** argv) { CommandLineParser parser(argc, argv, keys); @@ -115,9 +117,10 @@ int main(int argc, char** argv) Mat imInfo = (Mat_(1, 3) << inpSize.height, inpSize.width, 1.6f); net.setInput(imInfo, "im_info"); } - Mat out = net.forward(); + std::vector outs; + net.forward(outs, getOutputsNames(net)); - postprocess(frame, out, net); + postprocess(frame, outs, net); // Put efficiency information. std::vector layersTimes; @@ -131,18 +134,19 @@ int main(int argc, char** argv) return 0; } -void postprocess(Mat& frame, const Mat& out, Net& net) +void postprocess(Mat& frame, const std::vector& outs, Net& net) { static std::vector outLayers = net.getUnconnectedOutLayers(); static std::string outLayerType = net.getLayer(outLayers[0])->type; - float* data = (float*)out.data; if (net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN { // Network produces output blob with a shape 1x1xNx7 where N is a number of // detections and an every detection is a vector of values // [batchId, classId, confidence, left, top, right, bottom] - for (size_t i = 0; i < out.total(); i += 7) + CV_Assert(outs.size() == 1); + float* data = (float*)outs[0].data; + for (size_t i = 0; i < outs[0].total(); i += 7) { float confidence = data[i + 2]; if (confidence > confThreshold) @@ -161,7 +165,9 @@ void postprocess(Mat& frame, const Mat& out, Net& net) // Network produces output blob with a shape 1x1xNx7 where N is a number of // detections and an every detection is a vector of values // [batchId, classId, confidence, left, top, right, bottom] - for (size_t i = 0; i < out.total(); i += 7) + CV_Assert(outs.size() == 1); + float* data = (float*)outs[0].data; + for (size_t i = 0; i < outs[0].total(); i += 7) { float confidence = data[i + 2]; if (confidence > confThreshold) @@ -177,27 +183,45 @@ void postprocess(Mat& frame, const Mat& out, Net& net) } else if (outLayerType == "Region") { - // Network produces output blob with a shape NxC where N is a number of - // detected objects and C is a number of classes + 4 where the first 4 - // numbers are [center_x, center_y, width, height] - for (int i = 0; i < out.rows; ++i, data += out.cols) + std::vector classIds; + std::vector confidences; + std::vector boxes; + for (size_t i = 0; i < outs.size(); ++i) { - Mat confidences = out.row(i).colRange(5, out.cols); - Point classIdPoint; - double confidence; - minMaxLoc(confidences, 0, &confidence, 0, &classIdPoint); - if (confidence > confThreshold) + // Network produces output blob with a shape NxC where N is a number of + // detected objects and C is a number of classes + 4 where the first 4 + // numbers are [center_x, center_y, width, height] + float* data = (float*)outs[i].data; + for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols) { - int classId = classIdPoint.x; - int centerX = (int)(data[0] * frame.cols); - int centerY = (int)(data[1] * frame.rows); - int width = (int)(data[2] * frame.cols); - int height = (int)(data[3] * frame.rows); - int left = centerX - width / 2; - int top = centerY - height / 2; - drawPred(classId, (float)confidence, left, top, left + width, top + height, frame); + Mat scores = outs[i].row(j).colRange(5, outs[i].cols); + Point classIdPoint; + double confidence; + minMaxLoc(scores, 0, &confidence, 0, &classIdPoint); + if (confidence > confThreshold) + { + int centerX = (int)(data[0] * frame.cols); + int centerY = (int)(data[1] * frame.rows); + int width = (int)(data[2] * frame.cols); + int height = (int)(data[3] * frame.rows); + int left = centerX - width / 2; + int top = centerY - height / 2; + + classIds.push_back(classIdPoint.x); + confidences.push_back((float)confidence); + boxes.push_back(Rect(left, top, width, height)); + } } } + std::vector indices; + NMSBoxes(boxes, confidences, confThreshold, 0.4, indices); + for (size_t i = 0; i < indices.size(); ++i) + { + int idx = indices[i]; + Rect box = boxes[idx]; + drawPred(classIds[idx], confidences[idx], box.x, box.y, + box.x + box.width, box.y + box.height, frame); + } } else CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType); @@ -227,3 +251,17 @@ void callback(int pos, void*) { confThreshold = pos * 0.01f; } + +std::vector getOutputsNames(const Net& net) +{ + static std::vector names; + if (names.empty()) + { + std::vector outLayers = net.getUnconnectedOutLayers(); + std::vector layersNames = net.getLayerNames(); + names.resize(outLayers.size()); + for (size_t i = 0; i < outLayers.size(); ++i) + names[i] = layersNames[outLayers[i] - 1]; + } + return names; +} diff --git a/samples/dnn/object_detection.py b/samples/dnn/object_detection.py index c54f5d3ced..01386f2363 100644 --- a/samples/dnn/object_detection.py +++ b/samples/dnn/object_detection.py @@ -55,7 +55,11 @@ net.setPreferableTarget(args.target) confThreshold = args.thr -def postprocess(frame, out): +def getOutputsNames(net): + layersNames = net.getLayerNames() + return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()] + +def postprocess(frame, outs): frameHeight = frame.shape[0] frameWidth = frame.shape[1] @@ -63,7 +67,7 @@ def postprocess(frame, out): # Draw a bounding box. cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0)) - label = '%.2f' % confidence + label = '%.2f' % conf # Print a label of class. if classes: @@ -83,6 +87,8 @@ def postprocess(frame, out): # Network produces output blob with a shape 1x1xNx7 where N is a number of # detections and an every detection is a vector of values # [batchId, classId, confidence, left, top, right, bottom] + assert(len(outs) == 1) + out = outs[0] for detection in out[0, 0]: confidence = detection[2] if confidence > confThreshold: @@ -96,6 +102,8 @@ def postprocess(frame, out): # Network produces output blob with a shape 1x1xNx7 where N is a number of # detections and an every detection is a vector of values # [batchId, classId, confidence, left, top, right, bottom] + assert(len(outs) == 1) + out = outs[0] for detection in out[0, 0]: confidence = detection[2] if confidence > confThreshold: @@ -109,18 +117,33 @@ def postprocess(frame, out): # Network produces output blob with a shape NxC where N is a number of # detected objects and C is a number of classes + 4 where the first 4 # numbers are [center_x, center_y, width, height] - for detection in out: - confidences = detection[5:] - classId = np.argmax(confidences) - confidence = confidences[classId] - if confidence > confThreshold: - center_x = int(detection[0] * frameWidth) - center_y = int(detection[1] * frameHeight) - width = int(detection[2] * frameWidth) - height = int(detection[3] * frameHeight) - left = center_x - width / 2 - top = center_y - height / 2 - drawPred(classId, confidence, left, top, left + width, top + height) + classIds = [] + confidences = [] + boxes = [] + for out in outs: + for detection in out: + scores = detection[5:] + classId = np.argmax(scores) + confidence = scores[classId] + if confidence > confThreshold: + center_x = int(detection[0] * frameWidth) + center_y = int(detection[1] * frameHeight) + width = int(detection[2] * frameWidth) + height = int(detection[3] * frameHeight) + left = center_x - width / 2 + top = center_y - height / 2 + classIds.append(classId) + confidences.append(float(confidence)) + boxes.append([left, top, width, height]) + indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, 0.4) + for i in indices: + i = i[0] + box = boxes[i] + left = box[0] + top = box[1] + width = box[2] + height = box[3] + drawPred(classIds[i], confidences[i], left, top, left + width, top + height) # Process inputs winName = 'Deep learning object detection in OpenCV' @@ -152,9 +175,9 @@ while cv.waitKey(1) < 0: if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN frame = cv.resize(frame, (inpWidth, inpHeight)) net.setInput(np.array([inpHeight, inpWidth, 1.6], dtype=np.float32), 'im_info'); - out = net.forward() + outs = net.forward(getOutputsNames(net)) - postprocess(frame, out) + postprocess(frame, outs) # Put efficiency information. t, _ = net.getPerfProfile()