diff --git a/3rdparty/protobuf/src/google/protobuf/text_format.cc b/3rdparty/protobuf/src/google/protobuf/text_format.cc index 78f1acd7fe..eec6dffb6e 100644 --- a/3rdparty/protobuf/src/google/protobuf/text_format.cc +++ b/3rdparty/protobuf/src/google/protobuf/text_format.cc @@ -225,7 +225,9 @@ class TextFormat::Parser::ParserImpl { bool allow_unknown_enum, bool allow_field_number, bool allow_relaxed_whitespace, - bool allow_partial) + bool allow_partial, + int recursion_limit // backported from 3.8.0 + ) : error_collector_(error_collector), finder_(finder), parse_info_tree_(parse_info_tree), @@ -238,7 +240,9 @@ class TextFormat::Parser::ParserImpl { allow_unknown_enum_(allow_unknown_enum), allow_field_number_(allow_field_number), allow_partial_(allow_partial), - had_errors_(false) { + had_errors_(false), + recursion_limit_(recursion_limit) // backported from 3.8.0 + { // For backwards-compatibility with proto1, we need to allow the 'f' suffix // for floats. tokenizer_.set_allow_f_after_float(true); @@ -490,9 +494,9 @@ class TextFormat::Parser::ParserImpl { if (TryConsume(":") && !LookingAt("{") && !LookingAt("<")) { UnknownFieldSet* unknown_field = unknown_fields->AddGroup(unknown_fields->field_count()); unknown_field->AddLengthDelimited(0, field_name); // Add a field's name. - return SkipFieldValue(unknown_field); + return SkipFieldValue(unknown_field, recursion_limit_); } else { - return SkipFieldMessage(unknown_fields); + return SkipFieldMessage(unknown_fields, recursion_limit_); } } @@ -575,7 +579,14 @@ label_skip_parsing: } // Skips the next field including the field's name and value. - bool SkipField(UnknownFieldSet* unknown_fields) { + bool SkipField(UnknownFieldSet* unknown_fields, int recursion_limit) { + + // OpenCV specific + if (--recursion_limit < 0) { + ReportError("Message is too deep (SkipField)"); + return false; + } + string field_name; if (TryConsume("[")) { // Extension name. @@ -594,9 +605,9 @@ label_skip_parsing: if (TryConsume(":") && !LookingAt("{") && !LookingAt("<")) { UnknownFieldSet* unknown_field = unknown_fields->AddGroup(unknown_fields->field_count()); unknown_field->AddLengthDelimited(0, field_name); // Add a field's name. - DO(SkipFieldValue(unknown_field)); + DO(SkipFieldValue(unknown_field, recursion_limit)); } else { - DO(SkipFieldMessage(unknown_fields)); + DO(SkipFieldMessage(unknown_fields, recursion_limit)); } // For historical reasons, fields may optionally be separated by commas or // semicolons. @@ -608,6 +619,12 @@ label_skip_parsing: const Reflection* reflection, const FieldDescriptor* field) { + // backported from 3.8.0 + if (--recursion_limit_ < 0) { + ReportError("Message is too deep"); + return false; + } + // If the parse information tree is not NULL, create a nested one // for the nested message. ParseInfoTree* parent = parse_info_tree_; @@ -624,6 +641,9 @@ label_skip_parsing: delimiter)); } + // backported from 3.8.0 + ++recursion_limit_; + // Reset the parse information tree. parse_info_tree_ = parent; return true; @@ -631,11 +651,17 @@ label_skip_parsing: // Skips the whole body of a message including the beginning delimiter and // the ending delimiter. - bool SkipFieldMessage(UnknownFieldSet* unknown_fields) { + bool SkipFieldMessage(UnknownFieldSet* unknown_fields, int recursion_limit) { + // OpenCV specific + if (--recursion_limit < 0) { + ReportError("Message is too deep (SkipFieldMessage)"); + return false; + } + string delimiter; DO(ConsumeMessageDelimiter(&delimiter)); while (!LookingAt(">") && !LookingAt("}")) { - DO(SkipField(unknown_fields)); + DO(SkipField(unknown_fields, recursion_limit)); } DO(Consume(delimiter)); return true; @@ -775,7 +801,14 @@ label_skip_parsing: return true; } - bool SkipFieldValue(UnknownFieldSet* unknown_field) { + bool SkipFieldValue(UnknownFieldSet* unknown_field, int recursion_limit) { + + // OpenCV specific + if (--recursion_limit < 0) { + ReportError("Message is too deep (SkipFieldValue)"); + return false; + } + if (LookingAtType(io::Tokenizer::TYPE_STRING)) { while (LookingAtType(io::Tokenizer::TYPE_STRING)) { tokenizer_.Next(); @@ -785,9 +818,9 @@ label_skip_parsing: if (TryConsume("[")) { while (true) { if (!LookingAt("{") && !LookingAt("<")) { - DO(SkipFieldValue(unknown_field)); + DO(SkipFieldValue(unknown_field, recursion_limit)); } else { - DO(SkipFieldMessage(unknown_field)); + DO(SkipFieldMessage(unknown_field, recursion_limit)); } if (TryConsume("]")) { break; @@ -1156,6 +1189,7 @@ label_skip_parsing: const bool allow_field_number_; const bool allow_partial_; bool had_errors_; + int recursion_limit_; // backported from 3.8.0 }; #undef DO @@ -1306,17 +1340,19 @@ class TextFormat::Printer::TextGenerator TextFormat::Finder::~Finder() { } -TextFormat::Parser::Parser(bool allow_unknown_field) +TextFormat::Parser::Parser() : error_collector_(NULL), finder_(NULL), parse_info_tree_(NULL), allow_partial_(false), allow_case_insensitive_field_(false), - allow_unknown_field_(allow_unknown_field), + allow_unknown_field_(false), allow_unknown_enum_(false), allow_field_number_(false), allow_relaxed_whitespace_(false), - allow_singular_overwrites_(false) { + allow_singular_overwrites_(false), + recursion_limit_(std::numeric_limits::max()) +{ } TextFormat::Parser::~Parser() {} @@ -1335,7 +1371,7 @@ bool TextFormat::Parser::Parse(io::ZeroCopyInputStream* input, overwrites_policy, allow_case_insensitive_field_, allow_unknown_field_, allow_unknown_enum_, allow_field_number_, - allow_relaxed_whitespace_, allow_partial_); + allow_relaxed_whitespace_, allow_partial_, recursion_limit_); return MergeUsingImpl(input, output, &parser); } @@ -1353,7 +1389,7 @@ bool TextFormat::Parser::Merge(io::ZeroCopyInputStream* input, ParserImpl::ALLOW_SINGULAR_OVERWRITES, allow_case_insensitive_field_, allow_unknown_field_, allow_unknown_enum_, allow_field_number_, - allow_relaxed_whitespace_, allow_partial_); + allow_relaxed_whitespace_, allow_partial_, recursion_limit_); return MergeUsingImpl(input, output, &parser); } @@ -1388,7 +1424,7 @@ bool TextFormat::Parser::ParseFieldValueFromString( ParserImpl::ALLOW_SINGULAR_OVERWRITES, allow_case_insensitive_field_, allow_unknown_field_, allow_unknown_enum_, allow_field_number_, - allow_relaxed_whitespace_, allow_partial_); + allow_relaxed_whitespace_, allow_partial_, recursion_limit_); return parser.ParseField(field, output); } diff --git a/3rdparty/protobuf/src/google/protobuf/text_format.h b/3rdparty/protobuf/src/google/protobuf/text_format.h index 74d89a5f3e..a20a68d42f 100644 --- a/3rdparty/protobuf/src/google/protobuf/text_format.h +++ b/3rdparty/protobuf/src/google/protobuf/text_format.h @@ -457,7 +457,7 @@ class LIBPROTOBUF_EXPORT TextFormat { // For more control over parsing, use this class. class LIBPROTOBUF_EXPORT Parser { public: - Parser(bool allow_unknown_field = false); + Parser(); ~Parser(); // Like TextFormat::Parse(). @@ -508,10 +508,24 @@ class LIBPROTOBUF_EXPORT TextFormat { Message* output); + // backported from 3.8.0 + // When an unknown field is met, parsing will fail if this option is set + // to false(the default). If true, unknown fields will be ignored and + // a warning message will be generated. + // Please aware that set this option true may hide some errors (e.g. + // spelling error on field name). Avoid to use this option if possible. + void AllowUnknownField(bool allow) { allow_unknown_field_ = allow; } + + void AllowFieldNumber(bool allow) { allow_field_number_ = allow; } + // backported from 3.8.0 + // Sets maximum recursion depth which parser can use. This is effectively + // the maximum allowed nesting of proto messages. + void SetRecursionLimit(int limit) { recursion_limit_ = limit; } + private: // Forward declaration of an internal class used to parse text // representations (see text_format.cc for implementation). @@ -533,6 +547,7 @@ class LIBPROTOBUF_EXPORT TextFormat { bool allow_field_number_; bool allow_relaxed_whitespace_; bool allow_singular_overwrites_; + int recursion_limit_; // backported from 3.8.0 }; diff --git a/cmake/OpenCVFindProtobuf.cmake b/cmake/OpenCVFindProtobuf.cmake index 98e1ce2406..8835347d1d 100644 --- a/cmake/OpenCVFindProtobuf.cmake +++ b/cmake/OpenCVFindProtobuf.cmake @@ -6,9 +6,15 @@ if(NOT WITH_PROTOBUF) return() endif() -ocv_option(BUILD_PROTOBUF "Force to build libprotobuf from sources" ON) +ocv_option(BUILD_PROTOBUF "Force to build libprotobuf runtime from sources" ON) ocv_option(PROTOBUF_UPDATE_FILES "Force rebuilding .proto files (protoc should be available)" OFF) +# BUILD_PROTOBUF=OFF: Custom manual protobuf configuration (see find_package(Protobuf) for details): +# - Protobuf_INCLUDE_DIR +# - Protobuf_LIBRARY +# - Protobuf_PROTOC_EXECUTABLE + + function(get_protobuf_version version include) file(STRINGS "${include}/google/protobuf/stubs/common.h" ver REGEX "#define GOOGLE_PROTOBUF_VERSION [0-9]+") string(REGEX MATCHALL "[0-9]+" ver ${ver}) @@ -19,7 +25,9 @@ function(get_protobuf_version version include) endfunction() if(BUILD_PROTOBUF) + ocv_assert(NOT PROTOBUF_UPDATE_FILES) add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/protobuf") + set(Protobuf_LIBRARIES "libprotobuf") set(HAVE_PROTOBUF TRUE) else() unset(Protobuf_VERSION CACHE) @@ -44,10 +52,7 @@ else() if(Protobuf_FOUND) if(TARGET protobuf::libprotobuf) - add_library(libprotobuf INTERFACE IMPORTED) - set_target_properties(libprotobuf PROPERTIES - INTERFACE_LINK_LIBRARIES protobuf::libprotobuf - ) + set(Protobuf_LIBRARIES "protobuf::libprotobuf") else() add_library(libprotobuf UNKNOWN IMPORTED) set_target_properties(libprotobuf PROPERTIES @@ -56,21 +61,31 @@ else() INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${Protobuf_INCLUDE_DIR}" ) get_protobuf_version(Protobuf_VERSION "${Protobuf_INCLUDE_DIR}") + set(Protobuf_LIBRARIES "libprotobuf") endif() set(HAVE_PROTOBUF TRUE) endif() endif() if(HAVE_PROTOBUF AND PROTOBUF_UPDATE_FILES AND NOT COMMAND PROTOBUF_GENERATE_CPP) - find_package(Protobuf QUIET) - if(NOT COMMAND PROTOBUF_GENERATE_CPP) - message(FATAL_ERROR "PROTOBUF_GENERATE_CPP command is not available") - endif() + message(FATAL_ERROR "Can't configure protobuf dependency (BUILD_PROTOBUF=${BUILD_PROTOBUF} PROTOBUF_UPDATE_FILES=${PROTOBUF_UPDATE_FILES})") endif() if(HAVE_PROTOBUF) list(APPEND CUSTOM_STATUS protobuf) + if(NOT BUILD_PROTOBUF) + if(TARGET "${Protobuf_LIBRARIES}") + get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION_RELEASE) + if(NOT __location) + get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION) + endif() + elseif(Protobuf_LIBRARY) + set(__location "${Protobuf_LIBRARY}") + else() + set(__location "${Protobuf_LIBRARIES}") + endif() + endif() list(APPEND CUSTOM_STATUS_protobuf " Protobuf:" BUILD_PROTOBUF THEN "build (${Protobuf_VERSION})" - ELSE "${Protobuf_LIBRARY} (${Protobuf_VERSION})") + ELSE "${__location} (${Protobuf_VERSION})") endif() diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index 08704ef9dc..67ace67ff5 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -72,6 +72,9 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Winvalid-offsetof # Apple Clang (attr_value.pb.cc) ) +set(include_dirs "") +set(libs "") + if(PROTOBUF_UPDATE_FILES) file(GLOB proto_files "${CMAKE_CURRENT_LIST_DIR}/src/tensorflow/*.proto" "${CMAKE_CURRENT_LIST_DIR}/src/caffe/opencv-caffe.proto" "${CMAKE_CURRENT_LIST_DIR}/src/onnx/opencv-onnx.proto") set(PROTOBUF_GENERATE_CPP_APPEND_PATH ON) # required for tensorflow @@ -82,11 +85,15 @@ else() set(fw_inc "${CMAKE_CURRENT_LIST_DIR}/misc/caffe" "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx") endif() -set(include_dirs ${fw_inc}) +list(APPEND include_dirs ${fw_inc}) +list(APPEND libs ${Protobuf_LIBRARIES}) +if(NOT BUILD_PROTOBUF) + list(APPEND include_dirs ${Protobuf_INCLUDE_DIRS}) +endif() + set(sources_options "") -set(libs libprotobuf ${LAPACK_LIBRARIES}) - +list(APPEND libs ${LAPACK_LIBRARIES}) if(OPENCV_DNN_OPENCL AND HAVE_OPENCL) list(APPEND include_dirs ${OPENCL_INCLUDE_DIRS}) else() diff --git a/modules/dnn/src/caffe/caffe_io.cpp b/modules/dnn/src/caffe/caffe_io.cpp index e3f1113242..2fc4d84f46 100644 --- a/modules/dnn/src/caffe/caffe_io.cpp +++ b/modules/dnn/src/caffe/caffe_io.cpp @@ -1120,11 +1120,12 @@ bool ReadProtoFromTextFile(const char* filename, Message* proto) { std::ifstream fs(filename, std::ifstream::in); CHECK(fs.is_open()) << "Can't open \"" << filename << "\""; IstreamInputStream input(&fs); + google::protobuf::TextFormat::Parser parser; #ifndef OPENCV_DNN_EXTERNAL_PROTOBUF - return google::protobuf::TextFormat::Parser(true).Parse(&input, proto); -#else - return google::protobuf::TextFormat::Parser().Parse(&input, proto); + parser.AllowUnknownField(true); + parser.SetRecursionLimit(1000); #endif + return parser.Parse(&input, proto); } bool ReadProtoFromBinaryFile(const char* filename, Message* proto) { @@ -1137,12 +1138,12 @@ bool ReadProtoFromBinaryFile(const char* filename, Message* proto) { bool ReadProtoFromTextBuffer(const char* data, size_t len, Message* proto) { ArrayInputStream input(data, len); + google::protobuf::TextFormat::Parser parser; #ifndef OPENCV_DNN_EXTERNAL_PROTOBUF - return google::protobuf::TextFormat::Parser(true).Parse(&input, proto); -#else - return google::protobuf::TextFormat::Parser().Parse(&input, proto); + parser.AllowUnknownField(true); + parser.SetRecursionLimit(1000); #endif - + return parser.Parse(&input, proto); } diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp index 85113a94c0..e4e1f7a003 100644 --- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp +++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp @@ -223,6 +223,26 @@ public: } }; +class FlattenProdSubgraph : public Subgraph +{ +public: + FlattenProdSubgraph() + { + int input = addNodeToMatch(""); + int shape = addNodeToMatch("Shape", input); + int stack = addNodeToMatch("Const"); + int stack_1 = addNodeToMatch("Const"); + int stack_2 = addNodeToMatch("Const"); + int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2); + int prod = addNodeToMatch("Prod", strided_slice, addNodeToMatch("Const")); + int shape_pack = addNodeToMatch("Const"); + int pack = addNodeToMatch("Pack", shape_pack, prod); + addNodeToMatch("Reshape", input, pack); + + setFusedNode("Flatten", input); + } +}; + // K.layers.Softmax class SoftMaxKerasSubgraph : public Subgraph { @@ -629,6 +649,36 @@ public: } }; +class PReLUSubgraph : public TFSubgraph +{ +public: + PReLUSubgraph(bool negativeScales_) : negativeScales(negativeScales_) + { + int input = addNodeToMatch(""); + int scales = addNodeToMatch("Const"); + int neg = addNodeToMatch("Neg", input); + int relu_neg = addNodeToMatch("Relu", neg); + int finalScales = negativeScales ? addNodeToMatch("Neg", scales) : scales; + int mul = addNodeToMatch("Mul", finalScales, relu_neg); + int relu_pos = addNodeToMatch("Relu", input); + addNodeToMatch("Add", relu_pos, mul); + setFusedNode("PReLU", input, scales); + } + + virtual void finalize(tensorflow::GraphDef&, tensorflow::NodeDef* fusedNode, + std::vector& inputNodes) CV_OVERRIDE + { + if (!negativeScales) + { + Mat scales = getTensorContent(inputNodes[1]->attr().at("value").tensor(), /*copy*/false); + scales *= -1; + } + } + +private: + bool negativeScales; +}; + void simplifySubgraphs(tensorflow::GraphDef& net) { std::vector > subgraphs; @@ -649,6 +699,16 @@ void simplifySubgraphs(tensorflow::GraphDef& net) subgraphs.push_back(Ptr(new SoftMaxSlimV2Subgraph())); subgraphs.push_back(Ptr(new ReshapeAsShapeSubgraph())); subgraphs.push_back(Ptr(new KerasMVNSubgraph())); + subgraphs.push_back(Ptr(new PReLUSubgraph(true))); + subgraphs.push_back(Ptr(new PReLUSubgraph(false))); + subgraphs.push_back(Ptr(new FlattenProdSubgraph())); + + for (int i = 0; i < net.node_size(); ++i) + { + tensorflow::NodeDef* layer = net.mutable_node(i); + if (layer->op() == "AddV2") + layer->set_op("Add"); + } simplifySubgraphs(Ptr(new TFGraphWrapper(net)), subgraphs); } diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index ca4b8dbe9d..c462fde2c2 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -1231,6 +1231,7 @@ void TFImporter::populateNet(Net dstNet) // Only NHWC <-> NCHW permutations are allowed. OpenCV is always // keep NCHW layout this way. int inpLayout = getDataLayout(layer.input(0), data_layouts); + std::string type = "Identity"; if (inpLayout == DATA_LAYOUT_NHWC) { if (permData[0] == 0 && permData[1] == 3 && permData[2] == 1 && permData[3] == 2) @@ -1245,6 +1246,15 @@ void TFImporter::populateNet(Net dstNet) // in OpenCV: NCHW->NCHW data_layouts[name] = DATA_LAYOUT_NHWC; } + else if (permData[0] == 0 && permData[1] == 3 && permData[2] == 2 && permData[3] == 1) + { + // in TensorFlow: NHWC->NCWH + // in OpenCV: NCHW->NCWH + int permData[] = {0, 1, 3, 2}; + layerParams.set("order", DictValue::arrayInt(permData, perm.total())); + data_layouts[name] = DATA_LAYOUT_NCHW; // we keep track NCHW because channels position only matters + type = "Permute"; + } else CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed."); } @@ -1265,7 +1275,7 @@ void TFImporter::populateNet(Net dstNet) else CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed."); } - int id = dstNet.addLayer(name, "Identity", layerParams); + int id = dstNet.addLayer(name, type, layerParams); layer_id[name] = id; connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); } diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 3b2e1b1657..860cf2cb61 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -1035,11 +1035,25 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear) runTensorFlowNet("resize_bilinear_factor"); } -TEST_P(Test_TensorFlow_layers, tf2_keras) +TEST_P(Test_TensorFlow_layers, tf2_dense) { runTensorFlowNet("tf2_dense"); } +TEST_P(Test_TensorFlow_layers, tf2_prelu) +{ + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER); + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); + runTensorFlowNet("tf2_prelu"); +} + +TEST_P(Test_TensorFlow_layers, tf2_permute_nhwc_ncwh) +{ + runTensorFlowNet("tf2_permute_nhwc_ncwh"); +} + TEST_P(Test_TensorFlow_layers, squeeze) { #if defined(INF_ENGINE_RELEASE) diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp index 7b035fa849..5ae9126983 100644 --- a/modules/objdetect/src/hog.cpp +++ b/modules/objdetect/src/hog.cpp @@ -1218,15 +1218,6 @@ static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y, UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size) { ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc); - if(k.empty()) - return false; - bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; - cv::String opts; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple()); - k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); if(k.empty()) return false; @@ -1287,19 +1278,10 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_ size_t localThreads[3] = { 1, 1, 1 }; int idx = 0; - bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; - cv::String opts; ocl::Kernel k; if ( nbins == 9 ) { k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple()); - k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); if(k.empty()) return false; @@ -1311,14 +1293,7 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_ } else { - k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple()); - k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; @@ -1736,7 +1711,6 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size) { int nthreads; - bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; cv::String opts; ocl::Kernel k; @@ -1745,14 +1719,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y { case 180: nthreads = 180; - k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple()); - k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; idx = k.set(idx, descr_size.width); @@ -1761,14 +1728,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y case 252: nthreads = 256; - k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple()); - k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; idx = k.set(idx, descr_size.width); @@ -1777,14 +1737,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y default: nthreads = 256; - k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple()); - k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; idx = k.set(idx, descr_size.area()); diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl index 4fae320d02..156c8eb6a0 100644 --- a/modules/objdetect/src/opencl/objdetect_hog.cl +++ b/modules/objdetect/src/opencl/objdetect_hog.cl @@ -134,9 +134,7 @@ __kernel void compute_hists_lut_kernel( barrier(CLK_LOCAL_MEM_FENCE); if (cell_thread_x < 3) hist_[0] += hist_[3]; -#ifdef CPU barrier(CLK_LOCAL_MEM_FENCE); -#endif if (cell_thread_x == 0) final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; @@ -218,7 +216,6 @@ inline float reduce_smem(volatile __local float* smem, int size) barrier(CLK_LOCAL_MEM_FENCE); } if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); } -#ifdef CPU if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; barrier(CLK_LOCAL_MEM_FENCE); } if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; @@ -231,21 +228,6 @@ inline float reduce_smem(volatile __local float* smem, int size) barrier(CLK_LOCAL_MEM_FENCE); } if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; barrier(CLK_LOCAL_MEM_FENCE); } -#else - if (tid < 32) - { - if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { -#endif - if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; - if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; - if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; - if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; - if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; - } -#endif return sum; } @@ -284,6 +266,10 @@ __kernel void normalize_hists_kernel( hist[0] = elem * scale; } +#define reduce_with_sync(target, sharedMemory, localMemory, tid, offset) \ + if (tid < target) sharedMemory[tid] = localMemory = localMemory + sharedMemory[tid + offset]; \ + barrier(CLK_LOCAL_MEM_FENCE); + //--------------------------------------------------------------------- // Linear SVM based classification // 48x96 window, 9 bins and default parameters @@ -316,43 +302,16 @@ __kernel void classify_hists_180_kernel( barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 90) products[tid] = product = product + products[tid + 90]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 45) products[tid] = product = product + products[tid + 45]; - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* smem = products; -#ifdef CPU - if (tid < 13) smem[tid] = product = product + smem[tid + 32]; - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) smem[tid] = product = product + smem[tid + 16]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<8) smem[tid] = product = product + smem[tid + 8]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<4) smem[tid] = product = product + smem[tid + 4]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<2) smem[tid] = product = product + smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); -#else - if (tid < 13) - { - smem[tid] = product = product + smem[tid + 32]; - } -#if WAVE_SIZE < 32 - barrier(CLK_LOCAL_MEM_FENCE); -#endif - if (tid < 16) - { - smem[tid] = product = product + smem[tid + 16]; - smem[tid] = product = product + smem[tid + 8]; - smem[tid] = product = product + smem[tid + 4]; - smem[tid] = product = product + smem[tid + 2]; - } -#endif + reduce_with_sync(90, products, product, tid, 90); + reduce_with_sync(45, products, product, tid, 45); + reduce_with_sync(13, products, product, tid, 32); // 13 is not typo + reduce_with_sync(16, products, product, tid, 16); + reduce_with_sync(8, products, product, tid, 8); + reduce_with_sync(4, products, product, tid, 4); + reduce_with_sync(2, products, product, tid, 2); if (tid == 0){ - product = product + smem[tid + 1]; + product = product + products[tid + 1]; labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); } } @@ -389,40 +348,16 @@ __kernel void classify_hists_252_kernel( barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 128) products[tid] = product = product + products[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); + reduce_with_sync(128, products, product, tid, 128); + reduce_with_sync(64, products, product, tid, 64); + reduce_with_sync(32, products, product, tid, 32); + reduce_with_sync(16, products, product, tid, 16); + reduce_with_sync(8, products, product, tid, 8); + reduce_with_sync(4, products, product, tid, 4); + reduce_with_sync(2, products, product, tid, 2); - if (tid < 64) products[tid] = product = product + products[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* smem = products; -#ifdef CPU - if(tid<32) smem[tid] = product = product + smem[tid + 32]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<16) smem[tid] = product = product + smem[tid + 16]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<8) smem[tid] = product = product + smem[tid + 8]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<4) smem[tid] = product = product + smem[tid + 4]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<2) smem[tid] = product = product + smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); -#else - if (tid < 32) - { - smem[tid] = product = product + smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { -#endif - smem[tid] = product = product + smem[tid + 16]; - smem[tid] = product = product + smem[tid + 8]; - smem[tid] = product = product + smem[tid + 4]; - smem[tid] = product = product + smem[tid + 2]; - } -#endif if (tid == 0){ - product = product + smem[tid + 1]; + product = product + products[tid + 1]; labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); } } @@ -459,40 +394,16 @@ __kernel void classify_hists_kernel( barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 128) products[tid] = product = product + products[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); + reduce_with_sync(128, products, product, tid, 128); + reduce_with_sync(64, products, product, tid, 64); + reduce_with_sync(32, products, product, tid, 32); + reduce_with_sync(16, products, product, tid, 16); + reduce_with_sync(8, products, product, tid, 8); + reduce_with_sync(4, products, product, tid, 4); + reduce_with_sync(2, products, product, tid, 2); - if (tid < 64) products[tid] = product = product + products[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* smem = products; -#ifdef CPU - if(tid<32) smem[tid] = product = product + smem[tid + 32]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<16) smem[tid] = product = product + smem[tid + 16]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<8) smem[tid] = product = product + smem[tid + 8]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<4) smem[tid] = product = product + smem[tid + 4]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<2) smem[tid] = product = product + smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); -#else - if (tid < 32) - { - smem[tid] = product = product + smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { -#endif - smem[tid] = product = product + smem[tid + 16]; - smem[tid] = product = product + smem[tid + 8]; - smem[tid] = product = product + smem[tid + 4]; - smem[tid] = product = product + smem[tid + 2]; - } -#endif if (tid == 0){ - smem[tid] = product = product + smem[tid + 1]; + products[tid] = product = product + products[tid + 1]; labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); } } diff --git a/samples/dnn/face_detector/train.prototxt b/samples/dnn/face_detector/train.prototxt index d2b8167ca9..0bba9cb32b 100644 --- a/samples/dnn/face_detector/train.prototxt +++ b/samples/dnn/face_detector/train.prototxt @@ -1020,7 +1020,7 @@ layer { } convolution_param { num_output: 128 - pad: 1 + pad: 0 kernel_size: 3 stride: 1 weight_filler { @@ -1600,7 +1600,7 @@ layer { } convolution_param { num_output: 16 - pad: 0 + pad: 1 kernel_size: 3 stride: 1 weight_filler {