diff --git a/3rdparty/protobuf/src/google/protobuf/text_format.cc b/3rdparty/protobuf/src/google/protobuf/text_format.cc
index 78f1acd7fe..eec6dffb6e 100644
--- a/3rdparty/protobuf/src/google/protobuf/text_format.cc
+++ b/3rdparty/protobuf/src/google/protobuf/text_format.cc
@@ -225,7 +225,9 @@ class TextFormat::Parser::ParserImpl {
              bool allow_unknown_enum,
              bool allow_field_number,
              bool allow_relaxed_whitespace,
-             bool allow_partial)
+             bool allow_partial,
+             int recursion_limit  // backported from 3.8.0
+    )
     : error_collector_(error_collector),
       finder_(finder),
       parse_info_tree_(parse_info_tree),
@@ -238,7 +240,9 @@ class TextFormat::Parser::ParserImpl {
       allow_unknown_enum_(allow_unknown_enum),
       allow_field_number_(allow_field_number),
       allow_partial_(allow_partial),
-      had_errors_(false) {
+      had_errors_(false),
+      recursion_limit_(recursion_limit)  // backported from 3.8.0
+    {
     // For backwards-compatibility with proto1, we need to allow the 'f' suffix
     // for floats.
     tokenizer_.set_allow_f_after_float(true);
@@ -490,9 +494,9 @@ class TextFormat::Parser::ParserImpl {
       if (TryConsume(":") && !LookingAt("{") && !LookingAt("<")) {
         UnknownFieldSet* unknown_field = unknown_fields->AddGroup(unknown_fields->field_count());
         unknown_field->AddLengthDelimited(0, field_name);  // Add a field's name.
-        return SkipFieldValue(unknown_field);
+        return SkipFieldValue(unknown_field, recursion_limit_);
       } else {
-        return SkipFieldMessage(unknown_fields);
+        return SkipFieldMessage(unknown_fields, recursion_limit_);
       }
     }
 
@@ -575,7 +579,14 @@ label_skip_parsing:
   }
 
   // Skips the next field including the field's name and value.
-  bool SkipField(UnknownFieldSet* unknown_fields) {
+  bool SkipField(UnknownFieldSet* unknown_fields, int recursion_limit) {
+
+    // OpenCV specific
+    if (--recursion_limit < 0) {
+      ReportError("Message is too deep (SkipField)");
+      return false;
+    }
+
     string field_name;
     if (TryConsume("[")) {
       // Extension name.
@@ -594,9 +605,9 @@ label_skip_parsing:
     if (TryConsume(":") && !LookingAt("{") && !LookingAt("<")) {
       UnknownFieldSet* unknown_field = unknown_fields->AddGroup(unknown_fields->field_count());
       unknown_field->AddLengthDelimited(0, field_name);  // Add a field's name.
-      DO(SkipFieldValue(unknown_field));
+      DO(SkipFieldValue(unknown_field, recursion_limit));
     } else {
-      DO(SkipFieldMessage(unknown_fields));
+      DO(SkipFieldMessage(unknown_fields, recursion_limit));
     }
     // For historical reasons, fields may optionally be separated by commas or
     // semicolons.
@@ -608,6 +619,12 @@ label_skip_parsing:
                            const Reflection* reflection,
                            const FieldDescriptor* field) {
 
+    // backported from 3.8.0
+    if (--recursion_limit_ < 0) {
+      ReportError("Message is too deep");
+      return false;
+    }
+
     // If the parse information tree is not NULL, create a nested one
     // for the nested message.
     ParseInfoTree* parent = parse_info_tree_;
@@ -624,6 +641,9 @@ label_skip_parsing:
                         delimiter));
     }
 
+    // backported from 3.8.0
+    ++recursion_limit_;
+
     // Reset the parse information tree.
     parse_info_tree_ = parent;
     return true;
@@ -631,11 +651,17 @@ label_skip_parsing:
 
   // Skips the whole body of a message including the beginning delimiter and
   // the ending delimiter.
-  bool SkipFieldMessage(UnknownFieldSet* unknown_fields) {
+  bool SkipFieldMessage(UnknownFieldSet* unknown_fields, int recursion_limit) {
+    // OpenCV specific
+    if (--recursion_limit < 0) {
+      ReportError("Message is too deep (SkipFieldMessage)");
+      return false;
+    }
+
     string delimiter;
     DO(ConsumeMessageDelimiter(&delimiter));
     while (!LookingAt(">") &&  !LookingAt("}")) {
-      DO(SkipField(unknown_fields));
+      DO(SkipField(unknown_fields, recursion_limit));
     }
     DO(Consume(delimiter));
     return true;
@@ -775,7 +801,14 @@ label_skip_parsing:
     return true;
   }
 
-  bool SkipFieldValue(UnknownFieldSet* unknown_field) {
+  bool SkipFieldValue(UnknownFieldSet* unknown_field, int recursion_limit) {
+
+    // OpenCV specific
+    if (--recursion_limit < 0) {
+      ReportError("Message is too deep (SkipFieldValue)");
+      return false;
+    }
+
     if (LookingAtType(io::Tokenizer::TYPE_STRING)) {
       while (LookingAtType(io::Tokenizer::TYPE_STRING)) {
         tokenizer_.Next();
@@ -785,9 +818,9 @@ label_skip_parsing:
     if (TryConsume("[")) {
       while (true) {
         if (!LookingAt("{") && !LookingAt("<")) {
-          DO(SkipFieldValue(unknown_field));
+          DO(SkipFieldValue(unknown_field, recursion_limit));
         } else {
-          DO(SkipFieldMessage(unknown_field));
+          DO(SkipFieldMessage(unknown_field, recursion_limit));
         }
         if (TryConsume("]")) {
           break;
@@ -1156,6 +1189,7 @@ label_skip_parsing:
   const bool allow_field_number_;
   const bool allow_partial_;
   bool had_errors_;
+  int recursion_limit_;  // backported from 3.8.0
 };
 
 #undef DO
@@ -1306,17 +1340,19 @@ class TextFormat::Printer::TextGenerator
 TextFormat::Finder::~Finder() {
 }
 
-TextFormat::Parser::Parser(bool allow_unknown_field)
+TextFormat::Parser::Parser()
   : error_collector_(NULL),
     finder_(NULL),
     parse_info_tree_(NULL),
     allow_partial_(false),
     allow_case_insensitive_field_(false),
-    allow_unknown_field_(allow_unknown_field),
+    allow_unknown_field_(false),
     allow_unknown_enum_(false),
     allow_field_number_(false),
     allow_relaxed_whitespace_(false),
-    allow_singular_overwrites_(false) {
+    allow_singular_overwrites_(false),
+    recursion_limit_(std::numeric_limits<int>::max())
+{
 }
 
 TextFormat::Parser::~Parser() {}
@@ -1335,7 +1371,7 @@ bool TextFormat::Parser::Parse(io::ZeroCopyInputStream* input,
                     overwrites_policy,
                     allow_case_insensitive_field_, allow_unknown_field_,
                     allow_unknown_enum_, allow_field_number_,
-                    allow_relaxed_whitespace_, allow_partial_);
+                    allow_relaxed_whitespace_, allow_partial_, recursion_limit_);
   return MergeUsingImpl(input, output, &parser);
 }
 
@@ -1353,7 +1389,7 @@ bool TextFormat::Parser::Merge(io::ZeroCopyInputStream* input,
                     ParserImpl::ALLOW_SINGULAR_OVERWRITES,
                     allow_case_insensitive_field_, allow_unknown_field_,
                     allow_unknown_enum_, allow_field_number_,
-                    allow_relaxed_whitespace_, allow_partial_);
+                    allow_relaxed_whitespace_, allow_partial_, recursion_limit_);
   return MergeUsingImpl(input, output, &parser);
 }
 
@@ -1388,7 +1424,7 @@ bool TextFormat::Parser::ParseFieldValueFromString(
                     ParserImpl::ALLOW_SINGULAR_OVERWRITES,
                     allow_case_insensitive_field_, allow_unknown_field_,
                     allow_unknown_enum_, allow_field_number_,
-                    allow_relaxed_whitespace_, allow_partial_);
+                    allow_relaxed_whitespace_, allow_partial_, recursion_limit_);
   return parser.ParseField(field, output);
 }
 
diff --git a/3rdparty/protobuf/src/google/protobuf/text_format.h b/3rdparty/protobuf/src/google/protobuf/text_format.h
index 74d89a5f3e..a20a68d42f 100644
--- a/3rdparty/protobuf/src/google/protobuf/text_format.h
+++ b/3rdparty/protobuf/src/google/protobuf/text_format.h
@@ -457,7 +457,7 @@ class LIBPROTOBUF_EXPORT TextFormat {
   // For more control over parsing, use this class.
   class LIBPROTOBUF_EXPORT Parser {
    public:
-    Parser(bool allow_unknown_field = false);
+    Parser();
     ~Parser();
 
     // Like TextFormat::Parse().
@@ -508,10 +508,24 @@ class LIBPROTOBUF_EXPORT TextFormat {
                                    Message* output);
 
 
+    // backported from 3.8.0
+    // When an unknown field is met, parsing will fail if this option is set
+    // to false(the default). If true, unknown fields will be ignored and
+    // a warning message will be generated.
+    // Please aware that set this option true may hide some errors (e.g.
+    // spelling error on field name). Avoid to use this option if possible.
+    void AllowUnknownField(bool allow) { allow_unknown_field_ = allow; }
+
+
     void AllowFieldNumber(bool allow) {
       allow_field_number_ = allow;
     }
 
+    // backported from 3.8.0
+    // Sets maximum recursion depth which parser can use. This is effectively
+    // the maximum allowed nesting of proto messages.
+    void SetRecursionLimit(int limit) { recursion_limit_ = limit; }
+
    private:
     // Forward declaration of an internal class used to parse text
     // representations (see text_format.cc for implementation).
@@ -533,6 +547,7 @@ class LIBPROTOBUF_EXPORT TextFormat {
     bool allow_field_number_;
     bool allow_relaxed_whitespace_;
     bool allow_singular_overwrites_;
+    int recursion_limit_;  // backported from 3.8.0
   };
 
 
diff --git a/cmake/OpenCVFindProtobuf.cmake b/cmake/OpenCVFindProtobuf.cmake
index 98e1ce2406..8835347d1d 100644
--- a/cmake/OpenCVFindProtobuf.cmake
+++ b/cmake/OpenCVFindProtobuf.cmake
@@ -6,9 +6,15 @@ if(NOT WITH_PROTOBUF)
   return()
 endif()
 
-ocv_option(BUILD_PROTOBUF "Force to build libprotobuf from sources" ON)
+ocv_option(BUILD_PROTOBUF "Force to build libprotobuf runtime from sources" ON)
 ocv_option(PROTOBUF_UPDATE_FILES "Force rebuilding .proto files (protoc should be available)" OFF)
 
+# BUILD_PROTOBUF=OFF: Custom manual protobuf configuration (see find_package(Protobuf) for details):
+# - Protobuf_INCLUDE_DIR
+# - Protobuf_LIBRARY
+# - Protobuf_PROTOC_EXECUTABLE
+
+
 function(get_protobuf_version version include)
   file(STRINGS "${include}/google/protobuf/stubs/common.h" ver REGEX "#define GOOGLE_PROTOBUF_VERSION [0-9]+")
   string(REGEX MATCHALL "[0-9]+" ver ${ver})
@@ -19,7 +25,9 @@ function(get_protobuf_version version include)
 endfunction()
 
 if(BUILD_PROTOBUF)
+  ocv_assert(NOT PROTOBUF_UPDATE_FILES)
   add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/protobuf")
+  set(Protobuf_LIBRARIES "libprotobuf")
   set(HAVE_PROTOBUF TRUE)
 else()
   unset(Protobuf_VERSION CACHE)
@@ -44,10 +52,7 @@ else()
 
   if(Protobuf_FOUND)
     if(TARGET protobuf::libprotobuf)
-      add_library(libprotobuf INTERFACE IMPORTED)
-      set_target_properties(libprotobuf PROPERTIES
-        INTERFACE_LINK_LIBRARIES protobuf::libprotobuf
-      )
+      set(Protobuf_LIBRARIES "protobuf::libprotobuf")
     else()
       add_library(libprotobuf UNKNOWN IMPORTED)
       set_target_properties(libprotobuf PROPERTIES
@@ -56,21 +61,31 @@ else()
         INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${Protobuf_INCLUDE_DIR}"
       )
       get_protobuf_version(Protobuf_VERSION "${Protobuf_INCLUDE_DIR}")
+      set(Protobuf_LIBRARIES "libprotobuf")
     endif()
     set(HAVE_PROTOBUF TRUE)
   endif()
 endif()
 
 if(HAVE_PROTOBUF AND PROTOBUF_UPDATE_FILES AND NOT COMMAND PROTOBUF_GENERATE_CPP)
-  find_package(Protobuf QUIET)
-  if(NOT COMMAND PROTOBUF_GENERATE_CPP)
-    message(FATAL_ERROR "PROTOBUF_GENERATE_CPP command is not available")
-  endif()
+  message(FATAL_ERROR "Can't configure protobuf dependency (BUILD_PROTOBUF=${BUILD_PROTOBUF} PROTOBUF_UPDATE_FILES=${PROTOBUF_UPDATE_FILES})")
 endif()
 
 if(HAVE_PROTOBUF)
   list(APPEND CUSTOM_STATUS protobuf)
+  if(NOT BUILD_PROTOBUF)
+    if(TARGET "${Protobuf_LIBRARIES}")
+      get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION_RELEASE)
+      if(NOT __location)
+        get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION)
+      endif()
+    elseif(Protobuf_LIBRARY)
+      set(__location "${Protobuf_LIBRARY}")
+    else()
+      set(__location "${Protobuf_LIBRARIES}")
+    endif()
+  endif()
   list(APPEND CUSTOM_STATUS_protobuf "    Protobuf:"
     BUILD_PROTOBUF THEN "build (${Protobuf_VERSION})"
-    ELSE "${Protobuf_LIBRARY} (${Protobuf_VERSION})")
+    ELSE "${__location} (${Protobuf_VERSION})")
 endif()
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 08704ef9dc..67ace67ff5 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -72,6 +72,9 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS
     -Winvalid-offsetof                 # Apple Clang (attr_value.pb.cc)
 )
 
+set(include_dirs "")
+set(libs "")
+
 if(PROTOBUF_UPDATE_FILES)
   file(GLOB proto_files "${CMAKE_CURRENT_LIST_DIR}/src/tensorflow/*.proto" "${CMAKE_CURRENT_LIST_DIR}/src/caffe/opencv-caffe.proto" "${CMAKE_CURRENT_LIST_DIR}/src/onnx/opencv-onnx.proto")
   set(PROTOBUF_GENERATE_CPP_APPEND_PATH ON) # required for tensorflow
@@ -82,11 +85,15 @@ else()
   set(fw_inc "${CMAKE_CURRENT_LIST_DIR}/misc/caffe" "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx")
 endif()
 
-set(include_dirs ${fw_inc})
+list(APPEND include_dirs ${fw_inc})
+list(APPEND libs ${Protobuf_LIBRARIES})
+if(NOT BUILD_PROTOBUF)
+  list(APPEND include_dirs ${Protobuf_INCLUDE_DIRS})
+endif()
+
 set(sources_options "")
 
-set(libs libprotobuf ${LAPACK_LIBRARIES})
-
+list(APPEND libs ${LAPACK_LIBRARIES})
 if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
   list(APPEND include_dirs ${OPENCL_INCLUDE_DIRS})
 else()
diff --git a/modules/dnn/src/caffe/caffe_io.cpp b/modules/dnn/src/caffe/caffe_io.cpp
index e3f1113242..2fc4d84f46 100644
--- a/modules/dnn/src/caffe/caffe_io.cpp
+++ b/modules/dnn/src/caffe/caffe_io.cpp
@@ -1120,11 +1120,12 @@ bool ReadProtoFromTextFile(const char* filename, Message* proto) {
     std::ifstream fs(filename, std::ifstream::in);
     CHECK(fs.is_open()) << "Can't open \"" << filename << "\"";
     IstreamInputStream input(&fs);
+    google::protobuf::TextFormat::Parser parser;
 #ifndef OPENCV_DNN_EXTERNAL_PROTOBUF
-    return google::protobuf::TextFormat::Parser(true).Parse(&input, proto);
-#else
-    return google::protobuf::TextFormat::Parser().Parse(&input, proto);
+    parser.AllowUnknownField(true);
+    parser.SetRecursionLimit(1000);
 #endif
+    return parser.Parse(&input, proto);
 }
 
 bool ReadProtoFromBinaryFile(const char* filename, Message* proto) {
@@ -1137,12 +1138,12 @@ bool ReadProtoFromBinaryFile(const char* filename, Message* proto) {
 
 bool ReadProtoFromTextBuffer(const char* data, size_t len, Message* proto) {
     ArrayInputStream input(data, len);
+    google::protobuf::TextFormat::Parser parser;
 #ifndef OPENCV_DNN_EXTERNAL_PROTOBUF
-    return google::protobuf::TextFormat::Parser(true).Parse(&input, proto);
-#else
-    return google::protobuf::TextFormat::Parser().Parse(&input, proto);
+    parser.AllowUnknownField(true);
+    parser.SetRecursionLimit(1000);
 #endif
-
+    return parser.Parse(&input, proto);
 }
 
 
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index 85113a94c0..e4e1f7a003 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -223,6 +223,26 @@ public:
     }
 };
 
+class FlattenProdSubgraph : public Subgraph
+{
+public:
+    FlattenProdSubgraph()
+    {
+        int input = addNodeToMatch("");
+        int shape = addNodeToMatch("Shape", input);
+        int stack = addNodeToMatch("Const");
+        int stack_1 = addNodeToMatch("Const");
+        int stack_2 = addNodeToMatch("Const");
+        int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
+        int prod = addNodeToMatch("Prod", strided_slice, addNodeToMatch("Const"));
+        int shape_pack = addNodeToMatch("Const");
+        int pack = addNodeToMatch("Pack", shape_pack, prod);
+        addNodeToMatch("Reshape", input, pack);
+
+        setFusedNode("Flatten", input);
+    }
+};
+
 // K.layers.Softmax
 class SoftMaxKerasSubgraph : public Subgraph
 {
@@ -629,6 +649,36 @@ public:
     }
 };
 
+class PReLUSubgraph : public TFSubgraph
+{
+public:
+    PReLUSubgraph(bool negativeScales_) : negativeScales(negativeScales_)
+    {
+        int input = addNodeToMatch("");
+        int scales = addNodeToMatch("Const");
+        int neg = addNodeToMatch("Neg", input);
+        int relu_neg = addNodeToMatch("Relu", neg);
+        int finalScales = negativeScales ? addNodeToMatch("Neg", scales) : scales;
+        int mul = addNodeToMatch("Mul", finalScales, relu_neg);
+        int relu_pos = addNodeToMatch("Relu", input);
+        addNodeToMatch("Add", relu_pos, mul);
+        setFusedNode("PReLU", input, scales);
+    }
+
+    virtual void finalize(tensorflow::GraphDef&, tensorflow::NodeDef* fusedNode,
+                          std::vector<tensorflow::NodeDef*>& inputNodes) CV_OVERRIDE
+    {
+        if (!negativeScales)
+        {
+            Mat scales = getTensorContent(inputNodes[1]->attr().at("value").tensor(), /*copy*/false);
+            scales *= -1;
+        }
+    }
+
+private:
+    bool negativeScales;
+};
+
 void simplifySubgraphs(tensorflow::GraphDef& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
@@ -649,6 +699,16 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
     subgraphs.push_back(Ptr<Subgraph>(new SoftMaxSlimV2Subgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new ReshapeAsShapeSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new KerasMVNSubgraph()));
+    subgraphs.push_back(Ptr<Subgraph>(new PReLUSubgraph(true)));
+    subgraphs.push_back(Ptr<Subgraph>(new PReLUSubgraph(false)));
+    subgraphs.push_back(Ptr<Subgraph>(new FlattenProdSubgraph()));
+
+    for (int i = 0; i < net.node_size(); ++i)
+    {
+        tensorflow::NodeDef* layer = net.mutable_node(i);
+        if (layer->op() == "AddV2")
+            layer->set_op("Add");
+    }
 
     simplifySubgraphs(Ptr<ImportGraphWrapper>(new TFGraphWrapper(net)), subgraphs);
 }
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index ca4b8dbe9d..c462fde2c2 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1231,6 +1231,7 @@ void TFImporter::populateNet(Net dstNet)
                 // Only NHWC <-> NCHW permutations are allowed. OpenCV is always
                 // keep NCHW layout this way.
                 int inpLayout = getDataLayout(layer.input(0), data_layouts);
+                std::string type = "Identity";
                 if (inpLayout == DATA_LAYOUT_NHWC)
                 {
                     if (permData[0] == 0 && permData[1] == 3 && permData[2] == 1 && permData[3] == 2)
@@ -1245,6 +1246,15 @@ void TFImporter::populateNet(Net dstNet)
                         // in OpenCV: NCHW->NCHW
                         data_layouts[name] = DATA_LAYOUT_NHWC;
                     }
+                    else if (permData[0] == 0 && permData[1] == 3 && permData[2] == 2 && permData[3] == 1)
+                    {
+                        // in TensorFlow: NHWC->NCWH
+                        // in OpenCV: NCHW->NCWH
+                        int permData[] = {0, 1, 3, 2};
+                        layerParams.set("order", DictValue::arrayInt<int*>(permData, perm.total()));
+                        data_layouts[name] = DATA_LAYOUT_NCHW;  // we keep track NCHW because channels position only matters
+                        type = "Permute";
+                    }
                     else
                         CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
                 }
@@ -1265,7 +1275,7 @@ void TFImporter::populateNet(Net dstNet)
                     else
                         CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
                 }
-                int id = dstNet.addLayer(name, "Identity", layerParams);
+                int id = dstNet.addLayer(name, type, layerParams);
                 layer_id[name] = id;
                 connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
             }
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 3b2e1b1657..860cf2cb61 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -1035,11 +1035,25 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear)
     runTensorFlowNet("resize_bilinear_factor");
 }
 
-TEST_P(Test_TensorFlow_layers, tf2_keras)
+TEST_P(Test_TensorFlow_layers, tf2_dense)
 {
     runTensorFlowNet("tf2_dense");
 }
 
+TEST_P(Test_TensorFlow_layers, tf2_prelu)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    runTensorFlowNet("tf2_prelu");
+}
+
+TEST_P(Test_TensorFlow_layers, tf2_permute_nhwc_ncwh)
+{
+    runTensorFlowNet("tf2_permute_nhwc_ncwh");
+}
+
 TEST_P(Test_TensorFlow_layers, squeeze)
 {
 #if defined(INF_ENGINE_RELEASE)
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index 7b035fa849..5ae9126983 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -1218,15 +1218,6 @@ static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y,
                               UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size)
 {
     ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc);
-    if(k.empty())
-        return false;
-    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
-    cv::String opts;
-    if(is_cpu)
-       opts = "-D CPU ";
-    else
-        opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
-    k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
     if(k.empty())
         return false;
 
@@ -1287,19 +1278,10 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_
     size_t localThreads[3] = { 1, 1, 1  };
 
     int idx = 0;
-    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
-    cv::String opts;
     ocl::Kernel k;
     if ( nbins == 9 )
     {
         k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
-        if(k.empty())
-            return false;
-        if(is_cpu)
-           opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
-        k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
         if(k.empty())
             return false;
 
@@ -1311,14 +1293,7 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_
     }
     else
     {
-        k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
-        if(k.empty())
-            return false;
-        if(is_cpu)
-           opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
-        k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
         if(k.empty())
             return false;
 
@@ -1736,7 +1711,6 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
                                float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size)
 {
     int nthreads;
-    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
     cv::String opts;
 
     ocl::Kernel k;
@@ -1745,14 +1719,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
     {
     case 180:
         nthreads = 180;
-        k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
-        if(k.empty())
-            return false;
-        if(is_cpu)
-           opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
-        k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
         if(k.empty())
             return false;
         idx = k.set(idx, descr_size.width);
@@ -1761,14 +1728,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
 
     case 252:
         nthreads = 256;
-        k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
-        if(k.empty())
-            return false;
-        if(is_cpu)
-           opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
-        k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
         if(k.empty())
             return false;
         idx = k.set(idx, descr_size.width);
@@ -1777,14 +1737,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
 
     default:
         nthreads = 256;
-        k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
-        if(k.empty())
-            return false;
-        if(is_cpu)
-           opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
-        k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
         if(k.empty())
             return false;
         idx = k.set(idx, descr_size.area());
diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl
index 4fae320d02..156c8eb6a0 100644
--- a/modules/objdetect/src/opencl/objdetect_hog.cl
+++ b/modules/objdetect/src/opencl/objdetect_hog.cl
@@ -134,9 +134,7 @@ __kernel void compute_hists_lut_kernel(
         barrier(CLK_LOCAL_MEM_FENCE);
         if (cell_thread_x < 3)
             hist_[0] += hist_[3];
-#ifdef CPU
         barrier(CLK_LOCAL_MEM_FENCE);
-#endif
         if (cell_thread_x == 0)
             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
                 hist_[0] + hist_[1] + hist_[2];
@@ -218,7 +216,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
         barrier(CLK_LOCAL_MEM_FENCE); }
     if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
         barrier(CLK_LOCAL_MEM_FENCE); }
-#ifdef CPU
     if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
         barrier(CLK_LOCAL_MEM_FENCE); }
     if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
@@ -231,21 +228,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
         barrier(CLK_LOCAL_MEM_FENCE); }
     if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
         barrier(CLK_LOCAL_MEM_FENCE); }
-#else
-    if (tid < 32)
-    {
-        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-#if WAVE_SIZE < 32
-    } barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16) {
-#endif
-        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-        if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-        if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
-    }
-#endif
 
     return sum;
 }
@@ -284,6 +266,10 @@ __kernel void normalize_hists_kernel(
         hist[0] = elem * scale;
 }
 
+#define reduce_with_sync(target, sharedMemory, localMemory, tid, offset) \
+    if (tid < target) sharedMemory[tid] = localMemory = localMemory + sharedMemory[tid + offset]; \
+    barrier(CLK_LOCAL_MEM_FENCE);
+
 //---------------------------------------------------------------------
 //  Linear SVM based classification
 //  48x96 window, 9 bins and default parameters
@@ -316,43 +302,16 @@ __kernel void classify_hists_180_kernel(
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 90) products[tid] = product = product + products[tid + 90];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 45) products[tid] = product = product + products[tid + 45];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    volatile __local float* smem = products;
-#ifdef CPU
-    if (tid < 13) smem[tid] = product = product + smem[tid + 32];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16) smem[tid] = product = product + smem[tid + 16];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<8) smem[tid] = product = product + smem[tid + 8];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<4) smem[tid] = product = product + smem[tid + 4];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<2) smem[tid] = product = product + smem[tid + 2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-#else
-    if (tid < 13)
-    {
-        smem[tid] = product = product + smem[tid + 32];
-    }
-#if WAVE_SIZE < 32
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-    if (tid < 16)
-    {
-        smem[tid] = product = product + smem[tid + 16];
-        smem[tid] = product = product + smem[tid + 8];
-        smem[tid] = product = product + smem[tid + 4];
-        smem[tid] = product = product + smem[tid + 2];
-    }
-#endif
+    reduce_with_sync(90, products, product, tid, 90);
+    reduce_with_sync(45, products, product, tid, 45);
+    reduce_with_sync(13, products, product, tid, 32); // 13 is not typo
+    reduce_with_sync(16, products, product, tid, 16);
+    reduce_with_sync(8, products, product, tid, 8);
+    reduce_with_sync(4, products, product, tid, 4);
+    reduce_with_sync(2, products, product, tid, 2);
 
     if (tid == 0){
-        product = product + smem[tid + 1];
+        product = product + products[tid + 1];
         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
     }
 }
@@ -389,40 +348,16 @@ __kernel void classify_hists_252_kernel(
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 128) products[tid] = product = product + products[tid + 128];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    reduce_with_sync(128, products, product, tid, 128);
+    reduce_with_sync(64, products, product, tid, 64);
+    reduce_with_sync(32, products, product, tid, 32);
+    reduce_with_sync(16, products, product, tid, 16);
+    reduce_with_sync(8, products, product, tid, 8);
+    reduce_with_sync(4, products, product, tid, 4);
+    reduce_with_sync(2, products, product, tid, 2);
 
-    if (tid < 64) products[tid] = product = product + products[tid + 64];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    volatile __local float* smem = products;
-#ifdef CPU
-    if(tid<32) smem[tid] = product = product + smem[tid + 32];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<16) smem[tid] = product = product + smem[tid + 16];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<8) smem[tid] = product = product + smem[tid + 8];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<4) smem[tid] = product = product + smem[tid + 4];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<2) smem[tid] = product = product + smem[tid + 2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-#else
-    if (tid < 32)
-    {
-        smem[tid] = product = product + smem[tid + 32];
-#if WAVE_SIZE < 32
-    } barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16) {
-#endif
-        smem[tid] = product = product + smem[tid + 16];
-        smem[tid] = product = product + smem[tid + 8];
-        smem[tid] = product = product + smem[tid + 4];
-        smem[tid] = product = product + smem[tid + 2];
-    }
-#endif
     if (tid == 0){
-        product = product + smem[tid + 1];
+        product = product + products[tid + 1];
         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
     }
 }
@@ -459,40 +394,16 @@ __kernel void classify_hists_kernel(
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 128) products[tid] = product = product + products[tid + 128];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    reduce_with_sync(128, products, product, tid, 128);
+    reduce_with_sync(64, products, product, tid, 64);
+    reduce_with_sync(32, products, product, tid, 32);
+    reduce_with_sync(16, products, product, tid, 16);
+    reduce_with_sync(8, products, product, tid, 8);
+    reduce_with_sync(4, products, product, tid, 4);
+    reduce_with_sync(2, products, product, tid, 2);
 
-    if (tid < 64) products[tid] = product = product + products[tid + 64];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    volatile __local float* smem = products;
-#ifdef CPU
-    if(tid<32) smem[tid] = product = product + smem[tid + 32];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<16) smem[tid] = product = product + smem[tid + 16];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<8) smem[tid] = product = product + smem[tid + 8];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<4) smem[tid] = product = product + smem[tid + 4];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(tid<2) smem[tid] = product = product + smem[tid + 2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-#else
-    if (tid < 32)
-    {
-        smem[tid] = product = product + smem[tid + 32];
-#if WAVE_SIZE < 32
-    } barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16) {
-#endif
-        smem[tid] = product = product + smem[tid + 16];
-        smem[tid] = product = product + smem[tid + 8];
-        smem[tid] = product = product + smem[tid + 4];
-        smem[tid] = product = product + smem[tid + 2];
-    }
-#endif
     if (tid == 0){
-        smem[tid] = product = product + smem[tid + 1];
+        products[tid] = product = product + products[tid + 1];
         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
     }
 }
diff --git a/samples/dnn/face_detector/train.prototxt b/samples/dnn/face_detector/train.prototxt
index d2b8167ca9..0bba9cb32b 100644
--- a/samples/dnn/face_detector/train.prototxt
+++ b/samples/dnn/face_detector/train.prototxt
@@ -1020,7 +1020,7 @@ layer {
   }
   convolution_param {
     num_output: 128
-    pad: 1
+    pad: 0
     kernel_size: 3
     stride: 1
     weight_filler {
@@ -1600,7 +1600,7 @@ layer {
   }
   convolution_param {
     num_output: 16
-    pad: 0
+    pad: 1
     kernel_size: 3
     stride: 1
     weight_filler {