// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #include "precomp.hpp" #include "net_impl.hpp" #ifdef HAVE_CUDA #include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers #endif namespace cv { namespace dnn { CV__DNN_INLINE_NS_BEGIN void Net::Impl::enableFusion(bool fusion_) { if (fusion != fusion_) { fusion = fusion_; clear(); } } #if 0 #define printf_(args) printf args #else #define printf_(args) #endif void Net::Impl::fuseLayers(const std::vector& blobsToKeep_) { CV_TRACE_FUNCTION(); if(!fusion || (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA && preferableBackend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && preferableBackend != DNN_BACKEND_TIMVX)) return; #if 0 // FIXIT mode without fusion is broken due to unsupported layers and handling of "custom" nodes if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) return; #endif // scan through all the layers. If there is convolution layer followed by the activation layer, // we try to embed this activation into the convolution and disable separate execution of the activation // FIXIT replace by layersToKeep to avoid hacks like "LayerPin(lid, 0)" std::set pinsToKeep(blobsToKeep_.begin(), blobsToKeep_.end()); for (MapIdToLayerData::const_iterator it = layers.begin(); it != layers.end(); it++) { int lid = it->first; LayerData& ld = layers[lid]; if (ld.skip) { printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str())); continue; } printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str())); // the optimization #1. try to fuse batch norm, scaling and/or activation layers // with the current layer if they follow it. Normally, the are fused with the convolution layer, // but some of them (like activation) may be fused with fully-connected, elemwise (+) and // some other layers. Ptr& currLayer = ld.layerInstance; if (ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0) { LayerData* nextData = &layers[ld.consumers[0].lid]; LayerPin lpNext(ld.consumers[0].lid, 0); while (nextData) { #ifdef HAVE_INF_ENGINE if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && pinsToKeep.count(lpNext) != 0) { CV_LOG_DEBUG(NULL, "DNN/IE: skip fusing with 'output' node: " << nextData->name << "@" << nextData->type); break; } #endif /* we use `tryFuse` member of convolution layer to fuse eltwise later * it's not intended to be fused here; hence, we stop when we encounter eltwise */ if (preferableBackend == DNN_BACKEND_CUDA && ld.type == "Convolution" && nextData->type == "Eltwise") break; Ptr nextLayer = nextData->layerInstance; if (currLayer->tryFuse(nextLayer)) { printf_(("\tfused with %s\n", nextLayer->name.c_str())); nextData->skip = true; ld.outputBlobs = layers[lpNext.lid].outputBlobs; ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers; if (nextData->consumers.size() == 1) { int nextLayerId = nextData->consumers[0].lid; nextData = &layers[nextLayerId]; lpNext = LayerPin(nextLayerId, 0); } else { nextData = 0; break; } } else break; } if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA) continue; // Go to the next layer. // TODO: OpenCL target support more fusion styles. if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) && (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" && ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" && ld.layerInstance->type != "Concat")) ) continue; if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution" && ld.layerInstance->type != "Concat") continue; while (nextData) { // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh if (IS_DNN_OPENCL_TARGET(preferableTarget) && nextData->type != "ReLU" && nextData->type != "ChannelsPReLU" && nextData->type != "ReLU6" && nextData->type != "TanH" && nextData->type != "Power") break; Ptr nextActivLayer = nextData->layerInstance.dynamicCast(); if (nextActivLayer.empty()) break; if (currLayer->setActivation(nextActivLayer)) { printf_(("\tfused with %s\n", nextActivLayer->name.c_str())); nextData->skip = true; ld.outputBlobs = layers[lpNext.lid].outputBlobs; ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers; if (nextData->consumers.size() == 1) { int nextLayerId = nextData->consumers[0].lid; nextData = &layers[nextLayerId]; lpNext = LayerPin(nextLayerId, 0); } else { nextData = 0; break; } } else break; } // CPU: fuse Convolution 2D layer followed by Add + activation. while (nextData && (IS_DNN_CPU_TARGET(preferableTarget)) && ld.layerInstance->type == "Convolution") { // Note that we can only deal with conv + Add + activ here. // To avoid the order like: conv + activ + add, if we found the conv has been fused with activ, we break. Ptr convLayer = ld.layerInstance.dynamicCast(); // Only Conv2D without fusion Activation supports this fusion, other-wise, we skip. if (!convLayer->isConv2D || convLayer->fusedActivation) break; // For now, there are currently two layers in OpenCV that run the Add operator. Ptr nextNaryEltwiseLayer = nextData->layerInstance.dynamicCast(); Ptr nextEltwiseLayer = nextData->layerInstance.dynamicCast(); if (nextNaryEltwiseLayer.empty() && nextEltwiseLayer.empty()) break; if (nextData->inputBlobsId.size() != 2) break; if (!nextData->params.has("operation") || toLowerCase(nextData->params.get("operation")) != "add") { CV_LOG_DEBUG(NULL, "DNN/CPU: fusion with NaryEltwise or Eltwise Layer operation is not supported: " << nextData->params.get("operation")); break; } // This optimization is for cases like // some_layer conv // | | // +-- eltwise or (naryEltwise) --+ // | // activ // This way all the element-wise computations // (i.e. some_layer+conv) would be done at [conv] layer. // So we need to replace [conv]'s output blob to [eltwise]'s one // considering that [activ] is an in-place layer. // Also we need to move all the consumers' references. // To prevent memory collisions (i.e. when input of // [conv] and output of [eltwise or naryEltwise] is the same blob) // we allocate a new blob. { LayerData *naryOrEltwiseData = nextData; // Eltwise or NaryEltwise layer has two inputs. We need to determine which // is a base convolution layer and which could be used as it's bias. LayerData* biasLayerData = 0; for (int i = 0; i < 2; ++i) { LayerData *downLayerData = &layers[naryOrEltwiseData->inputBlobsId[i].lid]; CV_Assert(downLayerData); // If the current downLayerData is skip, it means it is fused into the parent node. while (downLayerData->skip) { if (downLayerData->inputBlobsId.size() == 1) downLayerData = &layers[downLayerData->inputBlobsId[0].lid]; else { downLayerData = 0; break; } } if (downLayerData && ld.id == downLayerData->id) { biasLayerData = &layers[naryOrEltwiseData->inputBlobsId[1 - i].lid]; break; } } // We check if biasLayerData is expected layer. if (!biasLayerData) break; // We check if the bias output shape and the ld output shape are the same. MatShape biasOutShape = shape(biasLayerData->outputBlobs[0]); MatShape ldOutShape = shape(ld.outputBlobs[0]); if (biasOutShape != ldOutShape) break; CV_Assert(biasLayerData); { // fuse naryEltwise layer // bias must already be computed to fuse => bias layer must appear before convolution if (biasLayerData->id < ld.id) { // conv + naryEltwise. CV_Assert_N(biasLayerData->outputBlobs.size() == 1, ld.inputBlobs.size() == 1); CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1); printf_(("\tfused with %s\n", nextNaryEltwiseLayer->name.c_str())); naryOrEltwiseData->skip = true; CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1); // Note: Here's a trick. We set the output of conv as the output of biasLayer. ld.outputBlobs[0] = ld.outputBlobs[0].clone(); ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]); // Recursively modifies the output data of biasLayerData and its parent. std::vector skipDataList; skipDataList.push_back(biasLayerData); while (!skipDataList.empty()) { LayerData* skipData = skipDataList.back(); skipDataList.pop_back(); CV_Assert(skipData->outputBlobs.size() == 1); skipData->outputBlobs[0] = ld.outputBlobs[0]; skipData->outputBlobsWrappers[0] = ld.outputBlobsWrappers[0]; if (skipData->skip) { for (auto& inputLayerId : skipData->inputLayersId) { LayerData* inputld = &layers[inputLayerId]; if (inputld && inputld->outputBlobs.size() == 1) skipDataList.push_back(inputld); } } } naryOrEltwiseData->outputBlobs = ld.outputBlobs; naryOrEltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers; // set the fusedAdd flag in [Conv]; convLayer->fusedAdd = true; LayerData* finalData = naryOrEltwiseData; /* After fused Conv + naryEltwise or eltwise, we can fuse activation if: * => activation layer that follows is the only consumer of eltwise output * => activation layer does not process multiple inputs * => we do not require to keep the output of eltwise */ if (naryOrEltwiseData->consumers.size() == 1) { Ptr nextFusabeleActivLayer; LayerData* nextAct = &layers[naryOrEltwiseData->consumers[0].lid]; if (nextData->outputBlobs.size() == 1) nextFusabeleActivLayer = nextAct->layerInstance.dynamicCast(); if (!nextFusabeleActivLayer.empty()) { convLayer->setActivation(nextFusabeleActivLayer); nextAct->skip = true; nextAct->outputBlobs = ld.outputBlobs; nextAct->outputBlobsWrappers = ld.outputBlobsWrappers; } } // Move references of finalData (eltwise or activation) layer consumers to the newly allocated blob. for (int i = 0; i < finalData->consumers.size(); ++i) { LayerData& consumer = layers[finalData->consumers[i].lid]; for (int j = 0; j < consumer.inputBlobsId.size(); ++j) { if (consumer.inputBlobsId[j].lid == finalData->id) { consumer.inputBlobs[j] = &ld.outputBlobs[0]; consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; break; } } } } } } break; } // OpenCL: fuse convolution layer followed by eltwise + relu // CUDA: fuse convolution layer followed by eltwise (and optional activation) while (nextData && (IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) && ld.layerInstance->type == "Convolution" ) // semantic of 'if' { Ptr nextEltwiseLayer = nextData->layerInstance.dynamicCast(); if (nextEltwiseLayer.empty()) break; #ifdef HAVE_CUDA // CUDA backend supports fusion with eltwise sum (without variable channels) if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty()) { // we create a temporary backend node for eltwise layer to obtain the eltwise configuration cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers); auto eltwiseNode = node.dynamicCast(); // broadcasting not supported in fused ops auto required_shape = shape(nextData->outputBlobs[0]); for (int i = 0; i < nextData->inputBlobs.size(); i++) { if (shape(*nextData->inputBlobs[i]) != required_shape) { eltwiseNode.reset(); break; } } // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used. // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors. if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty()) break; } #endif if (IS_DNN_OPENCL_TARGET(preferableTarget) && pinsToKeep.count(lpNext) != 0) break; if (nextData->inputBlobsId.size() != 2) break; if (IS_DNN_OPENCL_TARGET(preferableTarget)) { if (!nextData->params.has("operation") || toLowerCase(nextData->params.get("operation")) == "sum") { if (nextData->params.has("coeff")) { DictValue paramCoeff = nextData->params.get("coeff"); int n = paramCoeff.size(); bool isCoeffOneOne = (n == 2); for (int i = 0; isCoeffOneOne && i < n; i++) { float c = paramCoeff.get(i); isCoeffOneOne &= (c == 1.0f); } if (!isCoeffOneOne) { CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion of 'Sum' without coeffs (or {1.0, 1.0}) is supported only"); break; } } } else { CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion with eltwise operation is not supported: " << nextData->params.get("operation")); break; } } { LayerData *eltwiseData = nextData; // Eltwise layer has two inputs. We need to determine which // is a base convolution layer and which could be used as it's bias. LayerData* biasLayerData = 0; for (int i = 0; i < 2; ++i) { LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid]; CV_Assert(downLayerData); while (downLayerData->skip) { if (downLayerData->inputBlobsId.size() == 1) downLayerData = &layers[downLayerData->inputBlobsId[0].lid]; else { downLayerData = 0; break; } } if (downLayerData && ld.id == downLayerData->id) { biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid]; break; } } CV_Assert(biasLayerData); { // fuse eltwise + activation layer // bias must already be computed to fuse => bias layer must appear before convolution if (biasLayerData->id < ld.id) { /* we can fuse activation if: * => activation layer that follows is the only consumer of eltwise output * => activation layer does not process multiple inputs * => we do not require to keep the output of eltwise */ Ptr nextFusabeleActivLayer; if (eltwiseData->consumers.size() == 1 && pinsToKeep.count(lpNext) == 0) { nextData = &layers[eltwiseData->consumers[0].lid]; lpNext = LayerPin(eltwiseData->consumers[0].lid, 0); CV_Assert(nextData); if (nextData->outputBlobs.size() == 1) nextFusabeleActivLayer = nextData->layerInstance.dynamicCast(); } else { // OCL backend cannot fuse in this case but the CUDA backend can continue with just eltwise nextData = 0; } // the requirements of OCV OpenCL backend and CUDA backend are different // we need to check them separately; hence, the fuse variables bool fuse_eltwise = false, fuse_activation = false; Ptr activ_power; if (IS_DNN_OPENCL_TARGET(preferableTarget) && !nextFusabeleActivLayer.empty() && nextData && (!nextData->type.compare("ReLU") || !nextData->type.compare("ChannelsPReLU") || (!nextData->type.compare("Power") && (activ_power = nextFusabeleActivLayer.dynamicCast()) && activ_power->scale == 1.0f) ) && currLayer->setActivation(nextFusabeleActivLayer)) { fuse_eltwise = true; fuse_activation = true; } if (IS_DNN_CUDA_TARGET(preferableTarget)) { /* supported fusion options: * => convolution + eltwise * => activation(convolution) + eltwise * > convolution + activation would have been fused already; we have to fuse eltwise * => activation(convolution + eltwise) * > fuse eltwise and then activation */ auto layer = nextEltwiseLayer.staticCast(); if (currLayer->tryFuse(layer)) { fuse_eltwise = true; /* eltwise was successfully fused */ if (!nextFusabeleActivLayer.empty() && nextData) { if ((!nextData->type.compare("ReLU") || !nextData->type.compare("ReLU6") || !nextData->type.compare("Power") || !nextData->type.compare("TanH") || !nextData->type.compare("Sigmoid") || !nextData->type.compare("Swish") || !nextData->type.compare("Mish")) && currLayer->setActivation(nextFusabeleActivLayer)) { // activation was fused fuse_activation = true; } } } } CV_Assert(!fuse_activation || fuse_eltwise); /* cannot fuse activation without eltwise */ if(fuse_eltwise && fuse_activation) { CV_Assert(nextData); CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1); ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]); printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str())); printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str())); eltwiseData->skip = true; nextData->skip = true; // This optimization for cases like // some_layer conv // | | // +-- eltwise --+ // | // activ // This way all the element-wise computations // (i.e. some_layer+conv or some_layer*conv) // would be done at [conv] layer. So we need to // replace [conv]'s output blob to [eltwise]'s one // considering that [activ] is an in-place layer. // Also we need to move all the consumers' references. // To prevent memory collisions (i.e. when input of // [conv] and output of [eltwise] is the same blob) // we allocate a new blob. CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1); ld.outputBlobs[0] = ld.outputBlobs[0].clone(); ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]); eltwiseData->outputBlobs = ld.outputBlobs; nextData->outputBlobs = ld.outputBlobs; eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers; nextData->outputBlobsWrappers = ld.outputBlobsWrappers; // Move references of [activ] layer consumers to the newly allocated blob. for (int i = 0; i < nextData->consumers.size(); ++i) { LayerData& consumer = layers[nextData->consumers[i].lid]; for (int j = 0; j < consumer.inputBlobsId.size(); ++j) { if (consumer.inputBlobsId[j].lid == lpNext.lid) { consumer.inputBlobs[j] = &ld.outputBlobs[0]; consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; break; } } } } else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise) { CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget)); CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1); ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]); printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str())); eltwiseData->skip = true; // This optimization is for cases like // some_layer conv (maybe fused with activ) // | | // +-- eltwise --+ // // This way all the element-wise computations // (i.e. some_layer+conv or some_layer*conv) // would be done at [conv] layer. So we need to // replace [conv]'s output blob to [eltwise]'s one. // Also, we need to move all the consumers' references. // To prevent memory collisions (i.e. when input of // [conv] and output of [eltwise] is the same blob) // we allocate a new blob. CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1); ld.outputBlobs[0] = ld.outputBlobs[0].clone(); ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]); eltwiseData->outputBlobs = ld.outputBlobs; eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers; // Move references of [eltwise] layer consumers to the newly allocated blob. for (int i = 0; i < eltwiseData->consumers.size(); ++i) { LayerData& consumer = layers[eltwiseData->consumers[i].lid]; for (int j = 0; j < consumer.inputBlobsId.size(); ++j) { if (consumer.inputBlobsId[j].lid == eltwiseData->id) { consumer.inputBlobs[j] = &ld.outputBlobs[0]; consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; break; } } } } } } } break; } } if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA) continue; // Go to the next layer. // the optimization #2. if there is concat layer that concatenates channels // from the inputs together (i.e. axis == 1) then we make the inputs of // the concat layer to write to the concatenation output buffer // (and so we eliminate the concatenation layer, because the channels // are concatenated implicitly). Ptr concatLayer = ld.layerInstance.dynamicCast(); if( !concatLayer.empty() && !concatLayer->padding && ld.outputBlobs.size() == 1 ) { Mat& output = ld.outputBlobs[0]; UMat umat_output; #ifdef HAVE_OPENCL if (!ld.outputBlobsWrappers.empty() && (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))) { size_t i, ninputs = ld.inputBlobsId.size(); bool conv_layer = true; for( i = 0; i < ninputs; i++ ) { LayerPin pin = ld.inputBlobsId[i]; LayerData* inp_i_data = &layers[pin.lid]; while(inp_i_data->skip && inp_i_data->inputBlobsId.size() == 1 && inp_i_data->consumers.size() == 1) { pin = inp_i_data->inputBlobsId[0]; inp_i_data = &layers[pin.lid]; } conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution"); } if (!conv_layer) continue; std::vector umat_outputBlobs; umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers); umat_output = umat_outputBlobs[0]; } #endif // TODO: in general, this optimization can always be done, but // many layers currently check that the input/output blobs are // continuous arrays. Unfortunately, this is not true when // the concatenation optimization is applied with batch_size > 1. // so, for now, we only apply this optimization in the most popular // case batch_size == 1. int axis = normalize_axis(concatLayer->axis, output.dims); if( output.total(0, axis) == 1 ) { size_t i, ninputs = ld.inputBlobsId.size(); std::vector realinputs(ninputs); for( i = 0; i < ninputs; i++ ) { LayerPin pin = ld.inputBlobsId[i]; LayerData* inp_i_data = &layers[pin.lid]; while(inp_i_data->skip && inp_i_data->inputBlobsId.size() == 1 && inp_i_data->consumers.size() == 1) { pin = inp_i_data->inputBlobsId[0]; inp_i_data = &layers[pin.lid]; } printf_(("\treal input for %s is %s\n", layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(), inp_i_data->getLayerInstance()->name.c_str())); if(inp_i_data->skip || inp_i_data->consumers.size() != 1) break; #ifdef HAVE_CUDA if (preferableBackend == DNN_BACKEND_CUDA && (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false || (inp_i_data->layerInstance->type != "Convolution" && inp_i_data->layerInstance->type != "Pooling" && inp_i_data->layerInstance->type != "Resize" && inp_i_data->layerInstance->type != "Flatten" && inp_i_data->layerInstance->type != "Permute" && inp_i_data->layerInstance->type != "Reorg" && inp_i_data->layerInstance->type != "Eltwise" && inp_i_data->layerInstance.dynamicCast().empty()))) { break; } #endif realinputs[i] = pin; } if( i >= ninputs ) { // Allocate new memory to prevent collisions during memory // reusing (see https://github.com/opencv/opencv/pull/10456). output = output.clone(); #ifdef HAVE_OPENCL if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)) { std::vector umats(1); umat_output = umat_output.clone(); umats[0] = umat_output; OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats); } #endif #ifdef HAVE_CUDA if (preferableBackend == DNN_BACKEND_CUDA) ld.outputBlobsWrappers[0] = wrap(output); #endif std::vector chrange(output.dims, Range::all()); int ofs = 0; for( i = 0; i < ninputs; i++ ) { LayerPin pin = realinputs[i]; LayerData* inp_i_data = &layers[pin.lid]; int channels_i = ld.inputBlobs[i]->size[axis]; chrange[axis] = Range(ofs, ofs + channels_i); printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(), pin.oid, ofs, ofs + channels_i)); ofs += channels_i; Mat output_slice = output(chrange); Mat& curr_output = inp_i_data->outputBlobs[pin.oid]; CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size); Mat* oldPtr = &curr_output; curr_output = output_slice; #ifdef HAVE_OPENCL if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)) { std::vector umats(inp_i_data->outputBlobsWrappers.size()); umats[pin.oid] = umat_output(chrange); OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats); } #endif #ifdef HAVE_CUDA if (preferableBackend == DNN_BACKEND_CUDA) { auto cuda_wrapper = wrap(output).dynamicCast(); auto offset = chrange[axis].start * output_slice.total(axis + 1, output.dims); auto new_shape = shape(output_slice); cuda_wrapper->update(new_shape, offset); inp_i_data->outputBlobsWrappers[pin.oid] = cuda_wrapper.staticCast(); } #endif // Layers that refer old input Mat will refer to the // new data but the same Mat object. CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output); } #ifdef HAVE_CUDA if (preferableBackend == DNN_BACKEND_CUDA) { for (int i = 0; i < ld.consumers.size(); i++) { LayerData& consumer = layers[ld.consumers[i].lid]; for (int j = 0; j < consumer.inputBlobsId.size(); j++) { if (consumer.inputBlobsId[j].lid == ld.id) { CV_Assert(consumer.inputBlobs[j]->data == ld.outputBlobs[0].data); consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; break; } } } } #endif ld.skip = true; printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str())); } } } } } CV__DNN_INLINE_NS_END }} // namespace cv::dnn