diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 8a2ae2337e..a30171d9cc 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -187,16 +187,26 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN */ virtual void forward(std::vector &input, std::vector &output, std::vector &internals) = 0; + /** @brief Given the @p input blobs, computes the output @p blobs. + * @param[in] inputs the input blobs. + * @param[out] outputs allocated output blobs, which will store results of the computation. + * @param[out] internals allocated internal blobs + */ + virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) = 0; + + /** @brief Given the @p input blobs, computes the output @p blobs. + * @param[in] inputs the input blobs. + * @param[out] outputs allocated output blobs, which will store results of the computation. + * @param[out] internals allocated internal blobs + */ + void forward_fallback(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals); + /** @brief @overload */ CV_WRAP void finalize(const std::vector &inputs, CV_OUT std::vector &outputs); /** @brief @overload */ CV_WRAP std::vector finalize(const std::vector &inputs); - /** @brief @overload */ - CV_WRAP void forward(const std::vector &inputs, CV_IN_OUT std::vector &outputs, - CV_IN_OUT std::vector &internals); - /** @brief Allocates layer and computes output. */ CV_WRAP void run(const std::vector &inputs, CV_OUT std::vector &outputs, CV_IN_OUT std::vector &internals); diff --git a/modules/dnn/include/opencv2/dnn/shape_utils.hpp b/modules/dnn/include/opencv2/dnn/shape_utils.hpp index 4a77473d96..fa4b497a80 100644 --- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp +++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp @@ -132,6 +132,11 @@ static inline MatShape shape(const Mat& mat) return shape(mat.size.p, mat.dims); } +static inline MatShape shape(const UMat& mat) +{ + return shape(mat.size.p, mat.dims); +} + namespace {inline bool is_neg(int i) { return i < 0; }} static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1) @@ -151,7 +156,7 @@ static inline int total(const MatShape& shape, int start = -1, int end = -1) return 0; int elems = 1; - CV_Assert(start < (int)shape.size() && end <= (int)shape.size() && + CV_Assert(start <= (int)shape.size() && end <= (int)shape.size() && start <= end); for(int i = start; i < end; i++) { diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 5084322e9d..24f29a06a9 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -233,6 +233,9 @@ struct LayerData std::vector outputBlobs; std::vector inputBlobs; std::vector internals; + std::vector umat_outputBlobs; + std::vector umat_inputBlobs; + std::vector umat_internals; // Computation nodes of implemented backends (except DEFAULT). std::map > backendNodes; // Flag for skip layer computation for specific backend. @@ -263,6 +266,7 @@ struct DataLayer : public Layer { void finalize(const std::vector&, std::vector&) {} void forward(std::vector&, std::vector&, std::vector &) {} + void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) {} int outputNameToIndex(String tgtName) { @@ -398,22 +402,77 @@ public: } } + void reuseOrCreate(const MatShape& shape, const LayerPin& lp, UMat &umat_dst, bool force) + { + UMat bestBlob; + LayerPin bestBlobPin; + + if( !force ) + { + std::map::iterator hostIt; + std::map::iterator refIt; + + const int targetTotal = total(shape); + int bestBlobTotal = INT_MAX; + + for (hostIt = umat_memHosts.begin(); hostIt != umat_memHosts.end(); ++hostIt) + { + refIt = refCounter.find(hostIt->first); + // Use only blobs that had references before because if not, + // it might be used as output. + if (refIt != refCounter.end() && refIt->second == 0) + { + UMat& unusedBlob = hostIt->second; + if (unusedBlob.total() >= targetTotal && + unusedBlob.total() < bestBlobTotal) + { + bestBlobPin = hostIt->first; + bestBlob = unusedBlob; + bestBlobTotal = unusedBlob.total(); + } + } + } + } + if (!bestBlob.empty()) + { + reuse(bestBlobPin, lp); + umat_dst.create(shape, CV_32F); + } + else + { + // if dst already has been allocated with total(shape) elements, + // it won't be recrreated and pointer of dst.data remains the same. + umat_dst.create(shape, CV_32F); + addHost(lp, umat_dst); + } + } + void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes, std::vector& pinsForInternalBlobs, bool maximizeReuse) { CV_TRACE_FUNCTION(); + bool use_umat = (preferableBackend == DNN_BACKEND_DEFAULT && + preferableTarget == DNN_TARGET_OPENCL); pinsForInternalBlobs.clear(); std::vector& outputBlobs = ld.outputBlobs, &internalBlobs = ld.internals; + std::vector& umat_outputBlobs = ld.umat_outputBlobs, + &umat_internalBlobs = ld.umat_internals; + const ShapesVec& outShapes = layerShapes.out, internalShapes = layerShapes.internal; outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob internalBlobs.resize(internalShapes.size()); + if (use_umat) + { + umat_outputBlobs.resize(std::max((size_t)1, outShapes.size())); + umat_internalBlobs.resize(internalShapes.size()); + } CV_Assert(ld.requiredOutputs.size() <= outShapes.size()); @@ -433,14 +492,19 @@ public: ShapesVec shapes(outShapes); shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end()); std::vector blobs; + std::vector umat_blobs; for(int i = 0; i < outputBlobs.size(); i++) { blobs.push_back(&outputBlobs[i]); + if (use_umat) + umat_blobs.push_back(&umat_outputBlobs[i]); } for(int i = 0; i < internalBlobs.size(); i++) { blobs.push_back(&internalBlobs[i]); + if (use_umat) + umat_blobs.push_back(&umat_internalBlobs[i]); if (total(internalShapes[i])) { pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i)); @@ -467,13 +531,26 @@ public: LayerPin blobPin(ld.id, index); if (index < outShapes.size() && inPlace && !force) { - CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index])); - ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]); + if (use_umat) + { + CV_Assert(ld.umat_inputBlobs[0].total() == total(shapes[index])); + ld.umat_outputBlobs[index] = + ld.umat_inputBlobs[0].reshape(1, shapes[index].size(), + &shapes[index][0]); + } + else + { + CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index])); + ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]); + } reuse(ld.inputBlobsId[0], blobPin); } else { - reuseOrCreate(shapes[index], blobPin, *blobs[index], force); + if (use_umat) + reuseOrCreate(shapes[index], blobPin, *umat_blobs[index], force); + else + reuseOrCreate(shapes[index], blobPin, *blobs[index], force); } } } @@ -488,6 +565,19 @@ public: refCounter.clear(); reuseMap.clear(); memHosts.clear(); + umat_memHosts.clear(); + preferableTarget = DNN_TARGET_CPU; + preferableBackend = DNN_BACKEND_DEFAULT; + } + + void setPreferableTarget(int targetId) + { + preferableTarget = targetId; + } + + void setPreferableBackend(int backendId) + { + preferableBackend = backendId; } private: @@ -499,11 +589,21 @@ private: memHosts[lp] = mat; } + void addHost(const LayerPin& lp, const UMat& umat) + { + CV_Assert(umat_memHosts.find(lp) == umat_memHosts.end()); + reuseMap[lp] = lp; + umat_memHosts[lp] = umat; + } + std::map refCounter; // Maps pin to origin blob (for whom memory was allocated firstly). // For origin blobs key == value. std::map reuseMap; std::map memHosts; + std::map umat_memHosts; + int preferableTarget; + int preferableBackend; }; static Ptr wrapMat(int backendId, int targetId, const cv::Mat& m) @@ -654,6 +754,9 @@ struct Net::Impl it->second.inputBlobs.clear(); it->second.outputBlobs.clear(); it->second.internals.clear(); + it->second.umat_inputBlobs.clear(); + it->second.umat_outputBlobs.clear(); + it->second.umat_internals.clear(); } it->second.skipFlags.clear(); //it->second.consumers.clear(); @@ -974,7 +1077,11 @@ struct Net::Impl allocateLayer(*i, layersShapes); //bind inputs + bool use_umat = (preferableBackend == DNN_BACKEND_DEFAULT && + preferableTarget == DNN_TARGET_OPENCL); ld.inputBlobs.resize(ninputs); + if (use_umat) + ld.umat_inputBlobs.resize(ninputs); ld.inputBlobsWrappers.resize(ninputs); for (size_t i = 0; i < ninputs; i++) { @@ -982,6 +1089,8 @@ struct Net::Impl CV_Assert(from.valid()); CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid); ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid]; + if (use_umat) + ld.umat_inputBlobs[i] = layers[from.lid].umat_outputBlobs[from.oid]; ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid]; } @@ -1000,7 +1109,26 @@ struct Net::Impl Ptr layerPtr = ld.getLayerInstance(); { - layerPtr->finalize(ld.inputBlobs, ld.outputBlobs); + if (use_umat) + { + std::vector inputs(ld.umat_inputBlobs.size());; + std::vector outputs(ld.umat_outputBlobs.size()); + Mat mat; + for (int i = 0; i < inputs.size(); i++) + { + mat = ld.umat_inputBlobs[i].getMat(ACCESS_READ); + inputs[i] = &mat; + } + for (int i = 0; i < outputs.size(); i++) + { + outputs[i] = ld.umat_outputBlobs[i].getMat(ACCESS_READ); + } + layerPtr->finalize(inputs, outputs); + } + else + { + layerPtr->finalize(ld.inputBlobs, ld.outputBlobs); + } layerPtr->preferableTarget = preferableTarget; #if 0 std::cout << "\toutputs:"; @@ -1234,6 +1362,8 @@ struct Net::Impl getLayersShapes(inputShapes, layersShapes); blobManager.reset(); + blobManager.setPreferableTarget(preferableTarget); + blobManager.setPreferableBackend(preferableBackend); backendWrappers.clear(); blobManager.addReference(LayerPin(0, 0)); for (it = layers.begin(); it != layers.end(); ++it) @@ -1276,7 +1406,10 @@ struct Net::Impl if (!ld.inputBlobsWrappers[i].empty()) ld.inputBlobsWrappers[i]->copyToHost(); } - layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals); + if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL) + layer->forward(ld.umat_inputBlobs, ld.umat_outputBlobs, ld.umat_internals); + else + layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals); for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i) { if (!ld.outputBlobsWrappers[i].empty()) @@ -1421,6 +1554,10 @@ struct Net::Impl { CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL); } + + if (ld.umat_outputBlobs.size() > 0 && !ld.umat_outputBlobs[pin.oid].empty()) + ld.umat_outputBlobs[pin.oid].copyTo(ld.outputBlobs[pin.oid]); + return ld.outputBlobs[pin.oid]; } @@ -1520,6 +1657,13 @@ void Net::forward(std::vector& outputBlobs, const String& outputName) LayerPin pin = impl->getPinByAlias(layerName); LayerData &ld = impl->layers[pin.lid]; + + if (ld.umat_outputBlobs.size() > 0) + { + for (int i = 0; i < ld.umat_outputBlobs.size(); i++) + ld.umat_outputBlobs[i].copyTo(ld.outputBlobs[i]); + } + outputBlobs = ld.outputBlobs; } @@ -1584,6 +1728,7 @@ void Net::setPreferableBackend(int backendId) if( impl->preferableBackend != backendId ) { impl->preferableBackend = backendId; + impl->blobManager.setPreferableBackend(backendId); impl->netWasAllocated = false; impl->clear(); } @@ -1597,6 +1742,7 @@ void Net::setPreferableTarget(int targetId) if( impl->preferableTarget != targetId ) { impl->preferableTarget = targetId; + impl->blobManager.setPreferableTarget(targetId); impl->netWasAllocated = false; impl->clear(); } @@ -1623,13 +1769,25 @@ void Net::setInput(const Mat &blob_, const String& name) LayerData &ld = impl->layers[pin.lid]; ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) ); + bool use_umat = (impl->preferableBackend == DNN_BACKEND_DEFAULT && + impl->preferableTarget == DNN_TARGET_OPENCL); + if (use_umat) + ld.umat_outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) ); ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); MatShape prevShape = shape(ld.outputBlobs[pin.oid]); bool oldShape = prevShape == shape(blob_); if (oldShape) + { blob_.copyTo(ld.outputBlobs[pin.oid]); + if (use_umat) + blob_.copyTo(ld.umat_outputBlobs[pin.oid]); + } else + { ld.outputBlobs[pin.oid] = blob_.clone(); + if (use_umat) + blob_.copyTo(ld.umat_outputBlobs[pin.oid]); + } if (!ld.outputBlobsWrappers[pin.oid].empty()) { @@ -2132,13 +2290,24 @@ std::vector Layer::finalize(const std::vector &inputs) return outputs; } -void Layer::forward(const std::vector &inputs, std::vector &outputs, std::vector &internals) +void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - std::vector inputsp; - vecToPVec(inputs, inputsp); - this->forward(inputsp, outputs, internals); + std::vector inpvec; + std::vector outputs; + std::vector internals; + + inputs_arr.getMatVector(inpvec); + outputs_arr.getMatVector(outputs); + internals_arr.getMatVector(internals); + + std::vector inputs(inpvec.size()); + for (int i = 0; i < inpvec.size(); i++) + inputs[i] = &inpvec[i]; + + this->forward(inputs, outputs, internals); } void Layer::run(const std::vector &inputs, std::vector &outputs, std::vector &internals) diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index bd6133eb08..dc4a4b38c3 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -102,6 +102,14 @@ public: backendId == DNN_BACKEND_HALIDE && haveHalide(); } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 4c18517400..5e6ca2283b 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -62,6 +62,25 @@ public: return true; } +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) + { + return true; + } +#endif + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index 6833b0468b..e51e1f7824 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -176,36 +176,38 @@ public: }; #ifdef HAVE_OPENCL - bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { - CV_TRACE_FUNCTION(); - CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + std::vector inputs; + std::vector outputs; - int cAxis = clamp(axis, inputs[0]->dims); + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + + int cAxis = clamp(axis, inputs[0].dims); if (!(cAxis == 1 && outputs[0].dims == 4 && !padding)) return false; int bottom_concat_axis; - int concat_size = inputs[0]->size[2] * inputs[0]->size[3]; + int concat_size = inputs[0].size[2] * inputs[0].size[3]; int top_concat_axis = outputs[0].size[1]; int offset_concat_axis = 0; - UMat inpMat, outMat; - outMat = outputs[0].getUMat(ACCESS_WRITE); - - ocl::Kernel kernel; - String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0]->type()) + String(" "); - if (!kernel.create("concat", ocl::dnn::concat_oclsrc, buildopt)) - return false; + UMat& outMat = outputs[0]; + String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); for (size_t i = 0; i < inputs.size(); i++) { - inpMat = inputs[i]->getUMat(ACCESS_READ); - bottom_concat_axis = inputs[i]->size[1]; - size_t nthreads = inputs[i]->total(); + ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt); + if (kernel.empty()) + return false; + + UMat& inpMat = inputs[i]; + bottom_concat_axis = inputs[i].size[1]; + size_t nthreads = inputs[i].total(); kernel.set(0, (int)nthreads); kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat)); - kernel.set(2, (int)inputs[i]->size[0]); + kernel.set(2, (int)inputs[i].size[0]); kernel.set(3, (int)concat_size); kernel.set(4, (int)top_concat_axis); kernel.set(5, (int)bottom_concat_axis); @@ -222,14 +224,22 @@ public: } #endif - void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), - forward_ocl(inputs, outputs, internals)) + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); int cAxis = clamp(axis, inputs[0]->dims); Mat& outMat = outputs[0]; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 00254b2a87..b2f4ad9ad0 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -671,14 +671,20 @@ public: }; #ifdef HAVE_OPENCL - bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { - int group = inputs[0]->size[1] / umat_blobs[0].size[1]; + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + + int group = inputs[0].size[1] / umat_blobs[0].size[1]; if (convolutionOp.empty()) { OCL4DNNConvConfig config; - config.in_shape = shape(*inputs[0]); + config.in_shape = shape(inputs[0]); config.out_shape = shape(outputs[0]); config.kernel = kernel; config.pad = pad; @@ -690,6 +696,112 @@ public: convolutionOp = Ptr >(new OCL4DNNConvSpatial(config)); } + int k, outCn = umat_blobs[0].size[0]; + if( weightsMat.empty() ) + { + // prepare weightsMat where each row is aligned and has enough zero padding on the right to + // use vectorized (i.e. with intrinsics) loops without tail processing + Mat wm = blobs[0].reshape(1, outCn).clone(); + if( wm.step1() % VEC_ALIGN != 0 ) + { + int newcols = (int)alignSize(wm.step1(), VEC_ALIGN); + Mat wm_buffer = Mat(outCn, newcols, wm.type()); + Mat wm_padding = wm_buffer.colRange(wm.cols, newcols); + wm_padding.setTo(Scalar::all(0.)); + Mat wm_aligned = wm_buffer.colRange(0, wm.cols); + wm.copyTo(wm_aligned); + wm = wm_aligned; + } + weightsMat = wm; + + Mat biasMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat(); + biasvec.resize(outCn+2); + if( biasMat.empty() ) + { + for( k = 0; k < outCn; k++ ) + biasvec[k] = 0.f; + } + else + { + for( k = 0; k < outCn; k++ ) + biasvec[k] = biasMat.at(k); + } + + if( !bnorm.empty() || !scaleLayer.empty() ) + { + Mat scale, shift, scale2, shift2; + const float *scaleptr = 0, *shiftptr = 0; + const float *scaleptr2 = 0, *shiftptr2 = 0; + + if( !bnorm.empty() ) + { + bnorm->getScaleShift(scale, shift); + CV_Assert( scale.isContinuous() && shift.isContinuous() && + scale.type() == CV_32F && shift.type() == CV_32F && + scale.total() == (size_t)outCn && + shift.total() == (size_t)outCn ); + scaleptr = scale.ptr(); + shiftptr = shift.ptr(); + } + if( !scaleLayer.empty() ) + { + scale2 = scaleLayer->blobs[0]; + CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F && + scale2.total() == (size_t)outCn ); + scaleptr2 = scale2.ptr(); + if( scaleLayer->hasBias ) + { + shift2 = scaleLayer->blobs[1]; + CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F && + shift2.total() == (size_t)outCn ); + shiftptr2 = shift2.ptr(); + } + } + + if (shiftptr || shiftptr2) + fusedBias = true; + + for( int i = 0; i < outCn; i++ ) + { + float s1 = scaleptr ? scaleptr[i] : 1.f; + float delta1 = shiftptr ? shiftptr[i] : 0.f; + float s2 = scaleptr2 ? scaleptr2[i] : 1.f; + float delta2 = shiftptr2 ? shiftptr2[i] : 0.f; + float* w_i = weightsMat.ptr(i); + int j, wcols = weightsMat.cols; + + for( j = 0; j < wcols; j++ ) + w_i[j] *= (s1*s2); + + biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2); + } + } + biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1]; + } + + reluslope.clear(); + if( activ ) + { + Ptr activ_relu = activ.dynamicCast(); + if( !activ_relu.empty() ) + { + reluslope.assign(outCn+2, activ_relu->negativeSlope); + activType = OCL4DNN_CONV_FUSED_ACTIV_RELU; + } + + Ptr activ_chprelu = activ.dynamicCast(); + if( !activ_chprelu.empty() ) + { + const Mat& m = activ_chprelu->blobs[0]; + CV_Assert(m.isContinuous() && m.type() == CV_32F && (int)m.total() == outCn); + const float* mdata = m.ptr(); + reluslope.resize(outCn+2); + std::copy(mdata, mdata + outCn, reluslope.begin()); + reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1]; + activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU; + } + } + if ( newWeightAndBias ) { weightsMat.copyTo(umat_blobs[0]); @@ -723,9 +835,8 @@ public: newActiv = false; } - UMat inpMat, outMat; - inpMat = inputs[0]->getUMat(ACCESS_READ); - outMat = outputs[0].getUMat(ACCESS_WRITE); + UMat& inpMat = inputs[0]; + UMat& outMat = outputs[0]; int batch_size = inpMat.size[0]; return convolutionOp->Forward(inpMat, @@ -736,6 +847,18 @@ public: } #endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); @@ -811,11 +934,6 @@ public: } } -#ifdef HAVE_OPENCL - if (shiftptr || shiftptr2) - fusedBias = true; -#endif - for( int i = 0; i < outCn; i++ ) { float s1 = scaleptr ? scaleptr[i] : 1.f; @@ -841,9 +959,6 @@ public: if( !activ_relu.empty() ) { reluslope.assign(outCn+2, activ_relu->negativeSlope); -#ifdef HAVE_OPENCL - activType = OCL4DNN_CONV_FUSED_ACTIV_RELU; -#endif } Ptr activ_chprelu = activ.dynamicCast(); @@ -855,16 +970,9 @@ public: reluslope.resize(outCn+2); std::copy(mdata, mdata + outCn, reluslope.begin()); reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1]; -#ifdef HAVE_OPENCL - activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU; -#endif } } - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && - OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), - forward_ocl(inputs, outputs, internals)) - int nstripes = std::max(getNumThreads(), 1); ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope, @@ -1173,6 +1281,14 @@ public: } }; + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/crop_layer.cpp b/modules/dnn/src/layers/crop_layer.cpp index 17014955d5..29b26fc2d1 100644 --- a/modules/dnn/src/layers/crop_layer.cpp +++ b/modules/dnn/src/layers/crop_layer.cpp @@ -133,6 +133,14 @@ public: } } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp index ae4774a993..055115c37e 100644 --- a/modules/dnn/src/layers/detection_output_layer.cpp +++ b/modules/dnn/src/layers/detection_output_layer.cpp @@ -194,6 +194,95 @@ public: return false; } +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + std::vector inpvec; + std::vector outputs; + + inputs_arr.getMatVector(inpvec); + outputs_arr.getMatVector(outputs); + + std::vector inputs(inpvec.size()); + for (size_t i = 0; i < inpvec.size(); i++) + inputs[i] = &inpvec[i]; + + std::vector allDecodedBBoxes; + std::vector > > allConfidenceScores; + + int num = inputs[0]->size[0]; + + // extract predictions from input layers + { + int numPriors = inputs[2]->size[2] / 4; + + const float* locationData = inputs[0]->ptr(); + const float* confidenceData = inputs[1]->ptr(); + const float* priorData = inputs[2]->ptr(); + + // Retrieve all location predictions + std::vector allLocationPredictions; + GetLocPredictions(locationData, num, numPriors, _numLocClasses, + _shareLocation, _locPredTransposed, allLocationPredictions); + + // Retrieve all confidences + GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores); + + // Retrieve all prior bboxes + std::vector priorBBoxes; + std::vector > priorVariances; + GetPriorBBoxes(priorData, numPriors, priorBBoxes, priorVariances); + + // Decode all loc predictions to bboxes + DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num, + _shareLocation, _numLocClasses, _backgroundLabelId, + _codeType, _varianceEncodedInTarget, false, allDecodedBBoxes); + } + + size_t numKept = 0; + std::vector > > allIndices; + for (int i = 0; i < num; ++i) + { + numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices); + } + + if (numKept == 0) + { + // Set confidences to zeros. + Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)}; + outputs[0](ranges).setTo(0); + return true; + } + int outputShape[] = {1, 1, (int)numKept, 7}; + Mat mat(4, outputShape, CV_32F); + float* outputsData = mat.ptr(); + + size_t count = 0; + for (int i = 0; i < num; ++i) + { + count += outputDetections_(i, &outputsData[count * 7], + allDecodedBBoxes[i], allConfidenceScores[i], + allIndices[i]); + } + UMat& output = outputs_arr.getUMatRef(0); + output = mat.getUMat(ACCESS_READ); + CV_Assert(count == numKept); + return true; + } +#endif + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 1e3d2de3ec..d427e60e23 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -156,13 +156,20 @@ public: return true; } - void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), - func.applyOCL(inputs, outputs, internals)) + func.applyOCL(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + CV_TRACE_FUNCTION(); for (size_t i = 0; i < inputs.size(); i++) { @@ -258,25 +265,29 @@ struct ReLUFunctor return true; } - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize(); + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); for (size_t i = 0; i < inputs.size(); i++) { - UMat src, dst; - inputs[i]->copyTo(src); - dst = outputs[i].getUMat(ACCESS_WRITE); + UMat& src = inputs[i]; + UMat& dst = outputs[i]; CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset); - ocl::Kernel ker; - CV_Assert(initKernel(ker, src)); - ker.set(0, (int)src.total()); - ker.set(1, ocl::KernelArg::PtrReadOnly(src)); - ker.set(2, ocl::KernelArg::PtrWriteOnly(dst)); + ocl::Kernel kernel; + CV_Assert(initKernel(kernel, src)); + kernel.set(0, (int)src.total()); + kernel.set(1, ocl::KernelArg::PtrReadOnly(src)); + kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst)); size_t gSize = src.total(); - CV_Assert(ker.run(1, &gSize, &wgSize, false)); + CV_Assert(kernel.run(1, &gSize, &wgSize, false)); } return true; @@ -347,7 +358,7 @@ struct ReLU6Functor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; @@ -382,7 +393,7 @@ struct TanHFunctor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; @@ -417,7 +428,7 @@ struct SigmoidFunctor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; @@ -454,7 +465,7 @@ struct ELUFunctor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; @@ -489,7 +500,7 @@ struct AbsValFunctor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; @@ -524,7 +535,7 @@ struct BNLLFunctor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; @@ -581,7 +592,7 @@ struct PowerFunctor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; @@ -656,7 +667,7 @@ struct ChannelsPReLUFunctor } #ifdef HAVE_OPENCL - bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { // TODO: implement OCL version return false; diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 83f263da23..b98537b91d 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -254,6 +254,14 @@ public: } }; + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp index 8f477e42d0..039720c038 100644 --- a/modules/dnn/src/layers/flatten_layer.cpp +++ b/modules/dnn/src/layers/flatten_layer.cpp @@ -104,6 +104,43 @@ public: return true; } +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + std::vector inpvec; + std::vector outputs; + + inputs_arr.getUMatVector(inpvec); + outputs_arr.getUMatVector(outputs); + + std::vector inputs(inpvec.size()); + for (int i = 0; i < inpvec.size(); i++) + inputs[i] = &inpvec[i]; + + for (size_t i = 0; i < inputs.size(); i++) + { + MatShape outShape = shape(outputs[i]); + UMat& output = outputs_arr.getUMatRef(i); + output = inputs[i]->reshape(1, (int)outShape.size(), &outShape[0]); + } + + return true; + } +#endif + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + outputs_arr.isUMatVector() && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 6fa9ed673c..184e2b824f 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -258,12 +258,18 @@ public: }; #ifdef HAVE_OPENCL - bool forward_ocl(std::vector &input, std::vector &output) + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals) { - int axisCan = clamp(axis, input[0]->dims); - int numOutput = blobs[0].size[0]; - int innerSize = blobs[0].size[1]; - int outerSize = input[0]->total(0, axisCan); + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + + int axisCan = clamp(axis, inputs[0].dims); + int numOutput = umat_blobs[0].size[0]; + int innerSize = umat_blobs[0].size[1]; + int outerSize = total(shape(inputs[0]), 0, axisCan); bool ret = true; if (innerProductOp.empty()) @@ -278,11 +284,10 @@ public: } UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); - for (size_t i = 0; i < input.size(); i++) + for (size_t i = 0; i < inputs.size(); i++) { - UMat srcMat, dstMat; - srcMat = input[i]->reshape(1, outerSize).getUMat(ACCESS_READ); - dstMat = output[i].reshape(1, outerSize).getUMat(ACCESS_WRITE); + UMat& srcMat = inputs[i]; + UMat& dstMat = outputs[i]; dstMat.setTo(0.0f); if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat)) @@ -301,11 +306,15 @@ public: if (ret) return true; UMat& weights = umat_blobs[0]; - for (size_t i = 0; i < input.size(); i++) + for (size_t i = 0; i < inputs.size(); i++) { + MatShape inshape, outshape; + inshape = shape(outerSize, innerSize); + outshape = shape(outerSize, numOutput); + UMat srcMat, dstMat; - srcMat = input[i]->reshape(1, outerSize).getUMat(ACCESS_READ); - dstMat = output[i].reshape(1, outerSize).getUMat(ACCESS_WRITE); + srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]); + dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]); cv::gemm(srcMat, weights, 1, noArray(), 0, dstMat, GEMM_2_T); @@ -320,14 +329,22 @@ public: } #endif - void forward(std::vector &input, std::vector &output, std::vector &) + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), - forward_ocl(input, output)) + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + + void forward(std::vector &input, std::vector &output, std::vector &) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); int axisCan = clamp(axis, input[0]->dims); int outerSize = input[0]->total(0, axisCan); diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp index 62dde95e90..50c0ae1dfc 100644 --- a/modules/dnn/src/layers/lrn_layer.cpp +++ b/modules/dnn/src/layers/lrn_layer.cpp @@ -94,8 +94,14 @@ public: } #ifdef HAVE_OPENCL - bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + if (lrnOp.empty()) { OCL4DNNLRNConfig config; @@ -108,28 +114,38 @@ public: config.alpha = alpha; config.beta = beta; config.k = bias; - CHECK_EQ(4, inputs[0]->dims) << "Input must have 4 axes, " + CHECK_EQ(4, inputs[0].dims) << "Input must have 4 axes, " << "corresponding to (num, channels, height, width)"; - config.batch_size = inputs[0]->size[0]; - config.channels = inputs[0]->size[1]; - config.height = inputs[0]->size[2]; - config.width = inputs[0]->size[3]; + config.batch_size = inputs[0].size[0]; + config.channels = inputs[0].size[1]; + config.height = inputs[0].size[2]; + config.width = inputs[0].size[3]; config.norm_by_size = normBySize; lrnOp = Ptr >(new OCL4DNNLRN(config)); } - UMat inpMat, outMat; - inpMat = inputs[0]->getUMat(ACCESS_READ); - outMat = outputs[0].getUMat(ACCESS_WRITE); - - if (!lrnOp->Forward(inpMat, outMat)) + if (!lrnOp->Forward(inputs[0], outputs[0])) return false; return true; } #endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_Assert(inputs_arr.total() == outputs_arr.total()); + + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); @@ -137,10 +153,6 @@ public: CV_Assert(inputs.size() == outputs.size()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && - OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), - forward_ocl(inputs, outputs, internals)) - for (int i = 0; i < inputs.size(); i++) { CV_Assert(inputs[i]->dims == 4); diff --git a/modules/dnn/src/layers/max_unpooling_layer.cpp b/modules/dnn/src/layers/max_unpooling_layer.cpp index 99507182a6..e0c105dd4c 100644 --- a/modules/dnn/src/layers/max_unpooling_layer.cpp +++ b/modules/dnn/src/layers/max_unpooling_layer.cpp @@ -55,6 +55,14 @@ public: return false; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 536a21f4a7..e7d2ff524e 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -60,6 +60,14 @@ public: eps = params.get("eps", 1e-9); } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index 020d1d385a..9d2940d6da 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -69,6 +69,14 @@ public: return true; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp index ae62ee4560..52f7b6b9c0 100644 --- a/modules/dnn/src/layers/padding_layer.cpp +++ b/modules/dnn/src/layers/padding_layer.cpp @@ -91,6 +91,14 @@ public: backendId == DNN_BACKEND_HALIDE && haveHalide() && dstRanges.size() == 4; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp index fcebc9bb66..a21c5a6d1c 100644 --- a/modules/dnn/src/layers/permute_layer.cpp +++ b/modules/dnn/src/layers/permute_layer.cpp @@ -247,6 +247,14 @@ public: } }; + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index c27315ba26..6f9977330a 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -113,18 +113,24 @@ public: } #ifdef HAVE_OPENCL - bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals) { + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + if (poolOp.empty()) { OCL4DNNPoolConfig config; - config.in_shape = shape(*inputs[0]); + config.in_shape = shape(inputs[0]); config.out_shape = shape(outputs[0]); config.kernel = kernel; config.pad = pad; config.stride = stride; - config.channels = inputs[0]->size[1]; + config.channels = inputs[0].size[1]; config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX : (type == AVE ? LIBDNN_POOLING_METHOD_AVE : LIBDNN_POOLING_METHOD_STO); @@ -133,18 +139,10 @@ public: for (size_t ii = 0; ii < inputs.size(); ii++) { - UMat inpMat, outMat, maskMat; - - inpMat = inputs[ii]->getUMat(ACCESS_READ); - - if (type == MAX) - { - outMat = outputs[2 * ii].getUMat(ACCESS_WRITE); - maskMat = outputs[2 * ii + 1].getUMat(ACCESS_WRITE); - } else { - outMat = outputs[ii].getUMat(ACCESS_WRITE); - maskMat = UMat(); - } + UMat& inpMat = inputs[ii]; + int out_index = (type == MAX) ? 2 : 1; + UMat& outMat = outputs[out_index * ii]; + UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat(); CV_Assert(inpMat.offset == 0 && outMat.offset == 0); @@ -156,14 +154,22 @@ public: } #endif - void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), - forward_ocl(inputs, outputs, internals)) + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); for (size_t ii = 0; ii < inputs.size(); ii++) { diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp index 009789d5ca..c158bc62fe 100644 --- a/modules/dnn/src/layers/prior_box_layer.cpp +++ b/modules/dnn/src/layers/prior_box_layer.cpp @@ -249,6 +249,14 @@ public: return false; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp index a40bcc622a..92e9e0e233 100644 --- a/modules/dnn/src/layers/recurrent_layers.cpp +++ b/modules/dnn/src/layers/recurrent_layers.cpp @@ -252,6 +252,14 @@ public: allocated = true; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &input, std::vector &output, std::vector &internals) { CV_TRACE_FUNCTION(); @@ -465,6 +473,14 @@ public: } } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &input, std::vector &output, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp index 1e0f6b0555..bc12e8b1be 100644 --- a/modules/dnn/src/layers/region_layer.cpp +++ b/modules/dnn/src/layers/region_layer.cpp @@ -114,6 +114,14 @@ public: } } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp index 720d25eab5..78c806fc40 100644 --- a/modules/dnn/src/layers/reorg_layer.cpp +++ b/modules/dnn/src/layers/reorg_layer.cpp @@ -85,6 +85,15 @@ public: { return backendId == DNN_BACKEND_DEFAULT; } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp index a3d71cbebb..d4a2875356 100644 --- a/modules/dnn/src/layers/reshape_layer.cpp +++ b/modules/dnn/src/layers/reshape_layer.cpp @@ -182,6 +182,14 @@ public: return true; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp b/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp index f3025970ad..2ad5ba3b3a 100644 --- a/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp +++ b/modules/dnn/src/layers/resize_nearest_neighbor_layer.cpp @@ -37,6 +37,14 @@ public: return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]); } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index f3c4a0c6cc..266dec226a 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -44,6 +44,14 @@ public: backendId == DNN_BACKEND_HALIDE && haveHalide(); } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/shift_layer.cpp b/modules/dnn/src/layers/shift_layer.cpp index 0bd9515d3c..9e1004ab1c 100644 --- a/modules/dnn/src/layers/shift_layer.cpp +++ b/modules/dnn/src/layers/shift_layer.cpp @@ -36,6 +36,14 @@ public: return true; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + virtual void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 07a670bf31..18758b98bf 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -171,6 +171,14 @@ public: } } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index af552f0e9f..1ab9ed2b79 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -91,35 +91,42 @@ public: } #ifdef HAVE_OPENCL - bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns) { + std::vector inputs; + std::vector outputs; + std::vector internals; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + itns.getUMatVector(internals); + if (softmaxOp.empty()) { OCL4DNNSoftmaxConfig config; - config.in_shape = shape(*inputs[0]); + config.in_shape = shape(inputs[0]); config.axis = axisRaw; - config.channels = inputs[0]->size[axisRaw]; + config.channels = inputs[0].size[axisRaw]; config.logsoftmax = logSoftMax; softmaxOp = Ptr >(new OCL4DNNSoftmax(config)); } - UMat srcMat, dstMat; - srcMat = inputs[0]->getUMat(ACCESS_READ); - dstMat = outputs[0].getUMat(ACCESS_WRITE); + UMat& src = inputs[0]; + UMat& dstMat = outputs[0]; - if (softmaxOp->Forward(srcMat, dstMat)) + if (softmaxOp->Forward(src, dstMat)) return true; - const Mat &src = *inputs[0]; - UMat bufMat = internals[0].getUMat(ACCESS_WRITE); - srcMat.copyTo(dstMat); + UMat& bufMat = internals[0]; + src.copyTo(dstMat); int axis = clamp(axisRaw, src.dims); - size_t outerSize = src.total(0, axis); + MatShape s = shape(src); + size_t outerSize = total(s, 0, axis); size_t channels = src.size[axis]; - size_t innerSize = src.total(axis + 1); + size_t innerSize = total(s, axis + 1); String buildOpts = String("-DT=") + ocl::typeToStr(src.type()); ocl::Kernel kmax, ksub, ksum, kdiv; @@ -175,14 +182,22 @@ public: } #endif - void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), - forward_ocl(inputs, outputs, internals)) + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); const Mat &src = *inputs[0]; Mat &dst = outputs[0]; diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp index 435d4bd8e4..3da4cb27e7 100644 --- a/modules/dnn/src/layers/split_layer.cpp +++ b/modules/dnn/src/layers/split_layer.cpp @@ -78,6 +78,14 @@ public: return false; } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION();