/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "precomp.hpp" #include "op_halide.hpp" #include "halide_scheduler.hpp" #include #include #include #include #include #include #include #include namespace cv { namespace dnn { CV__DNN_EXPERIMENTAL_NS_BEGIN using std::vector; using std::map; using std::make_pair; using std::set; namespace { typedef std::vector ShapesVec; struct LayerShapes { ShapesVec in, out, internal; // No guarantees that layer which support in-place computations // will be computed in-place (input.data_ptr == output.data_ptr). // If layer said that it could work in-place and layers after it // no longer use input blob, we'll set output = input. bool supportInPlace; LayerShapes() {supportInPlace = false;} }; } template static String toString(const T &v) { std::ostringstream ss; ss << v; return ss.str(); } Mat blobFromImage(InputArray image, double scalefactor, const Size& size, const Scalar& mean, bool swapRB, bool crop) { CV_TRACE_FUNCTION(); std::vector images(1, image.getMat()); return blobFromImages(images, scalefactor, size, mean, swapRB, crop); } Mat blobFromImages(const std::vector& images_, double scalefactor, Size size, const Scalar& mean_, bool swapRB, bool crop) { CV_TRACE_FUNCTION(); std::vector images = images_; for (int i = 0; i < images.size(); i++) { Size imgSize = images[i].size(); if (size == Size()) size = imgSize; if (size != imgSize) { if(crop) { float resizeFactor = std::max(size.width / (float)imgSize.width, size.height / (float)imgSize.height); resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR); Rect crop(Point(0.5 * (images[i].cols - size.width), 0.5 * (images[i].rows - size.height)), size); images[i] = images[i](crop); } else resize(images[i], images[i], size, 0, 0, INTER_LINEAR); } if(images[i].depth() == CV_8U) images[i].convertTo(images[i], CV_32F); Scalar mean = mean_; if (swapRB) std::swap(mean[0], mean[2]); images[i] -= mean; images[i] *= scalefactor; } size_t i, nimages = images.size(); if(nimages == 0) return Mat(); Mat image0 = images[0]; int nch = image0.channels(); CV_Assert(image0.dims == 2); Mat blob, image; if (nch == 3 || nch == 4) { int sz[] = { (int)nimages, nch, image0.rows, image0.cols }; blob = Mat(4, sz, CV_32F); Mat ch[4]; for( i = 0; i < nimages; i++ ) { image = images[i]; CV_Assert(image.depth() == CV_32F); nch = image.channels(); CV_Assert(image.dims == 2 && (nch == 3 || nch == 4)); CV_Assert(image.size() == image0.size()); for( int j = 0; j < nch; j++ ) ch[j] = Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, j)); if(swapRB) std::swap(ch[0], ch[2]); split(image, ch); } } else { CV_Assert(nch == 1); int sz[] = { (int)nimages, 1, image0.rows, image0.cols }; blob = Mat(4, sz, CV_32F); for( i = 0; i < nimages; i++ ) { Mat image = images[i]; CV_Assert(image.depth() == CV_32F); nch = image.channels(); CV_Assert(image.dims == 2 && (nch == 1)); CV_Assert(image.size() == image0.size()); image.copyTo(Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, 0))); } } return blob; } struct LayerPin { int lid; int oid; LayerPin(int layerId = -1, int outputId = -1) : lid(layerId), oid(outputId) {} bool valid() const { return (lid >= 0 && oid >= 0); } bool equal(const LayerPin &r) const { return (lid == r.lid && oid == r.oid); } bool operator<(const LayerPin &r) const { return lid < r.lid || lid == r.lid && oid < r.oid; } bool operator ==(const LayerPin &r) const { return lid == r.lid && oid == r.oid; } }; struct LayerData { LayerData() : id(-1), flag(0) {} LayerData(int _id, const String &_name, const String &_type, LayerParams &_params) : id(_id), name(_name), type(_type), params(_params), flag(0) { CV_TRACE_FUNCTION(); //add logging info params.name = name; params.type = type; } int id; String name; String type; LayerParams params; std::vector inputBlobsId; std::set inputLayersId; std::set requiredOutputs; std::vector consumers; std::vector > outputBlobsWrappers; std::vector > inputBlobsWrappers; Ptr layerInstance; std::vector outputBlobs; std::vector inputBlobs; std::vector internals; std::vector umat_outputBlobs; std::vector umat_inputBlobs; std::vector umat_internals; // Computation nodes of implemented backends (except DEFAULT). std::map > backendNodes; // Flag for skip layer computation for specific backend. std::map skipFlags; int flag; Ptr getLayerInstance() { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(type, "type", type.c_str()); if (layerInstance) return layerInstance; layerInstance = LayerFactory::createLayerInstance(type, params); if (!layerInstance) { CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\""); } return layerInstance; } }; //fake layer containing network input blobs struct DataLayer : public Layer { void finalize(const std::vector&, std::vector&) {} void forward(std::vector&, std::vector&, std::vector &) {} void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) {} int outputNameToIndex(String tgtName) { int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin()); return (idx < (int)outNames.size()) ? idx : -1; } void setNames(const std::vector &names) { outNames.assign(names.begin(), names.end()); } bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const { CV_Assert(inputs.size() == requiredOutputs); outputs.assign(inputs.begin(), inputs.end()); return false; } private: std::vector outNames; }; struct BlobManager { public: // Increase references counter to layer output. void addReference(const LayerPin& lp) { std::map::iterator it = refCounter.find(lp); if (it == refCounter.end()) refCounter[lp] = 1; else it->second += 1; } void addReferences(const std::vector& pins) { for (int i = 0; i < pins.size(); i++) { addReference(pins[i]); } } // Returns number of references to allocated memory that used in specific // layer blob. int numReferences(const LayerPin& lp) { std::map::iterator mapIt = reuseMap.find(lp); CV_Assert(mapIt != reuseMap.end()); LayerPin memHost = mapIt->second; std::map::iterator refIt = refCounter.find(memHost); CV_Assert(refIt != refCounter.end()); return refIt->second; } // Reuse data allocated in inside the blob. void reuse(const LayerPin& host, const LayerPin& user) { CV_Assert(reuseMap.find(user) == reuseMap.end()); CV_Assert(reuseMap.find(host) != reuseMap.end()); LayerPin memHost = reuseMap[host]; reuseMap[user] = memHost; if (refCounter.find(memHost) != refCounter.end()) { std::map::iterator userRefIt = refCounter.find(user); if (userRefIt != refCounter.end()) { refCounter[memHost] += userRefIt->second; refCounter.erase(userRefIt); } else refCounter[memHost] += 1; } } // Decrease references counter to allocated memory inside specific blob. void releaseReference(const LayerPin& lp) { std::map::iterator mapIt = reuseMap.find(lp); CV_Assert(mapIt != reuseMap.end()); std::map::iterator refIt = refCounter.find(mapIt->second); CV_Assert(refIt != refCounter.end()); CV_Assert(refIt->second > 0); refIt->second -= 1; } void releaseReferences(const std::vector& pins) { for (int i = 0; i < pins.size(); i++) { releaseReference(pins[i]); } } void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool force) { Mat bestBlob; LayerPin bestBlobPin; if( !force ) { std::map::iterator hostIt; std::map::iterator refIt; const int targetTotal = total(shape); int bestBlobTotal = INT_MAX; for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt) { refIt = refCounter.find(hostIt->first); // Use only blobs that had references before because if not, // it might be used as output. if (refIt != refCounter.end() && refIt->second == 0) { Mat& unusedBlob = hostIt->second; if (unusedBlob.total() >= targetTotal && unusedBlob.total() < bestBlobTotal) { bestBlobPin = hostIt->first; bestBlob = unusedBlob; bestBlobTotal = unusedBlob.total(); } } } } if (!bestBlob.empty()) { reuse(bestBlobPin, lp); dst = Mat(shape, CV_32F, bestBlob.data); } else { // if dst already has been allocated with total(shape) elements, // it won't be recrreated and pointer of dst.data remains the same. dst.create(shape, CV_32F); addHost(lp, dst); } } void reuseOrCreate(const MatShape& shape, const LayerPin& lp, UMat &umat_dst, bool force) { UMat bestBlob; LayerPin bestBlobPin; if( !force ) { std::map::iterator hostIt; std::map::iterator refIt; const int targetTotal = total(shape); int bestBlobTotal = INT_MAX; for (hostIt = umat_memHosts.begin(); hostIt != umat_memHosts.end(); ++hostIt) { refIt = refCounter.find(hostIt->first); // Use only blobs that had references before because if not, // it might be used as output. if (refIt != refCounter.end() && refIt->second == 0) { UMat& unusedBlob = hostIt->second; if (unusedBlob.total() >= targetTotal && unusedBlob.total() < bestBlobTotal) { bestBlobPin = hostIt->first; bestBlob = unusedBlob; bestBlobTotal = unusedBlob.total(); } } } } if (!bestBlob.empty()) { reuse(bestBlobPin, lp); umat_dst.create(shape, CV_32F); } else { // if dst already has been allocated with total(shape) elements, // it won't be recrreated and pointer of dst.data remains the same. umat_dst.create(shape, CV_32F); addHost(lp, umat_dst); } } void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes, std::vector& pinsForInternalBlobs, bool maximizeReuse) { CV_TRACE_FUNCTION(); bool use_umat = (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL); pinsForInternalBlobs.clear(); std::vector& outputBlobs = ld.outputBlobs, &internalBlobs = ld.internals; std::vector& umat_outputBlobs = ld.umat_outputBlobs, &umat_internalBlobs = ld.umat_internals; const ShapesVec& outShapes = layerShapes.out, internalShapes = layerShapes.internal; outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob internalBlobs.resize(internalShapes.size()); if (use_umat) { umat_outputBlobs.resize(std::max((size_t)1, outShapes.size())); umat_internalBlobs.resize(internalShapes.size()); } CV_Assert(ld.requiredOutputs.size() <= outShapes.size()); // Check that layer could work in-place. bool inPlace = false; if (layerShapes.supportInPlace) { if (ld.inputBlobs.size() == 1) { // Get number of references to the input memory. int numRef = numReferences(ld.inputBlobsId[0]); // If current layer is one and only customer of this blob. inPlace = numRef == 1; } } ShapesVec shapes(outShapes); shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end()); std::vector blobs; std::vector umat_blobs; for(int i = 0; i < outputBlobs.size(); i++) { blobs.push_back(&outputBlobs[i]); if (use_umat) umat_blobs.push_back(&umat_outputBlobs[i]); } for(int i = 0; i < internalBlobs.size(); i++) { blobs.push_back(&internalBlobs[i]); if (use_umat) umat_blobs.push_back(&umat_internalBlobs[i]); if (total(internalShapes[i])) { pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i)); } } addReferences(pinsForInternalBlobs); std::map > idxSizes; for(int i = 0; i < shapes.size(); i++) { idxSizes[total(shapes[i])].push_back(i); } std::map >::reverse_iterator it; bool force = !maximizeReuse && ld.inputBlobsId.size() > 1; for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++) { for(int j = 0; j < it->second.size(); j++) { int index = it->second[j]; if (total(shapes[index])) { LayerPin blobPin(ld.id, index); if (index < outShapes.size() && inPlace && !force) { if (use_umat) { CV_Assert(ld.umat_inputBlobs[0].total() == total(shapes[index])); ld.umat_outputBlobs[index] = ld.umat_inputBlobs[0].reshape(1, shapes[index].size(), &shapes[index][0]); } else { CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index])); ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]); } reuse(ld.inputBlobsId[0], blobPin); } else { if (use_umat) reuseOrCreate(shapes[index], blobPin, *umat_blobs[index], force); else reuseOrCreate(shapes[index], blobPin, *blobs[index], force); } } } } } // Clear internal state. Calls before an every reallocation. void reset() { CV_TRACE_FUNCTION(); refCounter.clear(); reuseMap.clear(); memHosts.clear(); umat_memHosts.clear(); preferableTarget = DNN_TARGET_CPU; preferableBackend = DNN_BACKEND_DEFAULT; } void setPreferableTarget(int targetId) { preferableTarget = targetId; } void setPreferableBackend(int backendId) { preferableBackend = backendId; } private: // Register allocated memory. void addHost(const LayerPin& lp, const Mat& mat) { CV_Assert(memHosts.find(lp) == memHosts.end()); reuseMap[lp] = lp; memHosts[lp] = mat; } void addHost(const LayerPin& lp, const UMat& umat) { CV_Assert(umat_memHosts.find(lp) == umat_memHosts.end()); reuseMap[lp] = lp; umat_memHosts[lp] = umat; } std::map refCounter; // Maps pin to origin blob (for whom memory was allocated firstly). // For origin blobs key == value. std::map reuseMap; std::map memHosts; std::map umat_memHosts; int preferableTarget; int preferableBackend; }; static Ptr wrapMat(int backendId, int targetId, const cv::Mat& m) { if (backendId == DNN_BACKEND_DEFAULT) { return Ptr(); } else if (backendId == DNN_BACKEND_HALIDE) { CV_Assert(haveHalide()); #ifdef HAVE_HALIDE return Ptr(new HalideBackendWrapper(targetId, m)); #endif // HAVE_HALIDE } else CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); return Ptr(); } struct Net::Impl { typedef std::map LayersShapesMap; typedef std::map MapIdToLayerData; Impl() { //allocate fake net input layer netInputLayer = Ptr(new DataLayer()); LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second; inpl.id = 0; inpl.name = "_input"; inpl.type = "__NetInputLayer__"; inpl.layerInstance = netInputLayer; layerNameToId.insert(std::make_pair(inpl.name, inpl.id)); lastLayerId = 0; netWasAllocated = false; fusion = true; preferableBackend = DNN_BACKEND_DEFAULT; preferableTarget = DNN_TARGET_CPU; blobManager.setPreferableBackend(DNN_BACKEND_DEFAULT); blobManager.setPreferableTarget(DNN_TARGET_CPU); } Ptr netInputLayer; std::vector netOutputs; std::vector blobsToKeep; MapIdToLayerData layers; std::map layerNameToId; BlobManager blobManager; int preferableBackend; int preferableTarget; String halideConfigFile; // Map host data to backend specific wrapper. std::map > backendWrappers; int lastLayerId; bool netWasAllocated; bool fusion; std::vector layersTimings; Ptr wrap(const Mat& host) { if (preferableBackend == DNN_BACKEND_DEFAULT) return Ptr(); MatShape shape(host.dims); for (int i = 0; i < host.dims; ++i) shape[i] = host.size[i]; void* data = host.data; if (backendWrappers.find(data) != backendWrappers.end()) { Ptr baseBuffer = backendWrappers[data]; if (preferableBackend == DNN_BACKEND_HALIDE) { CV_Assert(haveHalide()); #ifdef HAVE_HALIDE return Ptr(new HalideBackendWrapper(baseBuffer, shape)); #endif // HAVE_HALIDE } else CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); } Ptr wrapper = wrapMat(preferableBackend, preferableTarget, host); backendWrappers[data] = wrapper; return wrapper; } #ifdef HAVE_HALIDE void compileHalide() { CV_TRACE_FUNCTION(); CV_Assert(preferableBackend == DNN_BACKEND_HALIDE); HalideScheduler scheduler(halideConfigFile); std::vector< std::reference_wrapper > compileList; compileList.reserve(64); for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it) { LayerData &ld = it->second; Ptr layer = ld.layerInstance; if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skipFlags[DNN_BACKEND_HALIDE]) { CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty()); bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]); if (!scheduled) { // Use automatic scheduling provided by layer. layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE], ld.inputBlobs, ld.outputBlobs, preferableTarget); } compileList.emplace_back(ld); } } std::atomic progress(0); auto fn = ([&] () -> void { for (;;) { int id = progress.fetch_add(1); if ((size_t)id >= compileList.size()) return; const LayerData& ld = compileList[id].get(); Ptr node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second; dnn::compileHalide(ld.outputBlobs, node, preferableTarget); } }); size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency()); num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads)); std::vector threads(num_threads - 1); for (auto& t: threads) t = std::thread(fn); fn(); // process own tasks for (auto& t: threads) t.join(); } #endif void clear() { CV_TRACE_FUNCTION(); MapIdToLayerData::iterator it; for (it = layers.begin(); it != layers.end(); it++) { if (it->second.id != 0) { it->second.inputBlobs.clear(); it->second.outputBlobs.clear(); it->second.internals.clear(); it->second.umat_inputBlobs.clear(); it->second.umat_outputBlobs.clear(); it->second.umat_internals.clear(); } it->second.skipFlags.clear(); //it->second.consumers.clear(); Ptr currLayer = it->second.layerInstance; if( currLayer.empty() ) continue; currLayer->unsetAttached(); Ptr poolingLayer = currLayer.dynamicCast(); if( !poolingLayer.empty() ) { poolingLayer->computeMaxIdx = true; } } it = layers.find(0); CV_Assert(it != layers.end()); it->second.skipFlags[DNN_BACKEND_DEFAULT] = true; layersTimings.clear(); } void setUpNet(const std::vector& blobsToKeep_ = std::vector()) { CV_TRACE_FUNCTION(); if (!netWasAllocated || this->blobsToKeep != blobsToKeep_) { clear(); allocateLayers(blobsToKeep_); computeNetOutputLayers(); initBackend(); if (!netWasAllocated ) { #ifdef HAVE_HALIDE if (preferableBackend == DNN_BACKEND_HALIDE) compileHalide(); #else CV_Assert(preferableBackend != DNN_BACKEND_HALIDE); #endif } netWasAllocated = true; this->blobsToKeep = blobsToKeep_; } } int getLayerId(const String &layerName) { std::map::iterator it = layerNameToId.find(layerName); return (it != layerNameToId.end()) ? it->second : -1; } int getLayerId(int id) { MapIdToLayerData::iterator it = layers.find(id); return (it != layers.end()) ? id : -1; } int getLayerId(DictValue &layerDesc) { if (layerDesc.isInt()) return getLayerId(layerDesc.get()); else if (layerDesc.isString()) return getLayerId(layerDesc.get()); CV_Assert(layerDesc.isInt() || layerDesc.isString()); return -1; } String getLayerName(int id) { MapIdToLayerData::iterator it = layers.find(id); return (it != layers.end()) ? it->second.name : "(unknown layer)"; } LayerData& getLayerData(int id) { MapIdToLayerData::iterator it = layers.find(id); if (it == layers.end()) CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id)); return it->second; } LayerData& getLayerData(const String &layerName) { int id = getLayerId(layerName); if (id < 0) CV_Error(Error::StsError, "Requsted layer \"" + layerName + "\" not found"); return getLayerData(id); } LayerData& getLayerData(const DictValue &layerDesc) { CV_Assert(layerDesc.isInt() || layerDesc.isString()); if (layerDesc.isInt()) return getLayerData(layerDesc.get()); else /*if (layerDesc.isString())*/ return getLayerData(layerDesc.get()); } static void addLayerInput(LayerData &ld, int inNum, LayerPin from) { if ((int)ld.inputBlobsId.size() <= inNum) { ld.inputBlobsId.resize(inNum + 1); } else { LayerPin storedFrom = ld.inputBlobsId[inNum]; if (storedFrom.valid() && !storedFrom.equal(from)) CV_Error(Error::StsError, "Input #" + toString(inNum) + "of layer \"" + ld.name + "\" already was connected"); } ld.inputBlobsId[inNum] = from; } static void splitPin(const String &pinAlias, String &layerName, String &outName) { size_t delimPos = pinAlias.find('.'); layerName = pinAlias.substr(0, delimPos); outName = (delimPos == String::npos) ? String() : pinAlias.substr(delimPos + 1); } int resolvePinOutputName(LayerData &ld, const String &outName) { if (outName.empty()) return 0; if (std::isdigit(outName[0])) { char *lastChar; long inum = std::strtol(outName.c_str(), &lastChar, 10); if (*lastChar == 0) { CV_Assert(inum == (int)inum); return (int)inum; } } return ld.getLayerInstance()->outputNameToIndex(outName); } LayerPin getPinByAlias(const String &pinAlias) { LayerPin pin; String layerName, outName; splitPin(pinAlias, layerName, outName); pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName); if (pin.lid >= 0) pin.oid = resolvePinOutputName(getLayerData(pin.lid), outName); return pin; } std::vector getLayerOutPins(const String &pinAlias) { String layerName, outName; splitPin(pinAlias, layerName, outName); int lid = (layerName.empty()) ? 0 : getLayerId(layerName); std::vector pins; for (int i = 0; i < layers[lid].outputBlobs.size(); i++) { pins.push_back(LayerPin(lid, i)); } return pins; } void connect(int outLayerId, int outNum, int inLayerId, int inNum) { CV_Assert(outLayerId < inLayerId); LayerData &ldOut = getLayerData(outLayerId); LayerData &ldInp = getLayerData(inLayerId); addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum)); ldOut.requiredOutputs.insert(outNum); ldOut.consumers.push_back(LayerPin(inLayerId, outNum)); } void computeNetOutputLayers() { CV_TRACE_FUNCTION(); netOutputs.clear(); MapIdToLayerData::iterator it; for (it = layers.begin(); it != layers.end(); it++) { int lid = it->first; LayerData &ld = it->second; if (ld.requiredOutputs.size() == 0) netOutputs.push_back(lid); } #ifndef NDEBUG std::cout << "\nNet Outputs(" << netOutputs.size() << "):\n"; for (size_t i = 0; i < netOutputs.size(); i++) std::cout << layers[netOutputs[i]].name << "\n"; #endif } void initBackend() { CV_TRACE_FUNCTION(); if (preferableBackend == DNN_BACKEND_DEFAULT) { CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL); return; } // Iterator to current layer. MapIdToLayerData::iterator it = layers.begin(); // Iterator to base layer for fusion. In example, in case of conv+bn+relu // it'll be a conv layer. MapIdToLayerData::iterator baseIt = layers.begin(); for (; it != layers.end(); it++) { LayerData &ldTop = it->second; Ptr layerTop = ldTop.layerInstance; if (!layerTop->supportBackend(preferableBackend)) { // Move base iterator to layer that don't support preferable // backend to prevent fusion over layer of different backend. baseIt = it; continue; } // Try to do layers fusion. LayerData &ldBot = baseIt->second; Ptr layerBot = ldBot.layerInstance; // 1. Check that bottom and top from the same backends. if (it != layers.begin() && layerBot->supportBackend(preferableBackend)) { // 2. Check that current layer works in-place. bool inPlace = ldTop.inputBlobs.size() == 1 && ldBot.outputBlobs.size() == 1 && ldTop.inputBlobs[0]->data == ldBot.outputBlobs[0].data; if (inPlace) { // 3. Try to attach node. CV_Assert(!ldBot.backendNodes[preferableBackend].empty()); Ptr fusedNode = layerTop->tryAttach(ldBot.backendNodes[preferableBackend]); if (!fusedNode.empty()) { ldTop.skipFlags[preferableBackend] = true; ldBot.backendNodes[preferableBackend] = fusedNode; continue; } } } // No layers fusion. ldTop.skipFlags[preferableBackend] = false; if (preferableBackend == DNN_BACKEND_HALIDE) { ldTop.backendNodes[DNN_BACKEND_HALIDE] = layerTop->initHalide(ldTop.inputBlobsWrappers); baseIt = it; } else { CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); } } } void allocateLayer(int lid, const LayersShapesMap& layersShapes) { CV_TRACE_FUNCTION(); LayerData &ld = layers[lid]; //already allocated if (ld.flag) return; size_t ninputs = ld.inputBlobsId.size(); #if 0 printf("layer %s:", ld.name.c_str()); for (size_t i = 0; i < ninputs; i++) { int inp_lid = ld.inputBlobsId[i].lid; LayerData &inp_ld = layers[inp_lid]; int inp_outputs = (int)inp_ld.outputBlobs.size(); std::cout << " " << inp_ld.name << "(" << inp_outputs; for( int j = 0; j < inp_outputs; j++ ) { std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size; } std::cout << ")"; } printf("\n"); #endif //determine parent layers for (size_t i = 0; i < ninputs; i++) ld.inputLayersId.insert(ld.inputBlobsId[i].lid); //allocate parents for (set::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++) allocateLayer(*i, layersShapes); //bind inputs bool use_umat = (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL); ld.inputBlobs.resize(ninputs); if (use_umat) ld.umat_inputBlobs.resize(ninputs); ld.inputBlobsWrappers.resize(ninputs); for (size_t i = 0; i < ninputs; i++) { LayerPin from = ld.inputBlobsId[i]; CV_Assert(from.valid()); CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid); ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid]; if (use_umat) ld.umat_inputBlobs[i] = layers[from.lid].umat_outputBlobs[from.oid]; ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid]; } LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid); CV_Assert(layerShapesIt != layersShapes.end()); std::vector pinsForInternalBlobs; bool maximizeReuse = preferableBackend == DNN_BACKEND_HALIDE; blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, maximizeReuse); ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); for (int i = 0; i < ld.outputBlobs.size(); ++i) { ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]); } Ptr layerPtr = ld.getLayerInstance(); { if (use_umat) { std::vector inputs(ld.umat_inputBlobs.size());; std::vector outputs(ld.umat_outputBlobs.size()); Mat mat; for (int i = 0; i < inputs.size(); i++) { mat = ld.umat_inputBlobs[i].getMat(ACCESS_READ); inputs[i] = &mat; } for (int i = 0; i < outputs.size(); i++) { outputs[i] = ld.umat_outputBlobs[i].getMat(ACCESS_READ); } layerPtr->finalize(inputs, outputs); } else { layerPtr->finalize(ld.inputBlobs, ld.outputBlobs); } layerPtr->preferableTarget = preferableTarget; #if 0 std::cout << "\toutputs:"; size_t noutputs = ld.outputBlobs.size(); for (size_t j = 0; j < noutputs; j++) { std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size; } std::cout << "\n"; #endif } // After allocation of layer, we decrease counters to it's input blobs. blobManager.releaseReferences(ld.inputBlobsId); blobManager.releaseReferences(pinsForInternalBlobs); ld.flag = 1; } #if 0 #define printf_(args) printf args #else #define printf_(args) #endif void fuseLayers(const std::vector& blobsToKeep_) { if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT) return; CV_TRACE_FUNCTION(); // scan through all the layers. If there is convolution layer followed by the activation layer, // we try to embed this activation into the convolution and disable separate execution of the activation std::vector outnames; std::set pinsToKeep(blobsToKeep_.begin(), blobsToKeep_.end()); MapIdToLayerData::iterator it; for (it = layers.begin(); it != layers.end(); it++) { int lid = it->first; LayerData& ld = layers[lid]; if( ld.skipFlags[DNN_BACKEND_DEFAULT] ) { printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str())); continue; } printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str())); if( ld.consumers.size() == 0 ) outnames.push_back(ld.layerInstance->name); // the optimization #1. try to fuse batch norm, scaling and/or activation layers // with the current layer if they follow it. Normally, the are fused with the convolution layer, // but some of them (like activation) may be fused with fully-connected, elemwise (+) and // some other layers. // TODO: OpenCL target support more fusion styles. if ( preferableTarget == DNN_TARGET_OPENCL && (!cv::ocl::useOpenCL() || ld.layerInstance->type.compare("Convolution")) ) continue; Ptr& currLayer = ld.layerInstance; if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 ) { LayerData* nextData = &layers[ld.consumers[0].lid]; Ptr nextBNormLayer = nextData->layerInstance.dynamicCast(); LayerPin lpNext(ld.consumers[0].lid, 0); if( !nextBNormLayer.empty() && pinsToKeep.count(lpNext) == 0 ) { LayerData* bnormData = nextData; nextData = 0; if( currLayer->setBatchNorm(nextBNormLayer) ) { printf_(("\tfused with %s\n", nextBNormLayer->name.c_str())); bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true; if ( preferableTarget == DNN_TARGET_OPENCL ) ld.umat_outputBlobs = layers[lpNext.lid].umat_outputBlobs; else ld.outputBlobs = layers[lpNext.lid].outputBlobs; if( bnormData->consumers.size() == 1 ) { nextData = &layers[bnormData->consumers[0].lid]; lpNext = LayerPin(bnormData->consumers[0].lid, 0); } } } Ptr nextScaleLayer; if( nextData ) nextScaleLayer = nextData->layerInstance.dynamicCast(); if( !nextScaleLayer.empty() && pinsToKeep.count(lpNext) == 0 ) { LayerData* scaleData = nextData; nextData = 0; if( currLayer->setScale(nextScaleLayer) ) { printf_(("\tfused with %s\n", nextScaleLayer->name.c_str())); scaleData->skipFlags[DNN_BACKEND_DEFAULT] = true; if ( preferableTarget == DNN_TARGET_OPENCL ) ld.umat_outputBlobs = layers[lpNext.lid].umat_outputBlobs; else ld.outputBlobs = layers[lpNext.lid].outputBlobs; if( scaleData->consumers.size() == 1 ) { nextData = &layers[scaleData->consumers[0].lid]; lpNext = LayerPin(scaleData->consumers[0].lid, 0); } } } // For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU/Power if ( preferableTarget != DNN_TARGET_OPENCL || (preferableTarget == DNN_TARGET_OPENCL && nextData && (!nextData->type.compare("ReLU") || !nextData->type.compare("ChannelsPReLU") || !nextData->type.compare("Power"))) ) { Ptr nextActivLayer; if( nextData ) nextActivLayer = nextData->layerInstance.dynamicCast(); if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 && currLayer->setActivation(nextActivLayer) ) { LayerData *activData = nextData; printf_(("\tfused with %s\n", nextActivLayer->name.c_str())); activData->skipFlags[DNN_BACKEND_DEFAULT] = true; if ( preferableTarget == DNN_TARGET_OPENCL ) ld.umat_outputBlobs = layers[lpNext.lid].umat_outputBlobs; else ld.outputBlobs = layers[lpNext.lid].outputBlobs; if ( preferableTarget == DNN_TARGET_OPENCL ) { nextData = &layers[activData->consumers[0].lid]; lpNext = LayerPin(activData->consumers[0].lid, 0); } } } // fuse convlution layer followed by eltwise + relu if ( preferableTarget == DNN_TARGET_OPENCL ) { Ptr nextEltwiseLayer; if( nextData ) nextEltwiseLayer = nextData->layerInstance.dynamicCast(); if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 ) { LayerData *eltwiseData = nextData; // go down from the second input and find the first non-skipped layer. LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[1].lid]; while (downLayerData->skipFlags[DNN_BACKEND_DEFAULT]) { downLayerData = &layers[downLayerData->inputBlobsId[0].lid]; } // second input layer is current layer. if ( ld.id == downLayerData->id ) { // go down from the first input and find the first non-skipped layer downLayerData = &layers[eltwiseData->inputBlobsId[0].lid]; while (downLayerData->skipFlags[DNN_BACKEND_DEFAULT]) { if ( !downLayerData->type.compare("Eltwise") ) downLayerData = &layers[downLayerData->inputBlobsId[1].lid]; else downLayerData = &layers[downLayerData->inputBlobsId[0].lid]; } Ptr convLayer; if( downLayerData ) convLayer = downLayerData->layerInstance.dynamicCast(); // first input layer is convolution layer if( !convLayer.empty() ) { // fuse eltwise + activation layer LayerData *firstConvLayerData = downLayerData; { nextData = &layers[eltwiseData->consumers[0].lid]; lpNext = LayerPin(eltwiseData->consumers[0].lid, 0); Ptr nextActivLayer; if( nextData ) nextActivLayer = nextData->layerInstance.dynamicCast(); if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 && (!nextData->type.compare("ReLU") || !nextData->type.compare("ChannelsPReLU") || !nextData->type.compare("Power")) && currLayer->setActivation(nextActivLayer) ) { CV_Assert(firstConvLayerData->umat_outputBlobs.size() == 1 && ld.umat_inputBlobs.size() == 1); ld.umat_inputBlobs.push_back(firstConvLayerData->umat_outputBlobs[0]); printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str())); printf_(("\tfused with %s\n", nextActivLayer->name.c_str())); eltwiseData->skipFlags[DNN_BACKEND_DEFAULT] = true; nextData->skipFlags[DNN_BACKEND_DEFAULT] = true; ld.umat_outputBlobs = layers[lpNext.lid].umat_outputBlobs; } } } } } } } // the optimization #2. if there is no layer that takes max pooling layer's computed // max indices (and only some semantical segmentation networks might need this; // many others only take the maximum values), then we switch the max pooling // layer to the faster operating mode. Ptr poolingLayer = ld.layerInstance.dynamicCast(); if( !poolingLayer.empty() && !ld.consumers.empty() ) { size_t i = 0, nconsumers = ld.consumers.size(); for( ; i < nconsumers; i++ ) if( ld.consumers[i].oid > 0 ) break; // if there is no layer that takes the second output pin of the pooling layer // on input then we don't need to compute the indices if( i >= nconsumers ) { poolingLayer->computeMaxIdx = false; printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str())); } } // the optimization #3. if there is concat layer that concatenates channels // from the inputs together (i.e. axis == 1) then we make the inputs of // the concat layer to write to the concatetion output buffer // (and so we eliminate the concatenation layer, because the channels // are concatenated implicitly). Ptr concatLayer = ld.layerInstance.dynamicCast(); if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding && ld.outputBlobs.size() == 1 ) { Mat& output = ld.outputBlobs[0]; // TODO: in general, this optimization can always be done, but // many layers currently check that the input/output blobs are // continuous arrays. Unfortunately, this is not true when // the concatenation optimization is applied with batch_size > 1. // so, for now, we only apply this optimization in the most popular // case batch_size == 1. if( output.dims == 4 && output.size[0] == 1 ) { size_t i, ninputs = ld.inputBlobsId.size(); std::vector realinputs(ninputs); for( i = 0; i < ninputs; i++ ) { LayerPin pin = ld.inputBlobsId[i]; LayerData* inp_i_data = &layers[pin.lid]; while(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] && inp_i_data->inputBlobsId.size() == 1) { pin = inp_i_data->inputBlobsId[0]; inp_i_data = &layers[pin.lid]; } printf_(("\treal input for %s is %s\n", layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(), inp_i_data->getLayerInstance()->name.c_str())); if(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] || inp_i_data->consumers.size() != 1) break; realinputs[i] = pin; } if( i >= ninputs ) { Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() }; int ofs = 0; for( i = 0; i < ninputs; i++ ) { LayerPin pin = realinputs[i]; LayerData* inp_i_data = &layers[pin.lid]; int channels_i = ld.inputBlobs[i]->size[1]; chrange[1] = Range(ofs, ofs + channels_i); printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(), pin.oid, ofs, ofs + channels_i)); ofs += channels_i; Mat output_slice = output(chrange); Mat& curr_output = inp_i_data->outputBlobs[pin.oid]; CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size); curr_output = output_slice; pin = ld.inputBlobsId[i]; inp_i_data = &layers[pin.lid]; for (int j = 0; j < inp_i_data->consumers.size(); ++j) { LayerPin consumer = inp_i_data->consumers[j]; layers[consumer.lid].inputBlobs[consumer.oid] = &curr_output; } } ld.skipFlags[DNN_BACKEND_DEFAULT] = true; printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str())); } } } } } void allocateLayers(const std::vector& blobsToKeep_) { CV_TRACE_FUNCTION(); MapIdToLayerData::iterator it; for (it = layers.begin(); it != layers.end(); it++) it->second.flag = 0; CV_Assert(!layers[0].outputBlobs.empty()); ShapesVec inputShapes; for(int i = 0; i < layers[0].outputBlobs.size(); i++) { CV_Assert(layers[0].outputBlobs[i].total()); inputShapes.push_back(shape(layers[0].outputBlobs[i])); } LayersShapesMap layersShapes; getLayersShapes(inputShapes, layersShapes); blobManager.reset(); blobManager.setPreferableTarget(preferableTarget); blobManager.setPreferableBackend(preferableBackend); backendWrappers.clear(); // Fake references to input blobs. for (int i = 0; i < layers[0].outputBlobs.size(); ++i) blobManager.addReference(LayerPin(0, i)); for (it = layers.begin(); it != layers.end(); ++it) { const LayerData& ld = it->second; blobManager.addReferences(ld.inputBlobsId); } for (int i = 0; i < blobsToKeep_.size(); i++) { blobManager.addReference(blobsToKeep_[i]); } for (it = layers.begin(); it != layers.end(); it++) { int lid = it->first; allocateLayer(lid, layersShapes); } layersTimings.resize(lastLayerId + 1, 0); fuseLayers(blobsToKeep_); } void forwardLayer(LayerData &ld) { CV_TRACE_FUNCTION(); Ptr layer = ld.layerInstance; TickMeter tm; tm.start(); if (preferableBackend == DNN_BACKEND_DEFAULT || !layer->supportBackend(preferableBackend)) { if( !ld.skipFlags[DNN_BACKEND_DEFAULT] ) { for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i) { if (!ld.inputBlobsWrappers[i].empty()) ld.inputBlobsWrappers[i]->copyToHost(); } if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL) layer->forward(ld.umat_inputBlobs, ld.umat_outputBlobs, ld.umat_internals); else layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals); for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i) { if (!ld.outputBlobsWrappers[i].empty()) ld.outputBlobsWrappers[i]->setHostDirty(); } } else tm.reset(); } else if (!ld.skipFlags[preferableBackend]) { Ptr node = ld.backendNodes[preferableBackend]; if (preferableBackend == DNN_BACKEND_HALIDE) { forwardHalide(ld.outputBlobsWrappers, node); } else { CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); } } tm.stop(); layersTimings[ld.id] = tm.getTimeTicks(); ld.flag = 1; } void forwardToLayer(LayerData &ld, bool clearFlags = true) { CV_TRACE_FUNCTION(); if (clearFlags) { MapIdToLayerData::iterator it; for (it = layers.begin(); it != layers.end(); it++) it->second.flag = 0; } //already was forwarded if (ld.flag) return; //forward parents MapIdToLayerData::iterator it; for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it) { LayerData &ld = it->second; if (ld.flag) continue; forwardLayer(ld); } //forward itself forwardLayer(ld); } void forwardAll() { CV_TRACE_FUNCTION(); MapIdToLayerData::reverse_iterator last_layer = layers.rbegin(); CV_Assert(last_layer != layers.rend()); forwardToLayer(last_layer->second, true); } void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) { std::vector& inputLayerIds = layers[id].inputBlobsId; if (inOutShapes[id].in.empty()) { for(int i = 0; i < inputLayerIds.size(); i++) { int layerId = inputLayerIds[i].lid; LayersShapesMap::iterator it = inOutShapes.find(layerId); if(it == inOutShapes.end() || it->second.out.empty()) { getLayerShapesRecursively(layerId, inOutShapes); } const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid]; inOutShapes[id].in.push_back(shape); } } const ShapesVec& is = inOutShapes[id].in; ShapesVec& os = inOutShapes[id].out; ShapesVec& ints = inOutShapes[id].internal; int requiredOutputs = layers[id].requiredOutputs.size(); inOutShapes[id].supportInPlace = layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints); } void getLayersShapes(const ShapesVec& netInputShapes, LayersShapesMap& inOutShapes) { inOutShapes.clear(); inOutShapes[0].in = netInputShapes; //insert shape for first input layer for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); it++) { getLayerShapesRecursively(it->first, inOutShapes); } } void getLayerShapes(const ShapesVec& netInputShapes, const int layerId, LayerShapes& shapes) { LayersShapesMap inOutShapes; inOutShapes[0].in = netInputShapes; //insert shape for first input layer getLayerShapesRecursively(layerId, inOutShapes); shapes = inOutShapes[layerId]; } LayerPin getLatestLayerPin(const std::vector& pins) { return *std::max_element(pins.begin(), pins.end()); } Mat getBlob(const LayerPin& pin) { CV_TRACE_FUNCTION(); if (!pin.valid()) CV_Error(Error::StsObjectNotFound, "Requested blob not found"); LayerData &ld = layers[pin.lid]; if ((size_t)pin.oid >= ld.outputBlobs.size()) { CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) + " outputs, the #" + toString(pin.oid) + " was requsted"); } if (preferableBackend != DNN_BACKEND_DEFAULT) { // Transfer data to CPU if it's require. ld.outputBlobsWrappers[pin.oid]->copyToHost(); } else { CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL); } if (ld.umat_outputBlobs.size() > 0 && !ld.umat_outputBlobs[pin.oid].empty()) ld.umat_outputBlobs[pin.oid].copyTo(ld.outputBlobs[pin.oid]); return ld.outputBlobs[pin.oid]; } void getBlob(UMat& umat, const LayerPin& pin) { CV_TRACE_FUNCTION(); if (!pin.valid()) CV_Error(Error::StsObjectNotFound, "Requested blob not found"); LayerData &ld = layers[pin.lid]; if ((size_t)pin.oid >= ld.outputBlobs.size()) { CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) + " outputs, the #" + toString(pin.oid) + " was requsted"); } if (ld.umat_outputBlobs.size() > 0 && !ld.umat_outputBlobs[pin.oid].empty()) umat = ld.umat_outputBlobs[pin.oid]; else umat = UMat(); } Mat getBlob(String outputName) { return getBlob(getPinByAlias(outputName)); } void getBlob(UMat& umat, String outputName) { getBlob(umat, getPinByAlias(outputName)); } }; Net::Net() : impl(new Net::Impl) { } Net::~Net() { } int Net::addLayer(const String &name, const String &type, LayerParams ¶ms) { CV_TRACE_FUNCTION(); if (name.find('.') != String::npos) { CV_Error(Error::StsBadArg, "Added layer name \"" + name + "\" must not contain dot symbol"); return -1; } if (impl->getLayerId(name) >= 0) { CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net"); return -1; } int id = ++impl->lastLayerId; impl->layerNameToId.insert(std::make_pair(name, id)); impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params))); return id; } int Net::addLayerToPrev(const String &name, const String &type, LayerParams ¶ms) { CV_TRACE_FUNCTION(); int prvLid = impl->lastLayerId; int newLid = this->addLayer(name, type, params); this->connect(prvLid, 0, newLid, 0); return newLid; } void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum) { CV_TRACE_FUNCTION(); impl->connect(outLayerId, outNum, inpLayerId, inpNum); } void Net::connect(String _outPin, String _inPin) { CV_TRACE_FUNCTION(); LayerPin outPin = impl->getPinByAlias(_outPin); LayerPin inpPin = impl->getPinByAlias(_inPin); CV_Assert(outPin.valid() && inpPin.valid()); impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid); } Mat Net::forward(const String& outputName) { CV_TRACE_FUNCTION(); String layerName = outputName; if (layerName.empty()) layerName = getLayerNames().back(); impl->setUpNet(); impl->forwardToLayer(impl->getLayerData(layerName)); return impl->getBlob(layerName); } void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName) { CV_TRACE_FUNCTION(); impl->setUpNet(); String layerName = outputName; if (layerName.empty()) layerName = getLayerNames().back(); impl->forwardToLayer(impl->getLayerData(layerName)); LayerPin pin = impl->getPinByAlias(layerName); LayerData &ld = impl->layers[pin.lid]; if (outputBlobs.isUMat()) { if (ld.umat_outputBlobs.size() > 0) { UMat umat; impl->getBlob(umat, layerName); outputBlobs.assign(umat); } } else if (outputBlobs.isMat()) { outputBlobs.assign(impl->getBlob(layerName)); } else if (outputBlobs.isMatVector()) { if (ld.umat_outputBlobs.size() > 0) { for (int i = 0; i < ld.umat_outputBlobs.size(); i++) ld.umat_outputBlobs[i].copyTo(ld.outputBlobs[i]); } std::vector & outputvec = *(std::vector *)outputBlobs.getObj(); outputvec = ld.outputBlobs; } else if (outputBlobs.isUMatVector()) { if (ld.umat_outputBlobs.size() > 0) { std::vector & outputvec = *(std::vector *)outputBlobs.getObj(); outputvec = ld.umat_outputBlobs; } } } void Net::forward(OutputArrayOfArrays outputBlobs, const std::vector& outBlobNames) { CV_TRACE_FUNCTION(); std::vector pins; for (int i = 0; i < outBlobNames.size(); i++) { pins.push_back(impl->getPinByAlias(outBlobNames[i])); } impl->setUpNet(pins); LayerPin out = impl->getLatestLayerPin(pins); impl->forwardToLayer(impl->getLayerData(out.lid)); std::vector matvec; for (int i = 0; i < pins.size(); i++) { matvec.push_back(impl->getBlob(pins[i])); } std::vector & outputvec = *(std::vector *)outputBlobs.getObj(); outputvec = matvec; } void Net::forward(std::vector >& outputBlobs, const std::vector& outBlobNames) { CV_TRACE_FUNCTION(); std::vector pins; for (int i = 0; i < outBlobNames.size(); i++) { std::vector lp = impl->getLayerOutPins(outBlobNames[i]); pins.insert(pins.end(), lp.begin(), lp.end()); } impl->setUpNet(pins); LayerPin out = impl->getLatestLayerPin(pins); impl->forwardToLayer(impl->getLayerData(out.lid)); outputBlobs.resize(outBlobNames.size()); for (int i = 0; i < outBlobNames.size(); i++) { std::vector lp = impl->getLayerOutPins(outBlobNames[i]); for (int i = 0; i < lp.size(); i++) { outputBlobs[i].push_back(impl->getBlob(lp[i])); } } } void Net::setPreferableBackend(int backendId) { CV_TRACE_FUNCTION(); CV_TRACE_ARG(backendId); if( impl->preferableBackend != backendId ) { impl->preferableBackend = backendId; impl->blobManager.setPreferableBackend(backendId); impl->netWasAllocated = false; impl->clear(); } } void Net::setPreferableTarget(int targetId) { CV_TRACE_FUNCTION(); CV_TRACE_ARG(targetId); if( impl->preferableTarget != targetId ) { impl->preferableTarget = targetId; impl->blobManager.setPreferableTarget(targetId); impl->netWasAllocated = false; impl->clear(); } } void Net::setInputsNames(const std::vector &inputBlobNames) { CV_TRACE_FUNCTION(); impl->netInputLayer->setNames(inputBlobNames); } void Net::setInput(InputArray blob, const String& name) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); LayerPin pin; pin.lid = 0; pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name); if (!pin.valid()) CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found"); LayerData &ld = impl->layers[pin.lid]; ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) ); bool use_umat = (impl->preferableBackend == DNN_BACKEND_DEFAULT && impl->preferableTarget == DNN_TARGET_OPENCL); if (use_umat) ld.umat_outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) ); ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); MatShape prevShape = shape(ld.outputBlobs[pin.oid]); Mat blob_ = blob.getMat(); bool oldShape = prevShape == shape(blob_); if (oldShape) { blob_.copyTo(ld.outputBlobs[pin.oid]); if (use_umat) blob_.copyTo(ld.umat_outputBlobs[pin.oid]); } else { ld.outputBlobs[pin.oid] = blob_.clone(); if (use_umat) blob_.copyTo(ld.umat_outputBlobs[pin.oid]); } if (!ld.outputBlobsWrappers[pin.oid].empty()) { ld.outputBlobsWrappers[pin.oid]->setHostDirty(); } impl->netWasAllocated = impl->netWasAllocated && oldShape; } Mat Net::getParam(LayerId layer, int numParam) { LayerData &ld = impl->getLayerData(layer); std::vector &layerBlobs = ld.layerInstance->blobs; CV_Assert(numParam < (int)layerBlobs.size()); return layerBlobs[numParam]; } void Net::setParam(LayerId layer, int numParam, const Mat &blob) { LayerData &ld = impl->getLayerData(layer); std::vector &layerBlobs = ld.layerInstance->blobs; CV_Assert(numParam < (int)layerBlobs.size()); //we don't make strong checks, use this function carefully layerBlobs[numParam] = blob; } int Net::getLayerId(const String &layer) { return impl->getLayerId(layer); } void Net::deleteLayer(LayerId) { CV_Error(Error::StsNotImplemented, ""); } Ptr Net::getLayer(LayerId layerId) { LayerData &ld = impl->getLayerData(layerId); return ld.getLayerInstance(); } std::vector > Net::getLayerInputs(LayerId layerId) { LayerData &ld = impl->getLayerData(layerId); if (!ld.layerInstance) CV_Error(Error::StsNullPtr, format("Requested layer \"%s\" was not initialized", ld.name.c_str())); std::vector > inputLayers; inputLayers.reserve(ld.inputLayersId.size()); std::set::iterator it; for (it = ld.inputLayersId.begin(); it != ld.inputLayersId.end(); ++it) { inputLayers.push_back(getLayer(*it)); } return inputLayers; } std::vector Net::getLayerNames() const { std::vector res; res.reserve(impl->layers.size()); Impl::MapIdToLayerData::iterator it; for (it = impl->layers.begin(); it != impl->layers.end(); it++) { if (it->second.id) //skip Data layer res.push_back(it->second.name); } return res; } bool Net::empty() const { return impl->layers.size() <= 1; //first layer is default Data layer } std::vector Net::getUnconnectedOutLayers() const { std::vector layersIds; Impl::MapIdToLayerData::iterator it; for (it = impl->layers.begin(); it != impl->layers.end(); it++) { int lid = it->first; LayerData &ld = it->second; if (ld.requiredOutputs.size() == 0) layersIds.push_back(lid); } return layersIds; } void Net::getLayersShapes(const ShapesVec& netInputShapes, std::vector& layersIds, std::vector& inLayersShapes, std::vector& outLayersShapes) const { layersIds.clear(); inLayersShapes.clear(); outLayersShapes.clear(); Impl::LayersShapesMap inOutShapes; impl->getLayersShapes(netInputShapes, inOutShapes); for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin(); it != inOutShapes.end(); it++) { layersIds.push_back(it->first); inLayersShapes.push_back(it->second.in); outLayersShapes.push_back(it->second.out); } } void Net::getLayersShapes(const MatShape& netInputShape, std::vector& layerIds, std::vector& inLayersShapes, std::vector& outLayersShapes) const { getLayersShapes(ShapesVec(1, netInputShape), layerIds, inLayersShapes, outLayersShapes); } void Net::getLayerShapes(const MatShape& netInputShape, const int layerId, ShapesVec& inLayerShapes, ShapesVec& outLayerShapes) const { getLayerShapes(ShapesVec(1, netInputShape), layerId, inLayerShapes, outLayerShapes); } void Net::getLayerShapes(const ShapesVec& netInputShapes, const int layerId, ShapesVec& inLayerShapes, ShapesVec& outLayerShapes) const { LayerShapes shapes; impl->getLayerShapes(netInputShapes, layerId, shapes); inLayerShapes = shapes.in; outLayerShapes = shapes.out; } int64 Net::getFLOPS(const std::vector& netInputShapes) const { CV_TRACE_FUNCTION(); int64 flops = 0; std::vector ids; std::vector > inShapes, outShapes; getLayersShapes(netInputShapes, ids, inShapes, outShapes); CV_Assert(inShapes.size() == outShapes.size()); CV_Assert(inShapes.size() == ids.size()); for(int i = 0; i < ids.size(); i++) { flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i], outShapes[i]); } return flops; } int64 Net::getFLOPS(const MatShape& netInputShape) const { return getFLOPS(std::vector(1, netInputShape)); } int64 Net::getFLOPS(const int layerId, const std::vector& netInputShapes) const { Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId); CV_Assert(layer != impl->layers.end()); LayerShapes shapes; impl->getLayerShapes(netInputShapes, layerId, shapes); return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out); } int64 Net::getFLOPS(const int layerId, const MatShape& netInputShape) const { return getFLOPS(layerId, std::vector(1, netInputShape)); } void Net::getLayerTypes(std::vector& layersTypes) const { layersTypes.clear(); std::map layers; for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++) { if (layers.find(it->second.type) == layers.end()) layers[it->second.type] = 0; layers[it->second.type]++; } for (std::map::iterator it = layers.begin(); it != layers.end(); it++) { layersTypes.push_back(it->first); } } int Net::getLayersCount(const String& layerType) const { int count = 0; for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++) { if (it->second.type == layerType) count++; } return count; } void Net::getMemoryConsumption(const int layerId, const std::vector& netInputShapes, size_t& weights, size_t& blobs) const { CV_TRACE_FUNCTION(); Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId); CV_Assert(layer != impl->layers.end()); weights = blobs = 0; for(int i = 0; i < layer->second.params.blobs.size(); i++) { const Mat& weightsBlob = layer->second.params.blobs[i]; weights += weightsBlob.total()*weightsBlob.elemSize(); } ShapesVec inLayerShapes, outLayerShapes; getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes); for(int i = 0; i < outLayerShapes.size(); i++) { blobs += total(outLayerShapes[i]) * sizeof(float); } } void Net::getMemoryConsumption(const std::vector& netInputShapes, size_t& weights, size_t& blobs) const { CV_TRACE_FUNCTION(); std::vector layerIds; std::vector w, b; getMemoryConsumption(netInputShapes, layerIds, w, b); weights = blobs = 0; for(int i = 0; i < layerIds.size(); i++) { weights += w[i]; blobs += b[i]; } } void Net::getMemoryConsumption(const int layerId, const MatShape& netInputShape, size_t& weights, size_t& blobs) const { getMemoryConsumption(layerId, std::vector(1, netInputShape), weights, blobs); } void Net::getMemoryConsumption(const MatShape& netInputShape, size_t& weights, size_t& blobs) const { getMemoryConsumption(std::vector(1, netInputShape), weights, blobs); } void Net::getMemoryConsumption(const std::vector& netInputShapes, std::vector& layerIds, std::vector& weights, std::vector& blobs) const { CV_TRACE_FUNCTION(); layerIds.clear(); weights.clear(); blobs.clear(); std::vector > inLayerShapes, outLayerShapes; getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes); for(int i = 0; i < layerIds.size(); i++) { int w = 0, b = 0; Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]); CV_Assert(layer != impl->layers.end()); for(int j = 0; j < layer->second.params.blobs.size(); j++) { const Mat& weightsBlob = layer->second.params.blobs[j]; w += weightsBlob.total()*weightsBlob.elemSize(); } for(int j = 0; j < outLayerShapes[i].size(); j++) { b += total(outLayerShapes[i][j]) * sizeof(float); } weights.push_back(w); blobs.push_back(b); } } void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector& layerIds, std::vector& weights, std::vector& blobs) const { getMemoryConsumption(std::vector(1, netInputShape), layerIds, weights, blobs); } void Net::enableFusion(bool fusion) { if( impl->fusion != fusion ) { impl->fusion = fusion; impl->netWasAllocated = false; impl->clear(); } } void Net::setHalideScheduler(const String& scheduler) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str()); impl->halideConfigFile = scheduler; } int64 Net::getPerfProfile(std::vector& timings) { timings = std::vector(impl->layersTimings.begin() + 1, impl->layersTimings.end()); int64 total = std::accumulate(timings.begin(), timings.end(), 0); return total; } ////////////////////////////////////////////////////////////////////////// Layer::Layer() { preferableTarget = DNN_TARGET_CPU; } Layer::Layer(const LayerParams ¶ms) : blobs(params.blobs), name(params.name), type(params.type) { preferableTarget = DNN_TARGET_CPU; } void Layer::setParamsFrom(const LayerParams ¶ms) { blobs = params.blobs; name = params.name; type = params.type; } int Layer::inputNameToIndex(String) { return -1; } int Layer::outputNameToIndex(String) { return -1; } bool Layer::supportBackend(int backendId) { return backendId == DNN_BACKEND_DEFAULT; } Ptr Layer::initHalide(const std::vector > &) { CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type + " layers is not defined."); return Ptr(); } void Layer::applyHalideScheduler(Ptr& node, const std::vector &inputs, const std::vector &outputs, int targetId) const { #ifdef HAVE_HALIDE CV_TRACE_FUNCTION(); Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile"); Halide::Func& top = node.dynamicCast()->funcs.back(); int outW, outH, outC, outN; getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); if (targetId == DNN_TARGET_CPU) { if (outW == 1 && outH == 1) { if (outC + outN == 1) return; if (outC > 8) top.split(c, co, ci, 8) .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) .parallel(tile) .vectorize(ci, 8); else top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile) .parallel(tile); } else { if (outH > 2) { top.reorder(x, c, y) .split(y, yo, yi, 2) .fuse(yo, n, tile) .parallel(tile) .unroll(yi) .vectorize(x, outW >= 16 ? 16 : outW); } } } else if (targetId == DNN_TARGET_OPENCL) { int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC; if (outW == 1 && outH == 1) { top.split(c, co, ci, c_split) .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) .gpu_blocks(tile) .gpu_threads(ci); } else { int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW; int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH; top.split(x, xo, xi, x_split).split(y, yo, yi, y_split) .split(c, co, ci, c_split) .gpu_blocks(xo, yo, co) .gpu_threads(xi, yi) .reorder(xi, yi, ci, xo, yo, co) .vectorize(ci); } } else CV_Error(Error::StsNotImplemented, "Unknown target identifier"); #endif // HAVE_HALIDE } Ptr Layer::tryAttach(const Ptr& node) { return Ptr(); } bool Layer::setActivation(const Ptr&) { return false; } bool Layer::setBatchNorm(const Ptr&) { return false; } bool Layer::setScale(const Ptr&) { return false; } void Layer::unsetAttached() { setActivation(Ptr()); setBatchNorm(Ptr()); setScale(Ptr()); } template static void vecToPVec(const std::vector &v, std::vector &pv) { pv.resize(v.size()); for (size_t i = 0; i < v.size(); i++) pv[i] = const_cast(&v[i]); } void Layer::finalize(const std::vector &inputs, std::vector &outputs) { CV_TRACE_FUNCTION(); std::vector inputsp; vecToPVec(inputs, inputsp); this->finalize(inputsp, outputs); } void Layer::finalize(const std::vector &input, std::vector &output) { (void)input;(void)output; } std::vector Layer::finalize(const std::vector &inputs) { CV_TRACE_FUNCTION(); std::vector outputs; this->finalize(inputs, outputs); return outputs; } void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); std::vector inpvec; std::vector outputs; std::vector internals; inputs_arr.getMatVector(inpvec); outputs_arr.getMatVector(outputs); internals_arr.getMatVector(internals); std::vector inputs(inpvec.size()); for (int i = 0; i < inpvec.size(); i++) inputs[i] = &inpvec[i]; this->forward(inputs, outputs, internals); // sync results back outputs_arr.assign(outputs); internals_arr.assign(internals); } void Layer::run(const std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); std::vector inputsp; vecToPVec(inputs, inputsp); this->finalize(inputsp, outputs); this->forward(inputsp, outputs, internals); } Layer::~Layer() {} bool Layer::getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const { CV_Assert(inputs.size()); outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]); return false; } ////////////////////////////////////////////////////////////////////////// static Mutex& getLayerFactoryMutex() { static Mutex* volatile instance = NULL; if (instance == NULL) { cv::AutoLock lock(getInitializationMutex()); if (instance == NULL) instance = new Mutex(); } return *instance; } typedef std::map LayerFactory_Impl; static LayerFactory_Impl& getLayerFactoryImpl_() { static LayerFactory_Impl impl; return impl; } static LayerFactory_Impl& getLayerFactoryImpl() { static LayerFactory_Impl* volatile instance = NULL; if (instance == NULL) { cv::AutoLock lock(getLayerFactoryMutex()); if (instance == NULL) { instance = &getLayerFactoryImpl_(); initializeLayerFactory(); } } return *instance; } void LayerFactory::registerLayer(const String &type, Constuctor constructor) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(type, "type", type.c_str()); cv::AutoLock lock(getLayerFactoryMutex()); String type_ = type.toLowerCase(); LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_); if (it != getLayerFactoryImpl().end() && it->second != constructor) { CV_Error(cv::Error::StsBadArg, "Layer \"" + type_ + "\" already was registered"); } getLayerFactoryImpl().insert(std::make_pair(type_, constructor)); } void LayerFactory::unregisterLayer(const String &type) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(type, "type", type.c_str()); cv::AutoLock lock(getLayerFactoryMutex()); String type_ = type.toLowerCase(); getLayerFactoryImpl().erase(type_); } Ptr LayerFactory::createLayerInstance(const String &type, LayerParams& params) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(type, "type", type.c_str()); cv::AutoLock lock(getLayerFactoryMutex()); String type_ = type.toLowerCase(); LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_); if (it != getLayerFactoryImpl().end()) { return it->second(params); } else { return Ptr(); //NULL } } BackendNode::BackendNode(int backendId) : backendId(backendId) {} BackendNode::~BackendNode() {}; BackendWrapper::BackendWrapper(int backendId, int targetId) : backendId(backendId), targetId(targetId) {} BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m) { CV_Error(Error::StsNotImplemented, "Constructor of backend wrapper must be implemented"); } BackendWrapper::BackendWrapper(const Ptr& base, const MatShape& shape) { CV_Error(Error::StsNotImplemented, "Constructor of backend wrapper must be implemented"); } BackendWrapper::~BackendWrapper() {} CV__DNN_EXPERIMENTAL_NS_END }} // namespace