mirror of
https://github.com/opencv/opencv.git
synced 2025-06-12 20:42:53 +08:00
Merge pull request #9090 from vpisarev:dnn_optim_scale_concat
This commit is contained in:
commit
86e8a105a5
@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
|
|||||||
|
|
||||||
class CV_EXPORTS ActivationLayer;
|
class CV_EXPORTS ActivationLayer;
|
||||||
class CV_EXPORTS BatchNormLayer;
|
class CV_EXPORTS BatchNormLayer;
|
||||||
|
class CV_EXPORTS ScaleLayer;
|
||||||
|
|
||||||
/** @brief This interface class allows to build new Layers - are building blocks of networks.
|
/** @brief This interface class allows to build new Layers - are building blocks of networks.
|
||||||
*
|
*
|
||||||
@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
|
|||||||
*/
|
*/
|
||||||
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
|
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
|
||||||
|
* @param[in] layer The subsequent scaling layer.
|
||||||
|
*
|
||||||
|
* Returns true if the scaling layer has been attached successfully.
|
||||||
|
*/
|
||||||
|
virtual bool setScale(const Ptr<ScaleLayer>& layer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief "Deattaches" all the layers, attached to particular layer.
|
||||||
|
*/
|
||||||
|
virtual void unsetAttached();
|
||||||
|
|
||||||
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
|
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
|
||||||
const int requiredOutputs,
|
const int requiredOutputs,
|
||||||
std::vector<MatShape> &outputs,
|
std::vector<MatShape> &outputs,
|
||||||
@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
|
|||||||
|
|
||||||
/** @overload */
|
/** @overload */
|
||||||
CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
|
CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
|
||||||
const int layerId,
|
const int layerId,
|
||||||
std::vector<MatShape>* inLayerShapes,
|
std::vector<MatShape>* inLayerShapes,
|
||||||
std::vector<MatShape>* outLayerShapes) const;
|
std::vector<MatShape>* outLayerShapes) const;
|
||||||
|
|
||||||
/** @brief Computes FLOP for whole loaded model with specified input shapes.
|
/** @brief Computes FLOP for whole loaded model with specified input shapes.
|
||||||
* @param netInputShapes vector of shapes for all net inputs.
|
* @param netInputShapes vector of shapes for all net inputs.
|
||||||
* @returns computed FLOP.
|
* @returns computed FLOP.
|
||||||
@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
|
|||||||
CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
|
CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
|
||||||
/** @overload */
|
/** @overload */
|
||||||
CV_WRAP int64 getFLOPS(const int layerId,
|
CV_WRAP int64 getFLOPS(const int layerId,
|
||||||
const std::vector<MatShape>& netInputShapes) const;
|
const std::vector<MatShape>& netInputShapes) const;
|
||||||
/** @overload */
|
/** @overload */
|
||||||
CV_WRAP int64 getFLOPS(const int layerId,
|
CV_WRAP int64 getFLOPS(const int layerId,
|
||||||
const MatShape& netInputShape) const;
|
const MatShape& netInputShape) const;
|
||||||
|
|
||||||
/** @brief Returns list of types for layer used in model.
|
/** @brief Returns list of types for layer used in model.
|
||||||
* @param layersTypes output parameter for returning types.
|
* @param layersTypes output parameter for returning types.
|
||||||
@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
|
|||||||
CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
|
CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
|
||||||
CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
|
CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
|
||||||
CV_OUT std::vector<size_t>& blobs) const;
|
CV_OUT std::vector<size_t>& blobs) const;
|
||||||
private:
|
|
||||||
|
|
||||||
|
/** @brief Enables or disables layer fusion in the network.
|
||||||
|
* @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
|
||||||
|
*/
|
||||||
|
CV_WRAP void enableFusion(bool fusion);
|
||||||
|
|
||||||
|
private:
|
||||||
struct Impl;
|
struct Impl;
|
||||||
Ptr<Impl> impl;
|
Ptr<Impl> impl;
|
||||||
};
|
};
|
||||||
|
@ -464,29 +464,34 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst)
|
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool force)
|
||||||
{
|
{
|
||||||
std::map<LayerPin, Mat>::iterator hostIt;
|
|
||||||
std::map<LayerPin, int>::iterator refIt;
|
|
||||||
|
|
||||||
const int targetTotal = total(shape);
|
|
||||||
Mat bestBlob;
|
Mat bestBlob;
|
||||||
int bestBlobTotal = INT_MAX;
|
|
||||||
LayerPin bestBlobPin;
|
LayerPin bestBlobPin;
|
||||||
for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
|
|
||||||
|
if( !force )
|
||||||
{
|
{
|
||||||
refIt = refCounter.find(hostIt->first);
|
std::map<LayerPin, Mat>::iterator hostIt;
|
||||||
// Use only blobs that had references before because if not,
|
std::map<LayerPin, int>::iterator refIt;
|
||||||
// it might be used as output.
|
|
||||||
if (refIt != refCounter.end() && refIt->second == 0)
|
const int targetTotal = total(shape);
|
||||||
|
int bestBlobTotal = INT_MAX;
|
||||||
|
|
||||||
|
for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
|
||||||
{
|
{
|
||||||
Mat& unusedBlob = hostIt->second;
|
refIt = refCounter.find(hostIt->first);
|
||||||
if (unusedBlob.total() >= targetTotal &&
|
// Use only blobs that had references before because if not,
|
||||||
unusedBlob.total() < bestBlobTotal)
|
// it might be used as output.
|
||||||
|
if (refIt != refCounter.end() && refIt->second == 0)
|
||||||
{
|
{
|
||||||
bestBlobPin = hostIt->first;
|
Mat& unusedBlob = hostIt->second;
|
||||||
bestBlob = unusedBlob;
|
if (unusedBlob.total() >= targetTotal &&
|
||||||
bestBlobTotal = unusedBlob.total();
|
unusedBlob.total() < bestBlobTotal)
|
||||||
|
{
|
||||||
|
bestBlobPin = hostIt->first;
|
||||||
|
bestBlob = unusedBlob;
|
||||||
|
bestBlobTotal = unusedBlob.total();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -505,7 +510,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
|
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
|
||||||
std::vector<LayerPin>& pinsForInternalBlobs)
|
std::vector<LayerPin>& pinsForInternalBlobs,
|
||||||
|
bool maximizeReuse)
|
||||||
{
|
{
|
||||||
CV_TRACE_FUNCTION();
|
CV_TRACE_FUNCTION();
|
||||||
|
|
||||||
@ -561,6 +567,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::map<int, std::vector<int> >::reverse_iterator it;
|
std::map<int, std::vector<int> >::reverse_iterator it;
|
||||||
|
bool force = !maximizeReuse && ld.inputBlobsId.size() > 1;
|
||||||
for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
|
for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
|
||||||
{
|
{
|
||||||
for(int j = 0; j < it->second.size(); j++)
|
for(int j = 0; j < it->second.size(); j++)
|
||||||
@ -569,7 +576,7 @@ public:
|
|||||||
if (total(shapes[index]))
|
if (total(shapes[index]))
|
||||||
{
|
{
|
||||||
LayerPin blobPin(ld.id, index);
|
LayerPin blobPin(ld.id, index);
|
||||||
if (index < outShapes.size() && inPlace)
|
if (index < outShapes.size() && inPlace && !force)
|
||||||
{
|
{
|
||||||
CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
|
CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
|
||||||
ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
|
ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
|
||||||
@ -577,7 +584,7 @@ public:
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
reuseOrCreate(shapes[index], blobPin, *blobs[index]);
|
reuseOrCreate(shapes[index], blobPin, *blobs[index], force);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -628,6 +635,7 @@ struct Net::Impl
|
|||||||
|
|
||||||
lastLayerId = 1;
|
lastLayerId = 1;
|
||||||
netWasAllocated = false;
|
netWasAllocated = false;
|
||||||
|
fusion = true;
|
||||||
preferableBackend = DNN_BACKEND_DEFAULT;
|
preferableBackend = DNN_BACKEND_DEFAULT;
|
||||||
preferableTarget = DNN_TARGET_CPU;
|
preferableTarget = DNN_TARGET_CPU;
|
||||||
}
|
}
|
||||||
@ -647,6 +655,7 @@ struct Net::Impl
|
|||||||
int lastLayerId;
|
int lastLayerId;
|
||||||
|
|
||||||
bool netWasAllocated;
|
bool netWasAllocated;
|
||||||
|
bool fusion;
|
||||||
|
|
||||||
void compileHalide()
|
void compileHalide()
|
||||||
{
|
{
|
||||||
@ -695,8 +704,7 @@ struct Net::Impl
|
|||||||
if( currLayer.empty() )
|
if( currLayer.empty() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
currLayer->setActivation(Ptr<ActivationLayer>());
|
currLayer->unsetAttached();
|
||||||
currLayer->setBatchNorm(Ptr<BatchNormLayer>());
|
|
||||||
|
|
||||||
Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
|
Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
|
||||||
if( !poolingLayer.empty() )
|
if( !poolingLayer.empty() )
|
||||||
@ -704,9 +712,11 @@ struct Net::Impl
|
|||||||
poolingLayer->computeMaxIdx = true;
|
poolingLayer->computeMaxIdx = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
it = layers.find(0);
|
||||||
|
CV_Assert(it != layers.end());
|
||||||
|
it->second.skipFlags[DNN_BACKEND_DEFAULT] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
|
void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
|
||||||
{
|
{
|
||||||
CV_TRACE_FUNCTION();
|
CV_TRACE_FUNCTION();
|
||||||
@ -783,13 +793,11 @@ struct Net::Impl
|
|||||||
|
|
||||||
LayerData& getLayerData(const DictValue &layerDesc)
|
LayerData& getLayerData(const DictValue &layerDesc)
|
||||||
{
|
{
|
||||||
|
CV_Assert(layerDesc.isInt() || layerDesc.isString());
|
||||||
if (layerDesc.isInt())
|
if (layerDesc.isInt())
|
||||||
return getLayerData(layerDesc.get<int>());
|
return getLayerData(layerDesc.get<int>());
|
||||||
else if (layerDesc.isString())
|
else /*if (layerDesc.isString())*/
|
||||||
return getLayerData(layerDesc.get<String>());
|
return getLayerData(layerDesc.get<String>());
|
||||||
|
|
||||||
CV_Assert(layerDesc.isInt() || layerDesc.isString());
|
|
||||||
return *((LayerData*)NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
|
static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
|
||||||
@ -1021,7 +1029,8 @@ struct Net::Impl
|
|||||||
CV_Assert(layerShapesIt != layersShapes.end());
|
CV_Assert(layerShapesIt != layersShapes.end());
|
||||||
|
|
||||||
std::vector<LayerPin> pinsForInternalBlobs;
|
std::vector<LayerPin> pinsForInternalBlobs;
|
||||||
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
|
bool maximizeReuse = preferableBackend == DNN_BACKEND_HALIDE;
|
||||||
|
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, maximizeReuse);
|
||||||
|
|
||||||
Ptr<Layer> layerPtr = ld.getLayerInstance();
|
Ptr<Layer> layerPtr = ld.getLayerInstance();
|
||||||
{
|
{
|
||||||
@ -1044,8 +1053,17 @@ struct Net::Impl
|
|||||||
ld.flag = 1;
|
ld.flag = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
#define printf_(args) printf args
|
||||||
|
#else
|
||||||
|
#define printf_(args)
|
||||||
|
#endif
|
||||||
|
|
||||||
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
|
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
|
||||||
{
|
{
|
||||||
|
if( !fusion || preferableBackend == DNN_BACKEND_HALIDE )
|
||||||
|
return;
|
||||||
|
|
||||||
CV_TRACE_FUNCTION();
|
CV_TRACE_FUNCTION();
|
||||||
|
|
||||||
// scan through all the layers. If there is convolution layer followed by the activation layer,
|
// scan through all the layers. If there is convolution layer followed by the activation layer,
|
||||||
@ -1060,11 +1078,17 @@ struct Net::Impl
|
|||||||
LayerData& ld = layers[lid];
|
LayerData& ld = layers[lid];
|
||||||
if( ld.skipFlags[DNN_BACKEND_DEFAULT] )
|
if( ld.skipFlags[DNN_BACKEND_DEFAULT] )
|
||||||
{
|
{
|
||||||
|
printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
|
||||||
if( ld.consumers.size() == 0 )
|
if( ld.consumers.size() == 0 )
|
||||||
outnames.push_back(ld.layerInstance->name);
|
outnames.push_back(ld.layerInstance->name);
|
||||||
|
|
||||||
|
// the optimization #1. try to fuse batch norm, scaling and/or activation layers
|
||||||
|
// with the current layer if they follow it. Normally, the are fused with the convolution layer,
|
||||||
|
// but some of them (like activation) may be fused with fully-connected, elemwise (+) and
|
||||||
|
// some other layers.
|
||||||
Ptr<Layer>& currLayer = ld.layerInstance;
|
Ptr<Layer>& currLayer = ld.layerInstance;
|
||||||
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
|
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
|
||||||
{
|
{
|
||||||
@ -1078,10 +1102,29 @@ struct Net::Impl
|
|||||||
nextData = 0;
|
nextData = 0;
|
||||||
if( currLayer->setBatchNorm(nextBNormLayer) )
|
if( currLayer->setBatchNorm(nextBNormLayer) )
|
||||||
{
|
{
|
||||||
|
printf_(("\tfused with %s\n", nextBNormLayer->name.c_str()));
|
||||||
bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
|
bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
|
||||||
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
|
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
|
||||||
if( bnormData->consumers.size() == 1 )
|
if( bnormData->consumers.size() == 1 )
|
||||||
nextData = &layers[bnormData->consumers[0].lid];
|
nextData = &layers[bnormData->consumers[0].lid];
|
||||||
|
lpNext = LayerPin(bnormData->consumers[0].lid, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ptr<ScaleLayer> nextScaleLayer;
|
||||||
|
if( nextData )
|
||||||
|
nextScaleLayer = nextData->layerInstance.dynamicCast<ScaleLayer>();
|
||||||
|
if( !nextScaleLayer.empty() && pinsToKeep.count(lpNext) == 0 )
|
||||||
|
{
|
||||||
|
LayerData* scaleData = nextData;
|
||||||
|
nextData = 0;
|
||||||
|
if( currLayer->setScale(nextScaleLayer) )
|
||||||
|
{
|
||||||
|
printf_(("\tfused with %s\n", nextScaleLayer->name.c_str()));
|
||||||
|
scaleData->skipFlags[DNN_BACKEND_DEFAULT] = true;
|
||||||
|
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
|
||||||
|
if( scaleData->consumers.size() == 1 )
|
||||||
|
nextData = &layers[scaleData->consumers[0].lid];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1091,11 +1134,16 @@ struct Net::Impl
|
|||||||
|
|
||||||
if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
|
if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
|
||||||
{
|
{
|
||||||
//printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
|
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
|
||||||
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
|
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
|
||||||
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
|
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// the optimization #2. if there is no layer that takes max pooling layer's computed
|
||||||
|
// max indices (and only some semantical segmentation networks might need this;
|
||||||
|
// many others only take the maximum values), then we switch the max pooling
|
||||||
|
// layer to the faster operating mode.
|
||||||
Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
|
Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
|
||||||
if( !poolingLayer.empty() && !ld.consumers.empty() )
|
if( !poolingLayer.empty() && !ld.consumers.empty() )
|
||||||
{
|
{
|
||||||
@ -1108,7 +1156,71 @@ struct Net::Impl
|
|||||||
if( i >= nconsumers )
|
if( i >= nconsumers )
|
||||||
{
|
{
|
||||||
poolingLayer->computeMaxIdx = false;
|
poolingLayer->computeMaxIdx = false;
|
||||||
//printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
|
printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the optimization #3. if there is concat layer that concatenates channels
|
||||||
|
// from the inputs together (i.e. axis == 1) then we make the inputs of
|
||||||
|
// the concat layer to write to the concatetion output buffer
|
||||||
|
// (and so we eliminate the concatenation layer, because the channels
|
||||||
|
// are concatenated implicitly).
|
||||||
|
Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
|
||||||
|
if( !concatLayer.empty() && concatLayer->axis == 1 &&
|
||||||
|
ld.outputBlobs.size() == 1 )
|
||||||
|
{
|
||||||
|
Mat& output = ld.outputBlobs[0];
|
||||||
|
|
||||||
|
// TODO: in general, this optimization can always be done, but
|
||||||
|
// many layers currently check that the input/output blobs are
|
||||||
|
// continuous arrays. Unfortunately, this is not true when
|
||||||
|
// the concatenation optimization is applied with batch_size > 1.
|
||||||
|
// so, for now, we only apply this optimization in the most popular
|
||||||
|
// case batch_size == 1.
|
||||||
|
if( output.dims == 4 && output.size[0] == 1 )
|
||||||
|
{
|
||||||
|
size_t i, ninputs = ld.inputBlobsId.size();
|
||||||
|
std::vector<LayerPin> realinputs(ninputs);
|
||||||
|
for( i = 0; i < ninputs; i++ )
|
||||||
|
{
|
||||||
|
LayerPin pin = ld.inputBlobsId[i];
|
||||||
|
LayerData* inp_i_data = &layers[pin.lid];
|
||||||
|
while(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] &&
|
||||||
|
inp_i_data->inputBlobsId.size() == 1)
|
||||||
|
{
|
||||||
|
pin = inp_i_data->inputBlobsId[0];
|
||||||
|
inp_i_data = &layers[pin.lid];
|
||||||
|
}
|
||||||
|
printf_(("\treal input for %s is %s\n",
|
||||||
|
layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
|
||||||
|
inp_i_data->getLayerInstance()->name.c_str()));
|
||||||
|
|
||||||
|
if(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT])
|
||||||
|
break;
|
||||||
|
realinputs[i] = pin;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( i >= ninputs )
|
||||||
|
{
|
||||||
|
Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
|
||||||
|
int ofs = 0;
|
||||||
|
for( i = 0; i < ninputs; i++ )
|
||||||
|
{
|
||||||
|
LayerPin pin = realinputs[i];
|
||||||
|
LayerData* inp_i_data = &layers[pin.lid];
|
||||||
|
int channels_i = ld.inputBlobs[i]->size[1];
|
||||||
|
chrange[1] = Range(ofs, ofs + channels_i);
|
||||||
|
printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
|
||||||
|
pin.oid, ofs, ofs + channels_i));
|
||||||
|
ofs += channels_i;
|
||||||
|
Mat output_slice = output(chrange);
|
||||||
|
Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
|
||||||
|
CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
|
||||||
|
curr_output = output_slice;
|
||||||
|
}
|
||||||
|
ld.skipFlags[DNN_BACKEND_DEFAULT] = true;
|
||||||
|
printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1458,9 +1570,12 @@ void Net::setPreferableBackend(int backendId)
|
|||||||
CV_TRACE_FUNCTION();
|
CV_TRACE_FUNCTION();
|
||||||
CV_TRACE_ARG(backendId);
|
CV_TRACE_ARG(backendId);
|
||||||
|
|
||||||
impl->netWasAllocated = impl->netWasAllocated &&
|
if( impl->preferableBackend != backendId )
|
||||||
impl->preferableBackend == backendId;
|
{
|
||||||
impl->preferableBackend = backendId;
|
impl->preferableBackend = backendId;
|
||||||
|
impl->netWasAllocated = false;
|
||||||
|
impl->clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Net::setPreferableTarget(int targetId)
|
void Net::setPreferableTarget(int targetId)
|
||||||
@ -1468,9 +1583,12 @@ void Net::setPreferableTarget(int targetId)
|
|||||||
CV_TRACE_FUNCTION();
|
CV_TRACE_FUNCTION();
|
||||||
CV_TRACE_ARG(targetId);
|
CV_TRACE_ARG(targetId);
|
||||||
|
|
||||||
impl->netWasAllocated = impl->netWasAllocated &&
|
if( impl->preferableTarget != targetId )
|
||||||
impl->preferableTarget == targetId;
|
{
|
||||||
impl->preferableTarget = targetId;
|
impl->preferableTarget = targetId;
|
||||||
|
impl->netWasAllocated = false;
|
||||||
|
impl->clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Net::setInputsNames(const std::vector<String> &inputBlobNames)
|
void Net::setInputsNames(const std::vector<String> &inputBlobNames)
|
||||||
@ -1825,6 +1943,16 @@ void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>&
|
|||||||
weights, blobs);
|
weights, blobs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Net::enableFusion(bool fusion)
|
||||||
|
{
|
||||||
|
if( impl->fusion != fusion )
|
||||||
|
{
|
||||||
|
impl->fusion = fusion;
|
||||||
|
impl->netWasAllocated = false;
|
||||||
|
impl->clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Net::setHalideScheduler(const String& scheduler)
|
void Net::setHalideScheduler(const String& scheduler)
|
||||||
{
|
{
|
||||||
CV_TRACE_FUNCTION();
|
CV_TRACE_FUNCTION();
|
||||||
@ -1950,6 +2078,13 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
|
|||||||
|
|
||||||
bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
|
bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
|
||||||
bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
|
bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
|
||||||
|
bool Layer::setScale(const Ptr<ScaleLayer>&) { return false; }
|
||||||
|
void Layer::unsetAttached()
|
||||||
|
{
|
||||||
|
setActivation(Ptr<ActivationLayer>());
|
||||||
|
setBatchNorm(Ptr<BatchNormLayer>());
|
||||||
|
setScale(Ptr<ScaleLayer>());
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
|
static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
|
||||||
|
@ -94,6 +94,78 @@ public:
|
|||||||
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1; // By channels
|
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1; // By channels
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class ChannelConcatInvoker : public ParallelLoopBody
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
std::vector<Mat*>* inputs;
|
||||||
|
Mat* output;
|
||||||
|
int nstripes;
|
||||||
|
std::vector<const float*> chptrs;
|
||||||
|
|
||||||
|
static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
|
||||||
|
{
|
||||||
|
ChannelConcatInvoker cc;
|
||||||
|
cc.inputs = &inputs;
|
||||||
|
cc.output = &output;
|
||||||
|
cc.nstripes = nstripes;
|
||||||
|
|
||||||
|
size_t i, ninputs = inputs.size();
|
||||||
|
int nchannels = 0, batchsz = output.size[0];
|
||||||
|
for( i = 0; i < ninputs; i++ )
|
||||||
|
{
|
||||||
|
Mat& inp = *inputs[i];
|
||||||
|
CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
|
||||||
|
inp.dims == 4 && inp.size[0] == output.size[0] &&
|
||||||
|
inp.size[2] == output.size[2] &&
|
||||||
|
inp.size[3] == output.size[3] );
|
||||||
|
nchannels += inp.size[1];
|
||||||
|
}
|
||||||
|
CV_Assert( nchannels == output.size[1] );
|
||||||
|
CV_Assert( output.isContinuous() && output.type() == CV_32F );
|
||||||
|
|
||||||
|
cc.chptrs.resize(nchannels*batchsz);
|
||||||
|
|
||||||
|
int ofs = 0;
|
||||||
|
for( i = 0; i < ninputs; i++)
|
||||||
|
{
|
||||||
|
Mat& inp = *inputs[i];
|
||||||
|
for( int j = 0; j < batchsz; j++ )
|
||||||
|
for( int k = 0; k < inp.size[1]; k++ )
|
||||||
|
{
|
||||||
|
const float* ptr = inp.ptr<float>(j, k);
|
||||||
|
cc.chptrs[ofs + j*nchannels + k] = ptr;
|
||||||
|
}
|
||||||
|
ofs += inp.size[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
parallel_for_(Range(0, nstripes), cc, nstripes);
|
||||||
|
}
|
||||||
|
|
||||||
|
ChannelConcatInvoker() {}
|
||||||
|
|
||||||
|
void operator()(const Range& r) const
|
||||||
|
{
|
||||||
|
size_t planeSize = (size_t)output->size[2]*output->size[3];
|
||||||
|
size_t nch = chptrs.size();
|
||||||
|
size_t total = nch*planeSize;
|
||||||
|
size_t stripeSize = (total + nstripes - 1)/nstripes;
|
||||||
|
size_t stripeStart = r.start*stripeSize;
|
||||||
|
size_t stripeEnd = std::min(total, r.end*stripeSize);
|
||||||
|
const float** ptrs = (const float**)&chptrs[0];
|
||||||
|
float* outptr = output->ptr<float>();
|
||||||
|
size_t blockSize0 = 1 << 16;
|
||||||
|
|
||||||
|
for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
|
||||||
|
{
|
||||||
|
size_t ch = ofs0/planeSize;
|
||||||
|
size_t ofs = ofs0 - ch*planeSize;
|
||||||
|
size_t blockSize = std::min(blockSize0, planeSize - ofs);
|
||||||
|
memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
|
||||||
|
ofs0 += blockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
|
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
|
||||||
{
|
{
|
||||||
CV_TRACE_FUNCTION();
|
CV_TRACE_FUNCTION();
|
||||||
@ -101,14 +173,23 @@ public:
|
|||||||
|
|
||||||
int cAxis = clamp(axis, inputs[0]->dims);
|
int cAxis = clamp(axis, inputs[0]->dims);
|
||||||
Mat& outMat = outputs[0];
|
Mat& outMat = outputs[0];
|
||||||
std::vector<Range> ranges(outputs[0].dims, Range::all());
|
|
||||||
|
|
||||||
ranges[cAxis].start = 0;
|
if( cAxis == 1 && outMat.dims == 4 )
|
||||||
for (size_t i = 0; i < inputs.size(); i++)
|
|
||||||
{
|
{
|
||||||
ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
|
int nstripes = getNumThreads();
|
||||||
inputs[i]->copyTo(outMat(&ranges[0]));
|
ChannelConcatInvoker::run(inputs, outMat, nstripes);
|
||||||
ranges[cAxis].start = ranges[cAxis].end;
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::vector<Range> ranges(outputs[0].dims, Range::all());
|
||||||
|
|
||||||
|
ranges[cAxis].start = 0;
|
||||||
|
for (size_t i = 0; i < inputs.size(); i++)
|
||||||
|
{
|
||||||
|
ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
|
||||||
|
inputs[i]->copyTo(outMat(&ranges[0]));
|
||||||
|
ranges[cAxis].start = ranges[cAxis].end;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,6 +148,7 @@ public:
|
|||||||
std::vector<float> reluslope;
|
std::vector<float> reluslope;
|
||||||
Ptr<ActivationLayer> activ;
|
Ptr<ActivationLayer> activ;
|
||||||
Ptr<BatchNormLayer> bnorm;
|
Ptr<BatchNormLayer> bnorm;
|
||||||
|
Ptr<ScaleLayer> scaleLayer;
|
||||||
|
|
||||||
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
|
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
|
||||||
{
|
{
|
||||||
@ -202,6 +203,9 @@ public:
|
|||||||
|
|
||||||
bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
|
bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
|
||||||
{
|
{
|
||||||
|
// for now the scale layer followed by the batch norm cannot be fused, only vice versa.
|
||||||
|
if( !scaleLayer.empty() )
|
||||||
|
return false;
|
||||||
bnorm = layer;
|
bnorm = layer;
|
||||||
// we will need to re-compute the weights with the batch
|
// we will need to re-compute the weights with the batch
|
||||||
// norm coefficients taken into account
|
// norm coefficients taken into account
|
||||||
@ -209,6 +213,15 @@ public:
|
|||||||
return !bnorm.empty();
|
return !bnorm.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool setScale(const Ptr<ScaleLayer>& layer)
|
||||||
|
{
|
||||||
|
scaleLayer = layer;
|
||||||
|
// we will need to re-compute the weights with the scaling
|
||||||
|
// coefficients taken into account
|
||||||
|
weightsMat.release();
|
||||||
|
return !scaleLayer.empty();
|
||||||
|
}
|
||||||
|
|
||||||
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
|
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_HALIDE
|
#ifdef HAVE_HALIDE
|
||||||
@ -678,32 +691,56 @@ public:
|
|||||||
biasvec[k] = biasMat.at<float>(k);
|
biasvec[k] = biasMat.at<float>(k);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( !bnorm.empty() )
|
if( !bnorm.empty() || !scaleLayer.empty() )
|
||||||
{
|
{
|
||||||
Mat scale, shift;
|
Mat scale, shift, scale2, shift2;
|
||||||
bnorm->getScaleShift(scale, shift);
|
const float *scaleptr = 0, *shiftptr = 0;
|
||||||
|
const float *scaleptr2 = 0, *shiftptr2 = 0;
|
||||||
|
|
||||||
CV_Assert( scale.isContinuous() && shift.isContinuous() &&
|
if( !bnorm.empty() )
|
||||||
scale.type() == CV_32F && shift.type() == CV_32F &&
|
{
|
||||||
scale.total() == (size_t)outCn &&
|
bnorm->getScaleShift(scale, shift);
|
||||||
shift.total() == (size_t)outCn );
|
CV_Assert( scale.isContinuous() && shift.isContinuous() &&
|
||||||
|
scale.type() == CV_32F && shift.type() == CV_32F &&
|
||||||
|
scale.total() == (size_t)outCn &&
|
||||||
|
shift.total() == (size_t)outCn );
|
||||||
|
scaleptr = scale.ptr<float>();
|
||||||
|
shiftptr = shift.ptr<float>();
|
||||||
|
}
|
||||||
|
if( !scaleLayer.empty() )
|
||||||
|
{
|
||||||
|
scale2 = scaleLayer->blobs[0];
|
||||||
|
CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
|
||||||
|
scale2.total() == (size_t)outCn );
|
||||||
|
scaleptr2 = scale2.ptr<float>();
|
||||||
|
if( scaleLayer->hasBias )
|
||||||
|
{
|
||||||
|
shift2 = scaleLayer->blobs[1];
|
||||||
|
CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
|
||||||
|
shift2.total() == (size_t)outCn );
|
||||||
|
shiftptr2 = shift2.ptr<float>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for( int i = 0; i < outCn; i++ )
|
for( int i = 0; i < outCn; i++ )
|
||||||
{
|
{
|
||||||
float s = scale.at<float>(i);
|
float s1 = scaleptr ? scaleptr[i] : 1.f;
|
||||||
float delta = shift.at<float>(i);
|
float delta1 = shiftptr ? shiftptr[i] : 0.f;
|
||||||
|
float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
|
||||||
|
float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
|
||||||
float* w_i = weightsMat.ptr<float>(i);
|
float* w_i = weightsMat.ptr<float>(i);
|
||||||
int j, wcols = weightsMat.cols;
|
int j, wcols = weightsMat.cols;
|
||||||
|
|
||||||
for( j = 0; j < wcols; j++ )
|
for( j = 0; j < wcols; j++ )
|
||||||
w_i[j] *= s;
|
w_i[j] *= (s1*s2);
|
||||||
|
|
||||||
biasvec[i] = biasvec[i]*s + delta;
|
biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
|
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reluslope.clear();
|
||||||
if( activ )
|
if( activ )
|
||||||
{
|
{
|
||||||
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
|
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
|
||||||
|
@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)
|
|||||||
|
|
||||||
Net net;
|
Net net;
|
||||||
|
|
||||||
std::vector<int> convLayerIds(numChannels.channels);
|
std::vector<int> convLayerIds;
|
||||||
|
convLayerIds.reserve(numChannels.channels);
|
||||||
for (int i = 0, n = numChannels.channels; i < n; ++i)
|
for (int i = 0, n = numChannels.channels; i < n; ++i)
|
||||||
{
|
{
|
||||||
if (!numChannels[i])
|
if (!numChannels[i])
|
||||||
@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
|
|||||||
convParam.name = ss.str();
|
convParam.name = ss.str();
|
||||||
convParam.blobs.push_back(weights);
|
convParam.blobs.push_back(weights);
|
||||||
|
|
||||||
convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
|
int layerId = net.addLayer(convParam.name, convParam.type, convParam);
|
||||||
net.connect(0, 0, convLayerIds[i], 0);
|
convLayerIds.push_back(layerId);
|
||||||
|
net.connect(0, 0, layerId, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LayerParams concatParam;
|
LayerParams concatParam;
|
||||||
|
Loading…
Reference in New Issue
Block a user