mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
Merge pull request #17939 from YashasSamaga:cuda4dnn-fix-eltwise-fusion
* fix eltwise fusion segfault, more eltwise fusions, fix power fusion * add assertion
This commit is contained in:
parent
e421233a1d
commit
f53f491cd2
@ -68,7 +68,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
IDENTITY,
|
||||
RELU, /* uses value provided in `relu_negative_slope` */
|
||||
CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
|
||||
POWER, /* scale and shift fused with weights and bias; only `power_exp` is handled here */
|
||||
POWER,
|
||||
TANH,
|
||||
SIGMOID,
|
||||
SWISH,
|
||||
@ -76,7 +76,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
};
|
||||
|
||||
ActivationType activation_type;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil;
|
||||
float power_exp, power_scale, power_shift;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
@ -224,10 +225,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
crelu_floor = config.crelu_floor;
|
||||
crelu_ceil = config.crelu_ceil;
|
||||
power_exp = config.power_exp;
|
||||
|
||||
/* the scale and shift parameters of POWER have already been fused with weights and bias */
|
||||
if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f)
|
||||
activation = ConvolutionConfiguration::ActivationType::IDENTITY;
|
||||
power_scale = config.power_scale;
|
||||
power_shift = config.power_shift;
|
||||
|
||||
/* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves
|
||||
* hence, the activation for cuDNN is IDENTITY by default
|
||||
@ -383,7 +382,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
kernels::biasN_eltwise_sum_2_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
|
||||
kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_eltwise_sum_2_tanh_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
@ -414,7 +413,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
kernels::biasN_clipped_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
|
||||
kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_tanh_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
@ -450,7 +449,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, 1.0, 0.0);
|
||||
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
@ -497,7 +496,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
kernels::eltwise_sum_2_clipped_relu<T>(stream, output, output, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, 1.0, 0.0);
|
||||
kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::eltwise_sum_2_tanh<T>(stream, output, output, eltwise);
|
||||
@ -527,7 +526,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
kernels::clipped_relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, 1.0, 0.0);
|
||||
kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::tanh_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
@ -561,7 +560,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
|
||||
kernels::power<T>(stream, output, output, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::tanh<T>(stream, output, output);
|
||||
@ -595,7 +594,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
ConvolutionConfiguration::FusionMode fusion_mode;
|
||||
ConvolutionConfiguration::ActivationType activation;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil;
|
||||
float power_exp, power_scale, power_shift;
|
||||
|
||||
enum class InternalFusionLocation {
|
||||
CUDNN,
|
||||
|
@ -2656,32 +2656,21 @@ struct Net::Impl : public detail::NetImplBase
|
||||
Ptr<EltwiseLayer> nextEltwiseLayer;
|
||||
if( nextData )
|
||||
nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
// CUDA backend supports fusion with eltwise sum (without variable channels)
|
||||
// `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
|
||||
if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
|
||||
{
|
||||
// we create a temporary backend node for eltwise layer to obtain the eltwise configuration
|
||||
cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init
|
||||
cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
|
||||
const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
|
||||
const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
|
||||
if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
|
||||
// CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
|
||||
// Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
|
||||
if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
|
||||
nextEltwiseLayer = Ptr<EltwiseLayer>();
|
||||
|
||||
// check for variable channels
|
||||
auto& inputs = nextData->inputBlobs;
|
||||
for (int i = 1; i < inputs.size(); ++i)
|
||||
{
|
||||
if (inputs[i]->size[1] != inputs[0]->size[1])
|
||||
{
|
||||
nextEltwiseLayer = Ptr<EltwiseLayer>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
|
||||
{
|
||||
LayerData *eltwiseData = nextData;
|
||||
@ -2725,7 +2714,8 @@ struct Net::Impl : public detail::NetImplBase
|
||||
{
|
||||
nextData = &layers[eltwiseData->consumers[0].lid];
|
||||
lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
|
||||
if (pinsToKeep.count(lpNext) == 0 && nextData->outputBlobs.size() == 1)
|
||||
CV_Assert(nextData);
|
||||
if (nextData->outputBlobs.size() == 1)
|
||||
nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
|
||||
}
|
||||
else
|
||||
|
@ -250,7 +250,8 @@ public:
|
||||
#ifdef HAVE_CUDA
|
||||
cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
|
||||
cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
|
||||
float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp;
|
||||
float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil;
|
||||
float cuda_power_exp, cuda_power_scale, cuda_power_shift;
|
||||
#endif
|
||||
|
||||
ConvolutionLayerImpl(const LayerParams ¶ms) : BaseConvolutionLayerImpl(params)
|
||||
@ -457,13 +458,8 @@ public:
|
||||
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
|
||||
if (!activ_power.empty())
|
||||
{
|
||||
if (activ_power->scale != 1.f || activ_power->shift != 0.f)
|
||||
{
|
||||
const int outCh = blobs[0].size[0];
|
||||
fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)),
|
||||
Mat(1, outCh, CV_32F, Scalar(activ_power->shift)));
|
||||
}
|
||||
|
||||
cuda_power_scale = activ_power->scale;
|
||||
cuda_power_shift = activ_power->shift;
|
||||
cuda_power_exp = activ_power->power;
|
||||
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER;
|
||||
}
|
||||
@ -1591,6 +1587,8 @@ public:
|
||||
config.crelu_floor = cuda_crelu_floor;
|
||||
config.crelu_ceil = cuda_crelu_ceil;
|
||||
config.power_exp = cuda_power_exp;
|
||||
config.power_scale = cuda_power_scale;
|
||||
config.power_shift = cuda_power_shift;
|
||||
|
||||
Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
|
||||
Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();
|
||||
|
Loading…
Reference in New Issue
Block a user