Merge pull request #17939 from YashasSamaga:cuda4dnn-fix-eltwise-fusion

* fix eltwise fusion segfault, more eltwise fusions, fix power fusion * add assertion
2025-08-05 22:19:14 +08:00 · 2020-08-01 17:33:07 +05:30 · 2020-08-01 17:33:07 +05:30 · f53f491cd2
commit f53f491cd2
parent e421233a1d
3 changed files with 25 additions and 37 deletions
--- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
@ -68,7 +68,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
            IDENTITY,
            RELU, /* uses value provided in `relu_negative_slope` */
            CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
-            POWER, /* scale and shift fused with weights and bias; only `power_exp` is handled here */
+            POWER,
            TANH,
            SIGMOID,
            SWISH,
@ -76,7 +76,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
        };

        ActivationType activation_type;
-        float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
+        float relu_negative_slope, crelu_floor, crelu_ceil;
+        float power_exp, power_scale, power_shift;
    };

    template <class T>
@ -224,10 +225,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
            crelu_floor = config.crelu_floor;
            crelu_ceil = config.crelu_ceil;
            power_exp = config.power_exp;
-
-            /* the scale and shift parameters of POWER have already been fused with weights and bias */
-            if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f)
-                activation = ConvolutionConfiguration::ActivationType::IDENTITY;
+            power_scale = config.power_scale;
+            power_shift = config.power_shift;

            /* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves
             * hence, the activation for cuDNN is IDENTITY by default
@ -383,7 +382,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                            kernels::biasN_eltwise_sum_2_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
                            break;
                        case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
+                            kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
                            break;
                        case ConvolutionConfiguration::ActivationType::TANH:
                            kernels::biasN_eltwise_sum_2_tanh_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
@ -414,7 +413,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                            kernels::biasN_clipped_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
                            break;
                        case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
+                            kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
                            break;
                        case ConvolutionConfiguration::ActivationType::TANH:
                            kernels::biasN_tanh_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
@ -450,7 +449,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                            kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
                            break;
                        case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, 1.0, 0.0);
+                            kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift);
                            break;
                        case ConvolutionConfiguration::ActivationType::TANH:
                            kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
@ -497,7 +496,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                            kernels::eltwise_sum_2_clipped_relu<T>(stream, output, output, eltwise, crelu_floor, crelu_ceil);
                            break;
                        case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, 1.0, 0.0);
+                            kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, power_scale, power_shift);
                            break;
                        case ConvolutionConfiguration::ActivationType::TANH:
                            kernels::eltwise_sum_2_tanh<T>(stream, output, output, eltwise);
@ -527,7 +526,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                            kernels::clipped_relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, crelu_floor, crelu_ceil);
                            break;
                        case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, 1.0, 0.0);
+                            kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, power_scale, power_shift);
                            break;
                        case ConvolutionConfiguration::ActivationType::TANH:
                            kernels::tanh_eltwise_sum_2_inplace<T>(stream, output, eltwise);
@ -561,7 +560,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                            kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
                            break;
                        case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
+                            kernels::power<T>(stream, output, output, power_exp, power_scale, power_shift);
                            break;
                        case ConvolutionConfiguration::ActivationType::TANH:
                            kernels::tanh<T>(stream, output, output);
@ -595,7 +594,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {

        ConvolutionConfiguration::FusionMode fusion_mode;
        ConvolutionConfiguration::ActivationType activation;
-        float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
+        float relu_negative_slope, crelu_floor, crelu_ceil;
+        float power_exp, power_scale, power_shift;

        enum class InternalFusionLocation {
            CUDNN,
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -2656,32 +2656,21 @@ struct Net::Impl : public detail::NetImplBase
                    Ptr<EltwiseLayer> nextEltwiseLayer;
                    if( nextData )
                        nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
-
 #ifdef HAVE_CUDA
                    // CUDA backend supports fusion with eltwise sum (without variable channels)
                    // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
                    if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
                    {
                        // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
-                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init
+                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
                        const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
                        const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
-                        if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
+                        // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
+                        // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
+                        if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
                            nextEltwiseLayer = Ptr<EltwiseLayer>();
-
-                        // check for variable channels
-                        auto& inputs = nextData->inputBlobs;
-                        for (int i = 1; i < inputs.size(); ++i)
-                        {
-                            if (inputs[i]->size[1] != inputs[0]->size[1])
-                            {
-                                nextEltwiseLayer = Ptr<EltwiseLayer>();
-                                break;
-                            }
-                        }
                    }
 #endif
-
                    if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
                    {
                        LayerData *eltwiseData = nextData;
@ -2725,7 +2714,8 @@ struct Net::Impl : public detail::NetImplBase
                                {
                                    nextData = &layers[eltwiseData->consumers[0].lid];
                                    lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
-                                    if (pinsToKeep.count(lpNext) == 0 && nextData->outputBlobs.size() == 1)
+                                    CV_Assert(nextData);
+                                    if (nextData->outputBlobs.size() == 1)
                                        nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
                                }
                                else
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -250,7 +250,8 @@ public:
 #ifdef HAVE_CUDA
    cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
    cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
-    float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp;
+    float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil;
+    float cuda_power_exp, cuda_power_scale, cuda_power_shift;
 #endif

    ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params)
@ -457,13 +458,8 @@ public:
            Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
            if (!activ_power.empty())
            {
-                if (activ_power->scale != 1.f || activ_power->shift != 0.f)
-                {
-                    const int outCh = blobs[0].size[0];
-                    fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)),
-                                Mat(1, outCh, CV_32F, Scalar(activ_power->shift)));
-                }
-
+                cuda_power_scale = activ_power->scale;
+                cuda_power_shift = activ_power->shift;
                cuda_power_exp = activ_power->power;
                cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER;
            }
@ -1591,6 +1587,8 @@ public:
        config.crelu_floor = cuda_crelu_floor;
        config.crelu_ceil = cuda_crelu_ceil;
        config.power_exp = cuda_power_exp;
+        config.power_scale = cuda_power_scale;
+        config.power_shift = cuda_power_shift;

        Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
        Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();