Merge pull request #17939 from YashasSamaga:cuda4dnn-fix-eltwise-fusion

* fix eltwise fusion segfault, more eltwise fusions, fix power fusion

* add assertion
This commit is contained in:
Yashas Samaga B L 2020-08-01 17:33:07 +05:30 committed by GitHub
parent e421233a1d
commit f53f491cd2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 37 deletions

View File

@ -68,7 +68,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
IDENTITY,
RELU, /* uses value provided in `relu_negative_slope` */
CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
POWER, /* scale and shift fused with weights and bias; only `power_exp` is handled here */
POWER,
TANH,
SIGMOID,
SWISH,
@ -76,7 +76,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
};
ActivationType activation_type;
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
float relu_negative_slope, crelu_floor, crelu_ceil;
float power_exp, power_scale, power_shift;
};
template <class T>
@ -224,10 +225,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
crelu_floor = config.crelu_floor;
crelu_ceil = config.crelu_ceil;
power_exp = config.power_exp;
/* the scale and shift parameters of POWER have already been fused with weights and bias */
if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f)
activation = ConvolutionConfiguration::ActivationType::IDENTITY;
power_scale = config.power_scale;
power_shift = config.power_shift;
/* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves
* hence, the activation for cuDNN is IDENTITY by default
@ -383,7 +382,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::biasN_eltwise_sum_2_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::biasN_eltwise_sum_2_tanh_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
@ -414,7 +413,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::biasN_clipped_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::biasN_tanh_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
@ -450,7 +449,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, 1.0, 0.0);
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
@ -497,7 +496,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::eltwise_sum_2_clipped_relu<T>(stream, output, output, eltwise, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, 1.0, 0.0);
kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, power_scale, power_shift);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::eltwise_sum_2_tanh<T>(stream, output, output, eltwise);
@ -527,7 +526,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::clipped_relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, 1.0, 0.0);
kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, power_scale, power_shift);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::tanh_eltwise_sum_2_inplace<T>(stream, output, eltwise);
@ -561,7 +560,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
kernels::power<T>(stream, output, output, power_exp, power_scale, power_shift);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::tanh<T>(stream, output, output);
@ -595,7 +594,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
ConvolutionConfiguration::FusionMode fusion_mode;
ConvolutionConfiguration::ActivationType activation;
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
float relu_negative_slope, crelu_floor, crelu_ceil;
float power_exp, power_scale, power_shift;
enum class InternalFusionLocation {
CUDNN,

View File

@ -2656,32 +2656,21 @@ struct Net::Impl : public detail::NetImplBase
Ptr<EltwiseLayer> nextEltwiseLayer;
if( nextData )
nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
#ifdef HAVE_CUDA
// CUDA backend supports fusion with eltwise sum (without variable channels)
// `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
{
// we create a temporary backend node for eltwise layer to obtain the eltwise configuration
cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init
cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
// CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
// Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
nextEltwiseLayer = Ptr<EltwiseLayer>();
// check for variable channels
auto& inputs = nextData->inputBlobs;
for (int i = 1; i < inputs.size(); ++i)
{
if (inputs[i]->size[1] != inputs[0]->size[1])
{
nextEltwiseLayer = Ptr<EltwiseLayer>();
break;
}
}
}
#endif
if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
{
LayerData *eltwiseData = nextData;
@ -2725,7 +2714,8 @@ struct Net::Impl : public detail::NetImplBase
{
nextData = &layers[eltwiseData->consumers[0].lid];
lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
if (pinsToKeep.count(lpNext) == 0 && nextData->outputBlobs.size() == 1)
CV_Assert(nextData);
if (nextData->outputBlobs.size() == 1)
nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
}
else

View File

@ -250,7 +250,8 @@ public:
#ifdef HAVE_CUDA
cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp;
float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil;
float cuda_power_exp, cuda_power_scale, cuda_power_shift;
#endif
ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params)
@ -457,13 +458,8 @@ public:
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
if (!activ_power.empty())
{
if (activ_power->scale != 1.f || activ_power->shift != 0.f)
{
const int outCh = blobs[0].size[0];
fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)),
Mat(1, outCh, CV_32F, Scalar(activ_power->shift)));
}
cuda_power_scale = activ_power->scale;
cuda_power_shift = activ_power->shift;
cuda_power_exp = activ_power->power;
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER;
}
@ -1591,6 +1587,8 @@ public:
config.crelu_floor = cuda_crelu_floor;
config.crelu_ceil = cuda_crelu_ceil;
config.power_exp = cuda_power_exp;
config.power_scale = cuda_power_scale;
config.power_shift = cuda_power_shift;
Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();