Merge pull request #24834 from fengyuentau:cuda_naryeltwise_broadcast

dnn (cuda): support broadcasting if a.rank() != b.rank() #24834

Inspired by https://github.com/opencv/opencv/pull/24786. This PR keeps the fusion of `NaryEltwise` and `Concat` while addressed the data missing problem via supporting broadcasting if a.rank() != b.rank().

Resolves https://github.com/opencv/opencv/issues/23977
Resolves https://github.com/opencv/opencv/issues/24606
Resolves https://github.com/opencv/opencv/issues/24635
Resolves https://github.com/opencv/opencv/issues/24721 

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2024-01-11 01:04:46 -06:00 committed by GitHub
parent be1373f01a
commit e7ccff9805
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 123 additions and 28 deletions

View File

@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
}
else
{
CV_Assert(is_shape_compatible(output, x));
CV_Assert(is_shape_compatible(output, y));
auto inShape1 = x.shape_as_vector();
auto inShape2 = y.shape_as_vector();
auto outShape = output.shape_as_vector();
std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size();
if (x_ndims >= y_ndims) {
for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) {
inShape2.insert(inShape2.begin(), 1);
}
} else {
for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) {
inShape1.insert(inShape1.begin(), 1);
}
}
CV_Assert(is_shape_compatible1(outShape, inShape1));
CV_Assert(is_shape_compatible1(outShape, inShape2));
/* matching singleton axes in both input tensors can be eliminated
*
@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
* x: [1, 256, 32, 32] -> [256, 32, 32]
* y: [1, 256, 1, 1] -> [256, 1, 1]
*/
for (int r = 0; r < output.rank(); r++)
{
while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
CV_Assert(output.get_axis_size(r) == 1);
x.squeeze(r);
y.squeeze(r);
output.squeeze(r);
int eliminate_times = 0;
for (std::size_t i = 0; i < outShape.size(); i++) {
if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) {
eliminate_times++;
} else {
break;
}
}
if (eliminate_times > 0) {
for (int i = 0; i < eliminate_times; i++) {
inShape1.erase(inShape1.begin());
inShape2.erase(inShape2.begin());
outShape.erase(outShape.begin());
}
}
auto inShape1 = x.shape_as_vector();
auto inShape2 = y.shape_as_vector();
auto outShape = output.shape_as_vector();
/* contiguous axes that do not broadcast can be merged into one axis
*

View File

@ -1187,6 +1187,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
return true;
}
template <typename ShapeType>
bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept {
const auto x_ndims = x_shape.size(), y_ndims = y_shape.size();
if (x_ndims != y_ndims) {
return false;
}
for (int i = 0; i < x_ndims; i++) {
if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) {
return false;
}
}
return true;
}
/** returns the rank to which the given tensor can be squeezed to */
template <class TensorType>
std::size_t get_effective_rank(const TensorType& x) noexcept {

View File

@ -818,19 +818,6 @@ public:
{
auto context = reinterpret_cast<csl::CSLContext*>(context_);
auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
for (int i = 1; i < inputs.size(); i++)
{
auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
if (input_0_shape.size() != input_i_shape.size())
return Ptr<BackendNode>();
// check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
for (int j = 0; j < input_0_shape.size(); j++)
if (input_0_shape[j] != input_i_shape[j] &&
input_0_shape[j] != 1 && input_i_shape[j] != 1)
return Ptr<BackendNode>();
}
cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
switch (op) {
case OPERATION::MAX:

View File

@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
break;
#ifdef HAVE_CUDA
/* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance
that Concat's output is filled with data in both host and device, leading to data missing.
See https://github.com/opencv/opencv/issues/24721 for more details.
*/
if (preferableBackend == DNN_BACKEND_CUDA &&
(inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
(inp_i_data->layerInstance->type != "Convolution" &&

View File

@ -102,6 +102,12 @@ public:
Net net;
};
TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) {
processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0");
expectNoFallbacksFromIE(net);
expectNoFallbacksFromCUDA(net);
}
TEST_P(DNNTestNetwork, AlexNet)
{
applyTestTag(CV_TEST_TAG_MEMORY_1GB);
@ -1518,6 +1524,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine(
dnnBackendsAndTargets()
));
////////////////////////////////////////////////////////////////////////////////
// Element-wise layers
////////////////////////////////////////////////////////////////////////////////
using NaryEltwiseConcat = TestWithParam<tuple<std::vector<int>, tuple<Backend, Target>>>;
TEST_P(NaryEltwiseConcat, Accuracy) {
auto param = GetParam();
std::vector<int> input_shape = get<0>(param);
auto backend_id = get<0>(get<1>(param));
auto target_id = get<1>(get<1>(param));
/* Build the following net:
<1x4x84>
/
[Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output]
| |
+-> Sigmoid ----------+
*/
Net net;
std::vector<int> mul_B_shape(input_shape.size() - 1, 1);
mul_B_shape.back() = input_shape.back();
Mat mul_B(mul_B_shape, CV_32FC1);
randn(mul_B, 0.f, 1.f);
LayerParams mul_B_lp;
mul_B_lp.name = "mul_B";
mul_B_lp.type = "Const";
mul_B_lp.blobs.push_back(mul_B);
int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp);
LayerParams mul_lp;
mul_lp.name = "mul";
mul_lp.type = "NaryEltwise";
mul_lp.set("operation", "mul");
int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp);
net.connect(0, 0, id_mul, 0);
net.connect(id_mul_B, 0, id_mul, 1);
LayerParams sigmoid_lp;
sigmoid_lp.name = "sigmoid";
sigmoid_lp.type = "Sigmoid";
int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp);
net.connect(0, 0, id_sigmoid, 0);
LayerParams concat_lp;
concat_lp.name = "concat";
concat_lp.type = "Concat";
concat_lp.set("axis", 1);
int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp);
net.connect(id_mul, 0, id_concat, 0);
net.connect(id_sigmoid, 0, id_concat, 1);
// Run test
Mat input(input_shape, CV_32FC1);
testLayer(input, net, backend_id, target_id, false);
}
INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine(
testing::Values(std::vector<int>{1, 4, 84}),
dnnBackendsAndTargets())
);
INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets());
}} // namespace

View File

@ -2050,7 +2050,7 @@ private:
net.setPreferableTarget(target);
Mat re;
ASSERT_NO_THROW(re = net.forward()); // runtime error
re = net.forward();
auto ptr_re = (float *) re.data;
for (int i = 0; i < re.total(); i++)
if (op == "sum"){