mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 09:25:45 +08:00
Merge pull request #24834 from fengyuentau:cuda_naryeltwise_broadcast
dnn (cuda): support broadcasting if a.rank() != b.rank() #24834 Inspired by https://github.com/opencv/opencv/pull/24786. This PR keeps the fusion of `NaryEltwise` and `Concat` while addressed the data missing problem via supporting broadcasting if a.rank() != b.rank(). Resolves https://github.com/opencv/opencv/issues/23977 Resolves https://github.com/opencv/opencv/issues/24606 Resolves https://github.com/opencv/opencv/issues/24635 Resolves https://github.com/opencv/opencv/issues/24721 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
be1373f01a
commit
e7ccff9805
@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(is_shape_compatible(output, x));
|
||||
CV_Assert(is_shape_compatible(output, y));
|
||||
auto inShape1 = x.shape_as_vector();
|
||||
auto inShape2 = y.shape_as_vector();
|
||||
auto outShape = output.shape_as_vector();
|
||||
|
||||
std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size();
|
||||
if (x_ndims >= y_ndims) {
|
||||
for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) {
|
||||
inShape2.insert(inShape2.begin(), 1);
|
||||
}
|
||||
} else {
|
||||
for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) {
|
||||
inShape1.insert(inShape1.begin(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
CV_Assert(is_shape_compatible1(outShape, inShape1));
|
||||
CV_Assert(is_shape_compatible1(outShape, inShape2));
|
||||
|
||||
/* matching singleton axes in both input tensors can be eliminated
|
||||
*
|
||||
@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
|
||||
* x: [1, 256, 32, 32] -> [256, 32, 32]
|
||||
* y: [1, 256, 1, 1] -> [256, 1, 1]
|
||||
*/
|
||||
for (int r = 0; r < output.rank(); r++)
|
||||
{
|
||||
while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
|
||||
CV_Assert(output.get_axis_size(r) == 1);
|
||||
|
||||
x.squeeze(r);
|
||||
y.squeeze(r);
|
||||
output.squeeze(r);
|
||||
int eliminate_times = 0;
|
||||
for (std::size_t i = 0; i < outShape.size(); i++) {
|
||||
if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) {
|
||||
eliminate_times++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (eliminate_times > 0) {
|
||||
for (int i = 0; i < eliminate_times; i++) {
|
||||
inShape1.erase(inShape1.begin());
|
||||
inShape2.erase(inShape2.begin());
|
||||
outShape.erase(outShape.begin());
|
||||
}
|
||||
}
|
||||
|
||||
auto inShape1 = x.shape_as_vector();
|
||||
auto inShape2 = y.shape_as_vector();
|
||||
auto outShape = output.shape_as_vector();
|
||||
|
||||
/* contiguous axes that do not broadcast can be merged into one axis
|
||||
*
|
||||
|
@ -1187,6 +1187,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename ShapeType>
|
||||
bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept {
|
||||
const auto x_ndims = x_shape.size(), y_ndims = y_shape.size();
|
||||
|
||||
if (x_ndims != y_ndims) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < x_ndims; i++) {
|
||||
if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** returns the rank to which the given tensor can be squeezed to */
|
||||
template <class TensorType>
|
||||
std::size_t get_effective_rank(const TensorType& x) noexcept {
|
||||
|
@ -818,19 +818,6 @@ public:
|
||||
{
|
||||
auto context = reinterpret_cast<csl::CSLContext*>(context_);
|
||||
|
||||
auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
|
||||
for (int i = 1; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
|
||||
if (input_0_shape.size() != input_i_shape.size())
|
||||
return Ptr<BackendNode>();
|
||||
// check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
|
||||
for (int j = 0; j < input_0_shape.size(); j++)
|
||||
if (input_0_shape[j] != input_i_shape[j] &&
|
||||
input_0_shape[j] != 1 && input_i_shape[j] != 1)
|
||||
return Ptr<BackendNode>();
|
||||
}
|
||||
|
||||
cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
|
||||
switch (op) {
|
||||
case OPERATION::MAX:
|
||||
|
@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
|
||||
if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
|
||||
break;
|
||||
#ifdef HAVE_CUDA
|
||||
/* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance
|
||||
that Concat's output is filled with data in both host and device, leading to data missing.
|
||||
See https://github.com/opencv/opencv/issues/24721 for more details.
|
||||
*/
|
||||
if (preferableBackend == DNN_BACKEND_CUDA &&
|
||||
(inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
|
||||
(inp_i_data->layerInstance->type != "Convolution" &&
|
||||
|
@ -102,6 +102,12 @@ public:
|
||||
Net net;
|
||||
};
|
||||
|
||||
TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) {
|
||||
processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0");
|
||||
expectNoFallbacksFromIE(net);
|
||||
expectNoFallbacksFromCUDA(net);
|
||||
}
|
||||
|
||||
TEST_P(DNNTestNetwork, AlexNet)
|
||||
{
|
||||
applyTestTag(CV_TEST_TAG_MEMORY_1GB);
|
||||
@ -1518,6 +1524,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine(
|
||||
dnnBackendsAndTargets()
|
||||
));
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Element-wise layers
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
using NaryEltwiseConcat = TestWithParam<tuple<std::vector<int>, tuple<Backend, Target>>>;
|
||||
TEST_P(NaryEltwiseConcat, Accuracy) {
|
||||
auto param = GetParam();
|
||||
std::vector<int> input_shape = get<0>(param);
|
||||
auto backend_id = get<0>(get<1>(param));
|
||||
auto target_id = get<1>(get<1>(param));
|
||||
|
||||
/* Build the following net:
|
||||
|
||||
<1x4x84>
|
||||
/
|
||||
[Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output]
|
||||
| |
|
||||
+-> Sigmoid ----------+
|
||||
|
||||
*/
|
||||
Net net;
|
||||
|
||||
std::vector<int> mul_B_shape(input_shape.size() - 1, 1);
|
||||
mul_B_shape.back() = input_shape.back();
|
||||
Mat mul_B(mul_B_shape, CV_32FC1);
|
||||
randn(mul_B, 0.f, 1.f);
|
||||
LayerParams mul_B_lp;
|
||||
mul_B_lp.name = "mul_B";
|
||||
mul_B_lp.type = "Const";
|
||||
mul_B_lp.blobs.push_back(mul_B);
|
||||
int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp);
|
||||
|
||||
LayerParams mul_lp;
|
||||
mul_lp.name = "mul";
|
||||
mul_lp.type = "NaryEltwise";
|
||||
mul_lp.set("operation", "mul");
|
||||
int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp);
|
||||
net.connect(0, 0, id_mul, 0);
|
||||
net.connect(id_mul_B, 0, id_mul, 1);
|
||||
|
||||
LayerParams sigmoid_lp;
|
||||
sigmoid_lp.name = "sigmoid";
|
||||
sigmoid_lp.type = "Sigmoid";
|
||||
int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp);
|
||||
net.connect(0, 0, id_sigmoid, 0);
|
||||
|
||||
LayerParams concat_lp;
|
||||
concat_lp.name = "concat";
|
||||
concat_lp.type = "Concat";
|
||||
concat_lp.set("axis", 1);
|
||||
int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp);
|
||||
net.connect(id_mul, 0, id_concat, 0);
|
||||
net.connect(id_sigmoid, 0, id_concat, 1);
|
||||
|
||||
// Run test
|
||||
Mat input(input_shape, CV_32FC1);
|
||||
testLayer(input, net, backend_id, target_id, false);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine(
|
||||
testing::Values(std::vector<int>{1, 4, 84}),
|
||||
dnnBackendsAndTargets())
|
||||
);
|
||||
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets());
|
||||
|
||||
}} // namespace
|
||||
|
@ -2050,7 +2050,7 @@ private:
|
||||
net.setPreferableTarget(target);
|
||||
|
||||
Mat re;
|
||||
ASSERT_NO_THROW(re = net.forward()); // runtime error
|
||||
re = net.forward();
|
||||
auto ptr_re = (float *) re.data;
|
||||
for (int i = 0; i < re.total(); i++)
|
||||
if (op == "sum"){
|
||||
|
Loading…
Reference in New Issue
Block a user