core: add broadcast (#23965)

* add broadcast_to with tests * change name * fix test * fix implicit type conversion * replace type of shape with InputArray * add perf test * add perf tests which takes care of axis * v2 from ficus expand * rename to broadcast * use randu in place of declare * doc improvement; smaller scale in perf * capture get_index by reference
2025-07-24 14:06:27 +08:00 · 2023-08-30 14:53:59 +08:00 · 2023-08-30 14:53:59 +08:00 · a308dfca98
commit a308dfca98
parent 8a1b998b2b
4 changed files with 385 additions and 0 deletions
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -1118,6 +1118,13 @@ CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
 */
 CV_EXPORTS_W void flipND(InputArray src, OutputArray dst, int axis);

+/** @brief Broadcast the given Mat to the given shape.
+ * @param src input array
+ * @param shape target shape. Should be a list of CV_32S numbers. Note that negative values are not supported.
+ * @param dst output array that has the given shape
+ */
+CV_EXPORTS_W void broadcast(InputArray src, InputArray shape, OutputArray dst);
+
 enum RotateFlags {
    ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
    ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@ -5,8 +5,35 @@ namespace opencv_test
 {
 using namespace perf;

+using BroadcastTest = perf::TestBaseWithParam<std::tuple<std::vector<int>, perf::MatType, std::vector<int>>>;
 typedef Size_MatType BinaryOpTest;

+PERF_TEST_P_(BroadcastTest, basic)
+{
+    std::vector<int> shape_src = get<0>(GetParam());
+    int dt_type = get<1>(GetParam());
+    std::vector<int> shape_dst = get<2>(GetParam());
+
+    cv::Mat src(static_cast<int>(shape_src.size()), shape_src.data(), dt_type);
+    cv::Mat dst(static_cast<int>(shape_dst.size()), shape_dst.data(), dt_type);
+
+    cv::randu(src, -1.f, 1.f);
+
+    TEST_CYCLE() cv::broadcast(src, shape_dst, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , BroadcastTest,
+    testing::Combine(
+        testing::Values(std::vector<int>{1, 100, 800},
+                        std::vector<int>{10, 1, 800},
+                        std::vector<int>{10, 100, 1}),
+        testing::Values(CV_32FC1),
+        testing::Values(std::vector<int>{10, 100, 800})
+    )
+);
+
 PERF_TEST_P_(BinaryOpTest, min)
 {
    Size sz = get<0>(GetParam());
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@ -7,6 +7,7 @@
 #include "opencv2/core/detail/dispatch_helper.impl.hpp"

 #include <algorithm> // std::swap_ranges
+#include <numeric> // std::accumulate

 namespace cv {

@ -857,6 +858,223 @@ void flipND(InputArray _src, OutputArray _dst, int _axis)
    flipNDImpl(dst.ptr(), dst.size.p, dst.step.p, axis);
 }

+/*
+    This function first prepends 1 to each tensor shape to have a common max_ndims dimension, then flatten non-broadcast dimensions.
+*/
+static bool _flatten_for_broadcast(int narrays, int max_ndims, const int* ndims, const int** orig_shape,
+                                   int** flatten_shape, size_t** flatten_step) {
+    int i, j, k;
+
+    // step 1.
+    // * make all inputs and the output max_ndims-dimensional.
+    // * compute proper step's
+    for (i = max_ndims - 1; i >= 0; i-- ) {
+        for (k = 0; k < narrays; k++) {
+            j = ndims[k] - (max_ndims - i);
+            int sz_i = j >= 0 ? orig_shape[k][j] : 1;
+            size_t st_i = i == max_ndims - 1 ? 1 : flatten_step[k][i+1] * flatten_shape[k][i+1];
+            flatten_shape[k][i] = sz_i;
+            flatten_step[k][i] = st_i;
+            if (flatten_shape[k][i] == 0)
+                return false;
+        }
+    }
+
+    // step 2. Let's do the flattening first,
+    // since we'd need proper values of steps to check continuity.
+    // this loop is probably the most tricky part
+    // in the whole implementation of broadcasting.
+    j = max_ndims-1;
+    for (i = j - 1; i >= 0; i--) {
+        bool all_contiguous = true, all_scalars = true, all_consistent = true;
+        for(k = 0; k < narrays; k++) {
+            size_t st = flatten_step[k][j] * flatten_shape[k][j];
+            bool prev_scalar = flatten_shape[k][j] == 1;
+            bool scalar = flatten_shape[k][i] == 1;
+            all_contiguous = all_contiguous && (st == flatten_step[k][i]);
+            all_scalars = all_scalars && scalar;
+            all_consistent = all_consistent && (scalar == prev_scalar);
+        }
+        if (all_contiguous && (all_consistent || all_scalars)) {
+            for(k = 0; k < narrays; k++)
+                flatten_shape[k][j] *= flatten_shape[k][i];
+        } else {
+            j--;
+            if (i < j) {
+                for(k = 0; k < narrays; k++) {
+                    flatten_shape[k][j] = flatten_shape[k][i];
+                    flatten_step[k][j] = flatten_step[k][i];
+                }
+            }
+        }
+    }
+
+    // step 3. Set some step's to 0's.
+    for (i = max_ndims-1; i >= j; i--) {
+        for (k = 0; k < narrays; k++)
+            flatten_step[k][i] = flatten_shape[k][i] == 1 ? 0 : flatten_step[k][i];
+    }
+    for (; i >= 0; i--) {
+        for (k = 0; k < narrays; k++) {
+            flatten_step[k][i] = 0;
+            flatten_shape[k][i] = 1;
+        }
+    }
+    return true;
+}
+
+void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
+    CV_INSTRUMENT_REGION();
+
+    Mat src = _src.getMat();
+    CV_CheckTrue(src.isContinuous(), "broadcast: input array must be contiguous");
+    CV_CheckChannelsEQ(src.channels(), 1, "broadcast: input array must be single channel");
+
+    Mat shape = _shape.getMat();
+    CV_CheckTypeEQ(shape.type(), CV_32S, "broadcast: target shape must be of type int32");
+    const auto dims_shape = static_cast<int>(shape.total());
+    const auto *ptr_shape = shape.ptr<int>();
+
+    // check valid shape, 1D/0D Mat would fail in the following checks
+    const auto dims_src = src.dims;
+    CV_CheckLE(dims_src, dims_shape,
+               "broadcast: dimension of input array must be less than or equal to dimension of target shape");
+    std::vector<int> shape_src{src.size.p, src.size.p + dims_src};
+    if (shape_src.size() < static_cast<size_t>(dims_shape)) {
+        shape_src.insert(shape_src.begin(), dims_shape - shape_src.size(), 1);
+    }
+    for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
+        const auto *shape_target = ptr_shape;
+        if (shape_src[i] != 1) {
+            CV_CheckEQ(shape_src[i], shape_target[i], "target shape must be equal to input shape or 1");
+        }
+    }
+
+    // impl
+    _dst.create(dims_shape, shape.ptr<int>(), src.type());
+    Mat dst = _dst.getMat();
+    std::vector<int> is_same_shape(dims_shape, 0);
+    for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
+        if (shape_src[i] == ptr_shape[i]) {
+            is_same_shape[i] = 1;
+        }
+    }
+    // copy if same shape
+    if (std::accumulate(is_same_shape.begin(), is_same_shape.end(), 1, std::multiplies<int>()) != 0) {
+        const auto *p_src = src.ptr<const char>();
+        auto *p_dst = dst.ptr<char>();
+        std::memcpy(p_dst, p_src, dst.total() * dst.elemSize());
+        return;
+    }
+    // other cases
+    int max_ndims = std::max(dims_src, dims_shape);
+    const int all_ndims[2] = {src.dims, dst.dims};
+    const int* orig_shapes[2] = {src.size.p, dst.size.p};
+    cv::AutoBuffer<size_t> buff(max_ndims * 4);
+    int* flatten_shapes[2] = {(int*)buff.data(), (int*)(buff.data() + max_ndims)};
+    size_t* flatten_steps[2] = {(size_t*)(buff.data() + 2 * max_ndims), (size_t*)(buff.data() + 3 * max_ndims)};
+    if (_flatten_for_broadcast(2, max_ndims, all_ndims, orig_shapes, flatten_shapes, flatten_steps)) {
+        size_t src_dp = flatten_steps[0][max_ndims - 1];
+        size_t dst_dp = flatten_steps[1][max_ndims - 1];
+        CV_Assert(dst_dp == 1);
+        CV_Assert(max_ndims >= 2); // >= 3?
+        size_t rowstep_src = flatten_steps[0][max_ndims - 2];
+        size_t rowstep_dst = flatten_steps[1][max_ndims - 2];
+        const char* ptr_src = src.ptr<const char>();
+        char* ptr_dst = dst.ptr<char>();
+        size_t esz = src.elemSize();
+        int nrows = flatten_shapes[1][max_ndims - 2];
+        int ncols = flatten_shapes[1][max_ndims - 1];
+        int nplanes = 1;
+        CV_Check(esz, esz == 1 || esz == 2 || esz == 4 || esz == 8, "broadcast: not supported data type");
+
+        for (int k = 0; k < max_ndims - 2; k++) {
+            nplanes *= flatten_shapes[1][k];
+        }
+        for (int plane_idx = 0; plane_idx < nplanes; plane_idx++) {
+            size_t offset_src = 0, offset_dst = 0;
+            size_t idx = (size_t)plane_idx;
+            for (int k = max_ndims - 3; k >= 0; k--) {
+                size_t prev_idx = idx / flatten_shapes[1][k];
+                size_t i_k = (int)(idx - prev_idx * flatten_shapes[1][k]);
+                offset_src += i_k * flatten_steps[0][k];
+                offset_dst += i_k * flatten_steps[1][k];
+                idx = prev_idx;
+            }
+
+            #define OPENCV_CORE_BROADCAST_LOOP(_Tp) \
+                for (int i = 0; i < nrows; i++) {   \
+                    const _Tp *ptr_src_ = (const _Tp*)ptr_src + offset_src + rowstep_src * i; \
+                    _Tp *ptr_dst_ = (_Tp*)ptr_dst + offset_dst + rowstep_dst * i; \
+                    if (src_dp == 1) { \
+                        for (int j = 0; j < ncols; j++) { \
+                            ptr_dst_[j] = ptr_src_[j]; \
+                        } \
+                    } else { \
+                        _Tp x = *ptr_src_; \
+                        for (int j = 0; j < ncols; j++) { \
+                            ptr_dst_[j] = x; \
+                        } \
+                    } \
+                }
+
+            if (esz == 1) {
+                OPENCV_CORE_BROADCAST_LOOP(int8_t);
+            } else if (esz == 2) {
+                OPENCV_CORE_BROADCAST_LOOP(int16_t);
+            } else if (esz == 4) {
+                OPENCV_CORE_BROADCAST_LOOP(int32_t);
+            } else if (esz == 8) {
+                OPENCV_CORE_BROADCAST_LOOP(int64_t);
+            } else {
+                CV_Error(cv::Error::StsNotImplemented, "");
+            }
+            #undef OPENCV_CORE_BROADCAST_LOOP
+        }
+    } else {
+        // initial copy (src to dst)
+        std::vector<size_t> step_src{src.step.p, src.step.p + dims_src};
+        if (step_src.size() < static_cast<size_t>(dims_shape)) {
+            step_src.insert(step_src.begin(), dims_shape - step_src.size(), step_src[0]);
+        }
+        for (size_t i = 0; i < src.total(); ++i) {
+            size_t t = i;
+            size_t src_offset = 0, dst_offset = 0;
+            for (int j = static_cast<int>(shape_src.size() - 1); j >= 0; --j) {
+                size_t idx = t / shape_src[j];
+                size_t offset = static_cast<size_t>(t - idx * shape_src[j]);
+                src_offset += offset * step_src[j];
+                dst_offset += offset * dst.step[j];
+                t = idx;
+            }
+            const auto *p_src = src.ptr<const char>();
+            auto *p_dst = dst.ptr<char>();
+            std::memcpy(p_dst + dst_offset, p_src + src_offset, dst.elemSize());
+        }
+        // broadcast copy (dst inplace)
+        std::vector<int> cumulative_shape(dims_shape, 1);
+        int total = static_cast<int>(dst.total());
+        for (int i = dims_shape - 1; i >= 0; --i) {
+            cumulative_shape[i] = static_cast<int>(total / ptr_shape[i]);
+            total = cumulative_shape[i];
+        }
+        for (int i = dims_shape - 1; i >= 0; --i) {
+            if (is_same_shape[i] == 1) {
+                continue;
+            }
+            auto step = dst.step[i];
+            auto *p_dst = dst.ptr<char>();
+            for (int j = 0; j < cumulative_shape[i]; j++) {
+                for (int k = 0; k < ptr_shape[i] - 1; k++) {
+                    std::memcpy(p_dst + step, p_dst, step);
+                    p_dst += step;
+                }
+                p_dst += step;
+            }
+        }
+    }
+}
+
 void rotate(InputArray _src, OutputArray _dst, int rotateMode)
 {
    CV_Assert(_src.dims() <= 2);
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@ -2268,6 +2268,139 @@ INSTANTIATE_TEST_CASE_P(Arithm, FlipND, testing::Combine(
    testing::Values(perf::MatType(CV_8UC1), CV_32FC1)
 ));

+TEST(BroadcastTo, basic) {
+    std::vector<int> shape_src{2, 1};
+    std::vector<int> data_src{1, 2};
+    Mat src(static_cast<int>(shape_src.size()), shape_src.data(), CV_32SC1, data_src.data());
+
+    auto get_index = [](const std::vector<int>& shape, size_t cnt) {
+        std::vector<int> index(shape.size());
+        size_t t = cnt;
+        for (int i = static_cast<int>(shape.size() - 1); i >= 0; --i) {
+            size_t idx = t / shape[i];
+            index[i] = static_cast<int>(t - idx * shape[i]);
+            t = idx;
+        }
+        return index;
+    };
+
+    auto fn_verify = [&get_index](const Mat& ref, const Mat& res) {
+        // check type
+        EXPECT_EQ(ref.type(), res.type());
+        // check shape
+        EXPECT_EQ(ref.dims, res.dims);
+        for (int i = 0; i < ref.dims; ++i) {
+            EXPECT_EQ(ref.size[i], res.size[i]);
+        }
+        // check value
+        std::vector<int> shape{ref.size.p, ref.size.p + ref.dims};
+        for (size_t i = 0; i < ref.total(); ++i) {
+            auto index = get_index(shape, i);
+            switch (ref.type()) {
+                case CV_32SC1: {
+                    ASSERT_EQ(ref.at<int>(index.data()), res.at<int>(index.data()));
+                } break;
+                case CV_8UC1: {
+                    ASSERT_EQ(ref.at<uint8_t>(index.data()), res.at<uint8_t>(index.data()));
+                } break;
+                case CV_32FC1: {
+                    ASSERT_EQ(ref.at<float>(index.data()), res.at<float>(index.data()));
+                } break;
+                default: FAIL() << "Unsupported type: " << ref.type();
+            }
+        }
+    };
+
+    {
+        std::vector<int> shape{4, 2, 3};
+        std::vector<int> data_ref{
+            1, 1, 1, // [0, 0, :]
+            2, 2, 2, // [0, 1, :]
+            1, 1, 1, // [1, 0, :]
+            2, 2, 2, // [1, 1, :]
+            1, 1, 1, // [2, 0, :]
+            2, 2, 2, // [2, 1, :]
+            1, 1, 1, // [3, 0, :]
+            2, 2, 2  // [3, 1, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), src.type(), data_ref.data());
+        Mat dst;
+        broadcast(src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        Mat _src;
+        src.convertTo(_src, CV_8U);
+        std::vector<int> shape{4, 2, 3};
+        std::vector<uint8_t> data_ref{
+            1, 1, 1, // [0, 0, :]
+            2, 2, 2, // [0, 1, :]
+            1, 1, 1, // [1, 0, :]
+            2, 2, 2, // [1, 1, :]
+            1, 1, 1, // [2, 0, :]
+            2, 2, 2, // [2, 1, :]
+            1, 1, 1, // [3, 0, :]
+            2, 2, 2  // [3, 1, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        Mat _src;
+        src.convertTo(_src, CV_32F);
+        std::vector<int> shape{1, 1, 2, 1}; // {2, 1}
+        std::vector<float> data_ref{
+            1.f, // [0, 0, 0, 0]
+            2.f, // [0, 0, 1, 0]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        std::vector<int> _shape_src{2, 3, 4};
+        std::vector<float> _data_src{
+            1.f, 2.f, 3.f, 4.f, // [0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [1, 2, :]
+        };
+        Mat _src(static_cast<int>(_shape_src.size()), _shape_src.data(), CV_32FC1, _data_src.data());
+
+        std::vector<int> shape{2, 1, 2, 3, 4};
+        std::vector<float> data_ref{
+            1.f, 2.f, 3.f, 4.f, // [0, 0, 0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [0, 0, 0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [0, 0, 0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [0, 0, 1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [0, 0, 1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [0, 0, 1, 2, :]
+
+            1.f, 2.f, 3.f, 4.f, // [1, 0, 0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [1, 0, 0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [1, 0, 0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [1, 0, 1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [1, 0, 1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [1, 0, 1, 2, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+}
+
 TEST(Core_minMaxIdx, regression_9207_2)
 {
    const int rows = 13;