multi-threaded scatterND and refactor perf

2025-08-05 14:06:35 +08:00 · 2024-01-05 18:15:59 +08:00 · 2024-01-05 18:15:59 +08:00 · 2ed97b9ef3
commit 2ed97b9ef3
parent 2997b4c5fe
2 changed files with 123 additions and 123 deletions
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -324,12 +324,12 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine(
                          /* withCann= */            false) // only test on CPU
 ));

-struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none")
-    {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
+using Layer_ScatterND = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_ScatterND, scatterND) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int backend_id = get<0>(get<2>(GetParam()));
+    int target_id = get<1>(get<2>(GetParam()));

    std::vector<int> indices_shape(shape);
    indices_shape.push_back(int(shape.size()));
@ -337,10 +337,8 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
    Mat indices(indices_shape, CV_32FC1);
    Mat updates(shape, CV_32FC1);

-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
-        randn(updates, mean, std);
+    randn(data, 0.f, 1.f);
+    randn(updates, 0.f, 1.f);

    // initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W]
    std::vector<int> current_index_tuple(shape.size());
@ -384,17 +382,14 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >

    // warmup
    {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);

-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
        Mat out = net.forward();
    }

@ -406,21 +401,18 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
    SANITY_CHECK_NOTHING();
 }

-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
-
-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND)
-{
-    test_layer({N, C, H ,W});
-}
-
-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND_add)
-{
-    test_layer({N, C, H , W}, "add");
-}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));

 struct Layer_LayerNorm : public TestBaseWithParam<tuple<Backend, Target> >
 {
@ -795,8 +787,6 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple
 #ifdef HAVE_CUDA
 INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
 #endif
-// INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
-INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
--- a/modules/dnn/src/layers/scatterND_layer.cpp
+++ b/modules/dnn/src/layers/scatterND_layer.cpp
@ -89,49 +89,59 @@ public:
    // NOTE: This impl does not check whether indices have duplicate entries.
    //       The last duplicate entry will overwrite the previous.
    template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
-    {
-        data.copyTo(out);
+    void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat& output_mat) {
+        input_mat.copyTo(output_mat);

-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(input_mat_shape.size());
+        for (int i = 0; i < input_mat.dims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+        }

-        const int ind_ndims = indices.dims;
-        const int* ind_shape = indices.size.p;
-        const T* p_indices = indices.ptr<const T>();
+        const int indices_mat_ndims = indices_mat.dims;
+        const auto &indices_mat_shape = shape(indices_mat);

-        const int upd_ndims = updates.dims;
-        const int* upd_shape = updates.size.p;
-        const T* p_updates = updates.ptr<const T>();
+        const int updates_mat_ndims = updates_mat.dims;
+        const auto &updates_mat_shape = shape(updates_mat);

-        T* p_out = out.ptr<T>();
-
-        int k = ind_shape[ind_ndims - 1]; // last dim of indices
-        size_t total = (size_t)(indices.total() / k);
+        int indices_last_dim = indices_mat_shape[indices_mat_ndims - 1]; // last dim of indices

        size_t updates_size = 1;
-        for (int i = ind_ndims - 1; i < upd_ndims; i++)
-            updates_size *= upd_shape[i];
+        for (int i = indices_mat_ndims - 1; i < updates_mat_ndims; i++)
+            updates_size *= updates_mat_shape[i];

-        size_t inp_start_offset = 0;
-        size_t ind_start_offset = 0;
-        size_t upd_start_offset = 0;
-        for (size_t i = 0; i < total; i++, ind_start_offset += k, upd_start_offset += updates_size)
-        {
-            const T* tmp_p_indices = p_indices + ind_start_offset;
-            inp_start_offset = 0;
-            for (int j = 0; j < k; j++)
-            {
-                CV_Assert(tmp_p_indices[j] < shape[j] && tmp_p_indices[j] > -shape[j]);
-                inp_start_offset += (((int)tmp_p_indices[j] + shape[j]) % shape[j]) * step[j];
-            }
-            inp_start_offset /= sizeof(T);
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0,
+                   indices_offset = r.start * indices_last_dim,
+                   updates_offset = r.start * updates_size;
+            for (int i = r.start; i < r.end; i++) {
+                const T* indices = indices_mat.ptr<const T>();
+                const T* updates = updates_mat.ptr<const T>();
+                T* output = output_mat.ptr<T>();

-            const T* tmp_p_updates = p_updates + upd_start_offset;
-            T* tmp_p_out = p_out + inp_start_offset;
-            for (int j = 0; j < updates_size; j++)
-                tmp_p_out[j] = rd(tmp_p_out[j], tmp_p_updates[j]);
+                input_offset = 0;
+                indices += indices_offset;
+                for (int j = 0; j < indices_last_dim; j++) {
+                    int index = static_cast<int>(*(indices + j));
+                    index = (index + input_mat_shape[j]) % input_mat_shape[j];
+                    CV_Assert(index < input_mat_shape[j] && index >= 0);
+                    input_offset += index * input_mat_step[j];
                }
+
+                updates += updates_offset;
+                output += input_offset;
+                for (int j = 0; j < updates_size; j++) {
+                    output[j] = reduce_operation(output[j], updates[j]);
+                }
+
+                indices_offset += indices_last_dim;
+                updates_offset += updates_size;
+            }
+        };
+
+        size_t total = (size_t)(indices_mat.total() / indices_last_dim);
+        double nstripes = (size_t)total * (indices_last_dim + updates_size) * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
    }

    template<typename... Args>