Merge pull request #25147 from fengyuentau:dnn/elementwise_layers/speedup

* added v_erf and implemented gelu acceleration via vectorization * remove anonymous v_erf and use v_erf from intrin_math * enable perf for ov and cuda backend
2025-08-05 22:19:14 +08:00 · 2024-07-08 19:24:36 +08:00 · 2024-07-08 19:24:36 +08:00 · e3858cc5a3
commit e3858cc5a3
parent 31b308f882
7 changed files with 254 additions and 13 deletions
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -975,4 +975,49 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
                          /* withCann= */            false) // only test on CPU
 ));

+using Layer_Elementwise = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_Elementwise, elementwise) {
+    std::vector<int> input_shape = get<0>(GetParam());
+    std::string op = get<1>(GetParam());
+    int backend_id = get<0>(get<2>(GetParam()));
+    int target_id = get<1>(get<2>(GetParam()));
+
+    Mat input(input_shape, CV_32F);
+    randn(input, 0.f, 1.f);
+
+    LayerParams lp;
+    lp.type = op;
+    lp.name = "TestLayer";
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    // Warmup
+    {
+        net.setInput(input);
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE() {
+        net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise, testing::Combine(
+    testing::Values(std::vector<int>{1, 50, 3072}),
+    testing::Values(std::string{"Gelu"}),
+    dnnBackendsAndTargets(/* withInferenceEngine= */ true,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            true,
+                          /* withNgraph= */          true,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));
+
 } // namespace
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -813,20 +813,82 @@ private:
    static const char* const ocl_kernel_name;
 };

-struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
-{
-    typedef GeluLayer Layer;
+namespace {
+    // Refer to v_erf in modules/core/include/opencv2/core/hal/intrin_math.hpp
+    constexpr float c_erf_coef0 = 0.3275911f;
+    constexpr float c_erf_coef1 = 1.061405429f;
+    constexpr float c_erf_coef2 = -1.453152027f;
+    constexpr float c_erf_coef3 = 1.421413741f;
+    constexpr float c_erf_coef4 = -0.284496736f;
+    constexpr float c_erf_coef5 = 0.254829592f;

-    explicit GeluFunctor() {}
+    inline float erf_approx(float v) {
+        float t = 1.f / fmaf(fabsf(v), c_erf_coef0, 1.f);
+        float r = fmaf(c_erf_coef1, t, c_erf_coef2);
+        r = fmaf(r, t, c_erf_coef3);
+        r = fmaf(r, t, c_erf_coef4);
+        r = fmaf(r, t, c_erf_coef5);
+        r = 1.f - r * t * expf(-v * v);
+        return std::copysignf(r, v);
+    }
+}

-    bool supportBackend(int backendId, int)
-    {
-        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+struct GeluFunctor : public BaseFunctor {
+    using Layer = GeluLayer;
+    int vlanes;
+
+    explicit GeluFunctor() {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        vlanes = VTraits<v_float32>::vlanes();
+#else
+        vlanes = 1;
+#endif
    }

-    inline float calculate(float x) const
-    {
-        return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
+    bool supportBackend(int backendId, int) {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
+        CV_UNUSED(stripeStart);
+        for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
+            int i = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            // 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
+            v_float32 half = vx_setall_f32(0.5f),
+                      one = vx_setall_f32(1.0f),
+                      reciprocal_sqrt2 = vx_setall_f32(M_SQRT1_2);
+            for (; i <= len - vlanes; i += vlanes) {
+                if (i + vlanes > len) {
+                    if (i == 0 || i == len) {
+                        break;
+                    }
+                    i = len - vlanes;
+                }
+                v_float32 x0 = vx_load(srcptr + i);
+
+                // t = x * M_SQRT1_2
+                v_float32 t0 = v_mul(reciprocal_sqrt2, x0);
+
+                // t = 1.0f + t
+                t0 = v_add(one, v_erf(t0));
+
+                // x = 0.5 * x
+                x0 = v_mul(half, x0);
+
+                // x = x * t
+                x0 = v_mul(x0, t0);
+
+                vx_store(dstptr + i, x0);
+            }
+#endif
+            // 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
+            for( ; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = 0.5f * x * (1.0f + erf_approx(x * M_SQRT1_2));
+            }
+        }
    }

 #ifdef HAVE_CUDA
@ -836,12 +898,55 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
    }
 #endif

+#ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        String buildopt = oclGetTMacro(src);
+
+        if (!ker.create("GeluForward", ocl::dnn::activations_oclsrc, buildopt))
+            return false;
+
+        return true;
+    }
+
+    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+            CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
+
+            ocl::Kernel kernel;
+            CV_Assert(initKernel(kernel, src));
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
+    }
+#endif
+
+#ifdef HAVE_DNN_NGRAPH
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
+    {
+        return std::make_shared<ov::op::v0::Gelu>(node);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    int64 getFLOPSPerElement() const { return 100; }
 };

-template<>
-const char* const BaseDefaultFunctor<GeluFunctor>::ocl_kernel_name = "GeluForward";
-
 namespace GeluApproximationConstants
 {
    static constexpr float sqrt_2_pi = 0.7978845834732056f;
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@ -48,6 +48,10 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif

+#if !defined(M_SQRT1_2)
+#define M_SQRT1_2   0.707106781186547524400844362104849039  /* 1/sqrt(2)      */
+#endif
+
 __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 #ifndef RELU_NO_SLOPE
 , KERNEL_ARG_DTYPE negative_slope
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@ -282,6 +282,14 @@ static const TestCase testConformanceConfig[] = {
    {"test_gathernd_example_float32", 2, 1},
    {"test_gathernd_example_int32", 2, 1},
    {"test_gathernd_example_int32_batch_dim1", 2, 1},
+    {"test_gelu_default_1", 1, 1},
+    {"test_gelu_default_1_expanded", 1, 1},
+    {"test_gelu_default_2", 1, 1},
+    {"test_gelu_default_2_expanded", 1, 1},
+    {"test_gelu_tanh_1", 1, 1},
+    {"test_gelu_tanh_1_expanded", 1, 1},
+    {"test_gelu_tanh_2", 1, 1},
+    {"test_gelu_tanh_2_expanded", 1, 1},
    {"test_gemm_all_attributes", 3, 1},
    {"test_gemm_alpha", 3, 1},
    {"test_gemm_beta", 3, 1},
@ -1123,6 +1131,19 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
        if (name == "test_pow") {
            default_lInf = 0.00013; // Expected: (normInf) <= (lInf), actual: 0.00012207 vs 0.0001
        }
+        if (name == "test_gelu_tanh_1") {
+            default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
+            default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
+        }
+        if (name == "test_gelu_tanh_2") {
+            if (target == DNN_TARGET_OPENCL_FP16) {
+                default_l1 = 0.00016; // Expected: (normL1) <= (l1), actual: 0.000157223 vs 9e-05
+                default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00153041 vs 0.0005
+            } else {
+                default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
+                default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
+            }
+        }
    }
 #ifdef HAVE_HALIDE
    else if (backend == DNN_BACKEND_HALIDE)
@ -1146,6 +1167,15 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
        {
            applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
        }
+
+        if (name == "test_gelu_tanh_1") {
+            default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
+            default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
+        }
+        if (name == "test_gelu_tanh_2") {
+            default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
+            default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
+        }
    }
 #endif
 #ifdef HAVE_CUDA
@ -1159,6 +1189,20 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
        {
            applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
        }
+
+        if (name == "test_gelu_tanh_1") {
+            default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05
+            default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001
+        }
+        if (name == "test_gelu_tanh_2") {
+            if (target == DNN_TARGET_CUDA_FP16) {
+                default_l1 = 0.00023; // Expected: (normL1) <= (l1), actual: 0.000220591 vs 9e-05
+                default_lInf = 0.0023; // Expected: (normInf) <= (lInf), actual: 0.00220466 vs 0.0005
+            } else {
+                default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80127e-05 vs 1e-05
+                default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001
+            }
+        }
    }
 #endif
    else
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@ -688,6 +688,36 @@ CASE(test_gathernd_example_int32)
    // no filter
 CASE(test_gathernd_example_int32_batch_dim1)
    // no filter
+CASE(test_gelu_default_1)
+    // no filter
+CASE(test_gelu_default_1_expanded)
+    // no filter
+CASE(test_gelu_default_2)
+    // no filter
+CASE(test_gelu_default_2_expanded)
+    // no filter
+CASE(test_gelu_tanh_1)
+    if (target == DNN_TARGET_CPU) {
+        default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
+        default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
+    }
+    if (target == DNN_TARGET_OPENCL) {
+        default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05
+        default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001
+    }
+CASE(test_gelu_tanh_1_expanded)
+    // no filter
+CASE(test_gelu_tanh_2)
+    if (target == DNN_TARGET_CPU) {
+        default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80057e-05 vs 1e-05
+        default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
+    }
+    if (target == DNN_TARGET_OPENCL) {
+        default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80144e-05 vs 1e-05
+        default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001
+    }
+CASE(test_gelu_tanh_2_expanded)
+    // no filter
 CASE(test_gemm_all_attributes)
    // no filter
 CASE(test_gemm_alpha)
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@ -117,6 +117,10 @@
 "test_gathernd_example_float32",
 "test_gathernd_example_int32",
 "test_gathernd_example_int32_batch_dim1",
+"test_gelu_default_1_expanded", // parser: no corresponding layer for CastLike
+"test_gelu_default_2_expanded", // parser: no corresponding layer for CastLike
+"test_gelu_tanh_1_expanded", // parser: no corresponding layer for CastLike
+"test_gelu_tanh_2_expanded", // parser: no corresponding layer for CastLike
 "test_gemm_all_attributes",
 "test_gemm_alpha",
 "test_gemm_beta",
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -3149,6 +3149,15 @@ TEST_P(Test_ONNX_nets, ViT_B_32) {
        l1 = 0.008;
        lInf = 0.04;
    }
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        if (target == DNN_TARGET_CPU) {
+            l1 = 4.4e-5; // Expected: (normL1) <= (l1), actual: 4.31208e-05 vs 1e-05
+            lInf = 0.0002; // Expected: (normInf) <= (lInf), actual: 0.000194907 vs 0.0001
+        } else if (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16) {
+            l1 = 0.0092; // Expected: (normL1) <= (l1), actual: 0.00918349 vs 4.4e-05
+            lInf = 0.056; // Expected: (normInf) <= (lInf), actual: 0.0556431 vs 0.0002
+        }
+    }

    normAssert(ref, out, "ViTB_32", l1, lInf);
 }