Merge pull request #24080 from dkurt:dnn_cuda_layers

Resolve uncovered CUDA dnn layer #24080 ### Pull Request Readiness Checklist * Gelu activation layer on CUDA * Try to relax GEMM from ONNX resolves https://github.com/opencv/opencv/issues/24064 See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-08-05 22:19:14 +08:00 · 2023-08-03 09:13:42 +03:00 · 2023-08-03 09:13:42 +03:00 · 96f23e3da1
commit 96f23e3da1
parent 0245c0cd10
9 changed files with 123 additions and 5 deletions
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -643,4 +643,69 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(D
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));

+
+typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;
+PERF_TEST_P_(Layer_FullyConnected, fc)
+{
+    std::vector<int> inpShape;
+    inpShape.reserve(4);
+    for (int i = 0; i < 4; ++i) {
+        int dim = get<0>(GetParam())[i];
+        if (dim == 0)
+            break;
+        inpShape.push_back(dim);
+    }
+    Mat input(inpShape, CV_32F);
+    randn(input, 0, 1);
+
+    int axis = input.dims - 1;
+    int outDims = get<1>(GetParam());
+    bool isMatMul = get<2>(GetParam());
+    int backendId = get<0>(get<3>(GetParam()));
+    int targetId = get<1>(get<3>(GetParam()));
+
+    std::vector<int> weightShape;
+    if (isMatMul) {
+        weightShape = inpShape;
+        weightShape[weightShape.size() - 2] = outDims;
+    } else {
+        weightShape = {outDims, (int)input.total(axis, input.dims)};
+    }
+    Mat weights(weightShape, CV_32F);
+    randn(weights, 0, 1);
+
+    LayerParams lp;
+    lp.set("axis", input.dims - 1);
+    lp.set("is_matmul", weights.dims > 2);
+    lp.set("bias_term", false);
+    lp.set("transB", true);
+    lp.set("num_output", (int)weights.total(0, weights.dims - 1));
+    lp.blobs.resize(1, weights);
+
+    Net net;
+    net.addLayerToPrev("matmul", "InnerProduct", lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // warmup
+    Mat output = net.forward();
+
+    TEST_CYCLE()
+    {
+        net.forward();
+    }
+    SANITY_CHECK_NOTHING();
+}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_FullyConnected, Combine(
+    Values(                // input size
+        Vec4i(5, 512, 384),
+        Vec4i(5, 16, 512, 128)
+    ),
+    Values(256, 512, 1024),  // output dimension
+    testing::Bool(),         // is_matmul
+    dnnBackendsAndTargets()
+));
+
 } // namespace
--- a/modules/dnn/src/cuda/activations.cu
+++ b/modules/dnn/src/cuda/activations.cu
@ -248,6 +248,11 @@ void selu(const Stream& stream, Span<T> output, View<T> input, T alpha, T gamma)
    generic_op<T, SeluFunctor<T>>(stream, output, input, {alpha, gamma});
 }

+template <class T>
+void gelu(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, GeluFunctor<T>>(stream, output, input);
+}
+
 template <class T>
 void sign(const Stream& stream, Span<T> output, View<T> input) {
    generic_op<T, SignFunctor<T>>(stream, output, input);
@ -324,6 +329,7 @@ template void tan<__half>(const Stream&, Span<__half>, View<__half>);
 template void celu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void hardsigmoid<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
 template void selu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void gelu<__half>(const Stream&, Span<__half>, View<__half>);
 template void thresholdedrelu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
 template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
@ -366,6 +372,7 @@ template void tan<float>(const Stream&, Span<float>, View<float>);
 template void celu<float>(const Stream&, Span<float>, View<float>, float);
 template void hardsigmoid<float>(const Stream&, Span<float>, View<float>, float, float);
 template void selu<float>(const Stream&, Span<float>, View<float>, float, float);
+template void gelu<float>(const Stream&, Span<float>, View<float>);
 template void thresholdedrelu<float>(const Stream&, Span<float>, View<float>, float);
 template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
 template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@ -588,6 +588,21 @@ struct SeluFunctor {
    T alpha, gamma;
 };

+template <class T>
+struct GeluFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE GeluFunctor() { }
+    CUDA4DNN_DEVICE GeluFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::erf;
+        return static_cast<T>(0.5f) * value * (static_cast<T>(1.f) + erf(value * static_cast<T>(M_SQRT1_2)));
+    }
+};
+
 template <class T>
 struct ThresholdedReluFunctor {
    struct Params {
--- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
@ -114,6 +114,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
    template <class T>
    void selu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T gamma);

+    template <class T>
+    void gelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
    template <class T>
    void thresholdedrelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha);

--- a/modules/dnn/src/cuda4dnn/primitives/activation.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
@ -537,6 +537,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
        const T alpha, gamma;
    };

+    template <class T>
+    class GeluOp final : public BaseOp<GeluOp, T> {
+    public:
+        GeluOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
+        {
+            kernels::gelu<T>(stream, output, input);
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
    template <class T>
    class ThresholdedReluOp final : public BaseOp<ThresholdedReluOp, T> {
    public:
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -821,7 +821,7 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>

    bool supportBackend(int backendId, int)
    {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
    }

    inline float calculate(float x) const
@ -829,6 +829,13 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
        return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
    }

+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::GeluOp>(target, stream);
+    }
+#endif
+
    int64 getFLOPSPerElement() const { return 100; }
 };

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -630,8 +630,10 @@ public:

            if(input_wrapper->getRank() == inp2Dim)
                return make_cuda_node<cuda4dnn::MatMulOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), oriMat, biasMat_, transA, transB);
-            else
+            else {
+                CV_LOG_INFO(NULL, "DNN/CUDA: no implementation for MatMul with rank " << input_wrapper->getRank());
                return Ptr<BackendNode>();
+            }
        }

        auto flatten_start_axis = normalize_axis(axis, input_wrapper->getRank());
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -1965,9 +1965,11 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
    }

    int transB = layerParams.get<int>("transB", 0);
+    int secondInpDims;
    if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
    {
        Mat weights = getBlob(node_proto, 1);
+        secondInpDims = weights.dims;

        if (transA == 0) // optimized barnch, for now, we can only optimize the Gemm when transA = 0.
        {
@ -1993,7 +1995,10 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
        }
    }
    else
+    {
        layerParams.set("transB", transB == 1);
+        secondInpDims = outShapes[node_proto.input(1)].size();
+    }

    if (node_proto.input_size() == 3)
    {
@ -2002,7 +2007,7 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
    }

    layerParams.set("bias_term", node_proto.input_size() == 3);
-    layerParams.set("is_matmul", true);
+    layerParams.set("is_matmul", secondInpDims > 2);
    addLayer(layerParams, node_proto);
 }

@ -2045,7 +2050,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
        layerParams.blobs.push_back(transBlob);
        int numOutput = layerParams.blobs[0].total(0, secondInpDims - 1);
        layerParams.set("num_output", numOutput);
-        layerParams.set("is_matmul", true);
+        layerParams.set("is_matmul", secondInpDims > 2);
    } else
        secondInpDims = outShapes[node_proto.input(1)].size();

--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -102,7 +102,7 @@ public:
            netSoftmax.setInput(ref);
            ref = netSoftmax.forward();
        }
-        normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
+        normAssert(ref, out, basename.c_str(), l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
        if (checkNoFallbacks)
            expectNoFallbacksFromIE(net);
    }