Merge pull request #15808 from thebhatman:Mish_swish

* Added Swish and Mish activations * Fixed whitespace errors * Kernel implementation done * Added function for launching kernel * Changed type of 1.0 * Attempt to add test for Swish and Mish * Resolving type mismatch for log * exp from device * Use log1pexp instead of adding 1 * Added openCL kernels
2025-08-06 14:36:36 +08:00 · 2019-12-02 02:36:17 +05:30 · 2019-12-02 02:36:17 +05:30 · 78c5e41c23
commit 78c5e41c23
parent 01a28db949
8 changed files with 372 additions and 1 deletions
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -462,6 +462,18 @@ CV__DNN_INLINE_NS_BEGIN
        static Ptr<TanHLayer> create(const LayerParams &params);
    };

+    class CV_EXPORTS SwishLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SwishLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS MishLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<MishLayer> create(const LayerParams &params);
+    };
+
    class CV_EXPORTS SigmoidLayer : public ActivationLayer
    {
    public:
--- a/modules/dnn/src/cuda/activations.cu
+++ b/modules/dnn/src/cuda/activations.cu
@ -62,6 +62,43 @@ namespace cv { namespace dnn { namespace cuda4dnn  { namespace kernels {
            }
        }

+        template <class T, std::size_t N>
+        __global__ void swish_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::sigmoid;
+                    vec.data[j] = vec.data[j] * sigmoid(vec.data[j]);
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void mish_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::tanh;
+                    using device::log1pexp;
+                    vec.data[j] = vec.data[j] * tanh(log1pexp(vec.data[j]));
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
        template <class T, std::size_t N>
        __global__ void sigmoid_vec(Span<T> output, View<T> input) {
            using vector_type = get_vector_type_t<T, N>;
@ -240,6 +277,58 @@ namespace cv { namespace dnn { namespace cuda4dnn  { namespace kernels {
    template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
    template void tanh<float>(const Stream&, Span<float>, View<float>);

+    template <class T, std::size_t N>
+    void launch_vectorized_swish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::swish_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void swish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_swish<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_swish<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_swish<T, 1>(stream, output, input);
+        }
+    }
+
+    template void swish<__half>(const Stream&, Span<__half>, View<__half>);
+    template void swish<float>(const Stream&, Span<float>, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_mish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::mish_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void mish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_mish<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_mish<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_mish<T, 1>(stream, output, input);
+        }
+    }
+
+    template void mish<__half>(const Stream&, Span<__half>, View<__half>);
+    template void mish<float>(const Stream&, Span<float>, View<float>);
+
    template <class T, std::size_t N>
    void launch_vectorized_sigmoid(const Stream& stream, Span<T> output, View<T> input) {
        CV_Assert(is_fully_aligned<T>(output, N));
--- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
@ -18,6 +18,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
    template <class T>
    void tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);

+    template <class T>
+    void swish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void mish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
    template <class T>
    void sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);

--- a/modules/dnn/src/cuda4dnn/primitives/activation.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
@ -143,6 +143,62 @@ namespace cv { namespace dnn { namespace cuda4dnn {
        csl::Stream stream;
    };

+    template <class T>
+    class SwishOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        SwishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::swish<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class MishOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        MishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::mish<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
    template <class T>
    class SigmoidOp final : public CUDABackendNode {
    public:
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@ -103,6 +103,8 @@ void initializeLayerFactory()
    CV_DNN_REGISTER_LAYER_CLASS(PReLU,          ChannelsPReLULayer);
    CV_DNN_REGISTER_LAYER_CLASS(Sigmoid,        SigmoidLayer);
    CV_DNN_REGISTER_LAYER_CLASS(TanH,           TanHLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Swish,          SwishLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Mish,           MishLayer);
    CV_DNN_REGISTER_LAYER_CLASS(ELU,            ELULayer);
    CV_DNN_REGISTER_LAYER_CLASS(BNLL,           BNLLLayer);
    CV_DNN_REGISTER_LAYER_CLASS(AbsVal,         AbsLayer);
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -613,6 +613,184 @@ struct TanHFunctor
    int64 getFLOPSPerElement() const { return 1; }
 };

+struct SwishFunctor
+{
+    typedef SwishLayer Layer;
+
+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_HALIDE;
+    }
+
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    {
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            for( int i = 0; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = x / (1.0f + exp(-x));
+            }
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("SwishForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
+    }
+#endif
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::SwishOp>(target, stream);
+    }
+#endif
+
+#ifdef HAVE_HALIDE
+    void attachHalide(const Halide::Expr& input, Halide::Func& top)
+    {
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        top(x, y, c, n) = input / (1.0f + exp(-input));
+    }
+#endif  // HAVE_HALIDE
+
+#ifdef HAVE_INF_ENGINE
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_VULKAN
+    std::shared_ptr<vkcom::OpBase> initVkCom()
+    {
+        // TODO: add vkcom implementation
+        return std::shared_ptr<vkcom::OpBase>();
+    }
+#endif  // HAVE_VULKAN
+
+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
+    int64 getFLOPSPerElement() const { return 3; }
+
+};
+
+struct MishFunctor
+{
+    typedef MishLayer Layer;
+
+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_HALIDE;
+    }
+
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    {
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            for( int i = 0; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = x * tanh(log(1.0f + exp(x)));
+            }
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("MishForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
+    }
+#endif
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::MishOp>(target, stream);
+    }
+#endif
+
+#ifdef HAVE_HALIDE
+    void attachHalide(const Halide::Expr& input, Halide::Func& top)
+    {
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        top(x, y, c, n) = input * tanh(log(1.0f + exp(input)));
+    }
+#endif  // HAVE_HALIDE
+
+#ifdef HAVE_INF_ENGINE
+    InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#endif  // HAVE_INF_ENGINE
+
+#ifdef HAVE_VULKAN
+    std::shared_ptr<vkcom::OpBase> initVkCom()
+    {
+        // TODO: add vkcom implementation
+        return std::shared_ptr<vkcom::OpBase>();
+    }
+#endif  // HAVE_VULKAN
+
+    bool tryFuse(Ptr<dnn::Layer>&) { return false; }
+
+    void getScaleShift(Mat&, Mat&) const {}
+
+    int64 getFLOPSPerElement() const { return 3; }
+
+};
+
 struct SigmoidFunctor
 {
    typedef SigmoidLayer Layer;
@ -1292,6 +1470,22 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
    return l;
 }

+Ptr<SwishLayer> SwishLayer::create(const LayerParams& params)
+{
+    Ptr<SwishLayer> l(new ElementWiseLayer<SwishFunctor>());
+    l->setParamsFrom(params);
+
+    return l;
+}
+
+Ptr<MishLayer> MishLayer::create(const LayerParams& params)
+{
+    Ptr<MishLayer> l(new ElementWiseLayer<MishFunctor>());
+    l->setParamsFrom(params);
+
+    return l;
+}
+
 Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
 {
    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@ -95,6 +95,18 @@ __kernel void SigmoidForward(const int count, __global const T* in, __global T*
  out[index] = 1.0f / (1.0f + exp(-in[index]));
 }

+__kernel void SwishForward(const int count, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = in[index] / (1.0f + exp(-in[index]));
+}
+
+__kernel void MishForward(const int count, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = in[index] * tanh(log(1.0f + exp(in[index])));
+}
+
 __kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
  int index = get_global_id(0);
  if (index < n) {
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -583,7 +583,7 @@ TEST_P(NoParamActivation, Accuracy)
    testInPlaceActivation(lp, backendId, targetId);
 }
 INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, NoParamActivation, Combine(
-/*type*/ Values("TanH", "Sigmoid", "AbsVal", "BNLL"),
+/*type*/ Values("TanH", "Sigmoid", "AbsVal", "BNLL", "Swish", "Mish"),
         dnnBackendsAndTargetsWithHalide()
 ));