diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp index f5af7242b0..b63c966b40 100644 --- a/modules/dnn/include/opencv2/dnn/all_layers.hpp +++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp @@ -462,6 +462,18 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams ¶ms); }; + class CV_EXPORTS SwishLayer : public ActivationLayer + { + public: + static Ptr create(const LayerParams ¶ms); + }; + + class CV_EXPORTS MishLayer : public ActivationLayer + { + public: + static Ptr create(const LayerParams ¶ms); + }; + class CV_EXPORTS SigmoidLayer : public ActivationLayer { public: diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu index 344ef79977..dfba54e933 100644 --- a/modules/dnn/src/cuda/activations.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -62,6 +62,43 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { } } + template + __global__ void swish_vec(Span output, View input) { + using vector_type = get_vector_type_t; + + auto output_vPtr = vector_type::get_pointer(output.data()); + auto input_vPtr = vector_type::get_pointer(input.data()); + + for (auto i : grid_stride_range(output.size() / vector_type::size())) { + vector_type vec; + v_load(vec, input_vPtr[i]); + for (int j = 0; j < vector_type::size(); j++) { + using device::sigmoid; + vec.data[j] = vec.data[j] * sigmoid(vec.data[j]); + } + v_store(output_vPtr[i], vec); + } + } + + template + __global__ void mish_vec(Span output, View input) { + using vector_type = get_vector_type_t; + + auto output_vPtr = vector_type::get_pointer(output.data()); + auto input_vPtr = vector_type::get_pointer(input.data()); + + for (auto i : grid_stride_range(output.size() / vector_type::size())) { + vector_type vec; + v_load(vec, input_vPtr[i]); + for (int j = 0; j < vector_type::size(); j++) { + using device::tanh; + using device::log1pexp; + vec.data[j] = vec.data[j] * tanh(log1pexp(vec.data[j])); + } + v_store(output_vPtr[i], vec); + } + } + template __global__ void sigmoid_vec(Span output, View input) { using vector_type = get_vector_type_t; @@ -240,6 +277,58 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void tanh<__half>(const Stream&, Span<__half>, View<__half>); template void tanh(const Stream&, Span, View); + template + void launch_vectorized_swish(const Stream& stream, Span output, View input) { + CV_Assert(is_fully_aligned(output, N)); + CV_Assert(is_fully_aligned(input, N)); + + auto kernel = raw::swish_vec; + auto policy = make_policy(kernel, output.size() / N, 0, stream); + launch_kernel(kernel, policy, output, input); + } + + template + void swish(const Stream& stream, Span output, View input) { + CV_Assert(input.size() == output.size()); + + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4)) { + launch_vectorized_swish(stream, output, input); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2)) { + launch_vectorized_swish(stream, output, input); + } else { + launch_vectorized_swish(stream, output, input); + } + } + + template void swish<__half>(const Stream&, Span<__half>, View<__half>); + template void swish(const Stream&, Span, View); + + template + void launch_vectorized_mish(const Stream& stream, Span output, View input) { + CV_Assert(is_fully_aligned(output, N)); + CV_Assert(is_fully_aligned(input, N)); + + auto kernel = raw::mish_vec; + auto policy = make_policy(kernel, output.size() / N, 0, stream); + launch_kernel(kernel, policy, output, input); + } + + template + void mish(const Stream& stream, Span output, View input) { + CV_Assert(input.size() == output.size()); + + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4)) { + launch_vectorized_mish(stream, output, input); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2)) { + launch_vectorized_mish(stream, output, input); + } else { + launch_vectorized_mish(stream, output, input); + } + } + + template void mish<__half>(const Stream&, Span<__half>, View<__half>); + template void mish(const Stream&, Span, View); + template void launch_vectorized_sigmoid(const Stream& stream, Span output, View input) { CV_Assert(is_fully_aligned(output, N)); diff --git a/modules/dnn/src/cuda4dnn/kernels/activations.hpp b/modules/dnn/src/cuda4dnn/kernels/activations.hpp index 05f8f48e02..8a7ebb26f5 100644 --- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp @@ -18,6 +18,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void tanh(const csl::Stream& stream, csl::Span output, csl::View input); + template + void swish(const csl::Stream& stream, csl::Span output, csl::View input); + + template + void mish(const csl::Stream& stream, csl::Span output, csl::View input); + template void sigmoid(const csl::Stream& stream, csl::Span output, csl::View input); diff --git a/modules/dnn/src/cuda4dnn/primitives/activation.hpp b/modules/dnn/src/cuda4dnn/primitives/activation.hpp index d90aef397b..fce996a89e 100644 --- a/modules/dnn/src/cuda4dnn/primitives/activation.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp @@ -143,6 +143,62 @@ namespace cv { namespace dnn { namespace cuda4dnn { csl::Stream stream; }; + template + class SwishOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + SwishOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + const std::vector>& inputs, + const std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + kernels::swish(stream, output, input); + } + } + + private: + csl::Stream stream; + }; + + template + class MishOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + MishOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + const std::vector>& inputs, + const std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + kernels::mish(stream, output, input); + } + } + + private: + csl::Stream stream; + }; + template class SigmoidOp final : public CUDABackendNode { public: diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp index 4ebded1436..b083e2c586 100644 --- a/modules/dnn/src/init.cpp +++ b/modules/dnn/src/init.cpp @@ -103,6 +103,8 @@ void initializeLayerFactory() CV_DNN_REGISTER_LAYER_CLASS(PReLU, ChannelsPReLULayer); CV_DNN_REGISTER_LAYER_CLASS(Sigmoid, SigmoidLayer); CV_DNN_REGISTER_LAYER_CLASS(TanH, TanHLayer); + CV_DNN_REGISTER_LAYER_CLASS(Swish, SwishLayer); + CV_DNN_REGISTER_LAYER_CLASS(Mish, MishLayer); CV_DNN_REGISTER_LAYER_CLASS(ELU, ELULayer); CV_DNN_REGISTER_LAYER_CLASS(BNLL, BNLLLayer); CV_DNN_REGISTER_LAYER_CLASS(AbsVal, AbsLayer); diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 632cac8b57..a5f93231b9 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -613,6 +613,184 @@ struct TanHFunctor int64 getFLOPSPerElement() const { return 1; } }; +struct SwishFunctor +{ + typedef SwishLayer Layer; + + bool supportBackend(int backendId, int) + { + return backendId == DNN_BACKEND_OPENCV || + backendId == DNN_BACKEND_CUDA || + backendId == DNN_BACKEND_HALIDE; + } + + void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const + { + for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) + { + for( int i = 0; i < len; i++ ) + { + float x = srcptr[i]; + dstptr[i] = x / (1.0f + exp(-x)); + } + } + } + +#ifdef HAVE_OPENCL + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) + { + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + String buildopt = oclGetTMacro(inputs[0]); + + for (size_t i = 0; i < inputs.size(); i++) + { + UMat& src = inputs[i]; + UMat& dst = outputs[i]; + + ocl::Kernel kernel("SwishForward", ocl::dnn::activations_oclsrc, buildopt); + kernel.set(0, (int)src.total()); + kernel.set(1, ocl::KernelArg::PtrReadOnly(src)); + kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst)); + + size_t gSize = src.total(); + CV_Assert(kernel.run(1, &gSize, NULL, false)); + } + + return true; + } +#endif + +#ifdef HAVE_CUDA + Ptr initCUDA(int target, csl::Stream stream) + { + return make_cuda_node(target, stream); + } +#endif + +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + top(x, y, c, n) = input / (1.0f + exp(-input)); + } +#endif // HAVE_HALIDE + +#ifdef HAVE_INF_ENGINE + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + CV_Error(Error::StsNotImplemented, ""); + } +#endif // HAVE_INF_ENGINE + +#ifdef HAVE_VULKAN + std::shared_ptr initVkCom() + { + // TODO: add vkcom implementation + return std::shared_ptr(); + } +#endif // HAVE_VULKAN + + bool tryFuse(Ptr&) { return false; } + + void getScaleShift(Mat&, Mat&) const {} + + int64 getFLOPSPerElement() const { return 3; } + +}; + +struct MishFunctor +{ + typedef MishLayer Layer; + + bool supportBackend(int backendId, int) + { + return backendId == DNN_BACKEND_OPENCV || + backendId == DNN_BACKEND_CUDA || + backendId == DNN_BACKEND_HALIDE; + } + + void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const + { + for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) + { + for( int i = 0; i < len; i++ ) + { + float x = srcptr[i]; + dstptr[i] = x * tanh(log(1.0f + exp(x))); + } + } + } + +#ifdef HAVE_OPENCL + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) + { + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + String buildopt = oclGetTMacro(inputs[0]); + + for (size_t i = 0; i < inputs.size(); i++) + { + UMat& src = inputs[i]; + UMat& dst = outputs[i]; + + ocl::Kernel kernel("MishForward", ocl::dnn::activations_oclsrc, buildopt); + kernel.set(0, (int)src.total()); + kernel.set(1, ocl::KernelArg::PtrReadOnly(src)); + kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst)); + + size_t gSize = src.total(); + CV_Assert(kernel.run(1, &gSize, NULL, false)); + } + + return true; + } +#endif + +#ifdef HAVE_CUDA + Ptr initCUDA(int target, csl::Stream stream) + { + return make_cuda_node(target, stream); + } +#endif + +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + top(x, y, c, n) = input * tanh(log(1.0f + exp(input))); + } +#endif // HAVE_HALIDE + +#ifdef HAVE_INF_ENGINE + InferenceEngine::Builder::Layer initInfEngineBuilderAPI() + { + CV_Error(Error::StsNotImplemented, ""); + } +#endif // HAVE_INF_ENGINE + +#ifdef HAVE_VULKAN + std::shared_ptr initVkCom() + { + // TODO: add vkcom implementation + return std::shared_ptr(); + } +#endif // HAVE_VULKAN + + bool tryFuse(Ptr&) { return false; } + + void getScaleShift(Mat&, Mat&) const {} + + int64 getFLOPSPerElement() const { return 3; } + +}; + struct SigmoidFunctor { typedef SigmoidLayer Layer; @@ -1292,6 +1470,22 @@ Ptr TanHLayer::create(const LayerParams& params) return l; } +Ptr SwishLayer::create(const LayerParams& params) +{ + Ptr l(new ElementWiseLayer()); + l->setParamsFrom(params); + + return l; +} + +Ptr MishLayer::create(const LayerParams& params) +{ + Ptr l(new ElementWiseLayer()); + l->setParamsFrom(params); + + return l; +} + Ptr SigmoidLayer::create(const LayerParams& params) { Ptr l(new ElementWiseLayer()); diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl index ff9d2401d3..b900e6add6 100644 --- a/modules/dnn/src/opencl/activations.cl +++ b/modules/dnn/src/opencl/activations.cl @@ -95,6 +95,18 @@ __kernel void SigmoidForward(const int count, __global const T* in, __global T* out[index] = 1.0f / (1.0f + exp(-in[index])); } +__kernel void SwishForward(const int count, __global const T* in, __global T* out) { + int index = get_global_id(0); + if(index < count) + out[index] = in[index] / (1.0f + exp(-in[index])); +} + +__kernel void MishForward(const int count, __global const T* in, __global T* out) { + int index = get_global_id(0); + if(index < count) + out[index] = in[index] * tanh(log(1.0f + exp(in[index]))); +} + __kernel void BNLLForward(const int n, __global const T* in, __global T* out) { int index = get_global_id(0); if (index < n) { diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp index 0cee157b15..a9eff9faa2 100644 --- a/modules/dnn/test/test_halide_layers.cpp +++ b/modules/dnn/test/test_halide_layers.cpp @@ -583,7 +583,7 @@ TEST_P(NoParamActivation, Accuracy) testInPlaceActivation(lp, backendId, targetId); } INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, NoParamActivation, Combine( -/*type*/ Values("TanH", "Sigmoid", "AbsVal", "BNLL"), +/*type*/ Values("TanH", "Sigmoid", "AbsVal", "BNLL", "Swish", "Mish"), dnnBackendsAndTargetsWithHalide() ));