ReLU6 layer ocl support

include relu6 ocl kernel and layer fusion support Signed-off-by: Li Peng <peng.li@intel.com>
2025-06-07 09:25:45 +08:00 · 2018-02-16 15:37:59 +08:00 · 2018-02-16 15:37:59 +08:00 · 2863f950d6
commit 2863f950d6
parent 0e4eed0ba1
8 changed files with 94 additions and 12 deletions
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -406,6 +406,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    class CV_EXPORTS ReLU6Layer : public ActivationLayer
    {
    public:
+        float minValue, maxValue;
+
        static Ptr<ReLU6Layer> create(const LayerParams &params);
    };

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1439,6 +1439,7 @@ struct Net::Impl
                         nextData &&
                        ((nextData->type == "ReLU") ||
                         (nextData->type == "ChannelsPReLU") ||
+                         (nextData->type == "ReLU6") ||
                         (nextData->type == "TanH") ||
                         (nextData->type == "Power"))) )
                {
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -860,6 +860,15 @@ public:
                activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
            }

+            Ptr<ReLU6Layer> activ_relu6 = activ.dynamicCast<ReLU6Layer>();
+            if( !activ_relu6.empty() )
+            {
+                reluslope.resize(2);
+                reluslope[0] = activ_relu6->minValue;
+                reluslope[1] = activ_relu6->maxValue;
+                activType = OCL4DNN_CONV_FUSED_ACTIV_RELU6;
+            }
+
            Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>();
            if( !activ_chprelu.empty() )
            {
@ -906,12 +915,17 @@ public:
            {
                convolutionOp->setActivTanh(true);
            }
+            else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU6)
+            {
+                convolutionOp->setActivReLU6(true, reluslope[0], reluslope[1]);
+            }
            else
            {
                convolutionOp->setActivReLU(false, 0);
                convolutionOp->setActivPReLU(false, reluslope);
                convolutionOp->setActivPower(false, 1.f);
                convolutionOp->setActivTanh(false);
+                convolutionOp->setActivReLU6(false, 0, 0);
            }
            newActiv = false;
        }
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -381,8 +381,30 @@ struct ReLU6Functor
 #ifdef HAVE_OPENCL
    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("ReLU6Forward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+            kernel.set(3, (float)minValue);
+            kernel.set(4, (float)maxValue);
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
    }
 #endif

@ -867,6 +889,9 @@ Ptr<ReLU6Layer> ReLU6Layer::create(const LayerParams& params)
    float maxValue = params.get<float>("max_value", 6.0f);
    Ptr<ReLU6Layer> l(new ElementWiseLayer<ReLU6Functor>(ReLU6Functor(minValue, maxValue)));
    l->setParamsFrom(params);
+    l->minValue = minValue;
+    l->maxValue = maxValue;
+
    return l;
 }

--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -78,7 +78,8 @@ typedef enum {
    OCL4DNN_CONV_FUSED_ACTIV_RELU                 = 1,
    OCL4DNN_CONV_FUSED_ACTIV_PRELU                = 2,
    OCL4DNN_CONV_FUSED_ACTIV_POWER                = 3,
-    OCL4DNN_CONV_FUSED_ACTIV_TANH                 = 4
+    OCL4DNN_CONV_FUSED_ACTIV_TANH                 = 4,
+    OCL4DNN_CONV_FUSED_ACTIV_RELU6                = 5
 } ocl4dnnFusedActiv_t;

 template<typename Dtype>
@ -96,6 +97,7 @@ class OCL4DNNConvSpatial
        void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
        void setActivPower(bool fuse_activ, float power);
        void setActivTanh(bool fuse_activ);
+        void setActivReLU6(bool fuse_activ, float min, float max);
        void setBias(bool bias_term);

    private:
@ -319,6 +321,8 @@ class OCL4DNNConvSpatial
        cv::ocl::ProgramSource src_;
        int32_t prev_kernel_type_;
        float negative_slope_;
+        float min_value_;
+        float max_value_;
        UMat negative_slope_umat_;
        ocl4dnnFusedActiv_t fused_activ_;
        float power_;
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -82,6 +82,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
    fused_eltwise_ = false;
    power_ = 1.f;
    negative_slope_ = 0;
+    min_value_ = 0;
+    max_value_ = 0;
    prev_kernel_type_ = -1;
    tuned_ = false;

@ -162,6 +164,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
        case OCL4DNN_CONV_FUSED_ACTIV_TANH:
            addDef("FUSED_CONV_TANH", 1);
            break;
+        case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
+            addDef("FUSED_CONV_RELU6", 1);
+            break;
        default:
            ;
    }
@ -184,6 +189,10 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bo
        case OCL4DNN_CONV_FUSED_ACTIV_POWER:
            kernel.set(argIdx++, (float)power_);
            break;
+        case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
+            kernel.set(argIdx++, (float)min_value_);
+            kernel.set(argIdx++, (float)max_value_);
+            break;
        default:
            ;
    }
@ -393,6 +402,19 @@ void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
 }

+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setActivReLU6(bool fuse_activ, float min, float max)
+{
+    if ( fuse_activ )
+    {
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU6;
+        min_value_ = min;
+        max_value_ = max;
+    }
+    else
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+}
+
 template<typename Dtype>
 void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
 {
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@ -54,6 +54,17 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 #endif
 }

+__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
+                           const T minValue, const T maxValue)
+{
+  int index = get_global_id(0);
+  if(index < count)
+  {
+    T x = in[index];
+    out[index] = clamp(x, minValue, maxValue);
+  }
+}
+
 __kernel void PReLUForward(const int count, const int channels, const int plane_size,
                           __global const T* in, __global T* out, __global const T* slope_data)
 {
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -48,19 +48,22 @@

 #if defined(FUSED_CONV_RELU)
 #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope)))
-#define NEGATIVE_SLOPE_ARG Dtype negative_slope,
+#define FUSED_ARG Dtype negative_slope,
 #elif defined(FUSED_CONV_PRELU)
 #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
-#define NEGATIVE_SLOPE_ARG __global const Dtype *negative_slope,
+#define FUSED_ARG __global const Dtype *negative_slope,
 #elif defined(FUSED_CONV_POWER)
 #define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
-#define NEGATIVE_SLOPE_ARG Dtype power,
+#define FUSED_ARG Dtype power,
 #elif defined(FUSED_CONV_TANH)
 #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
-#define NEGATIVE_SLOPE_ARG
+#define FUSED_ARG
+#elif defined(FUSED_CONV_RELU6)
+#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value))
+#define FUSED_ARG Dtype min_value, Dtype max_value,
 #else
 #define ACTIVATION_RELU_FUNCTION(x, c) (x)
-#define NEGATIVE_SLOPE_ARG
+#define FUSED_ARG
 #endif

 #ifdef FUSED_CONV_ELTWISE
@ -108,7 +111,7 @@

 __kernel void ConvolveBasic(
    ELTWISE_DATA_ARG
-    NEGATIVE_SLOPE_ARG
+    FUSED_ARG
    __global Dtype* image_data,
    int image_offset,
    __global Dtype* kernel_data,
@ -197,7 +200,7 @@ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
 __kernel void
 convolve_simd(
    ELTWISE_DATA_ARG
-    NEGATIVE_SLOPE_ARG
+    FUSED_ARG
    __global Dtype* inputs_base,
    filter_qualifier Dtype* weights_base,
    BIAS_KERNEL_ARG
@ -417,7 +420,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ

 #define GEMM_LIKE_KERNEL_ARGS     \
    ELTWISE_DATA_ARG              \
-    NEGATIVE_SLOPE_ARG            \
+    FUSED_ARG                     \
    const __global Dtype *src0,   \
    const __global Dtype *src1,   \
    BIAS_KERNEL_ARG               \
@ -1731,7 +1734,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)

 __kernel void DWCONV(
    ELTWISE_DATA_ARG
-    NEGATIVE_SLOPE_ARG
+    FUSED_ARG
    __global Dtype* image_data,
    __global Dtype* kernel_data,
    BIAS_KERNEL_ARG