/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Copyright (C) 2017, Intel Corporation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "../precomp.hpp" #include "layers_common.hpp" #include "op_halide.hpp" #include "opencv2/imgproc.hpp" #include #include "opencl_kernels_dnn.hpp" #include namespace cv { namespace dnn { using std::abs; using std::exp; using std::tanh; using std::pow; template class ElementWiseLayer : public Func::Layer { public: class PBody : public cv::ParallelLoopBody { public: const Func* func_; const Mat* src_; Mat* dst_; int nstripes_; PBody(const Func &func, const Mat &src, Mat& dst, int nstripes) { func_ = &func; src_ = &src; dst_ = &dst; nstripes_ = nstripes; } void operator()(const Range &r) const { int nstripes = nstripes_, nsamples, outCn; size_t planeSize; if( src_->dims == 4 ) { nsamples = src_->size[0]; outCn = src_->size[1]; planeSize = (size_t)src_->size[2]*src_->size[3]; } else { nsamples = outCn = 1; planeSize = (size_t)src_->total(); } size_t stripeSize = (planeSize + nstripes - 1)/nstripes; size_t stripeStart = r.start*stripeSize; size_t stripeEnd = std::min(r.end*stripeSize, planeSize); for( int i = 0; i < nsamples; i++ ) { const float* srcptr = src_->ptr(i) + stripeStart; float* dstptr = dst_->ptr(i) + stripeStart; func_->apply(srcptr, dstptr, (int)(stripeEnd - stripeStart), planeSize, 0, outCn); } } }; ElementWiseLayer(const Func &f=Func()) : run_parallel(false) { func = f; } virtual bool supportBackend(int backendId) { return backendId == DNN_BACKEND_DEFAULT || backendId == DNN_BACKEND_HALIDE && haveHalide(); } virtual Ptr tryAttach(const Ptr& node) { switch (node->backendId) { case DNN_BACKEND_HALIDE: { #ifdef HAVE_HALIDE auto base = node.dynamicCast(); Halide::Func& input = base->funcs.back(); Halide::Var x("x"), y("y"), c("c"), n("n"); Halide::Func top = (this->name.empty() ? Halide::Func() : Halide::Func(this->name)); func.attachHalide(input(x, y, c, n), top); return Ptr(new HalideBackendNode(base, top)); #endif // HAVE_HALIDE break; } } return Ptr(); } virtual Ptr initHalide(const std::vector > &inputs) { #ifdef HAVE_HALIDE Halide::Buffer input = halideBuffer(inputs[0]); Halide::Var x("x"), y("y"), c("c"), n("n"); Halide::Func top = (this->name.empty() ? Halide::Func() : Halide::Func(this->name)); func.attachHalide(input(x, y, c, n), top); return Ptr(new HalideBackendNode(top)); #endif // HAVE_HALIDE return Ptr(); } bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const { Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); return true; } void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), func.applyOCL(inputs, outputs, internals)) for (size_t i = 0; i < inputs.size(); i++) { const Mat &src = *inputs[i]; Mat &dst = outputs[i]; CV_Assert(src.size == dst.size && src.type() == dst.type() && src.isContinuous() && dst.isContinuous() && src.type() == CV_32F); const int nstripes = getNumThreads(); PBody body(func, src, dst, nstripes); parallel_for_(Range(0, nstripes), body, nstripes); } } void forwardSlice(const float* src, float* dst, int len, size_t planeSize, int cn0, int cn1) const { func.apply(src, dst, len, planeSize, cn0, cn1); } virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { long flops = 0; for (int i = 0; i < outputs.size(); i++) { flops += total(outputs[i]) * func.getFLOPSPerElement(); } return flops; } Func func; bool run_parallel; }; #ifdef HAVE_OPENCL static String oclGetTMacro(const UMat &m) { return String("-DT=") + ocl::typeToStr(m.type()) + String(" "); } #endif struct ReLUFunctor { typedef ReLULayer Layer; float slope; explicit ReLUFunctor(float slope_=1.f) : slope(slope_) {} void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { float s = slope; for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { int i = 0; #if CV_SIMD128 v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32(); for( ; i <= len - 16; i += 16 ) { v_float32x4 x0 = v_load(srcptr + i); v_float32x4 x1 = v_load(srcptr + i + 4); v_float32x4 x2 = v_load(srcptr + i + 8); v_float32x4 x3 = v_load(srcptr + i + 12); x0 = v_select(x0 >= z, x0, x0*s4); x1 = v_select(x1 >= z, x1, x1*s4); x2 = v_select(x2 >= z, x2, x2*s4); x3 = v_select(x3 >= z, x3, x3*s4); v_store(dstptr + i, x0); v_store(dstptr + i + 4, x1); v_store(dstptr + i + 8, x2); v_store(dstptr + i + 12, x3); } #endif for( ; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = x >= 0.f ? x : s*x; } } } #ifdef HAVE_OPENCL bool initKernel(ocl::Kernel &ker, const UMat &src) const { const char *buildoptSlope = (slope == 0) ? "-DRELU_NO_SLOPE" : ""; String buildopt = oclGetTMacro(src) + buildoptSlope; if (!ker.create("ReLUForward", ocl::dnn::activations_oclsrc, buildopt)) return false; if (slope != 0) ker.set(3, (float)slope); return true; } bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize(); for (size_t i = 0; i < inputs.size(); i++) { UMat src, dst; inputs[i]->copyTo(src); dst = outputs[i].getUMat(ACCESS_WRITE); CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset); ocl::Kernel ker; CV_Assert(initKernel(ker, src)); ker.set(0, (int)src.total()); ker.set(1, ocl::KernelArg::PtrReadOnly(src)); ker.set(2, ocl::KernelArg::PtrWriteOnly(dst)); size_t gSize = src.total(); CV_Assert(ker.run(1, &gSize, &wgSize, false)); } return true; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); if (slope) { top(x, y, c, n) = select(input >= 0.0f, input, slope * input); } else { top(x, y, c, n) = max(input, 0.0f); } } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 1; } }; struct ReLU6Functor { typedef ReLU6Layer Layer; float minValue, maxValue; ReLU6Functor(float minValue_ = 0.0f, float maxValue_ = 6.0f) : minValue(minValue_), maxValue(maxValue_) { CV_Assert(minValue <= maxValue); } void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { int i = 0; #if CV_SIMD128 v_float32x4 minV = v_setall_f32(minValue), maxV = v_setall_f32(maxValue); for( ; i <= len - 16; i += 16 ) { v_float32x4 x0 = v_load(srcptr + i); v_float32x4 x1 = v_load(srcptr + i + 4); v_float32x4 x2 = v_load(srcptr + i + 8); v_float32x4 x3 = v_load(srcptr + i + 12); x0 = v_min(v_max(minV, x0), maxV); x1 = v_min(v_max(minV, x1), maxV); x2 = v_min(v_max(minV, x2), maxV); x3 = v_min(v_max(minV, x3), maxV); v_store(dstptr + i, x0); v_store(dstptr + i + 4, x1); v_store(dstptr + i + 8, x2); v_store(dstptr + i + 12, x3); } #endif for( ; i < len; i++ ) { float x = srcptr[i]; if (x >= minValue) dstptr[i] = x <= maxValue ? x : maxValue; else dstptr[i] = minValue; } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); top(x, y, c, n) = clamp(input, minValue, maxValue); } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 2; } }; struct TanHFunctor { typedef TanHLayer Layer; void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { for( int i = 0; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = tanh(x); } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); top(x, y, c, n) = tanh(input); } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 1; } }; struct SigmoidFunctor { typedef SigmoidLayer Layer; void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { for( int i = 0; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = 1.f/(1.f + exp(-x)); } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); top(x, y, c, n) = 1.0f / (1.0f + exp(-input)); } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 3; } }; struct ELUFunctor { typedef ELULayer Layer; explicit ELUFunctor() {} void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { for(int i = 0; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = x >= 0.f ? x : exp(x) - 1; } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); top(x, y, c, n) = select(input >= 0.0f, input, exp(input) - 1); } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 2; } }; struct AbsValFunctor { typedef AbsLayer Layer; void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { for( int i = 0; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = abs(x); } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); top(x, y, c, n) = abs(input); } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 1; } }; struct BNLLFunctor { typedef BNLLLayer Layer; void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { for( int i = 0; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = log(1.f + exp(-abs(x))); } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); top(x, y, c, n) = log(1.0f + exp(-abs(input))); } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 5; } }; struct PowerFunctor { typedef PowerLayer Layer; float power; float scale; float shift; explicit PowerFunctor(float power_ = 1.f, float scale_ = 1.f, float shift_ = 0.f) : power(power_), scale(scale_), shift(shift_) {} void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { float a = scale, b = shift, p = power; if( p == 1.f ) { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { for( int i = 0; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = a*x + b; } } } else { for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { for( int i = 0; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = pow(a*x + b, p); } } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); Halide::Expr topExpr = (scale == 1.0f ? input : input * scale); if (shift) { topExpr += shift; } if (power != 1.0f) { topExpr = pow(topExpr, power); } top(x, y, c, n) = topExpr; } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return power == 1 ? 2 : 10; } }; struct ChannelsPReLUFunctor { typedef ChannelsPReLULayer Layer; Mat scale; explicit ChannelsPReLUFunctor(const Mat& scale_=Mat()) : scale(scale_) { } void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const { CV_Assert(scale.isContinuous() && scale.type() == CV_32F); const float* scaleptr = scale.ptr(); CV_Assert( 0 <= cn0 && cn0 < cn1 && cn1 <= (int)scale.total() ); for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize ) { float s = scaleptr[cn]; int i = 0; #if CV_SIMD128 v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32(); for( ; i <= len - 16; i += 16 ) { v_float32x4 x0 = v_load(srcptr + i); v_float32x4 x1 = v_load(srcptr + i + 4); v_float32x4 x2 = v_load(srcptr + i + 8); v_float32x4 x3 = v_load(srcptr + i + 12); x0 = v_select(x0 >= z, x0, x0*s4); x1 = v_select(x1 >= z, x1, x1*s4); x2 = v_select(x2 >= z, x2, x2*s4); x3 = v_select(x3 >= z, x3, x3*s4); v_store(dstptr + i, x0); v_store(dstptr + i + 4, x1); v_store(dstptr + i + 8, x2); v_store(dstptr + i + 12, x3); } #endif for( ; i < len; i++ ) { float x = srcptr[i]; dstptr[i] = x >= 0.f ? x : s*x; } } } #ifdef HAVE_OPENCL bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) { // TODO: implement OCL version return false; } #endif #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { Halide::Var x("x"), y("y"), c("c"), n("n"); auto weights = wrapToHalideBuffer(scale, {(int)scale.total()}); top(x, y, c, n) = select(input >= 0.0f, input, weights(c) * input); } #endif // HAVE_HALIDE int64 getFLOPSPerElement() const { return 1; } }; #define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \ Ptr<_Layer> _Layer::create() { \ return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); } Ptr ReLULayer::create(const LayerParams& params) { float negativeSlope = params.get("negative_slope", 0.f); Ptr l(new ElementWiseLayer(ReLUFunctor(negativeSlope))); l->setParamsFrom(params); l->negativeSlope = negativeSlope; return l; } Ptr ReLU6Layer::create(const LayerParams& params) { float minValue = params.get("min_value", 0.0f); float maxValue = params.get("max_value", 6.0f); Ptr l(new ElementWiseLayer(ReLU6Functor(minValue, maxValue))); l->setParamsFrom(params); return l; } Ptr TanHLayer::create(const LayerParams& params) { Ptr l(new ElementWiseLayer()); l->setParamsFrom(params); return l; } Ptr SigmoidLayer::create(const LayerParams& params) { Ptr l(new ElementWiseLayer()); l->setParamsFrom(params); return l; } Ptr ELULayer::create(const LayerParams& params) { Ptr l(new ElementWiseLayer(ELUFunctor())); l->setParamsFrom(params); return l; } Ptr AbsLayer::create(const LayerParams& params) { Ptr l(new ElementWiseLayer()); l->setParamsFrom(params); return l; } Ptr BNLLLayer::create(const LayerParams& params) { Ptr l(new ElementWiseLayer()); l->setParamsFrom(params); return l; } Ptr PowerLayer::create(const LayerParams& params) { float power = params.get("power", 1.0f); float scale = params.get("scale", 1.0f); float shift = params.get("shift", 0.0f); Ptr l(new ElementWiseLayer(PowerFunctor(power, scale, shift))); l->setParamsFrom(params); l->power = power; l->scale = scale; l->shift = shift; return l; } Ptr ChannelsPReLULayer::create(const LayerParams& params) { Ptr l(new ElementWiseLayer(ChannelsPReLUFunctor(params.blobs[0]))); l->setParamsFrom(params); return l; } } }