/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Copyright (C) 2017, Intel Corporation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "../precomp.hpp" #include "layers_common.hpp" #include "opencv2/core/hal/intrin.hpp" #include "op_halide.hpp" #include "opencl_kernels_dnn.hpp" #include #include using std::max; using std::min; #ifdef HAVE_OPENCL using namespace cv::dnn::ocl4dnn; #endif namespace cv { namespace dnn { class PoolingLayerImpl : public PoolingLayer { public: PoolingLayerImpl(const LayerParams& params) { type = PoolingLayer::MAX; computeMaxIdx = true; globalPooling = false; if (params.has("pool")) { String pool = params.get("pool").toLowerCase(); if (pool == "max") type = PoolingLayer::MAX; else if (pool == "ave") type = PoolingLayer::AVE; else if (pool == "stochastic") type = PoolingLayer::STOCHASTIC; else CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\""); getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling, pad.height, pad.width, stride.height, stride.width, padMode); } else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale")) { type = PoolingLayer::ROI; } setParamsFrom(params); ceilMode = params.get("ceil_mode", true); pooledSize.width = params.get("pooled_w", 1); pooledSize.height = params.get("pooled_h", 1); spatialScale = params.get("spatial_scale", 1); } #ifdef HAVE_OPENCL Ptr > poolOp; #endif void finalize(const std::vector &inputs, std::vector &outputs) { CV_Assert(!inputs.empty()); cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]), out(outputs[0].size[3], outputs[0].size[2]); if(globalPooling) { kernel = inp; } getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad); } virtual bool supportBackend(int backendId) { return backendId == DNN_BACKEND_DEFAULT || backendId == DNN_BACKEND_HALIDE && haveHalide() && (type == PoolingLayer::MAX || type == PoolingLayer::AVE && !pad.width && !pad.height); } #ifdef HAVE_OPENCL bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals) { std::vector inputs; std::vector outputs; inps.getUMatVector(inputs); outs.getUMatVector(outputs); if (poolOp.empty()) { OCL4DNNPoolConfig config; config.in_shape = shape(inputs[0]); config.out_shape = shape(outputs[0]); config.kernel = kernel; config.pad = pad; config.stride = stride; config.channels = inputs[0].size[1]; config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX : (type == AVE ? LIBDNN_POOLING_METHOD_AVE : LIBDNN_POOLING_METHOD_STO); poolOp = Ptr >(new OCL4DNNPool(config)); } for (size_t ii = 0; ii < inputs.size(); ii++) { UMat& inpMat = inputs[ii]; int out_index = (type == MAX) ? 2 : 1; UMat& outMat = outputs[out_index * ii]; UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat(); CV_Assert(inpMat.offset == 0 && outMat.offset == 0); if (!poolOp->Forward(inpMat, outMat, maskMat)) return false; } return true; } #endif void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); } void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); switch (type) { case MAX: CV_Assert(inputs.size() == 1, outputs.size() == 2); maxPooling(*inputs[0], outputs[0], outputs[1]); break; case AVE: CV_Assert(inputs.size() == 1, outputs.size() == 1); avePooling(*inputs[0], outputs[0]); break; case ROI: CV_Assert(inputs.size() == 2, outputs.size() == 1); roiPooling(*inputs[0], *inputs[1], outputs[0]); break; default: CV_Error(Error::StsNotImplemented, "Not implemented"); break; } } virtual Ptr initHalide(const std::vector > &inputs) { if (type == PoolingLayer::MAX) return initMaxPoolingHalide(inputs); else if (type == PoolingLayer::AVE) return initAvePoolingHalide(inputs); else return Ptr(); } class PoolingInvoker : public ParallelLoopBody { public: const Mat* src, *rois; Mat *dst, *mask; Size kernel, stride, pad; int nstripes; bool computeMaxIdx; std::vector ofsbuf; int poolingType; float spatialScale; PoolingInvoker() : src(0), rois(0), dst(0), mask(0), nstripes(0), computeMaxIdx(0), poolingType(PoolingLayer::MAX), spatialScale(0) {} static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask, Size kernel, Size stride, Size pad, int poolingType, float spatialScale, bool computeMaxIdx, int nstripes) { CV_Assert(src.isContinuous() && dst.isContinuous() && src.type() == CV_32F && src.type() == dst.type() && src.dims == 4 && dst.dims == 4 && (poolingType == ROI && dst.size[0] == rois.size[0] || src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] && (mask.empty() || (mask.type() == src.type() && mask.size == dst.size))); PoolingInvoker p; p.src = &src; p.rois = &rois; p.dst = &dst; p.mask = &mask; p.kernel = kernel; p.stride = stride; p.pad = pad; p.nstripes = nstripes; p.computeMaxIdx = computeMaxIdx; p.poolingType = poolingType; p.spatialScale = spatialScale; if( !computeMaxIdx ) { p.ofsbuf.resize(kernel.width*kernel.height); for( int i = 0; i < kernel.height; i++ ) for( int j = 0; j < kernel.width; j++ ) p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j; } parallel_for_(Range(0, nstripes), p, nstripes); } void operator()(const Range& r) const { int channels = dst->size[1], width = dst->size[3], height = dst->size[2]; int inp_width = src->size[3], inp_height = src->size[2]; size_t total = dst->total(); size_t stripeSize = (total + nstripes - 1)/nstripes; size_t stripeStart = r.start*stripeSize; size_t stripeEnd = std::min(r.end*stripeSize, total); int kernel_w = kernel.width, kernel_h = kernel.height; int pad_w = pad.width, pad_h = pad.height; int stride_w = stride.width, stride_h = stride.height; bool compMaxIdx = computeMaxIdx; #if CV_SIMD128 const int* ofsptr = &ofsbuf[0]; v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3)); v_float32x4 ones = v_setall_f32(1.f); v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w)); #endif for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; ) { size_t ofs = ofs0; int x0 = (int)(ofs % width); ofs /= width; int y0 = (int)(ofs % height); ofs /= height; int c = (int)(ofs % channels); int n = (int)(ofs / channels); int ystart, yend; const float *srcData; int xstartROI = 0; float roiRatio = 0; if (poolingType == ROI) { const float *roisData = rois->ptr(n); int ystartROI = round(roisData[2] * spatialScale); int yendROI = round(roisData[4] * spatialScale); int roiHeight = std::max(yendROI - ystartROI + 1, 1); roiRatio = (float)roiHeight / height; ystart = ystartROI + y0 * roiRatio; yend = ystartROI + std::ceil((y0 + 1) * roiRatio); xstartROI = round(roisData[1] * spatialScale); int xendROI = round(roisData[3] * spatialScale); int roiWidth = std::max(xendROI - xstartROI + 1, 1); roiRatio = (float)roiWidth / width; CV_Assert(roisData[0] < src->size[0]); srcData = src->ptr(roisData[0], c); } else { ystart = y0 * stride_h - pad_h; yend = min(ystart + kernel_h, inp_height + pad_h); srcData = src->ptr(n, c); } int ydelta = yend - ystart; ystart = max(ystart, 0); yend = min(yend, inp_height); float *dstData = dst->ptr(n, c, y0); float *dstMaskData = mask->data ? mask->ptr(n, c, y0) : 0; int delta = std::min((int)(stripeEnd - ofs0), width - x0); ofs0 += delta; int x1 = x0 + delta; if( poolingType == MAX || poolingType == ROI) for( ; x0 < x1; x0++ ) { int xstart, xend; if (poolingType == ROI) { xstart = xstartROI + x0 * roiRatio; xend = xstartROI + std::ceil((x0 + 1) * roiRatio); } else { xstart = x0 * stride_w - pad_w; xend = xstart + kernel_w; } xstart = max(xstart, 0); xend = min(xend, inp_width); if (xstart >= xend || ystart >= yend) { dstData[x0] = 0; if (compMaxIdx && dstMaskData) dstMaskData[x0] = -1; continue; } #if CV_SIMD128 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width ) { if( compMaxIdx ) { v_float32x4 max_val0 = v_setall_f32(-FLT_MAX); v_float32x4 max_val1 = max_val0; v_float32x4 max_idx0 = v_setall_f32(-1.f); v_float32x4 max_idx1 = max_idx0; int index0 = ystart * inp_width + xstart; v_float32x4 idx0 = idx00 + v_setall_f32((float)index0); v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4)); for (int y = ystart; y < yend; ++y) { for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones) { const int index = y * inp_width + x; v_float32x4 v0(srcData[index], srcData[index + stride_w], srcData[index + stride_w*2], srcData[index + stride_w*3]); v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5], srcData[index + stride_w*6], srcData[index + stride_w*7]); max_idx0 = v_select(v0 > max_val0, idx0, max_idx0); max_idx1 = v_select(v1 > max_val1, idx1, max_idx1); max_val0 = v_max(max_val0, v0); max_val1 = v_max(max_val1, v1); } idx0 += idx_delta; idx1 += idx_delta; } v_store(dstData + x0, max_val0); v_store(dstData + x0 + 4, max_val1); if (dstMaskData) { v_store(dstMaskData + x0, max_idx0); v_store(dstMaskData + x0 + 4, max_idx1); } x0 += 7; } else { v_float32x4 max_val0 = v_setall_f32(-FLT_MAX); v_float32x4 max_val1 = max_val0; if( yend - ystart == kernel_h ) { const float* srcData1 = srcData + ystart*inp_width + xstart; if( stride_w == 1 ) for (int k = 0; k < kernel_w*kernel_h; k++) { int index = ofsptr[k]; v_float32x4 v0 = v_load(srcData1 + index); v_float32x4 v1 = v_load(srcData1 + index + 4); max_val0 = v_max(max_val0, v0); max_val1 = v_max(max_val1, v1); } #if CV_SSE2 else if( stride_w == 2 ) for (int k = 0; k < kernel_w*kernel_h; k++) { int index = ofsptr[k]; v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4); v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0))); v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12); v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0))); max_val0 = v_max(max_val0, v0); max_val1 = v_max(max_val1, v1); } #endif else for (int k = 0; k < kernel_w*kernel_h; k++) { int index = ofsptr[k]; v_float32x4 v0(srcData1[index], srcData1[index + stride_w], srcData1[index + stride_w*2], srcData1[index + stride_w*3]); v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5], srcData1[index + stride_w*6], srcData1[index + stride_w*7]); max_val0 = v_max(max_val0, v0); max_val1 = v_max(max_val1, v1); } } else { for (int y = ystart; y < yend; ++y) { for (int x = xstart; x < xend; ++x) { const int index = y * inp_width + x; v_float32x4 v0(srcData[index], srcData[index + stride_w], srcData[index + stride_w*2], srcData[index + stride_w*3]); v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5], srcData[index + stride_w*6], srcData[index + stride_w*7]); max_val0 = v_max(max_val0, v0); max_val1 = v_max(max_val1, v1); } } } v_store(dstData + x0, max_val0); v_store(dstData + x0 + 4, max_val1); x0 += 7; } } else #endif { float max_val = -FLT_MAX; if( compMaxIdx ) { int max_index = -1; for (int y = ystart; y < yend; ++y) for (int x = xstart; x < xend; ++x) { const int index = y * inp_width + x; float val = srcData[index]; if (val > max_val) { max_val = val; max_index = index; } } dstData[x0] = max_val; if (dstMaskData) dstMaskData[x0] = max_index; } else { for (int y = ystart; y < yend; ++y) for (int x = xstart; x < xend; ++x) { const int index = y * inp_width + x; float val = srcData[index]; max_val = std::max(max_val, val); } dstData[x0] = max_val; } } } else { for( ; x0 < x1; x0++ ) { int xstart = x0 * stride_w - pad_w; int xend = min(xstart + kernel_w, inp_width + pad_w); int xdelta = xend - xstart; xstart = max(xstart, 0); xend = min(xend, inp_width); float inv_kernel_area = 1.f/(ydelta*xdelta); #if CV_SIMD128 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width ) { v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32(); v_float32x4 ikarea = v_setall_f32(inv_kernel_area); for (int y = ystart; y < yend; ++y) { for (int x = xstart; x < xend; ++x) { const int index = y * inp_width + x; v_float32x4 v0(srcData[index], srcData[index + stride_w], srcData[index + stride_w*2], srcData[index + stride_w*3]); v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5], srcData[index + stride_w*6], srcData[index + stride_w*7]); sum_val0 += v0; sum_val1 += v1; } } v_store(dstData + x0, sum_val0*ikarea); v_store(dstData + x0 + 4, sum_val1*ikarea); x0 += 7; } else #endif { float sum_val = 0.f; for (int y = ystart; y < yend; ++y) for (int x = xstart; x < xend; ++x) { const int index = y * inp_width + x; float val = srcData[index]; sum_val += val; } dstData[x0] = sum_val*inv_kernel_area; } } } } } }; void maxPooling(Mat &src, Mat &dst, Mat &mask) { const int nstripes = getNumThreads(); Mat rois; PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes); } void avePooling(Mat &src, Mat &dst) { const int nstripes = getNumThreads(); Mat rois, mask; PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes); } void roiPooling(const Mat &src, const Mat &rois, Mat &dst) { const int nstripes = getNumThreads(); Mat mask; PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes); } virtual Ptr initMaxPoolingHalide(const std::vector > &inputs) { #ifdef HAVE_HALIDE Halide::Buffer inputBuffer = halideBuffer(inputs[0]); const int inWidth = inputBuffer.width(); const int inHeight = inputBuffer.height(); Halide::Var x("x"), y("y"), c("c"), n("n"); Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); Halide::RDom r(0, kernel.width, 0, kernel.height); Halide::Expr kx, ky; if (pad.width || pad.height) { kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1); ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1); } else { kx = min(x * stride.width + r.x, inWidth - 1); ky = min(y * stride.height + r.y, inHeight - 1); } // Halide::argmax returns tuple (r.x, r.y, max). Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n)); // Compute offset from argmax in range [0, kernel_size). Halide::Expr max_index; if (pad.width || pad.height) { max_index = clamp(y * stride.height + res[1] - pad.height, 0, inHeight - 1) * inWidth + clamp(x * stride.width + res[0] - pad.width, 0, inWidth - 1); } else { max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth + min(x * stride.width + res[0], inWidth - 1); } top(x, y, c, n) = { res[2], Halide::cast(max_index) }; return Ptr(new HalideBackendNode(top)); #endif // HAVE_HALIDE return Ptr(); } virtual Ptr initAvePoolingHalide(const std::vector > &inputs) { #ifdef HAVE_HALIDE Halide::Buffer inputBuffer = halideBuffer(inputs[0]); const int inW = inputBuffer.width(), inH = inputBuffer.height(); if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height) { CV_Error(cv::Error::StsNotImplemented, "Halide backend for average pooling with partial " "kernels is not implemented"); } const float norm = 1.0f / (kernel.width * kernel.height); Halide::Var x("x"), y("y"), c("c"), n("n"); Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); Halide::RDom r(0, kernel.width, 0, kernel.height); top(x, y, c, n) = sum( inputBuffer(x * stride.width + r.x, y * stride.height + r.y, c, n)) * norm; return Ptr(new HalideBackendNode(top)); #endif // HAVE_HALIDE return Ptr(); } virtual void applyHalideScheduler(Ptr& node, const std::vector &inputs, const std::vector &outputs, int targetId) const { #ifdef HAVE_HALIDE if (targetId != DNN_TARGET_CPU) { Layer::applyHalideScheduler(node, inputs, outputs, targetId); return; } Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co"); Halide::Func& top = node.dynamicCast()->funcs.back(); int outW, outH, outC, outN; getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); if (outW < 8 || outH < 8) { if (outC > 8) top.split(c, co, ci, 8) .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) .parallel(tile) .vectorize(ci); else { top.fuse(y, c, tile).fuse(n, tile, tile) .parallel(tile); if (outW > 1) top.vectorize(x); } } else { if (outC > 8) top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8) .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile) .parallel(tile) .vectorize(xi); else top.split(x, xo, xi, 8).split(y, yo, yi, 8) .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile) .parallel(tile) .vectorize(xi); } #endif // HAVE_HALIDE } bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const { CV_Assert(inputs.size() != 0); Size in(inputs[0][3], inputs[0][2]), out; if (globalPooling) { out.height = 1; out.width = 1; } else if (type == PoolingLayer::ROI) { out.height = pooledSize.height; out.width = pooledSize.width; } else if (padMode.empty()) { float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height; float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width; out.height = 1 + (ceilMode ? ceil(height) : floor(height)); out.width = 1 + (ceilMode ? ceil(width) : floor(width)); if (pad.height || pad.width) { // If we have padding, ensure that the last pooling starts strictly // inside the image (instead of at the padding); otherwise clip the last. if ((out.height - 1) * stride.height >= in.height + pad.height) --out.height; if ((out.width - 1) * stride.width >= in.width + pad.width) --out.width; CV_Assert((out.height - 1) * stride.height < in.height + pad.height); CV_Assert((out.width - 1) * stride.width < in.width + pad.width); } } else { getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out); } int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width}; if (type == ROI) { CV_Assert(inputs.size() == 2); dims[0] = inputs[1][0]; // Number of proposals; } outputs.assign(type == MAX ? 2 : 1, shape(dims)); return false; } virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { (void)inputs; // suppress unused variable warning long flops = 0; for(int i = 0; i < outputs.size(); i++) { if (type == MAX) { if (i%2 == 0) flops += total(outputs[i])*kernel.area(); } else { flops += total(outputs[i])*(kernel.area() + 1); } } return flops; } }; Ptr PoolingLayer::create(const LayerParams& params) { return Ptr(new PoolingLayerImpl(params)); } } }