// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #include "../precomp.hpp" #include namespace cv { namespace dnn { class ReduceLayerImpl CV_FINAL : public ReduceLayer { public: ReduceLayerImpl(const LayerParams& params) { setParamsFrom(params); // set reduce type CV_Assert(params.has("reduce")); String op_type = toLowerCase(params.get("reduce")); if (op_type == "max") reduce_type = ReduceType::MAX; else if (op_type == "min") reduce_type = ReduceType::MIN; else if (op_type == "mean") reduce_type = ReduceType::MEAN; else if (op_type == "sum") reduce_type = ReduceType::SUM; else if (op_type == "sum_square") reduce_type = ReduceType::SUM_SQUARE; else if (op_type == "l1") reduce_type = ReduceType::L1; else if (op_type == "l2") reduce_type = ReduceType::L2; else if (op_type == "log_sum") reduce_type = ReduceType::LOG_SUM; else if (op_type == "log_sum_exp") reduce_type = ReduceType::LOG_SUM_EXP; else if (op_type == "prod") reduce_type = ReduceType::PROD; else CV_Error(Error::StsBadArg, "Unknown reduce type\"" + op_type + "\""); keepdims = params.get("keepdims", true); noop_with_empty_axes = params.get("noop_with_empty_axes", false); // get axes if it is existed, otherwise reduce all if (params.has("axes")) { auto param_axes = params.get("axes"); int num_axes = param_axes.size(); axes.resize(num_axes); for (int i = 0; i < num_axes; ++i) axes[i] = param_axes.get(i); } } virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV; } virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { if (axes.empty()) { return; } std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); auto shape_input = shape(inputs[0]); for (auto i = 0; i < axes.size(); ++i) { auto norm_axis = normalize_axis(axes[i], shape_input); axes[i] = norm_axis; } bool do_nothing = true; for (auto axis : axes) { if (shape_input[axis] != 1) { do_nothing = false; } } if (do_nothing) { axes.clear(); noop_with_empty_axes = true; } } bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const CV_OVERRIDE { // empty axes if (axes.empty()) { if (noop_with_empty_axes) { // do nothing outputs.assign(1, inputs[0]); } else { // reduce all axes MatShape shape_output; if (keepdims) { shape_output = inputs[0]; for (auto i = 0; i < shape_output.size(); ++i) shape_output[i] = 1; } else { shape_output.push_back(1); } outputs.assign(1, shape_output); } } else { auto shape_output_ = inputs[0]; for (size_t i = 0; i < axes.size(); ++i) { auto norm_axis = normalize_axis(axes[i], inputs[0]); shape_output_[norm_axis] = -1; } MatShape shape_output; for (size_t i = 0; i < shape_output_.size(); ++i) { if (shape_output_[i] == -1) { if (keepdims) shape_output.push_back(1); else continue; } else shape_output.push_back(shape_output_[i]); } if (shape_output.empty()) shape_output.push_back(1); outputs.assign(1, shape_output); } return false; } template class ReduceBase { public: using dtype_input = T; ReduceBase(size_t n, const T& init) : n_(n), accumulator_(init) {} virtual void update(const T& a) = 0; virtual T get_value() { return accumulator_; } virtual ~ReduceBase() = default; protected: size_t n_; T accumulator_; }; template class ReduceMin : public ReduceBase { public: ReduceMin(size_t n, const T& init) : ReduceBase(n, init) {} void update(const T& a) override { this->accumulator_ = a > this->accumulator_ ? this->accumulator_ : a; } }; template class ReduceMax : public ReduceBase { public: ReduceMax(size_t n, const T& init) : ReduceBase(n, init) {} void update(const T& a) override { this->accumulator_ = a > this->accumulator_ ? a : this->accumulator_; } }; template class ReduceSum : public ReduceBase { public: ReduceSum(size_t n, const T& init) : ReduceBase(n, 0) {} void update(const T& a) override { this->accumulator_ += a; } }; template class ReduceMean : public ReduceSum { public: ReduceMean(size_t n, const T& init) : ReduceSum(n, init) {} T get_value() override { return this->accumulator_ / static_cast(this->n_); } }; template class ReduceSumSquare : public ReduceBase { public: ReduceSumSquare(size_t n, const T& init) : ReduceBase(n, 0) {} void update(const T& a) override { this->accumulator_ += a * a; } }; template class ReduceL1 : public ReduceBase { public: ReduceL1(size_t n, const T& init) : ReduceBase(n, 0) {} void update(const T& a) override { this->accumulator_ += a > 0 ? a : -a; } }; template class ReduceL2 : public ReduceBase { public: ReduceL2(size_t n, const T& init) : ReduceBase(n, 0) {} void update(const T& a) override { this->accumulator_ += a * a; } T get_value() override { return std::sqrt(this->accumulator_); } }; template class ReduceProd : public ReduceBase { public: ReduceProd(size_t n, const T& init) : ReduceBase(n, 1) {} void update(const T& a) override { this->accumulator_ *= a; } }; template class ReduceLogSum : public ReduceBase { public: ReduceLogSum(size_t n, const T& init) : ReduceBase(n, 0) {} void update(const T& a) override { this->accumulator_ += a; } T get_value() override { return static_cast(std::log(this->accumulator_)); } }; // FIXME: overflow caution template class ReduceLogSumExp : public ReduceBase { public: ReduceLogSumExp(size_t n, const T& init) : ReduceBase(n, 0) {} void update(const T& a) override { this->accumulator_ += static_cast(std::exp(a)); } T get_value() override { return static_cast(std::log(this->accumulator_)); } }; template class ReduceAllInvoker : public ParallelLoopBody { public: using dtype = typename Op::dtype_input; const Mat& src; Mat& dst; int n_reduce; int loop_size; int total; int cost_per_thread; ReduceAllInvoker(const Mat& src_, Mat& dst_) : src(src_), dst(dst_) { auto shape_src = shape(src); n_reduce = std::accumulate(shape_src.begin(), shape_src.end(), 1, std::multiplies()); loop_size = n_reduce; total = 1; cost_per_thread = 1; } void operator()(const Range& r) const CV_OVERRIDE { int start = r.start; int end = r.end; const dtype* p_src = src.ptr(); dtype* p_dst = dst.ptr(); for (int i = start; i < end; ++i) { Op accumulator(n_reduce, *p_src); for (int l = 0; l < loop_size; ++l) { accumulator.update(p_src[l]); } p_dst[i] = accumulator.get_value(); } } }; template class ReduceInvoker : public ParallelLoopBody { public: using dtype = typename Op::dtype_input; const Mat& src; Mat& dst; std::vector reduced_axes; // assume in ascending order int n_reduce; int loop_size; int last_reduced_dim; int last_reduced_step; std::vector projected_steps; int last_unreduced_dim; int last_unreduced_step; std::vector unprojected_steps; int total; int cost_per_thread; ReduceInvoker(const Mat& src_, Mat& dst_, std::vector axes_) : src(src_), dst(dst_), reduced_axes(axes_) { auto shape_src = shape(src); auto steps_src = shape_src; steps_src[steps_src.size() - 1] = 1; for (int i = static_cast(steps_src.size()) - 2; i >= 0; --i) steps_src[i] = steps_src[i + 1] * shape_src[i + 1]; size_t projection_size = 1; for (auto axis : reduced_axes) { projection_size *= shape_src[axis]; } n_reduce = projection_size; last_reduced_dim = shape_src[reduced_axes.back()]; last_reduced_step = steps_src[reduced_axes.back()]; loop_size = last_reduced_dim * last_reduced_step; projection_size /= last_reduced_dim; // calculate projected_steps int last_reduced_axis = static_cast(reduced_axes.size()) - 1; if (last_reduced_axis == 0) { projected_steps.resize(1, 0); } else { projected_steps.resize(projection_size); std::vector projected_indices(last_reduced_axis, 0); for (size_t i = 0, current_step = 0; i < projection_size; ++i) { projected_steps[i] = current_step; ++projected_indices[last_reduced_axis - 1]; current_step += steps_src[reduced_axes[last_reduced_axis - 1]]; for (int j = last_reduced_axis - 1; j > 0; --j) { if (projected_indices[j] < shape_src[reduced_axes[j]]) { break; } projected_indices[j] = 0; ++projected_indices[j - 1]; current_step = steps_src[reduced_axes[j - 1]]; } } } // calculate unprojected_steps std::vector unreduced_axes; for (int i = 0; i < static_cast(shape_src.size()); ++i) { if (std::find(reduced_axes.begin(), reduced_axes.end(), i) == reduced_axes.end()) { unreduced_axes.push_back(i); } } size_t unprojection_size = 1; for (auto axis : unreduced_axes) { unprojection_size *= shape_src[axis]; } last_unreduced_dim = shape_src[unreduced_axes.back()]; last_unreduced_step = steps_src[unreduced_axes.back()]; unprojection_size /= last_unreduced_dim; std::vector unprojected_indices(unreduced_axes.size(), 0); unprojected_steps.reserve(unprojection_size); if (unprojected_indices.size() <= 1) { unprojected_steps.push_back(0); } else { for (size_t i = 0, current_step = 0; i < unprojection_size; ++i) { unprojected_steps.push_back(current_step); ++unprojected_indices[unprojected_indices.size() - 2]; current_step += steps_src[unreduced_axes[unreduced_axes.size() - 2]]; for (int j = static_cast(unreduced_axes.size()) - 2; j > 0; --j) { if (unprojected_indices[j] < shape_src[unreduced_axes[j]]) { break; } unprojected_indices[j] -= shape_src[unreduced_axes[j]]; current_step -= shape_src[unreduced_axes[j]] * steps_src[unreduced_axes[j]]; ++unprojected_indices[j - 1]; current_step += steps_src[unreduced_axes[j - 1]]; } } } auto shape_dst = shape(dst); total = std::accumulate(shape_dst.begin(), shape_dst.end(), 1, std::multiplies()); cost_per_thread = static_cast(projected_steps.size() * last_reduced_step); } static void run(const Mat& src, Mat& dst, std::vector axes, bool noop_with_empty_axes) { CV_Assert(src.isContinuous()); CV_Assert(dst.isContinuous()); if (axes.empty()) { if (noop_with_empty_axes) { // copyTo is not used here for the reason that we want a // copy for the case when dims at all axes are 1 const auto p_src = src.ptr(); auto p_dst = dst.ptr(); std::memcpy(p_dst, p_src, sizeof(dtype) * dst.total()); return; } ReduceAllInvoker p(src, dst); double nstripes = (size_t)p.total * (size_t)p.cost_per_thread * (1 / 1024.0); parallel_for_(Range(0, p.total), p, nstripes); return; } ReduceInvoker p(src, dst, axes); double nstripes = (size_t)p.total * (size_t)p.cost_per_thread * (1 / 1024.0); parallel_for_(Range(0, p.total), p, nstripes); } void operator()(const Range& r) const CV_OVERRIDE { int start = r.start; int end = r.end; const dtype* p_src = src.ptr(); dtype* p_dst = dst.ptr(); size_t main_index = start / last_unreduced_dim; size_t loop = start % last_unreduced_dim; size_t origin = unprojected_steps[main_index] + loop * last_unreduced_step; for (int i = start; i < end; ++i) { Op accumulator(n_reduce, p_src[origin + projected_steps[0]]); for (auto projected_step : projected_steps) { const dtype* loop_p_src = p_src + origin + projected_step; for (auto l = 0; l < loop_size; l += last_reduced_step) { accumulator.update(loop_p_src[l]); } } p_dst[i] = accumulator.get_value(); ++loop; if (loop >= last_unreduced_dim) { loop = 0; ++main_index; if (main_index < unprojected_steps.size()) { origin = unprojected_steps[main_index]; } } else { origin += last_unreduced_step; } } } }; void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); if (inputs_arr.depth() == CV_16F) { forward_fallback(inputs_arr, outputs_arr, internals_arr); return; } std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); typeDispatch(outputs[0].type(), inputs[0], outputs[0], axes, noop_with_empty_axes); } template inline void opDispatch(Args&&... args) { switch (reduce_type) { case ReduceType::MAX: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::MIN: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::MEAN: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::SUM: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::L1: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::L2: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::PROD: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::SUM_SQUARE: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::LOG_SUM: ReduceInvoker>::run(std::forward(args)...); break; case ReduceType::LOG_SUM_EXP: ReduceInvoker>::run(std::forward(args)...); break; default: CV_Error(Error::StsBadArg, "DNN/Reduce: Unsupported operation."); } } template inline void typeDispatch(const int type, Args&&... args) { switch (type) { case CV_8U: opDispatch(std::forward(args)...); break; case CV_32S: opDispatch(std::forward(args)...); break; case CV_32F: opDispatch(std::forward(args)...); break; default: CV_Error(cv::Error::BadDepth, "DNN/Reduce: Unsupported type."); } } private: enum ReduceType { MAX, MIN, MEAN, SUM, L1, L2, PROD, SUM_SQUARE, LOG_SUM, LOG_SUM_EXP } reduce_type; bool keepdims; bool noop_with_empty_axes; std::vector axes; }; Ptr ReduceLayer::create(const LayerParams& params) { return Ptr(new ReduceLayerImpl(params)); } }} // cv::dnn