// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // // Copyright (C) 2017, Intel Corporation, all rights reserved. // Third party copyrights are property of their respective owners. #include "precomp.hpp" #include #include "op_halide.hpp" #include "net_impl.hpp" #ifdef HAVE_HALIDE #include "halide_scheduler.hpp" #include #endif // HAVE_HALIDE namespace cv { namespace dnn { CV__DNN_INLINE_NS_BEGIN void Net::Impl::setHalideScheduler(const String& scheduler) { halideConfigFile = scheduler; } #ifdef HAVE_HALIDE void Net::Impl::compileHalide() { CV_TRACE_FUNCTION(); CV_Assert(preferableBackend == DNN_BACKEND_HALIDE); HalideScheduler scheduler(halideConfigFile); std::vector< std::reference_wrapper > compileList; compileList.reserve(64); for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it) { LayerData& ld = it->second; Ptr layer = ld.layerInstance; if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip) { CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty()); bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]); if (!scheduled) { // Use automatic scheduling provided by layer. layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE], ld.inputBlobs, ld.outputBlobs, preferableTarget); } compileList.emplace_back(ld); } } std::atomic progress(0); auto fn = ([&] () -> void { for (;;) { int id = progress.fetch_add(1); if ((size_t)id >= compileList.size()) return; const LayerData& ld = compileList[id].get(); Ptr node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second; dnn::compileHalide(ld.outputBlobs, node, preferableTarget); } }); size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency()); num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads)); std::vector threads(num_threads - 1); for (auto& t: threads) t = std::thread(fn); fn(); // process own tasks for (auto& t: threads) t.join(); } void Net::Impl::initHalideBackend() { CV_TRACE_FUNCTION(); CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide()); // Iterator to current layer. MapIdToLayerData::iterator it = layers.begin(); // Iterator to base layer for fusion. In example, in case of conv+bn+relu // it'll be a conv layer. MapIdToLayerData::iterator baseIt = layers.begin(); for (; it != layers.end(); it++) { LayerData &ldTop = it->second; Ptr layerTop = ldTop.layerInstance; if (!layerTop->supportBackend(preferableBackend)) { // Move base iterator to layer that don't support preferable // backend to prevent fusion over layer of different backend. baseIt = it; continue; } // Try to do layers fusion. LayerData &ldBot = baseIt->second; Ptr layerBot = ldBot.layerInstance; // 1. Check that bottom and top from the same backends. if (it != layers.begin() && layerBot->supportBackend(preferableBackend)) { // 2. Check that current layer works in-place. bool inPlace = ldTop.inputBlobs.size() == 1 && ldBot.outputBlobs.size() == 1 && ldTop.inputBlobs[0]->data == ldBot.outputBlobs[0].data; if (inPlace) { // 3. Try to attach node. CV_Assert(!ldBot.backendNodes[preferableBackend].empty()); Ptr fusedNode = layerTop->tryAttach(ldBot.backendNodes[preferableBackend]); if (!fusedNode.empty()) { ldTop.skip = true; ldBot.backendNodes[preferableBackend] = fusedNode; ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers; continue; } } } // No layers fusion. ldTop.skip = false; ldTop.backendNodes[DNN_BACKEND_HALIDE] = layerTop->initHalide(ldTop.inputBlobsWrappers); baseIt = it; } } #endif // HAVE_HALIDE CV__DNN_INLINE_NS_END #ifdef HAVE_HALIDE static MatShape getBufferShape(const MatShape& shape) { if (shape.size() == 2 || shape.size() == 4) { int w, h, c, n; getCanonicalSize(shape, &w, &h, &c, &n); return {w, h, c, n}; } else { MatShape bufferShape(shape); std::reverse(bufferShape.begin(), bufferShape.end()); return bufferShape; } } static MatShape getBufferShape(const MatSize& size) { return getBufferShape(shape(size)); } Halide::Buffer wrapToHalideBuffer(const Mat& mat) { return wrapToHalideBuffer(mat, getBufferShape(mat.size)); } Halide::Buffer wrapToHalideBuffer(const Mat& mat, const std::vector& sizes) { Halide::Buffer buffer((float*)mat.data, sizes); buffer.set_host_dirty(); // Indicate that data is on CPU. return buffer; } Halide::Buffer<> halideBuffer(const Ptr& ptr) { CV_Assert(!ptr.empty()); return ptr.dynamicCast()->buffer; } std::vector > halideBuffers(const std::vector >& ptrs) { std::vector > vec; vec.reserve(ptrs.size()); for (const Ptr& ptr : ptrs) { vec.push_back(halideBuffer(ptr)); } return vec; } void getCanonicalSize(const Halide::Buffer<>& buffer, int* width, int* height, int* channels, int* batch) { CV_Assert(buffer.dimensions() == 4); *width = buffer.extent(0); *height = buffer.extent(1); *channels = buffer.extent(2); *batch = buffer.extent(3); } HalideBackendNode::HalideBackendNode(const Halide::Func& func) : BackendNode(DNN_BACKEND_HALIDE), funcs(1, func) {} HalideBackendNode::HalideBackendNode(const std::vector& funcs) : BackendNode(DNN_BACKEND_HALIDE), funcs(funcs) {} HalideBackendNode::HalideBackendNode(const Ptr& base, const Halide::Func& top) : BackendNode(DNN_BACKEND_HALIDE), funcs(base->funcs) { funcs.back() = top; } HalideBackendWrapper::HalideBackendWrapper(int targetId, const cv::Mat& m) : BackendWrapper(DNN_BACKEND_HALIDE, targetId) { managesDevMemory = true; buffer = wrapToHalideBuffer(m); if (targetId == DNN_TARGET_CPU) { return; } else if (targetId == DNN_TARGET_OPENCL) { Halide::Target t = Halide::get_host_target(); t.set_feature(Halide::Target::OpenCL); buffer.copy_to_device(t); } else CV_Error(Error::StsNotImplemented, "Unknown target identifier"); } HalideBackendWrapper::HalideBackendWrapper(const Ptr& base, const MatShape& shape) : BackendWrapper(DNN_BACKEND_HALIDE, base->targetId) { managesDevMemory = false; Halide::Buffer baseBuffer = halideBuffer(base); buffer = Halide::Buffer((float*)baseBuffer.raw_buffer()->host, getBufferShape(shape)); if (baseBuffer.has_device_allocation()) { buffer.raw_buffer()->device = baseBuffer.raw_buffer()->device; buffer.raw_buffer()->device_interface = baseBuffer.raw_buffer()->device_interface; buffer.set_device_dirty(); } else { buffer.set_host_dirty(); // Indicate that data is on CPU. CV_Assert(targetId == DNN_TARGET_CPU); } } HalideBackendWrapper::~HalideBackendWrapper() { if (buffer.has_device_allocation() && !managesDevMemory) { buffer.raw_buffer()->device = 0; buffer.raw_buffer()->device_interface = 0; buffer.set_device_dirty(false); } } void HalideBackendWrapper::copyToHost() { if (buffer.device_dirty()) { buffer.device_sync(); buffer.copy_to_host(); } } void HalideBackendWrapper::setHostDirty() { buffer.set_device_dirty(false); buffer.set_host_dirty(); } #endif // HAVE_HALIDE void getCanonicalSize(const MatSize& size, int* w, int* h, int* c, int* n) { getCanonicalSize(shape(size), w, h, c, n); } void getCanonicalSize(const MatShape& shape, int* width, int* height, int* channels, int* batch) { const int dims = shape.size(); CV_Assert(dims == 2 || dims == 4); *batch = shape[0]; *channels = shape[1]; if (dims == 4) { *width = shape[3]; *height = shape[2]; } else { *width = 1; *height = 1; } } void compileHalide(const std::vector &outputs, Ptr& node, int targetId) { #ifdef HAVE_HALIDE CV_Assert(!node.empty()); Halide::Func& top = node.dynamicCast()->funcs.back(); int outW, outH, outC, outN; Halide::Var x("x"), y("y"), c("c"), n("n"); getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); top.bound(x, 0, outW).bound(y, 0, outH) .bound(c, 0, outC).bound(n, 0, outN); Halide::Target target = Halide::get_host_target(); target.set_feature(Halide::Target::NoAsserts); if (targetId == DNN_TARGET_OPENCL) { target.set_feature(Halide::Target::OpenCL); } CV_Assert(target.supported()); top.compile_jit(target); #endif // HAVE_HALIDE } void forwardHalide(std::vector > &outputs, const Ptr& node) { #ifdef HAVE_HALIDE CV_Assert(!node.empty()); Halide::Func& top = node.dynamicCast()->funcs.back(); auto outputBuffers = halideBuffers(outputs); top.realize(Halide::Realization(outputBuffers)); #endif // HAVE_HALIDE } bool haveHalide() { #ifdef HAVE_HALIDE return true; #else return false; #endif // HAVE_HALIDE } CV__DNN_INLINE_NS_BEGIN void Layer::applyHalideScheduler(Ptr& node, const std::vector &inputs, const std::vector &outputs, int targetId) const { #ifndef HAVE_HALIDE CV_Error(Error::StsNotImplemented, ""); #else CV_TRACE_FUNCTION(); Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile"); Halide::Func& top = node.dynamicCast()->funcs.back(); int outW, outH, outC, outN; getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); if (targetId == DNN_TARGET_CPU) { if (outW == 1 && outH == 1) { if (outC + outN == 1) return; if (outC > 8) top.split(c, co, ci, 8) .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) .parallel(tile) .vectorize(ci, 8); else top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile) .parallel(tile); } else { if (outH > 2) { top.reorder(x, c, y) .split(y, yo, yi, 2) .fuse(yo, n, tile) .parallel(tile) .unroll(yi) .vectorize(x, outW >= 16 ? 16 : outW); } } } else if (targetId == DNN_TARGET_OPENCL) { if (outW == 1 && outH == 1) { int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC; top.split(c, co, ci, c_split) .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) .gpu_blocks(tile) .gpu_threads(ci); } else { int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW; int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH; // Supported vectorization widths: 2, 3, 4, 8, 16 int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC); top.split(x, xo, xi, x_split).split(y, yo, yi, y_split) .split(c, co, ci, c_split) .gpu_blocks(xo, yo, co) .gpu_threads(xi, yi) .reorder(xi, yi, ci, xo, yo, co) .vectorize(ci); } } else CV_Error(Error::StsNotImplemented, "Unknown target identifier"); #endif // HAVE_HALIDE } CV__DNN_INLINE_NS_END }} // namespace