diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index d7d541474e..9994677cb5 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -160,6 +160,10 @@ public: void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { +#ifdef HAVE_OPENCL + ocl_exec_cache.clear(); +#endif + std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); @@ -214,26 +218,33 @@ public: } #ifdef HAVE_OPENCL - bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + struct OpenCLExecInfo { - std::vector inputs; - std::vector outputs; + std::string kernel_name; + std::string build_opts; + size_t local_size[2]; + size_t global_size[2]; - inputs_.getUMatVector(inputs); - outputs_.getUMatVector(outputs); + OpenCLExecInfo() + { + local_size[0] = local_size[1] = 0; + global_size[0] = global_size[1] = 0; + } + }; + std::vector ocl_exec_cache; + + void ocl_prepare(const std::vector& inputs, const std::vector& outputs) + { + CV_TRACE_FUNCTION(); CV_Assert(outputs.size() == finalSliceRanges.size()); + ocl_exec_cache.resize(outputs.size()); const UMat& input = inputs[0]; - if (input.dims > 5) - { - CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU"); - return false; - } + const int dims = input.dims; size_t WSZ = 128; - const int dims = input.dims; const int elemSize = (int)input.elemSize(); String opts0 = cv::format( "-DDIMS=%d -DELEMSIZE=%d", @@ -243,10 +254,11 @@ public: { opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]); } - String kname = cv::format("slice_%d", dims); for (size_t i = 0; i < outputs.size(); i++) { - UMat& output = outputs[i]; + OpenCLExecInfo& ocl = ocl_exec_cache[i]; + + const UMat& output = outputs[i]; const std::vector& range = finalSliceRanges[i]; String opts = opts0; @@ -262,6 +274,8 @@ public: CV_CheckEQ(range[d].size(), (int)output.size[d], ""); } + const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64; + int block_dims = 0; size_t block_size = elemSize; for (int i = dims - 1; i >= 0; --i) @@ -270,12 +284,14 @@ public: break; block_size *= output.size[i]; block_dims++; + if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG) + break; } const size_t total = output.total() * elemSize; size_t num_blocks = total / block_size; - if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64)) + if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)) { // use 1D copy mode opts += cv::format(" -DUSE_COPY_1D=1"); @@ -345,23 +361,98 @@ public: opts += cv::format(" -DWSZ=%d", (int)WSZ); - size_t local[] = { WSZ, 1 }; - size_t global[] = { WSZ, num_blocks }; + std::ostringstream kernel_suffix; + kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size; + kernel_suffix << "__src_"; + for (int d = 0; d < dims; d++) + { + kernel_suffix << input.size[dims - 1 - d] << '_'; + } + kernel_suffix << '_'; + /*for (int d = 0; d < dims; d++) + { + kernel_suffix << input.step[dims - 1 - d] << '_'; + } + kernel_suffix << '_';*/ - ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts); + kernel_suffix << "dst_"; + for (int d = 0; d < dims; d++) + { + kernel_suffix << output.size[dims - 1 - d] << '_'; + } + /*kernel_suffix << '_'; + for (int d = 0; d < dims; d++) + { + kernel_suffix << output.step[dims - 1 - d] << '_'; + }*/ + kernel_suffix << "_slice_"; + for (int d = 0; d < dims; d++) + { + kernel_suffix << range[dims - 1 - d].start << '_'; + } + for (int d = 0; d < dims; d++) + { + kernel_suffix << '_' << range[dims - 1 - d].end; + } + + std::string kernel_suffix_str = kernel_suffix.str(); + opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str()); + + ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str()); + ocl.build_opts = opts; + ocl.local_size[0] = WSZ; + ocl.local_size[1] = 1; + ocl.global_size[0] = WSZ; + ocl.global_size[1] = num_blocks; + } // for outputs.size() + } // ocl_prepare + + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + { + CV_TRACE_FUNCTION(); + + std::vector inputs; + std::vector outputs; + + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + CV_Assert(outputs.size() == finalSliceRanges.size()); + + const UMat& input = inputs[0]; + const int dims = input.dims; + if (dims > 5) + { + CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU"); + return false; + } + + if (ocl_exec_cache.empty()) + { + ocl_prepare(inputs, outputs); + } + CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), ""); + + for (size_t i = 0; i < outputs.size(); i++) + { + const OpenCLExecInfo& ocl = ocl_exec_cache[i]; + + UMat& output = outputs[i]; + + ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts); if (kernel.empty()) return false; bool ret = kernel.args( ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output) ) - .run(2, global, local, false); + .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false); if (!ret) return false; } // for outputs.size() return true; - } + } // forward_ocl #endif void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE diff --git a/modules/dnn/src/opencl/slice.cl b/modules/dnn/src/opencl/slice.cl index d468dbc16a..f32d66a9ca 100644 --- a/modules/dnn/src/opencl/slice.cl +++ b/modules/dnn/src/opencl/slice.cl @@ -48,19 +48,85 @@ global: #define BLOCK_COLS_X4 (BLOCK_COLS / 4) #define BLOCK_COLS_X16 (BLOCK_COLS / 16) -#ifdef USE_COPY_1D - -static inline -__attribute__((always_inline)) -void copy_block_1d( +__attribute__((reqd_work_group_size(WSZ, 1, 1))) +__kernel void +CONCAT(slice_, SLICE_KERNEL_SUFFIX)( __global const uchar* src0, - const uint src_offset, - __global uchar* dst0, - const uint dst_offset + __global uchar* dst0 ) { - __global const uchar* src = src0 + src_offset; - __global uchar* dst = dst0 + dst_offset; + uint block_id = get_global_id(1); + uint dst_offset0 = block_id * BLOCK_SIZE; + uint src_offset0 = 0; + + { // calculate src_offset0 + +#define CALC_SRC_INDEX(dim) \ + { \ + uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \ + CONCAT(idx_, dim) = block_id / plane_sz; \ + block_id = block_id - CONCAT(idx_, dim) * plane_sz; \ + } +#define UPDATE_SRC_OFFSET(dim) \ + src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0); +/* + if (get_global_id(0) == 0 && get_global_id(1) == 0) \ + printf("(%d, %d): @%d src_offset0=%d idx_dim=%d block_id=%d\n", \ + get_global_id(0), get_global_id(1), \ + dim, src_offset0, CONCAT(idx_, dim), block_id \ + ); +*/ + +#if DIMS > 5 +#error "invalid configuration" +#endif +#if DIMS > 4 + uint idx_4 = 0; +#if BLOCK_DIMS <= 4 + CALC_SRC_INDEX(4) +#endif + UPDATE_SRC_OFFSET(4) +#endif +#if DIMS > 3 + uint idx_3 = 0; +#if BLOCK_DIMS <= 3 + CALC_SRC_INDEX(3) +#endif + UPDATE_SRC_OFFSET(3) +#endif +#if DIMS > 2 + uint idx_2 = 0; +#if BLOCK_DIMS <= 2 + CALC_SRC_INDEX(2) +#endif + UPDATE_SRC_OFFSET(2) +#endif +#if DIMS > 1 + uint idx_1 = 0; +#if BLOCK_DIMS <= 1 + CALC_SRC_INDEX(1) +#endif + UPDATE_SRC_OFFSET(1) +#endif +#if DIMS > 0 + uint idx_0 = 0; + UPDATE_SRC_OFFSET(0) +#endif + +/* + if (get_global_id(0) == 0) + printf("(%d, %d): src_offset0=%d dst_offset0=%d\n", + get_global_id(0), get_global_id(1), + src_offset0, dst_offset0 + ); +*/ + + } // calculate src_offset0 + +#ifdef USE_COPY_1D + { // copy_block_1d + __global const uchar* src = src0 + src_offset0; + __global uchar* dst = dst0 + dst_offset0; uint processed = 0; @@ -70,8 +136,9 @@ void copy_block_1d( uint i = get_local_id(0) * 16; // uchar16 while (i < BLOCK_COLS_X16 * 16) { - uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ); - idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16)); + uint4 idx0 = (uint4)i; + uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ); + idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16)); uchar16 a0 = vload16(0, src + idx.s0); uchar16 a1 = vload16(0, src + idx.s1); @@ -97,8 +164,9 @@ void copy_block_1d( uint i = get_local_id(0) * 4 + processed; // uchar4 while (i < BLOCK_COLS_X4 * 4) { - uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ); - idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4)); + uint4 idx0 = (uint4)i; + uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ); + idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4)); uchar4 a0 = vload4(0, src + idx.s0); uchar4 a1 = vload4(0, src + idx.s1); @@ -130,19 +198,11 @@ void copy_block_1d( } } #endif -} + } // copy_block_1d -#else // USE_COPY_1D +#else -static inline -__attribute__((always_inline)) -void copy_block_2d( - __global const uchar* src0, - const uint src_offset0, - __global uchar* dst0, - const uint dst_offset0 -) -{ + { // copy_block_2d __global const uchar* src = src0 + src_offset0; __global uchar* dst = dst0 + dst_offset0; @@ -199,85 +259,6 @@ void copy_block_2d( #endif // BLOCK_COLS_FILL_X4 != BLOCK_COLS i += WSZ * 4; } -} - -#endif // USE_COPY_1D - -__kernel void -CONCAT(slice_, DIMS)( - __global const uchar* src, - __global uchar* dst -) -{ - uint block_id = get_global_id(1); - - uint dst_offset = block_id * BLOCK_SIZE; - - uint src_offset = 0; - -#define CALC_SRC_INDEX(dim) \ - { \ - uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \ - CONCAT(idx_, dim) = block_id / plane_sz; \ - block_id = block_id - CONCAT(idx_, dim) * plane_sz; \ - } -#define UPDATE_SRC_OFFSET(dim) \ - src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset); -/* - if (get_global_id(0) == 0 && get_global_id(1) == 0) \ - printf("(%d, %d): @%d src_offset=%d idx_dim=%d block_id=%d\n", \ - get_global_id(0), get_global_id(1), \ - dim, src_offset, CONCAT(idx_, dim), block_id \ - ); -*/ - -#if DIMS > 5 -#error "invalid configuration" -#endif -#if DIMS > 4 - uint idx_4 = 0; -#if BLOCK_DIMS <= 4 - CALC_SRC_INDEX(4) -#endif - UPDATE_SRC_OFFSET(4) -#endif -#if DIMS > 3 - uint idx_3 = 0; -#if BLOCK_DIMS <= 3 - CALC_SRC_INDEX(3) -#endif - UPDATE_SRC_OFFSET(3) -#endif -#if DIMS > 2 - uint idx_2 = 0; -#if BLOCK_DIMS <= 2 - CALC_SRC_INDEX(2) -#endif - UPDATE_SRC_OFFSET(2) -#endif -#if DIMS > 1 - uint idx_1 = 0; -#if BLOCK_DIMS <= 1 - CALC_SRC_INDEX(1) -#endif - UPDATE_SRC_OFFSET(1) -#endif -#if DIMS > 0 - uint idx_0 = 0; - UPDATE_SRC_OFFSET(0) -#endif - -/* - if (get_global_id(0) == 0) - printf("(%d, %d): src_offset=%d dst_offset=%d\n", - get_global_id(0), get_global_id(1), - src_offset, dst_offset - ); -*/ - -#ifdef USE_COPY_1D - copy_block_1d(src, src_offset, dst, dst_offset); -#else - copy_block_2d(src, src_offset, dst, dst_offset); + } // copy_block_2d #endif }