// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" namespace cv { typedef void (*MinMaxIdxFunc)(const uchar* data, const uchar* mask, void* minval, void* maxval, size_t* minidx, size_t* maxidx, int len, size_t startidx); CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN MinMaxIdxFunc getMinMaxIdxFunc(int depth); #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY template static void minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx ) { WT minVal = *_minVal, maxVal = *_maxVal; size_t minIdx = *_minIdx, maxIdx = *_maxIdx; int i = 0; if (minIdx == 0 || maxIdx == 0) { if (mask) { for (; i < len; i++) { if (mask[i]) { minVal = maxVal = (WT)src[i]; minIdx = maxIdx = startIdx + i; i++; break; } } } else if (len > 0) { minVal = maxVal = (WT)src[0]; minIdx = maxIdx = startIdx; i++; } } if( !mask ) { for( ; i < len; i++ ) { WT val = (WT)src[i]; if( val < minVal ) { minVal = val; minIdx = startIdx + i; } if( val > maxVal ) { maxVal = val; maxIdx = startIdx + i; } } } else { for( ; i < len; i++ ) { WT val = (WT)src[i]; uchar m = mask[i]; if( m && val < minVal ) { minVal = val; minIdx = startIdx + i; } if( m && val > maxVal ) { maxVal = val; maxIdx = startIdx + i; } } } *_minIdx = minIdx; *_maxIdx = maxIdx; *_minVal = minVal; *_maxVal = maxVal; } #undef SIMD_ONLY #if (CV_SIMD || CV_SIMD_SCALABLE) #define SIMD_ONLY(expr) expr #else #define SIMD_ONLY(expr) #endif static int minMaxInit(const uchar* mask, int len) { int i = 0; SIMD_ONLY( int vlanes = VTraits::vlanes(); v_uint8 v_zero = vx_setzero_u8(); for (; i < len; i += vlanes) { if (i + vlanes > len) { if (i == 0) break; i = len - vlanes; } v_uint8 mask_i = v_ne(vx_load(mask + i), v_zero); if (v_check_any(mask_i)) return i + v_scan_forward(mask_i); }) for (; i < len; i++) { if (mask[i] != 0) return i; } return -1; } // vectorized implementation for u8, s8, u16 and s16 // uses blocks to decrease the lane size necessary to store indices #undef DEFINE_MINMAXIDX_SMALLINT_FUNC #define DEFINE_MINMAXIDX_SMALLINT_FUNC(funcname, suffix, usuffix, T, UT, VT, UVT, WT, BLOCK_SIZE, load_mask) \ static void funcname(const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, \ size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx) \ { \ T minVal = T(*_minVal), maxVal = T(*_maxVal); \ size_t minIdx = *_minIdx, maxIdx = *_maxIdx; \ int i = 0; \ /* initialize minVal/maxVal/minIdx/maxIdx to the proper values in the beginning */ \ if (minIdx == 0) { \ if (mask) { \ i = minMaxInit(mask, len); \ if (i < 0) \ return; \ } \ minVal = maxVal = src[i]; \ minIdx = maxIdx = startIdx + i; \ i++; \ } \ SIMD_ONLY( \ const int vlanes = VTraits::vlanes(); \ const int block_size0 = BLOCK_SIZE - vlanes; \ if (len-i >= vlanes && block_size0 > 0 && block_size0 % vlanes == 0) { \ UT idxbuf[VTraits::max_nlanes]; \ for (int j = 0; j < vlanes; j++) \ idxbuf[j] = (UT)j; \ UVT v_idx0 = vx_load(idxbuf); \ UVT v_idx_delta = vx_setall_##usuffix((UT)vlanes); \ UVT v_invalid_idx = vx_setall_##usuffix((UT)-1); \ VT v_minval = vx_setall_##suffix(minVal); \ VT v_maxval = vx_setall_##suffix(maxVal); \ int block_size = block_size0; \ /* process data by blocks: */ \ /* - for u8/s8 data each block contains up to 256-vlanes elements */ \ /* - for u16/s16 data each block contains up to 65536-vlanes elements */ \ /* inside each block we can store the relative (local) index (v_locidx) */ \ /* in a compact way: 8 bits per lane for u8/s8 data, */ \ /* 16 bits per lane for u16/s16 data */ \ /* 0b111...111 is "invalid index", meaning that this */ \ /* particular lane has not been updated. */ \ /* after each block we update minVal, maxVal, minIdx and maxIdx */ \ for (; i <= len - vlanes; i += block_size) { \ block_size = std::min(block_size, (len - i) & -vlanes); \ UVT v_locidx = v_idx0; \ UVT v_minidx = v_invalid_idx; \ UVT v_maxidx = v_invalid_idx; \ if (!mask) { \ for (int j = 0; j < block_size; j += vlanes) { \ VT data = vx_load(src + i + j); \ UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \ UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \ v_minidx = v_select(lt_min, v_locidx, v_minidx); \ v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \ v_minval = v_min(v_minval, data); \ v_maxval = v_max(v_maxval, data); \ v_locidx = v_add(v_locidx, v_idx_delta); \ } \ } else { \ UVT v_zero = vx_setzero_##usuffix(); \ for (int j = 0; j < block_size; j += vlanes) { \ VT data = vx_load(src + i + j); \ UVT msk = v_ne(load_mask(mask + i + j), v_zero); \ UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \ UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \ lt_min = v_and(lt_min, msk); \ gt_max = v_and(gt_max, msk); \ v_minidx = v_select(lt_min, v_locidx, v_minidx); \ v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \ VT lt_min_data = v_reinterpret_as_##suffix(lt_min); \ VT gt_max_data = v_reinterpret_as_##suffix(gt_max); \ v_minval = v_select(lt_min_data, data, v_minval); \ v_maxval = v_select(gt_max_data, data, v_maxval); \ v_locidx = v_add(v_locidx, v_idx_delta); \ } \ } \ /* for both minimum and maximum we check whether global extremum */ \ /* and its index need to be updated. If yes, we compute */ \ /* the smallest index within the block where the new global \ /* extremum value occurs */ \ UVT idxmask = v_ne(v_minidx, v_invalid_idx); \ if (v_check_any(idxmask)) { \ minVal = (T)v_reduce_min(v_minval); \ VT invmask = v_ne(v_minval, vx_setall_##suffix(minVal)); \ v_minidx = v_or(v_minidx, v_reinterpret_as_##usuffix(invmask)); \ minIdx = startIdx + i + v_reduce_min(v_minidx); \ v_minval = vx_setall_##suffix(minVal); \ } \ idxmask = v_ne(v_maxidx, v_invalid_idx); \ if (v_check_any(idxmask)) { \ maxVal = (T)v_reduce_max(v_maxval); \ VT invmask = v_ne(v_maxval, vx_setall_##suffix(maxVal)); \ v_maxidx = v_or(v_maxidx, v_reinterpret_as_##usuffix(invmask)); \ maxIdx = startIdx + i + v_reduce_min(v_maxidx); \ v_maxval = vx_setall_##suffix(maxVal); \ } \ } \ }) \ *_minVal = (WT)minVal; \ *_maxVal = (WT)maxVal; \ *_minIdx = minIdx; \ *_maxIdx = maxIdx; \ /* [TODO]: unlike sum, countNonZero and other reduce operations, */ \ /* in the case of minMaxIdx we can process the tail using */ \ /* vector overlapping technique (as in arithmetic operations) */ \ if (i < len) { \ src += i; \ if (mask) mask += i; \ startIdx += i; \ len -= i; \ minMaxIdx_(src, mask, _minVal, _maxVal, _minIdx, _maxIdx, len, startIdx); \ } \ } // vectorized implementation for s32, f32, f16 and bf16 // (potentially can be extended for u32) // no need to use blocks here #undef DEFINE_MINMAXIDX_FUNC #define DEFINE_MINMAXIDX_FUNC(funcname, suffix, usuffix, T, UT, VT, UVT, WT, load_op) \ static void funcname(const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, \ size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx) \ { \ WT minVal = *_minVal, maxVal = *_maxVal; \ size_t minIdx = *_minIdx, maxIdx = *_maxIdx; \ int i = 0; \ /* initialize minVal/maxVal/minIdx/maxIdx to the proper values in the beginning */ \ if (minIdx == 0) { \ if (mask) { \ i = minMaxInit(mask, len); \ if (i < 0) \ return; \ } \ minVal = maxVal = src[i]; \ minIdx = maxIdx = startIdx + i; \ i++; \ } \ SIMD_ONLY( \ const int vlanes = VTraits::vlanes(); \ UT idxbuf[VTraits::max_nlanes]; \ for (int j = 0; j < vlanes; j++) \ idxbuf[j] = (UT)(i+j); \ UVT v_locidx = vx_load(idxbuf); \ UVT v_idx_delta = vx_setall_##usuffix((UT)vlanes); \ UVT v_invalid_idx = vx_setall_##usuffix((UT)-1); \ VT v_minval = vx_setall_##suffix(minVal); \ VT v_maxval = vx_setall_##suffix(maxVal); \ UVT v_minidx = v_invalid_idx; \ UVT v_maxidx = v_invalid_idx; \ /* process data by blocks: */ \ /* - for u8/s8 data each block contains up to 256-vlanes elements */ \ /* - for u16/s16 data each block contains up to 65536-vlanes elements */ \ /* inside each block we can store the relative (local) index (v_locidx) */ \ /* in a compact way: 8 bits per lane for u8/s8 data, */ \ /* 16 bits per lane for u16/s16 data */ \ /* 0b111...111 is "invalid index", meaning that this */ \ /* particular lane has not been updated. */ \ /* after each block we update minVal, maxVal, minIdx and maxIdx */ \ if (!mask) { \ for (; i <= len - vlanes; i += vlanes) { \ VT data = load_op(src + i); \ UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \ UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \ v_minidx = v_select(lt_min, v_locidx, v_minidx); \ v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \ v_minval = v_min(v_minval, data); \ v_maxval = v_max(v_maxval, data); \ v_locidx = v_add(v_locidx, v_idx_delta); \ } \ } else { \ UVT v_zero = vx_setzero_##usuffix(); \ for (; i <= len - vlanes; i += vlanes) { \ VT data = load_op(src + i); \ UVT msk = v_ne(vx_load_expand_q(mask + i), v_zero); \ UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \ UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \ lt_min = v_and(lt_min, msk); \ gt_max = v_and(gt_max, msk); \ v_minidx = v_select(lt_min, v_locidx, v_minidx); \ v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \ VT lt_min_data = v_reinterpret_as_##suffix(lt_min); \ VT gt_max_data = v_reinterpret_as_##suffix(gt_max); \ v_minval = v_select(lt_min_data, data, v_minval); \ v_maxval = v_select(gt_max_data, data, v_maxval); \ v_locidx = v_add(v_locidx, v_idx_delta); \ } \ } \ /* for both minimum and maximum we check whether global extremum */ \ /* and its index need to be updated. If yes, we compute */ \ /* the smallest index within the block where the new global \ /* extremum value occurs */ \ UVT idxmask = v_ne(v_minidx, v_invalid_idx); \ if (v_check_any(idxmask)) { \ minVal = v_reduce_min(v_minval); \ VT invmask = v_ne(v_minval, vx_setall_##suffix(minVal)); \ v_minidx = v_or(v_minidx, v_reinterpret_as_##usuffix(invmask)); \ minIdx = startIdx + v_reduce_min(v_minidx); \ v_minval = vx_setall_##suffix(minVal); \ } \ idxmask = v_ne(v_maxidx, v_invalid_idx); \ if (v_check_any(idxmask)) { \ maxVal = v_reduce_max(v_maxval); \ VT invmask = v_ne(v_maxval, vx_setall_##suffix(maxVal)); \ v_maxidx = v_or(v_maxidx, v_reinterpret_as_##usuffix(invmask)); \ maxIdx = startIdx + v_reduce_min(v_maxidx); \ v_maxval = vx_setall_##suffix(maxVal); \ }) \ *_minVal = minVal; \ *_maxVal = maxVal; \ *_minIdx = minIdx; \ *_maxIdx = maxIdx; \ /* [TODO]: unlike sum, countNonZero and other reduce operations, */ \ /* in the case of minMaxIdx we can process the tail using */ \ /* vector overlapping technique (as in arithmetic operations) */ \ if (i < len) { \ src += i; \ if (mask) mask += i; \ startIdx += i; \ len -= i; \ minMaxIdx_(src, mask, _minVal, _maxVal, _minIdx, _maxIdx, len, startIdx); \ } \ } #undef DEFINE_MINMAXIDX_FUNC_NOSIMD #define DEFINE_MINMAXIDX_FUNC_NOSIMD(funcname, T, WT) \ static void funcname(const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, \ size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx) \ { \ minMaxIdx_(src, mask, _minVal, _maxVal, _minIdx, _maxIdx, len, startIdx); \ } DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx8u, u8, u8, uchar, uchar, v_uint8, v_uint8, int, 256, vx_load) DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx8s, s8, u8, schar, uchar, v_int8, v_uint8, int, 256, vx_load) DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx16u, u16, u16, ushort, ushort, v_uint16, v_uint16, int, 65536, vx_load_expand) DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx16s, s16, u16, short, ushort, v_int16, v_uint16, int, 65536, vx_load_expand) DEFINE_MINMAXIDX_FUNC(minMaxIdx32s, s32, u32, int, unsigned, v_int32, v_uint32, int, vx_load) DEFINE_MINMAXIDX_FUNC(minMaxIdx32f, f32, u32, float, unsigned, v_float32, v_uint32, float, vx_load) DEFINE_MINMAXIDX_FUNC(minMaxIdx16f, f32, u32, hfloat, unsigned, v_float32, v_uint32, float, vx_load_expand) DEFINE_MINMAXIDX_FUNC(minMaxIdx16bf, f32, u32, bfloat, unsigned, v_float32, v_uint32, float, vx_load_expand) //DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx32s, int, int) //DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx32f, float, float) DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx64f, double, double) //DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx16f, hfloat, float) //DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx16bf, bfloat, float) DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx64u, uint64, uint64) DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx64s, int64, int64) DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx32u, unsigned, int64) MinMaxIdxFunc getMinMaxIdxFunc(int depth) { static MinMaxIdxFunc minMaxIdxTab[CV_DEPTH_MAX] = { (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx8s), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16s), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx32s), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx64f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16bf), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx64u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx64s), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx32u), 0 }; return minMaxIdxTab[depth]; } #endif CV_CPU_OPTIMIZATION_NAMESPACE_END } // namespace