opencv/modules/core/src/minmax.simd.hpp
Alexander Smorkalov cb6d295f15 Merge branch 4.x
2024-04-02 16:39:54 +03:00

395 lines
16 KiB
C++

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
namespace cv {
typedef void (*MinMaxIdxFunc)(const uchar* data, const uchar* mask,
void* minval, void* maxval,
size_t* minidx, size_t* maxidx,
int len, size_t startidx);
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
MinMaxIdxFunc getMinMaxIdxFunc(int depth);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
template<typename T, typename WT> static void
minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal,
size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx )
{
WT minVal = *_minVal, maxVal = *_maxVal;
size_t minIdx = *_minIdx, maxIdx = *_maxIdx;
int i = 0;
if (minIdx == 0 || maxIdx == 0) {
if (mask) {
for (; i < len; i++) {
if (mask[i]) {
minVal = maxVal = (WT)src[i];
minIdx = maxIdx = startIdx + i;
i++;
break;
}
}
}
else if (len > 0) {
minVal = maxVal = (WT)src[0];
minIdx = maxIdx = startIdx;
i++;
}
}
if( !mask )
{
for( ; i < len; i++ )
{
WT val = (WT)src[i];
if( val < minVal )
{
minVal = val;
minIdx = startIdx + i;
}
if( val > maxVal )
{
maxVal = val;
maxIdx = startIdx + i;
}
}
}
else
{
for( ; i < len; i++ )
{
WT val = (WT)src[i];
uchar m = mask[i];
if( m && val < minVal )
{
minVal = val;
minIdx = startIdx + i;
}
if( m && val > maxVal )
{
maxVal = val;
maxIdx = startIdx + i;
}
}
}
*_minIdx = minIdx;
*_maxIdx = maxIdx;
*_minVal = minVal;
*_maxVal = maxVal;
}
#undef SIMD_ONLY
#if (CV_SIMD || CV_SIMD_SCALABLE)
#define SIMD_ONLY(expr) expr
#else
#define SIMD_ONLY(expr)
#endif
static int minMaxInit(const uchar* mask, int len)
{
int i = 0;
SIMD_ONLY(
int vlanes = VTraits<v_uint8>::vlanes();
v_uint8 v_zero = vx_setzero_u8();
for (; i < len; i += vlanes) {
if (i + vlanes > len) {
if (i == 0)
break;
i = len - vlanes;
}
v_uint8 mask_i = v_ne(vx_load(mask + i), v_zero);
if (v_check_any(mask_i))
return i + v_scan_forward(mask_i);
})
for (; i < len; i++) {
if (mask[i] != 0)
return i;
}
return -1;
}
// vectorized implementation for u8, s8, u16 and s16
// uses blocks to decrease the lane size necessary to store indices
#undef DEFINE_MINMAXIDX_SMALLINT_FUNC
#define DEFINE_MINMAXIDX_SMALLINT_FUNC(funcname, suffix, usuffix, T, UT, VT, UVT, WT, BLOCK_SIZE, load_mask) \
static void funcname(const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, \
size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx) \
{ \
T minVal = T(*_minVal), maxVal = T(*_maxVal); \
size_t minIdx = *_minIdx, maxIdx = *_maxIdx; \
int i = 0; \
/* initialize minVal/maxVal/minIdx/maxIdx to the proper values in the beginning */ \
if (minIdx == 0) { \
if (mask) { \
i = minMaxInit(mask, len); \
if (i < 0) \
return; \
} \
minVal = maxVal = src[i]; \
minIdx = maxIdx = startIdx + i; \
i++; \
} \
SIMD_ONLY( \
const int vlanes = VTraits<VT>::vlanes(); \
const int block_size0 = BLOCK_SIZE - vlanes; \
if (len-i >= vlanes && block_size0 > 0 && block_size0 % vlanes == 0) { \
UT idxbuf[VTraits<UVT>::max_nlanes]; \
for (int j = 0; j < vlanes; j++) \
idxbuf[j] = (UT)j; \
UVT v_idx0 = vx_load(idxbuf); \
UVT v_idx_delta = vx_setall_##usuffix((UT)vlanes); \
UVT v_invalid_idx = vx_setall_##usuffix((UT)-1); \
VT v_minval = vx_setall_##suffix(minVal); \
VT v_maxval = vx_setall_##suffix(maxVal); \
int block_size = block_size0; \
/* process data by blocks: */ \
/* - for u8/s8 data each block contains up to 256-vlanes elements */ \
/* - for u16/s16 data each block contains up to 65536-vlanes elements */ \
/* inside each block we can store the relative (local) index (v_locidx) */ \
/* in a compact way: 8 bits per lane for u8/s8 data, */ \
/* 16 bits per lane for u16/s16 data */ \
/* 0b111...111 is "invalid index", meaning that this */ \
/* particular lane has not been updated. */ \
/* after each block we update minVal, maxVal, minIdx and maxIdx */ \
for (; i <= len - vlanes; i += block_size) { \
block_size = std::min(block_size, (len - i) & -vlanes); \
UVT v_locidx = v_idx0; \
UVT v_minidx = v_invalid_idx; \
UVT v_maxidx = v_invalid_idx; \
if (!mask) { \
for (int j = 0; j < block_size; j += vlanes) { \
VT data = vx_load(src + i + j); \
UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \
UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \
v_minidx = v_select(lt_min, v_locidx, v_minidx); \
v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \
v_minval = v_min(v_minval, data); \
v_maxval = v_max(v_maxval, data); \
v_locidx = v_add(v_locidx, v_idx_delta); \
} \
} else { \
UVT v_zero = vx_setzero_##usuffix(); \
for (int j = 0; j < block_size; j += vlanes) { \
VT data = vx_load(src + i + j); \
UVT msk = v_ne(load_mask(mask + i + j), v_zero); \
UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \
UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \
lt_min = v_and(lt_min, msk); \
gt_max = v_and(gt_max, msk); \
v_minidx = v_select(lt_min, v_locidx, v_minidx); \
v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \
VT lt_min_data = v_reinterpret_as_##suffix(lt_min); \
VT gt_max_data = v_reinterpret_as_##suffix(gt_max); \
v_minval = v_select(lt_min_data, data, v_minval); \
v_maxval = v_select(gt_max_data, data, v_maxval); \
v_locidx = v_add(v_locidx, v_idx_delta); \
} \
} \
/* for both minimum and maximum we check whether global extremum */ \
/* and its index need to be updated. If yes, we compute */ \
/* the smallest index within the block where the new global \
/* extremum value occurs */ \
UVT idxmask = v_ne(v_minidx, v_invalid_idx); \
if (v_check_any(idxmask)) { \
minVal = (T)v_reduce_min(v_minval); \
VT invmask = v_ne(v_minval, vx_setall_##suffix(minVal)); \
v_minidx = v_or(v_minidx, v_reinterpret_as_##usuffix(invmask)); \
minIdx = startIdx + i + v_reduce_min(v_minidx); \
v_minval = vx_setall_##suffix(minVal); \
} \
idxmask = v_ne(v_maxidx, v_invalid_idx); \
if (v_check_any(idxmask)) { \
maxVal = (T)v_reduce_max(v_maxval); \
VT invmask = v_ne(v_maxval, vx_setall_##suffix(maxVal)); \
v_maxidx = v_or(v_maxidx, v_reinterpret_as_##usuffix(invmask)); \
maxIdx = startIdx + i + v_reduce_min(v_maxidx); \
v_maxval = vx_setall_##suffix(maxVal); \
} \
} \
}) \
*_minVal = (WT)minVal; \
*_maxVal = (WT)maxVal; \
*_minIdx = minIdx; \
*_maxIdx = maxIdx; \
/* [TODO]: unlike sum, countNonZero and other reduce operations, */ \
/* in the case of minMaxIdx we can process the tail using */ \
/* vector overlapping technique (as in arithmetic operations) */ \
if (i < len) { \
src += i; \
if (mask) mask += i; \
startIdx += i; \
len -= i; \
minMaxIdx_(src, mask, _minVal, _maxVal, _minIdx, _maxIdx, len, startIdx); \
} \
}
// vectorized implementation for s32, f32, f16 and bf16
// (potentially can be extended for u32)
// no need to use blocks here
#undef DEFINE_MINMAXIDX_FUNC
#define DEFINE_MINMAXIDX_FUNC(funcname, suffix, usuffix, T, UT, VT, UVT, WT, load_op) \
static void funcname(const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, \
size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx) \
{ \
WT minVal = *_minVal, maxVal = *_maxVal; \
size_t minIdx = *_minIdx, maxIdx = *_maxIdx; \
int i = 0; \
/* initialize minVal/maxVal/minIdx/maxIdx to the proper values in the beginning */ \
if (minIdx == 0) { \
if (mask) { \
i = minMaxInit(mask, len); \
if (i < 0) \
return; \
} \
minVal = maxVal = src[i]; \
minIdx = maxIdx = startIdx + i; \
i++; \
} \
SIMD_ONLY( \
const int vlanes = VTraits<VT>::vlanes(); \
UT idxbuf[VTraits<UVT>::max_nlanes]; \
for (int j = 0; j < vlanes; j++) \
idxbuf[j] = (UT)(i+j); \
UVT v_locidx = vx_load(idxbuf); \
UVT v_idx_delta = vx_setall_##usuffix((UT)vlanes); \
UVT v_invalid_idx = vx_setall_##usuffix((UT)-1); \
VT v_minval = vx_setall_##suffix(minVal); \
VT v_maxval = vx_setall_##suffix(maxVal); \
UVT v_minidx = v_invalid_idx; \
UVT v_maxidx = v_invalid_idx; \
/* process data by blocks: */ \
/* - for u8/s8 data each block contains up to 256-vlanes elements */ \
/* - for u16/s16 data each block contains up to 65536-vlanes elements */ \
/* inside each block we can store the relative (local) index (v_locidx) */ \
/* in a compact way: 8 bits per lane for u8/s8 data, */ \
/* 16 bits per lane for u16/s16 data */ \
/* 0b111...111 is "invalid index", meaning that this */ \
/* particular lane has not been updated. */ \
/* after each block we update minVal, maxVal, minIdx and maxIdx */ \
if (!mask) { \
for (; i <= len - vlanes; i += vlanes) { \
VT data = load_op(src + i); \
UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \
UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \
v_minidx = v_select(lt_min, v_locidx, v_minidx); \
v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \
v_minval = v_min(v_minval, data); \
v_maxval = v_max(v_maxval, data); \
v_locidx = v_add(v_locidx, v_idx_delta); \
} \
} else { \
UVT v_zero = vx_setzero_##usuffix(); \
for (; i <= len - vlanes; i += vlanes) { \
VT data = load_op(src + i); \
UVT msk = v_ne(vx_load_expand_q(mask + i), v_zero); \
UVT lt_min = v_reinterpret_as_##usuffix(v_lt(data, v_minval)); \
UVT gt_max = v_reinterpret_as_##usuffix(v_gt(data, v_maxval)); \
lt_min = v_and(lt_min, msk); \
gt_max = v_and(gt_max, msk); \
v_minidx = v_select(lt_min, v_locidx, v_minidx); \
v_maxidx = v_select(gt_max, v_locidx, v_maxidx); \
VT lt_min_data = v_reinterpret_as_##suffix(lt_min); \
VT gt_max_data = v_reinterpret_as_##suffix(gt_max); \
v_minval = v_select(lt_min_data, data, v_minval); \
v_maxval = v_select(gt_max_data, data, v_maxval); \
v_locidx = v_add(v_locidx, v_idx_delta); \
} \
} \
/* for both minimum and maximum we check whether global extremum */ \
/* and its index need to be updated. If yes, we compute */ \
/* the smallest index within the block where the new global \
/* extremum value occurs */ \
UVT idxmask = v_ne(v_minidx, v_invalid_idx); \
if (v_check_any(idxmask)) { \
minVal = v_reduce_min(v_minval); \
VT invmask = v_ne(v_minval, vx_setall_##suffix(minVal)); \
v_minidx = v_or(v_minidx, v_reinterpret_as_##usuffix(invmask)); \
minIdx = startIdx + v_reduce_min(v_minidx); \
v_minval = vx_setall_##suffix(minVal); \
} \
idxmask = v_ne(v_maxidx, v_invalid_idx); \
if (v_check_any(idxmask)) { \
maxVal = v_reduce_max(v_maxval); \
VT invmask = v_ne(v_maxval, vx_setall_##suffix(maxVal)); \
v_maxidx = v_or(v_maxidx, v_reinterpret_as_##usuffix(invmask)); \
maxIdx = startIdx + v_reduce_min(v_maxidx); \
v_maxval = vx_setall_##suffix(maxVal); \
}) \
*_minVal = minVal; \
*_maxVal = maxVal; \
*_minIdx = minIdx; \
*_maxIdx = maxIdx; \
/* [TODO]: unlike sum, countNonZero and other reduce operations, */ \
/* in the case of minMaxIdx we can process the tail using */ \
/* vector overlapping technique (as in arithmetic operations) */ \
if (i < len) { \
src += i; \
if (mask) mask += i; \
startIdx += i; \
len -= i; \
minMaxIdx_(src, mask, _minVal, _maxVal, _minIdx, _maxIdx, len, startIdx); \
} \
}
#undef DEFINE_MINMAXIDX_FUNC_NOSIMD
#define DEFINE_MINMAXIDX_FUNC_NOSIMD(funcname, T, WT) \
static void funcname(const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, \
size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx) \
{ \
minMaxIdx_(src, mask, _minVal, _maxVal, _minIdx, _maxIdx, len, startIdx); \
}
DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx8u, u8, u8, uchar, uchar, v_uint8, v_uint8, int, 256, vx_load)
DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx8s, s8, u8, schar, uchar, v_int8, v_uint8, int, 256, vx_load)
DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx16u, u16, u16, ushort, ushort, v_uint16, v_uint16, int, 65536, vx_load_expand)
DEFINE_MINMAXIDX_SMALLINT_FUNC(minMaxIdx16s, s16, u16, short, ushort, v_int16, v_uint16, int, 65536, vx_load_expand)
DEFINE_MINMAXIDX_FUNC(minMaxIdx32s, s32, u32, int, unsigned, v_int32, v_uint32, int, vx_load)
DEFINE_MINMAXIDX_FUNC(minMaxIdx32f, f32, u32, float, unsigned, v_float32, v_uint32, float, vx_load)
DEFINE_MINMAXIDX_FUNC(minMaxIdx16f, f32, u32, hfloat, unsigned, v_float32, v_uint32, float, vx_load_expand)
DEFINE_MINMAXIDX_FUNC(minMaxIdx16bf, f32, u32, bfloat, unsigned, v_float32, v_uint32, float, vx_load_expand)
//DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx32s, int, int)
//DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx32f, float, float)
DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx64f, double, double)
//DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx16f, hfloat, float)
//DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx16bf, bfloat, float)
DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx64u, uint64, uint64)
DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx64s, int64, int64)
DEFINE_MINMAXIDX_FUNC_NOSIMD(minMaxIdx32u, unsigned, int64)
MinMaxIdxFunc getMinMaxIdxFunc(int depth)
{
static MinMaxIdxFunc minMaxIdxTab[CV_DEPTH_MAX] =
{
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx8u),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx8s),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16u),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16s),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx32s),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx32f),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx64f),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16f),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx16bf),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx8u),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx64u),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx64s),
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx32u),
0
};
return minMaxIdxTab[depth];
}
#endif
CV_CPU_OPTIMIZATION_NAMESPACE_END
} // namespace