mirror of
https://github.com/opencv/opencv.git
synced 2025-06-11 11:45:30 +08:00
core: dispatch count_non_zero
This commit is contained in:
parent
0b49680339
commit
e3633ec4a2
@ -5,6 +5,7 @@ ocv_add_dispatched_file(stat SSE4_2 AVX2)
|
||||
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
|
||||
ocv_add_dispatched_file(convert SSE2 AVX2)
|
||||
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
|
||||
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
|
||||
ocv_add_dispatched_file(sum SSE2 AVX2)
|
||||
|
||||
# dispatching for accuracy tests
|
||||
|
@ -7,190 +7,18 @@
|
||||
#include "opencl_kernels_core.hpp"
|
||||
#include "stat.hpp"
|
||||
|
||||
#include "count_non_zero.simd.hpp"
|
||||
#include "count_non_zero.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
|
||||
|
||||
namespace cv {
|
||||
|
||||
template<typename T>
|
||||
static int countNonZero_(const T* src, int len )
|
||||
{
|
||||
int i=0, nz = 0;
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for(; i <= len - 4; i += 4 )
|
||||
nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
nz += src[i] != 0;
|
||||
return nz;
|
||||
}
|
||||
|
||||
static int countNonZero8u( const uchar* src, int len )
|
||||
{
|
||||
int i=0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_uint8::nlanes;
|
||||
v_uint8 v_zero = vx_setzero_u8();
|
||||
v_uint8 v_one = vx_setall_u8(1);
|
||||
|
||||
v_uint32 v_sum32 = vx_setzero_u32();
|
||||
while (i < len0)
|
||||
{
|
||||
v_uint16 v_sum16 = vx_setzero_u16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 65280 * v_uint16::nlanes))
|
||||
{
|
||||
v_uint8 v_sum8 = vx_setzero_u8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)
|
||||
v_sum8 += v_one & (vx_load(src + k) == v_zero);
|
||||
v_uint16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
j = k;
|
||||
}
|
||||
v_uint32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
v_cleanup();
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
nz += src[i] != 0;
|
||||
return nz;
|
||||
}
|
||||
|
||||
static int countNonZero16u( const ushort* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int8::nlanes;
|
||||
v_uint16 v_zero = vx_setzero_u16();
|
||||
v_int8 v_one = vx_setall_s8(1);
|
||||
|
||||
v_int32 v_sum32 = vx_setzero_s32();
|
||||
while (i < len0)
|
||||
{
|
||||
v_int16 v_sum16 = vx_setzero_s16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
|
||||
{
|
||||
v_int8 v_sum8 = vx_setzero_s8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
|
||||
v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));
|
||||
v_int16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
j = k;
|
||||
}
|
||||
v_int32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
v_cleanup();
|
||||
#endif
|
||||
return nz + countNonZero_(src + i, len - i);
|
||||
}
|
||||
|
||||
static int countNonZero32s( const int* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int8::nlanes;
|
||||
v_int32 v_zero = vx_setzero_s32();
|
||||
v_int8 v_one = vx_setall_s8(1);
|
||||
|
||||
v_int32 v_sum32 = vx_setzero_s32();
|
||||
while (i < len0)
|
||||
{
|
||||
v_int16 v_sum16 = vx_setzero_s16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
|
||||
{
|
||||
v_int8 v_sum8 = vx_setzero_s8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
|
||||
v_sum8 += v_one & v_pack(
|
||||
v_pack(vx_load(src + k ) == v_zero, vx_load(src + k + v_int32::nlanes) == v_zero),
|
||||
v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)
|
||||
);
|
||||
v_int16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
j = k;
|
||||
}
|
||||
v_int32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
v_cleanup();
|
||||
#endif
|
||||
return nz + countNonZero_(src + i, len - i);
|
||||
}
|
||||
|
||||
static int countNonZero32f( const float* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int8::nlanes;
|
||||
v_float32 v_zero = vx_setzero_f32();
|
||||
v_int8 v_one = vx_setall_s8(1);
|
||||
|
||||
v_int32 v_sum32 = vx_setzero_s32();
|
||||
while (i < len0)
|
||||
{
|
||||
v_int16 v_sum16 = vx_setzero_s16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
|
||||
{
|
||||
v_int8 v_sum8 = vx_setzero_s8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
|
||||
v_sum8 += v_one & v_pack(
|
||||
v_pack(v_reinterpret_as_s32(vx_load(src + k ) == v_zero), v_reinterpret_as_s32(vx_load(src + k + v_float32::nlanes) == v_zero)),
|
||||
v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))
|
||||
);
|
||||
v_int16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
j = k;
|
||||
}
|
||||
v_int32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
v_cleanup();
|
||||
#endif
|
||||
return nz + countNonZero_(src + i, len - i);
|
||||
}
|
||||
|
||||
static int countNonZero64f( const double* src, int len )
|
||||
{
|
||||
return countNonZero_(src, len);
|
||||
}
|
||||
|
||||
typedef int (*CountNonZeroFunc)(const uchar*, int);
|
||||
|
||||
static CountNonZeroFunc getCountNonZeroTab(int depth)
|
||||
{
|
||||
static CountNonZeroFunc countNonZeroTab[] =
|
||||
{
|
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
|
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
|
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),
|
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0
|
||||
};
|
||||
|
||||
return countNonZeroTab[depth];
|
||||
CV_INSTRUMENT_REGION();
|
||||
CV_CPU_DISPATCH(getCountNonZeroTab, (depth),
|
||||
CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
static bool ocl_countNonZero( InputArray _src, int & res )
|
||||
{
|
||||
@ -288,9 +116,7 @@ static bool ipp_countNonZero( Mat &src, int &res )
|
||||
}
|
||||
#endif
|
||||
|
||||
} // cv::
|
||||
|
||||
int cv::countNonZero( InputArray _src )
|
||||
int countNonZero(InputArray _src)
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
@ -324,7 +150,7 @@ int cv::countNonZero( InputArray _src )
|
||||
return nz;
|
||||
}
|
||||
|
||||
void cv::findNonZero( InputArray _src, OutputArray _idx )
|
||||
void findNonZero(InputArray _src, OutputArray _idx)
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
@ -351,3 +177,5 @@ void cv::findNonZero( InputArray _src, OutputArray _idx )
|
||||
*idx_ptr++ = Point(j, i);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -2,13 +2,20 @@
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "opencl_kernels_core.hpp"
|
||||
#include "stat.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
||||
typedef int (*CountNonZeroFunc)(const uchar*, int);
|
||||
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
|
||||
CountNonZeroFunc getCountNonZeroTab(int depth);
|
||||
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
template<typename T>
|
||||
static int countNonZero_(const T* src, int len )
|
||||
{
|
||||
@ -175,9 +182,7 @@ static int countNonZero64f( const double* src, int len )
|
||||
return countNonZero_(src, len);
|
||||
}
|
||||
|
||||
typedef int (*CountNonZeroFunc)(const uchar*, int);
|
||||
|
||||
static CountNonZeroFunc getCountNonZeroTab(int depth)
|
||||
CountNonZeroFunc getCountNonZeroTab(int depth)
|
||||
{
|
||||
static CountNonZeroFunc countNonZeroTab[] =
|
||||
{
|
||||
@ -190,164 +195,7 @@ static CountNonZeroFunc getCountNonZeroTab(int depth)
|
||||
return countNonZeroTab[depth];
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
static bool ocl_countNonZero( InputArray _src, int & res )
|
||||
{
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
|
||||
if (depth == CV_64F && !doubleSupport)
|
||||
return false;
|
||||
|
||||
int dbsize = ocl::Device::getDefault().maxComputeUnits();
|
||||
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
|
||||
|
||||
int wgs2_aligned = 1;
|
||||
while (wgs2_aligned < (int)wgs)
|
||||
wgs2_aligned <<= 1;
|
||||
wgs2_aligned >>= 1;
|
||||
|
||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
||||
format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
|
||||
" -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
|
||||
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
|
||||
ocl::typeToStr(depth), (int)wgs, kercn,
|
||||
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
|
||||
dbsize, ocl::KernelArg::PtrWriteOnly(db));
|
||||
|
||||
size_t globalsize = dbsize * wgs;
|
||||
if (k.run(1, &globalsize, &wgs, true))
|
||||
return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined HAVE_IPP
|
||||
static bool ipp_countNonZero( Mat &src, int &res )
|
||||
{
|
||||
CV_INSTRUMENT_REGION_IPP();
|
||||
|
||||
#if IPP_VERSION_X100 < 201801
|
||||
// Poor performance of SSE42
|
||||
if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
|
||||
return false;
|
||||
#endif
|
||||
|
||||
Ipp32s count = 0;
|
||||
int depth = src.depth();
|
||||
|
||||
if(src.dims <= 2)
|
||||
{
|
||||
IppStatus status;
|
||||
IppiSize size = {src.cols*src.channels(), src.rows};
|
||||
|
||||
if(depth == CV_8U)
|
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);
|
||||
else if(depth == CV_32F)
|
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);
|
||||
else
|
||||
return false;
|
||||
|
||||
if(status < 0)
|
||||
return false;
|
||||
|
||||
res = size.width*size.height - count;
|
||||
}
|
||||
else
|
||||
{
|
||||
IppStatus status;
|
||||
const Mat *arrays[] = {&src, NULL};
|
||||
Mat planes[1];
|
||||
NAryMatIterator it(arrays, planes, 1);
|
||||
IppiSize size = {(int)it.size*src.channels(), 1};
|
||||
res = 0;
|
||||
for (size_t i = 0; i < it.nplanes; i++, ++it)
|
||||
{
|
||||
if(depth == CV_8U)
|
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);
|
||||
else if(depth == CV_32F)
|
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);
|
||||
else
|
||||
return false;
|
||||
|
||||
if(status < 0 || (int)it.planes->total()*src.channels() < count)
|
||||
return false;
|
||||
|
||||
res += (int)it.planes->total()*src.channels() - count;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // cv::
|
||||
|
||||
int cv::countNonZero( InputArray _src )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
int type = _src.type(), cn = CV_MAT_CN(type);
|
||||
CV_Assert( cn == 1 );
|
||||
|
||||
#if defined HAVE_OPENCL || defined HAVE_IPP
|
||||
int res = -1;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
|
||||
ocl_countNonZero(_src, res),
|
||||
res)
|
||||
#endif
|
||||
|
||||
Mat src = _src.getMat();
|
||||
CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);
|
||||
|
||||
CountNonZeroFunc func = getCountNonZeroTab(src.depth());
|
||||
CV_Assert( func != 0 );
|
||||
|
||||
const Mat* arrays[] = {&src, 0};
|
||||
uchar* ptrs[1] = {};
|
||||
NAryMatIterator it(arrays, ptrs);
|
||||
int total = (int)it.size, nz = 0;
|
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it )
|
||||
nz += func( ptrs[0], total );
|
||||
|
||||
return nz;
|
||||
}
|
||||
|
||||
void cv::findNonZero( InputArray _src, OutputArray _idx )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
Mat src = _src.getMat();
|
||||
CV_Assert( src.type() == CV_8UC1 );
|
||||
int n = countNonZero(src);
|
||||
if( n == 0 )
|
||||
{
|
||||
_idx.release();
|
||||
return;
|
||||
}
|
||||
if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
|
||||
_idx.release();
|
||||
_idx.create(n, 1, CV_32SC2);
|
||||
Mat idx = _idx.getMat();
|
||||
CV_Assert(idx.isContinuous());
|
||||
Point* idx_ptr = idx.ptr<Point>();
|
||||
|
||||
for( int i = 0; i < src.rows; i++ )
|
||||
{
|
||||
const uchar* bin_ptr = src.ptr(i);
|
||||
for( int j = 0; j < src.cols; j++ )
|
||||
if( bin_ptr[j] )
|
||||
*idx_ptr++ = Point(j, i);
|
||||
}
|
||||
}
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
} // namespace
|
||||
|
Loading…
Reference in New Issue
Block a user