opencv/3rdparty/carotene/src/minmaxloc.cpp

1341 lines
41 KiB
C++

/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <limits>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void minMaxVals(const Size2D &size,
const T * srcBase, ptrdiff_t srcStride,
T * pMinVal, T * pMaxVal)
{
using namespace internal;
typedef typename VecTraits<T>::vec128 vec128;
typedef typename VecTraits<T>::vec64 vec64;
u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
T maxVal = std::numeric_limits<T>::min();
T minVal = std::numeric_limits<T>::max();
vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal);
vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal);
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for (; j < roiw_base; j += step_base)
{
prefetch(src + j);
vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T));
v_min_base = vminq(v_min_base, v_src0);
v_max_base = vmaxq(v_max_base, v_src0);
v_min_base = vminq(v_min_base, v_src1);
v_max_base = vmaxq(v_max_base, v_src1);
}
for (; j < roiw_tail; j += step_tail)
{
vec64 v_src0 = vld1(src + j);
v_min_tail = vmin(v_min_tail, v_src0);
v_max_tail = vmax(v_max_tail, v_src0);
}
for (; j < size.width; j++)
{
T srcval = src[j];
minVal = std::min(srcval, minVal);
maxVal = std::max(srcval, maxVal);
}
}
// collect min & max values
T ar[16 / sizeof(T)];
vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))),
vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base)))));
for (size_t x = 0; x < 8u / sizeof(T); ++x)
{
minVal = std::min(minVal, ar[x]);
maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]);
}
if (pMaxVal)
*pMaxVal = maxVal;
if (pMinVal)
*pMinVal = minVal;
}
} // namespace
#endif
void minMaxVals(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * pMinVal, u8 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<u8>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * pMinVal, s16 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<s16>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * pMinVal, u16 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<u16>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * pMinVal, s32 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<s32>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxVals(const Size2D &size,
const u32 * srcBase, ptrdiff_t srcStride,
u32 * pMinVal, u32 * pMaxVal)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minMaxVals<u32>(size,
srcBase, srcStride,
pMinVal, pMaxVal);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMinVal;
(void)pMaxVal;
#endif
}
void minMaxLoc(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 &minVal, size_t &minCol, size_t &minRow,
f32 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 16)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c4 = vdupq_n_u32(4);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (4 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
{
size_t bound = size.width - (4 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
float32x4_t n_min = vdupq_n_f32(minVal);
uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);
float32x4_t n_max = vdupq_n_f32(maxVal);
uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);
for(; i < bound; i+=4)
{
internal::prefetch(src + i);
float32x4_t line = vld1q_f32(src + i);
uint32x4_t minmask = vcltq_f32(line, n_min);
uint32x4_t maxmask = vcgtq_f32(line, n_max);
n_min = vbslq_f32(minmask, line, n_min);
n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
n_max = vbslq_f32(maxmask, line, n_max);
n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
// idx[] +=4
lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
}
f32 fmin[4], fmax[4];
u32 fminIdx[4], fmaxIdx[4];
vst1q_f32(fmin, n_min);
vst1q_f32(fmax, n_max);
vst1q_u32(fminIdx, n_minIdx);
vst1q_u32(fmaxIdx, n_maxIdx);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 4; ++j)
{
f32 minval = fmin[j];
f32 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
float val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
const u8 * maskBase, ptrdiff_t maskStride,
f32 &minVal, size_t &minCol, size_t &minRow,
f32 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = std::numeric_limits<f32>::max();
minCol = size.width;
minRow = size.height;
maxVal = -std::numeric_limits<f32>::max();
maxCol = size.width;
maxRow = size.height;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
const u8 * mask = internal::getRowPtr( maskBase, maskStride, l);
if (size.width >= 16)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t uOne = vdupq_n_u32(1);
uint32x4_t c4 = vdupq_n_u32(4);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (4 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
{
size_t bound = size.width - (4 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
float32x4_t n_min = vdupq_n_f32(minVal);
uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);
float32x4_t n_max = vdupq_n_f32(maxVal);
uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);
for(; i < bound; i+=4)
{
internal::prefetch(src + i);
internal::prefetch(mask + i);
float32x4_t line = vld1q_f32(src + i);
uint8x8_t maskLine = vld1_u8(mask + i);
uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine)));
maskLine4 = vcgeq_u32(maskLine4, uOne);
uint32x4_t minmask = vcltq_f32(line, n_min);
uint32x4_t maxmask = vcgtq_f32(line, n_max);
minmask = vandq_u32(minmask, maskLine4);
maxmask = vandq_u32(maxmask, maskLine4);
n_min = vbslq_f32(minmask, line, n_min);
n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
n_max = vbslq_f32(maxmask, line, n_max);
n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
// idx[] +=4
lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
}
f32 fmin[4], fmax[4];
u32 fminIdx[4], fmaxIdx[4];
vst1q_f32(fmin, n_min);
vst1q_f32(fmax, n_max);
vst1q_u32(fminIdx, n_minIdx);
vst1q_u32(fmaxIdx, n_maxIdx);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 4; ++j)
{
f32 minval = fmin[j];
f32 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; i++ )
{
if (!mask[i])
continue;
f32 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)maskBase;
(void)maskStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 &minVal, size_t &minCol, size_t &minRow,
s32 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const s32 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 16)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c4 = vdupq_n_u32(4);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (4 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
{
size_t bound = size.width - (4 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
int32x4_t n_min = vdupq_n_s32(minVal);
uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);
int32x4_t n_max = vdupq_n_s32(maxVal);
uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);
for(; i < bound; i+=4 )
{
internal::prefetch(src + i);
int32x4_t line = vld1q_s32(src + i);
uint32x4_t minmask = vcltq_s32(line, n_min);
uint32x4_t maxmask = vcgtq_s32(line, n_max);
n_min = vbslq_s32(minmask, line, n_min);
n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
n_max = vbslq_s32(maxmask, line, n_max);
n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);
// idx[] +=4
lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
}
s32 fmin[4], fmax[4];
u32 fminIdx[4], fmaxIdx[4];
vst1q_s32(fmin, n_min);
vst1q_s32(fmax, n_max);
vst1q_u32(fminIdx, n_minIdx);
vst1q_u32(fmaxIdx, n_maxIdx);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 4; ++j)
{
s32 minval = fmin[j];
s32 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFFC)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
s32 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 &minVal, size_t &minCol, size_t &minRow,
s16 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const s16 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 32)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c8 = vdupq_n_u32(8);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (8 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
#else
{
size_t bound = size.width - (8 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
int16x8_t n_min = vdupq_n_s16(minVal);
uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
int16x8_t n_max = vdupq_n_s16(maxVal);
uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
for(; i < bound; i+=8 )
{
internal::prefetch(src + i);
int16x8_t line = vld1q_s16(src + i);
uint16x8_t minmask = vcltq_s16(line, n_min);
uint16x8_t maxmask = vcgtq_s16(line, n_max);
n_min = vbslq_s16(minmask, line, n_min);
uint16x4_t minml = vget_low_u16(minmask);
uint16x4_t minmh = vget_high_u16(minmask);
uint32x4_t minml2 = vmovl_u16(minml);
uint32x4_t minmh2 = vmovl_u16(minmh);
minml2 = vqshlq_n_u32(minml2, 31);
minmh2 = vqshlq_n_u32(minmh2, 31);
n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_s16(maxmask, line, n_max);
uint16x4_t maxml = vget_low_u16(maxmask);
uint16x4_t maxmh = vget_high_u16(maxmask);
uint32x4_t maxml2 = vmovl_u16(maxml);
uint32x4_t maxmh2 = vmovl_u16(maxmh);
maxml2 = vqshlq_n_u32(maxml2, 31);
maxmh2 = vqshlq_n_u32(maxmh2, 31);
n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=8
lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
}
// fix high part of indexes
uint32x4_t c4 = vdupq_n_u32((int32_t) 4);
n_minIdxh = vaddq_u32(n_minIdxh, c4);
n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
s16 fmin[8], fmax[8];
u32 fminIdx[8], fmaxIdx[8];
vst1q_s16(fmin, n_min);
vst1q_s16(fmax, n_max);
vst1q_u32(fminIdx+0, n_minIdxl);
vst1q_u32(fmaxIdx+0, n_maxIdxl);
vst1q_u32(fminIdx+4, n_minIdxh);
vst1q_u32(fmaxIdx+4, n_maxIdxh);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 8; ++j)
{
s16 minval = fmin[j];
s16 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
short val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 &minVal, size_t &minCol, size_t &minRow,
u16 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
{
const u16 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width >= 32)
{
u32 tmp0123[4] = { 0, 1, 2, 3 };
uint32x4_t c8 = vdupq_n_u32(8);
#if SIZE_MAX > UINT32_MAX
size_t boundAll = size.width - (8 - 1);
for(size_t b = 0; i < boundAll; b = i)
{
size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
#else
{
size_t bound = size.width - (8 - 1);
#endif
uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);
uint16x8_t n_min = vdupq_n_u16(minVal);
uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
uint16x8_t n_max = vdupq_n_u16(maxVal);
uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);
for(; i < bound; i+=8 )
{
internal::prefetch(src + i);
uint16x8_t line = vld1q_u16(src + i);
uint16x8_t minmask = vcltq_u16(line, n_min);
uint16x8_t maxmask = vcgtq_u16(line, n_max);
n_min = vbslq_u16(minmask, line, n_min);
uint16x4_t minml = vget_low_u16(minmask);
uint16x4_t minmh = vget_high_u16(minmask);
uint32x4_t minml2 = vmovl_u16(minml);
uint32x4_t minmh2 = vmovl_u16(minmh);
minml2 = vqshlq_n_u32(minml2, 31);
minmh2 = vqshlq_n_u32(minmh2, 31);
n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_u16(maxmask, line, n_max);
uint16x4_t maxml = vget_low_u16(maxmask);
uint16x4_t maxmh = vget_high_u16(maxmask);
uint32x4_t maxml2 = vmovl_u16(maxml);
uint32x4_t maxmh2 = vmovl_u16(maxmh);
maxml2 = vqshlq_n_u32(maxml2, 31);
maxmh2 = vqshlq_n_u32(maxmh2, 31);
n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=8
lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
}
// fix high part of indexes
uint32x4_t c4 = vdupq_n_u32(4);
n_minIdxh = vaddq_u32(n_minIdxh, c4);
n_maxIdxh = vaddq_u32(n_maxIdxh, c4);
u16 fmin[8], fmax[8];
u32 fminIdx[8], fmaxIdx[8];
vst1q_u16(fmin, n_min);
vst1q_u16(fmax, n_max);
vst1q_u32(fminIdx+0, n_minIdxl);
vst1q_u32(fmaxIdx+0, n_maxIdxl);
vst1q_u32(fminIdx+4, n_minIdxh);
vst1q_u32(fmaxIdx+4, n_maxIdxh);
size_t minIdx = fminIdx[0];
size_t maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 8; ++j)
{
u16 minval = fmin[j];
u16 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
if(minIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
minCol = b + minIdx;
#else
minCol = minIdx;
#endif
minRow = l;
}
if(maxIdx < 0xffffFFF8)
{
#if SIZE_MAX > UINT32_MAX
maxCol = b + maxIdx;
#else
maxCol = maxIdx;
#endif
maxRow = l;
}
}
}
for(; i < size.width; ++i )
{
u16 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
#ifdef CAROTENE_NEON
namespace {
void minMaxLocBlock(const u8 * src, u32 len,
u8 &minVal, u16 &minIdx,
u8 &maxVal, u16 &maxIdx)
{
u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
uint8x16_t n_min = vdupq_n_u8(src[0]);
uint16x8_t n_minIdxl = vdupq_n_u16(0);
uint16x8_t n_minIdxh = vdupq_n_u16(0);
uint8x16_t n_max = vdupq_n_u8(src[0]);
uint16x8_t n_maxIdxl = vdupq_n_u16(0);
uint16x8_t n_maxIdxh = vdupq_n_u16(0);
uint16x8_t c16 = vdupq_n_u16(16);
uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
s32 i = 0;
s32 bound = len - (16 - 1);
for(; i < bound; i+=16 )
{
internal::prefetch(src + i);
uint8x16_t line = vld1q_u8(src + i);
uint8x16_t minmask = vcltq_u8(line, n_min);
uint8x16_t maxmask = vcgtq_u8(line, n_max);
n_min = vbslq_u8(minmask, line, n_min);
uint8x8_t minml = vget_low_u8(minmask);
uint8x8_t minmh = vget_high_u8(minmask);
uint16x8_t minml2 = vmovl_u8(minml);
uint16x8_t minmh2 = vmovl_u8(minmh);
minml2 = vqshlq_n_u16(minml2, 15);
minmh2 = vqshlq_n_u16(minmh2, 15);
n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_u8(maxmask, line, n_max);
uint8x8_t maxml = vget_low_u8(maxmask);
uint8x8_t maxmh = vget_high_u8(maxmask);
uint16x8_t maxml2 = vmovl_u8(maxml);
uint16x8_t maxmh2 = vmovl_u8(maxmh);
maxml2 = vqshlq_n_u16(maxml2, 15);
maxmh2 = vqshlq_n_u16(maxmh2, 15);
n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=16
lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
}
// fix high part of indexes
uint16x8_t c8 = vdupq_n_u16(8);
n_minIdxh = vaddq_u16(n_minIdxh, c8);
n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
u8 fmin[16], fmax[16];
u16 fminIdx[16], fmaxIdx[16];
/*{
uint8x8_t min_low = vget_low_u8(n_min);
uint8x8_t min_high = vget_high_u8(n_min);
uint8x8_t max_low = vget_low_u8(n_max);
uint8x8_t max_high = vget_high_u8(n_max);
uint8x8_t minmask = vclt_u8(min_low, min_high);
uint8x8_t maxmask = vcgt_u8(max_low, max_high);
uint8x8_t min2 = vbsl_u8(minmask, min_low, min_high);
uint8x8_t max2 = vbsl_u8(maxmask, max_low, max_high);
uint16x8_t minidxmask = vmovl_u8(minmask);
uint16x8_t maxidxmask = vmovl_u8(maxmask);
minidxmask = vqshlq_n_u16(minidxmask, 15);
maxidxmask = vqshlq_n_u16(maxidxmask, 15);
uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh);
uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh);
vst1_u8((uint8_t*)fmin, min2);
vst1_u8((uint8_t*)fmax, max2);
vst1q_u16((uint16_t*)(fminIdx), n_minIdx);
vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx);
}*/
vst1q_u8(fmin, n_min);
vst1q_u8(fmax, n_max);
vst1q_u16(fminIdx+0, n_minIdxl);
vst1q_u16(fmaxIdx+0, n_maxIdxl);
vst1q_u16(fminIdx+8, n_minIdxh);
vst1q_u16(fmaxIdx+8, n_maxIdxh);
minIdx = fminIdx[0];
maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 16; ++j)
{
u8 minval = fmin[j];
u8 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
for(; i < (s32)len; ++i )
{
u8 val = src[i];
if( val < minVal )
{
minVal = val;
minIdx = (u16)i;
}
else if( val > maxVal )
{
maxVal = val;
maxIdx = (u16)i;
}
}
}
void minMaxLocBlock(const s8 * src, u32 len,
s8 &minVal, u16 &minIdx,
s8 &maxVal, u16 &maxIdx)
{
u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
int8x16_t n_min = vdupq_n_s8(src[0]);
uint16x8_t n_minIdxl = vdupq_n_u16(0);
uint16x8_t n_minIdxh = vdupq_n_u16(0);
int8x16_t n_max = vdupq_n_s8(src[0]);
uint16x8_t n_maxIdxl = vdupq_n_u16(0);
uint16x8_t n_maxIdxh = vdupq_n_u16(0);
uint16x8_t c16 = vdupq_n_u16(16);
uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);
s32 i = 0;
s32 bound = len - (16 - 1);
for(; i < bound; i+=16 )
{
internal::prefetch(src + i);
int8x16_t line = vld1q_s8(src + i);
uint8x16_t minmask = vcltq_s8(line, n_min);
uint8x16_t maxmask = vcgtq_s8(line, n_max);
n_min = vbslq_s8(minmask, line, n_min);
uint8x8_t minml = vget_low_u8(minmask);
uint8x8_t minmh = vget_high_u8(minmask);
uint16x8_t minml2 = vmovl_u8(minml);
uint16x8_t minmh2 = vmovl_u8(minmh);
minml2 = vqshlq_n_u16(minml2, 15);
minmh2 = vqshlq_n_u16(minmh2, 15);
n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);
n_max = vbslq_s8(maxmask, line, n_max);
uint8x8_t maxml = vget_low_u8(maxmask);
uint8x8_t maxmh = vget_high_u8(maxmask);
uint16x8_t maxml2 = vmovl_u8(maxml);
uint16x8_t maxmh2 = vmovl_u8(maxmh);
maxml2 = vqshlq_n_u16(maxml2, 15);
maxmh2 = vqshlq_n_u16(maxmh2, 15);
n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);
// idx[] +=16
lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
}
// fix high part of indexes
uint16x8_t c8 = vdupq_n_u16(8);
n_minIdxh = vaddq_u16(n_minIdxh, c8);
n_maxIdxh = vaddq_u16(n_maxIdxh, c8);
s8 fmin[16], fmax[16];
u16 fminIdx[16], fmaxIdx[16];
vst1q_s8(fmin, n_min);
vst1q_s8(fmax, n_max);
vst1q_u16(fminIdx+0, n_minIdxl);
vst1q_u16(fmaxIdx+0, n_maxIdxl);
vst1q_u16(fminIdx+8, n_minIdxh);
vst1q_u16(fmaxIdx+8, n_maxIdxh);
minIdx = fminIdx[0];
maxIdx = fmaxIdx[0];
minVal = fmin[0];
maxVal = fmax[0];
for (s32 j = 1; j < 16; ++j)
{
s8 minval = fmin[j];
s8 maxval = fmax[j];
if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
{
minIdx = fminIdx[j];
minVal = minval;
}
if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
{
maxIdx = fmaxIdx[j];
maxVal = maxval;
}
}
for(; i < (s32)len; ++i )
{
s8 val = src[i];
if( val < minVal )
{
minVal = val;
minIdx = (u16)i;
}
else if( val > maxVal )
{
maxVal = val;
maxIdx = (u16)i;
}
}
}
} // namespace
#endif // CAROTENE_NEON
#define USHORT_BLOCK_MAX_SIZE (1 << 16)
void minMaxLoc(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 &minVal, size_t &minCol, size_t &minRow,
u8 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0; l < size.height; ++l)
{
const u8 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width > 128)
{
for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
{
u8 locMinVal, locMaxVal;
u16 locMinIdx, locMaxIdx;
size_t tail = size.width - blockStart;
minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
locMinVal, locMinIdx, locMaxVal, locMaxIdx);
if (locMinVal == 0 && locMaxVal == 255)
{
minCol = blockStart + locMinIdx;
maxCol = blockStart + locMaxIdx;
minRow = l;
maxRow = l;
minVal = 0;
maxVal = 255;
return;
}
else
{
if (locMinVal < minVal)
{
minCol = blockStart + locMinIdx;
minRow = l;
minVal = locMinVal;
}
if (locMaxVal > maxVal)
{
maxCol = blockStart + locMaxIdx;
maxRow = l;
maxVal = locMaxVal;
}
}
}
}
else
{
for(size_t i = 0; i < size.width; ++i )
{
u8 val = src[i];
if( val < minVal )
{
minVal = val;
minCol = i;
minRow = l;
}
else if( val > maxVal )
{
maxVal = val;
maxCol = i;
maxRow = l;
}
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
void minMaxLoc(const Size2D &size,
const s8 * srcBase, ptrdiff_t srcStride,
s8 &minVal, size_t &minCol, size_t &minRow,
s8 &maxVal, size_t &maxCol, size_t &maxRow)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
minVal = srcBase[0];
minCol = 0;
minRow = 0;
maxVal = srcBase[0];
maxCol = 0;
maxRow = 0;
for(size_t l = 0; l < size.height; ++l)
{
const s8 * src = internal::getRowPtr( srcBase, srcStride, l);
if (size.width > 128)
{
for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
{
s8 locMinVal, locMaxVal;
u16 locMinIdx, locMaxIdx;
size_t tail = size.width - blockStart;
minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
locMinVal, locMinIdx, locMaxVal, locMaxIdx);
if (locMinVal == -128 && locMaxVal == 127)
{
minCol = blockStart + locMinIdx;
maxCol = blockStart + locMaxIdx;
minRow = l;
maxRow = l;
minVal = -128;
maxVal = 127;
return;
}
else
{
if (locMinVal < minVal)
{
minCol = blockStart + locMinIdx;
minRow = l;
minVal = locMinVal;
}
if (locMaxVal > maxVal)
{
maxCol = blockStart + locMaxIdx;
maxRow = l;
maxVal = locMaxVal;
}
}
}
}
else
{
for(size_t i = 0; i < size.width; ++i )
{
s8 val = src[i];
if( val < minVal )
{
minVal = val;
minRow = l;
minCol = i;
}
else if( val > maxVal )
{
maxVal = val;
maxRow = l;
maxCol = i;
}
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minCol;
(void)minRow;
(void)maxVal;
(void)maxCol;
(void)maxRow;
#endif
}
} // namespace CAROTENE_NS