opencv/3rdparty/carotene/src/norm.cpp

1311 lines
36 KiB
C++

/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
//magic number; must be multiple of 4
#define NORM32F_BLOCK_SIZE 2048
s32 normInf(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
if (size.width >= 16)
{
uint8x16_t s = vld1q_u8(src);
for (i = 16; i <= size.width - 16; i += 16)
{
internal::prefetch(src + i);
uint8x16_t s1 = vld1q_u8(src + i);
s = vmaxq_u8(s1, s);
}
u8 s2[8];
uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
vst1_u8(s2, s3);
for (u32 j = 0; j < 8; j++)
result = std::max((s32)(s2[j]), result);
}
for ( ; i < size.width; i++)
result = std::max((s32)(src[i]), result);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normInf(const Size2D &_size,
const s8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
if (size.width >= 16)
{
uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src)));
for (i = 16; i <= size.width - 16; i += 16)
{
internal::prefetch(src + i);
uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i)));
s = vmaxq_u8(s1, s);
}
u8 s2[8];
uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
vst1_u8(s2, s3);
for (u32 j = 0; j < 8; j++)
result = std::max((s32)(s2[j]), result);
}
for ( ; i < size.width; i++)
result = std::max((s32)(std::abs(src[i])), result);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normInf(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
if (size.width >= 8)
{
uint16x8_t s = vld1q_u16(src);
for (i = 8; i <= size.width - 8; i += 8)
{
internal::prefetch(src + i);
uint16x8_t s1 = vld1q_u16(src + i);
s = vmaxq_u16(s1, s);
}
u16 s2[4];
uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
vst1_u16(s2, s3);
for (u32 j = 0; j < 4; j++)
result = std::max((s32)(s2[j]), result);
}
for ( ; i < size.width; i++)
result = std::max((s32)(src[i]), result);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normInf(const Size2D &_size,
const s16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
if (size.width >= 8)
{
uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src)));
for (i = 8; i <= size.width - 8; i += 8)
{
internal::prefetch(src + i);
uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i)));
s = vmaxq_u16(s1, s);
}
u16 s2[4];
uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
vst1_u16(s2, s3);
for (u32 j = 0; j < 4; j++)
result = std::max((s32)(s2[j]), result);
}
for ( ; i < size.width; i++)
result = std::max(std::abs((s32)(src[i])), result);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normInf(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
if (size.width >= 4)
{
uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src)));
for (i = 4; i <= size.width - 4; i += 4)
{
internal::prefetch(src + i);
uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i)));
s = vmaxq_u32(s1, s);
}
u32 s2[2];
uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s));
vst1_u32(s2, s3);
for (u32 j = 0; j < 2; j++)
result = std::max((s32)(s2[j]), result);
}
for ( ; i < size.width; i++)
result = std::max((s32)(std::abs(src[i])), result);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
f32 normInf(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
f32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
if (size.width >= 4)
{
float32x4_t s = vabsq_f32(vld1q_f32(src));
for (i = 4; i <= size.width - 4; i += 4 )
{
internal::prefetch(src + i);
float32x4_t s1 = vld1q_f32(src + i);
float32x4_t sa = vabsq_f32(s1);
s = vmaxq_f32(sa, s);
}
f32 s2[2];
float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s));
vst1_f32(s2, s3);
for (u32 j = 0; j < 2; j++)
result = std::max(s2[j], result);
}
for (; i < size.width; i++)
result = std::max(std::abs(src[i]), result);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0.;
#endif
}
s32 normL1(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw8;)
{
size_t limit = std::min(size.width, i + 256) - 8;
uint8x8_t s0 = vld1_u8(src + i);
uint16x8_t s = vmovl_u8(s0);
for (i += 8; i <= limit; i += 8)
{
internal::prefetch(src + i);
uint8x8_t s1 = vld1_u8(src + i);
s = vaddw_u8(s, s1);
}
uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
vs = vaddw_u16(vs, s4);
}
u32 s2[2];
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
vst1_u32(s2, vs2);
result += (s32)(s2[0] + s2[1]);
for ( ; i < size.width; i++)
result += (s32)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normL1(const Size2D &_size,
const s8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw8;)
{
size_t limit = std::min(size.width, i + 256) - 8;
uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
uint16x8_t s = vmovl_u8(s0);
for (i += 8; i <= limit; i += 8)
{
internal::prefetch(src + i);
uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
s = vaddw_u8(s, s1);
}
uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
vs = vaddw_u16(vs, s4);
}
u32 s2[2];
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
vst1_u32(s2, vs2);
result += (s32)(s2[0] + s2[1]);
for ( ; i < size.width; i++)
result += (s32)(std::abs(src[i]));
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normL1(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw4; i += 4)
{
internal::prefetch(src + i);
uint16x4_t s = vld1_u16(src + i);
vs = vaddw_u16(vs, s);
}
u32 s2[4];
vst1q_u32(s2, vs);
for (u32 j = 0; j < 4; j++)
result += s2[j];
for ( ; i < size.width; i++)
result += (s32)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normL1(const Size2D &_size,
const s16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw4; i += 4)
{
internal::prefetch(src + i);
uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i)));
vs = vaddw_u16(vs, s);
}
u32 s2[4];
vst1q_u32(s2, vs);
for (u32 j = 0; j < 4; j++)
result += s2[j];
for ( ; i < size.width; i++)
result += (s32)(std::abs(src[i]));
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
f64 normL1(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
for (; i < roiw4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
for (i += 4; i <= limit; i += 4 )
{
internal::prefetch(src + i);
float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
s = vaddq_f32(s, s1);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
for ( ; i < size.width; i++)
result += (f64)(std::abs(src[i]));
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0.;
#endif
}
f64 normL1(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
for (; i < roiw4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
float32x4_t s = vabsq_f32(vld1q_f32(src + i));
for (i += 4; i <= limit; i += 4)
{
internal::prefetch(src + i);
float32x4_t s1 = vld1q_f32(src + i);
float32x4_t sa = vabsq_f32(s1);
s = vaddq_f32(sa, s);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
for (; i < size.width; i++)
result += std::abs((f64)(src[i]));
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0.;
#endif
}
s32 normL2(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
uint32x4_t sl = vmovq_n_u32(0);
uint32x4_t sh = vmovq_n_u32(0);
for (; i < roiw8; i += 8)
{
internal::prefetch(src + i);
uint8x8_t s1 = vld1_u8(src + i);
uint16x8_t sq = vmull_u8(s1, s1);
sl = vaddw_u16(sl, vget_low_u16(sq));
sh = vaddw_u16(sh, vget_high_u16(sq));
}
uint32x4_t s = vaddq_u32(sl, sh);
uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s));
u32 s2[2];
vst1_u32(s2, ss);
result += (s32)(s2[0] + s2[1]);
for (; i < size.width; i++)
result += (s32)(src[i]) * (s32)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 normL2(const Size2D &_size,
const s8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
int32x4_t sl = vmovq_n_s32(0);
int32x4_t sh = vmovq_n_s32(0);
for (; i < roiw8; i += 8)
{
internal::prefetch(src + i);
int8x8_t s1 = vld1_s8(src + i);
int16x8_t sq = vmull_s8(s1, s1);
sl = vaddw_s16(sl, vget_low_s16(sq));
sh = vaddw_s16(sh, vget_high_s16(sq));
}
int32x4_t s = vaddq_s32(sl, sh);
int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s));
s32 s2[2];
vst1_s32(s2, ss);
result += s2[0] + s2[1];
for (; i < size.width; i++)
result += (s32)(src[i]) * (s32)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
f64 normL2(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
for (; i < roiw4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
uint16x4_t s0 = vld1_u16(src+i);
float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0));
for (i += 4; i <= limit; i += 4 )
{
internal::prefetch(src + i);
uint16x4_t s1 = vld1_u16(src+i);
float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1));
s = vaddq_f32(s, sq);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
for ( ; i < size.width; i++)
result += (f64)(src[i]) * (f64)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0.;
#endif
}
f64 normL2(const Size2D &_size,
const s16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
for (; i < roiw4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
int16x4_t s0 = vld1_s16(src+i);
float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0));
for (i += 4; i <= limit; i += 4 )
{
internal::prefetch(src + i);
int16x4_t s1 = vld1_s16(src+i);
float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1));
s = vaddq_f32(s, sq);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
for ( ; i < size.width; i++)
result += (f64)(src[i]) * (f64)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0.;
#endif
}
f64 normL2(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const s32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
for (; i < roiw4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i));
s = vmulq_f32(s, s);
for (i += 4; i <= limit; i += 4 )
{
internal::prefetch(src + i);
float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i));
s = vmlaq_f32(s, s1, s1);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
for ( ; i < size.width; i++)
result += (f64)(src[i]) * (f64)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0.;
#endif
}
f64 normL2(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
for (; i < roiw4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
float32x4_t s = vld1q_f32(src + i);
s = vmulq_f32(s, s);
for (i += 4; i <= limit; i += 4 )
{
internal::prefetch(src + i);
float32x4_t s1 = vld1q_f32(src + i);
s = vmlaq_f32(s, s1, s1);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
for ( ; i < size.width; i++)
result += (f64)(src[i]) * (f64)(src[i]);
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0.;
#endif
}
s32 diffNormInf(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
size_t i = 0;
if (size.width >= 16)
{
uint8x16_t vs3 = vdupq_n_u8(0);
for (; i < size.width - 16; i += 16)
{
internal::prefetch(src1 + i);
internal::prefetch(src2 + i);
uint8x16_t vs1 = vld1q_u8(src1 + i);
uint8x16_t vs2 = vld1q_u8(src2 + i);
vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2));
}
u8 s2[8];
vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3)));
for (u32 j = 0; j < 8; j++)
result = std::max((s32)(s2[j]), result);
}
for (; i < size.width; i++)
{
result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result);
}
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f32 diffNormInf(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
f32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
size_t i = 0;
if (size.width >= 4)
{
float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2));
for (i += 4; i <= size.width - 4; i += 4 )
{
internal::prefetch(src1 + i);
internal::prefetch(src2 + i);
float32x4_t vs1 = vld1q_f32(src1 + i);
float32x4_t vs2 = vld1q_f32(src2 + i);
float32x4_t vd = vabdq_f32(vs2, vs1);
s = vmaxq_f32(s, vd);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
if (s2[j] > result)
result = s2[j];
}
for (; i < size.width; i++)
{
f32 v = std::abs(src1[i] - src2[i]);
if (v > result)
result = v;
}
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0.;
#endif
}
s32 diffNormL1(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
size_t i = 0;
if (size.width >= 16)
{
for(; i <= size.width - 16;)
{
size_t limit = std::min(size.width, i + 2*256) - 16;
uint16x8_t si1 = vmovq_n_u16(0);
uint16x8_t si2 = vmovq_n_u16(0);
for (; i <= limit; i += 16)
{
internal::prefetch(src1 + i);
internal::prefetch(src2 + i);
uint8x16_t vs1 = vld1q_u8(src1 + i);
uint8x16_t vs2 = vld1q_u8(src2 + i);
si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2));
si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2));
}
u32 s2[4];
vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2)));
for (u32 j = 0; j < 4; j++)
{
if ((s32)(0x7fFFffFFu - s2[j]) <= result)
{
return 0x7fFFffFF; //result already saturated
}
result = (s32)((u32)(result) + s2[j]);
}
}
}
for (; i < size.width; i++)
{
u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i]));
if ((s32)(0x7fFFffFFu - v) <= result)
{
return 0x7fFFffFF; //result already saturated
}
result = (s32)((u32)(result) + v);
}
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 diffNormL1(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
size_t i = 0;
if (size.width >= 4)
{
for(; i <= size.width - 4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
float32x4_t s = vmovq_n_f32(0.0f);
for (; i <= limit; i += 4 )
{
internal::prefetch(src1 + i);
internal::prefetch(src2 + i);
float32x4_t vs1 = vld1q_f32(src1 + i);
float32x4_t vs2 = vld1q_f32(src2 + i);
float32x4_t vd = vabdq_f32(vs2, vs1);
s = vaddq_f32(s, vd);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
}
for (; i < size.width; i++)
{
f32 v = std::abs(src1[i] - src2[i]);
result += (f64)(v);
}
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0.;
#endif
}
s32 diffNormL2(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
size_t i = 0;
#define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow
if (size.width >= 16)
{
for(; i <= size.width - 16;)
{
size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16;
uint32x4_t si1 = vmovq_n_u32(0);
uint32x4_t si2 = vmovq_n_u32(0);
for (; i <= limit; i += 16)
{
internal::prefetch(src1 + i);
internal::prefetch(src2 + i);
uint8x16_t vs1 = vld1q_u8(src1 + i);
uint8x16_t vs2 = vld1q_u8(src2 + i);
uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2));
uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2));
si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo));
si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo));
si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi));
si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi));
}
u32 s2[4];
vst1q_u32(s2, vqaddq_u32(si1, si2));
for (u32 j = 0; j < 4; j++)
{
if ((s32)(0x7fFFffFFu - s2[j]) <= result)
{
return 0x7fFFffFF; //result already saturated
}
result += (s32)s2[j];
}
}
}
for (; i < size.width; i++)
{
s32 v = (s32)(src1[i]) - (s32)(src2[i]);
v *= v;
if ((s32)(0x7fFFffFFu - (u32)(v)) <= result)
{
return 0x7fFFffFF; //result already saturated
}
result += v;
}
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 diffNormL2(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
f64 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
size_t i = 0;
if (size.width >= 4)
{
for(; i <= size.width - 4;)
{
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
float32x4_t s = vmovq_n_f32(0.0f);
for (; i <= limit; i += 4 )
{
internal::prefetch(src1 + i);
internal::prefetch(src2 + i);
float32x4_t vs1 = vld1q_f32(src1 + i);
float32x4_t vs2 = vld1q_f32(src2 + i);
float32x4_t vd = vsubq_f32(vs2,vs1);
s = vmlaq_f32(s, vd, vd);
}
f32 s2[4];
vst1q_f32(s2, s);
for (u32 j = 0; j < 4; j++)
result += (f64)(s2[j]);
}
}
for (; i < size.width; i++)
{
f32 v = src1[i] - src2[i];
result += v * v;
}
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0.;
#endif
}
} // namespace CAROTENE_NS