// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" #include "opencl_kernels_core.hpp" #include "convert.hpp" /****************************************************************************************\ * convertScale[Abs] * \****************************************************************************************/ namespace cv { template struct cvtScaleAbs_SIMD { int operator () (const T *, DT *, int, WT, WT) const { return 0; } }; #if CV_SIMD128 static inline void v_load_expand_from_u8_f32(const uchar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) { v_uint32x4 v_src0, v_src1; v_expand(v_load_expand(src), v_src0, v_src1); a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0)); b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1)); } static inline void v_load_expand_from_s8_f32(const schar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) { v_int32x4 v_src0, v_src1; v_expand(v_load_expand(src), v_src0, v_src1); a = v_shift + v_scale * v_cvt_f32(v_src0); b = v_shift + v_scale * v_cvt_f32(v_src1); } static inline void v_load_expand_from_u16_f32(const ushort* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) { v_uint32x4 v_src0, v_src1; v_expand(v_load(src), v_src0, v_src1); a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0)); b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1)); } static inline void v_load_expand_from_s16_f32(const short* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) { v_int32x4 v_src0, v_src1; v_expand(v_load(src), v_src0, v_src1); a = v_shift + v_scale * v_cvt_f32(v_src0); b = v_shift + v_scale * v_cvt_f32(v_src1); } static inline void v_load_expand_from_s32_f32(const int* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b) { a = v_shift + v_scale * v_cvt_f32(v_load(src)); b = v_shift + v_scale * v_cvt_f32(v_load(src + v_int32x4::nlanes)); } template <> struct cvtScaleAbs_SIMD { int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift); v_float32x4 v_scale = v_setall_f32(scale); const int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3; v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1); v_load_expand_from_u8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3); v_dst_0 = v_abs(v_dst_0); v_dst_1 = v_abs(v_dst_1); v_dst_2 = v_abs(v_dst_2); v_dst_3 = v_abs(v_dst_3); v_int16x8 v_dsti_0 = v_pack(v_round(v_dst_0), v_round(v_dst_1)); v_int16x8 v_dsti_1 = v_pack(v_round(v_dst_2), v_round(v_dst_3)); v_store(dst + x, v_pack_u(v_dsti_0, v_dsti_1)); } } return x; } }; template <> struct cvtScaleAbs_SIMD { int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift); v_float32x4 v_scale = v_setall_f32(scale); const int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth*2; x += cWidth*2) { v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3; v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1); v_load_expand_from_s8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3); v_dst_0 = v_abs(v_dst_0); v_dst_1 = v_abs(v_dst_1); v_dst_2 = v_abs(v_dst_2); v_dst_3 = v_abs(v_dst_3); v_uint16x8 v_dsti_0 = v_pack_u(v_round(v_dst_0), v_round(v_dst_1)); v_uint16x8 v_dsti_1 = v_pack_u(v_round(v_dst_2), v_round(v_dst_3)); v_store(dst + x, v_pack(v_dsti_0, v_dsti_1)); } } return x; } }; template <> struct cvtScaleAbs_SIMD { int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift); v_float32x4 v_scale = v_setall_f32(scale); const int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_dst0, v_dst1; v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1); v_dst0 = v_abs(v_dst0); v_dst1 = v_abs(v_dst1); v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScaleAbs_SIMD { int operator () (const short * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift); v_float32x4 v_scale = v_setall_f32(scale); const int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_dst0, v_dst1; v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1); v_dst0 = v_abs(v_dst0); v_dst1 = v_abs(v_dst1); v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScaleAbs_SIMD { int operator () (const int * src, uchar * dst, int width, float scale, float shift) const { int x = 0; v_float32x4 v_shift = v_setall_f32(shift); v_float32x4 v_scale = v_setall_f32(scale); const int cWidth = v_int32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_dst_0 = v_cvt_f32(v_load(src + x)) * v_scale; v_dst_0 = v_abs(v_dst_0 + v_shift); v_float32x4 v_dst_1 = v_cvt_f32(v_load(src + x + cWidth)) * v_scale; v_dst_1 = v_abs(v_dst_1 + v_shift); v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1)); v_pack_u_store(dst + x, v_dst); } return x; } }; template <> struct cvtScaleAbs_SIMD { int operator () (const float * src, uchar * dst, int width, float scale, float shift) const { int x = 0; v_float32x4 v_shift = v_setall_f32(shift); v_float32x4 v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_dst_0 = v_load(src + x) * v_scale; v_dst_0 = v_abs(v_dst_0 + v_shift); v_float32x4 v_dst_1 = v_load(src + x + cWidth) * v_scale; v_dst_1 = v_abs(v_dst_1 + v_shift); v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1)); v_pack_u_store(dst + x, v_dst); } return x; } }; #if CV_SIMD128_64F template <> struct cvtScaleAbs_SIMD { int operator () (const double * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_scale = v_setall_f32(scale); v_float32x4 v_shift = v_setall_f32(shift); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 4; x += cWidth * 4) { v_float32x4 v_src1, v_src2, v_dummy; v_recombine(v_cvt_f32(v_load(src + x)), v_cvt_f32(v_load(src + x + cWidth)), v_src1, v_dummy); v_recombine(v_cvt_f32(v_load(src + x + cWidth * 2)), v_cvt_f32(v_load(src + x + cWidth * 3)), v_src2, v_dummy); v_float32x4 v_dst1 = v_abs((v_src1 * v_scale) + v_shift); v_float32x4 v_dst2 = v_abs((v_src2 * v_scale) + v_shift); v_int16x8 v_dst_i = v_pack(v_round(v_dst1), v_round(v_dst2)); v_pack_u_store(dst + x, v_dst_i); } } return x; } }; #endif // CV_SIMD128_64F #endif template static void cvtScaleAbs_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size, WT scale, WT shift ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); cvtScaleAbs_SIMD vop; for( ; size.height--; src += sstep, dst += dstep ) { int x = vop(src, dst, size.width, scale, shift); #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { DT t0, t1; t0 = saturate_cast
(std::abs(src[x]*scale + shift)); t1 = saturate_cast
(std::abs(src[x+1]*scale + shift)); dst[x] = t0; dst[x+1] = t1; t0 = saturate_cast
(std::abs(src[x+2]*scale + shift)); t1 = saturate_cast
(std::abs(src[x+3]*scale + shift)); dst[x+2] = t0; dst[x+3] = t1; } #endif for( ; x < size.width; x++ ) dst[x] = saturate_cast
(std::abs(src[x]*scale + shift)); } } template struct cvtScale_SIMD { int operator () (const T *, DT *, int, WT, WT) const { return 0; } }; #if CV_SIMD128 // from uchar template <> struct cvtScale_SIMD { int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store_low(dst + x, v_pack(v_dst, v_dst)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const { int x = 0; #if CV_TRY_SSE4_1 if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_u8u16f32_SSE41(src, dst, width, scale, shift); #endif if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const uchar * src, short * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const uchar * src, int * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1, v_src2; v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_store(dst + x, v_round(v_src1)); v_store(dst + x + cWidth, v_round(v_src2)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const uchar * src, float * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1, v_src2; v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_store(dst + x, v_src1); v_store(dst + x + cWidth, v_src2); } } return x; } }; // from schar template <> struct cvtScale_SIMD { int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const schar * src, schar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store_low(dst + x, v_pack(v_dst, v_dst)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const { int x = 0; #if CV_TRY_SSE4_1 if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_s8u16f32_SSE41(src, dst, width, scale, shift); #endif if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const schar * src, short * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const schar * src, int * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1, v_src2; v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_store(dst + x, v_round(v_src1)); v_store(dst + x + cWidth, v_round(v_src2)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const schar * src, float * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1, v_src2; v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_store(dst + x, v_src1); v_store(dst + x + cWidth, v_src2); } } return x; } }; // from ushort template <> struct cvtScale_SIMD { int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store_low(dst + x, v_pack(v_dst, v_dst)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const { int x = 0; #if CV_TRY_SSE4_1 if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_u16u16f32_SSE41(src, dst, width, scale, shift); #endif if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const ushort * src, short * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const ushort * src, int * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1, v_src2; v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_store(dst + x, v_round(v_src1)); v_store(dst + x + cWidth, v_round(v_src2)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const ushort * src, float * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1, v_src2; v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_store(dst + x, v_src1); v_store(dst + x + cWidth, v_src2); } } return x; } }; // from short template <> struct cvtScale_SIMD { int operator () (const short * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const short * src, schar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store_low(dst + x, v_pack(v_dst, v_dst)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const short * src, ushort * dst, int width, float scale, float shift) const { int x = 0; #if CV_TRY_SSE4_1 if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_s16u16f32_SSE41(src, dst, width, scale, shift); #endif if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const short * src, short * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const short * src, float * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1, v_src2; v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_store(dst + x, v_src1); v_store(dst + x + cWidth, v_src2); } } return x; } }; // from int template <> struct cvtScale_SIMD { int operator () (const int * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const int * src, schar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store_low(dst + x, v_pack(v_dst, v_dst)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const int * src, ushort * dst, int width, float scale, float shift) const { int x = 0; #if CV_TRY_SSE4_1 if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_s32u16f32_SSE41(src, dst, width, scale, shift); #endif if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const int * src, short * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_src1, v_src2; v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2); v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2)); v_store(dst + x, v_dst); } } return x; } }; #if CV_SIMD128_64F template <> struct cvtScale_SIMD { int operator () (const int * src, int * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_int32x4::nlanes; for (; x <= width - cWidth; x += cWidth) { double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] }; v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf); v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2); v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2))); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const int * src, float * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_int32x4::nlanes; for (; x <= width - cWidth; x += cWidth) { double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] }; v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf); v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2); v_store(dst + x, v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2))); } } return x; } }; #endif //CV_SIMD128_64F // from float template <> struct cvtScale_SIMD { int operator () (const float * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); v_pack_u_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const float * src, schar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); v_store_low(dst + x, v_pack(v_dst, v_dst)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const float * src, ushort * dst, int width, float scale, float shift) const { int x = 0; #if CV_TRY_SSE4_1 if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_f32u16f32_SSE41(src, dst, width, scale, shift); #endif if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const float * src, short * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x); v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth); v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const float * src, int * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth; x += cWidth) v_store(dst + x, v_round(v_load(src + x) * v_scale + v_shift)); } return x; } }; template <> struct cvtScale_SIMD { int operator () (const float * src, float * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth; x += cWidth) v_store(dst + x, v_load(src + x) * v_scale + v_shift); } return x; } }; #if CV_SIMD128_64F static inline void v_load_scale_shift(const double* src, const v_float64x2& v_scale, const v_float64x2 &v_shift, v_float32x4& v_dst1, v_float32x4 &v_dst2) { int cWidth = v_float64x2::nlanes; v_float64x2 v_src1 = v_shift + v_scale * v_load(src); v_float64x2 v_src2 = v_shift + v_scale * v_load(src + cWidth); v_float64x2 v_src3 = v_shift + v_scale * v_load(src + cWidth * 2); v_float64x2 v_src4 = v_shift + v_scale * v_load(src + cWidth * 3); v_dst1 = v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2)); v_dst2 = v_combine_low(v_cvt_f32(v_src3), v_cvt_f32(v_src4)); } static inline void v_store_scale_shift_s32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_int32x4 &v1, const v_int32x4 &v2) { v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1); v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1); v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2); v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2); v_store(dst, v_dst1); v_store(dst + v_float64x2::nlanes, v_dst2); v_store(dst + v_float64x2::nlanes * 2, v_dst3); v_store(dst + v_float64x2::nlanes * 3, v_dst4); } static inline void v_store_scale_shift_f32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_float32x4 &v1, const v_float32x4 &v2) { v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1); v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1); v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2); v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2); v_store(dst, v_dst1); v_store(dst + v_float64x2::nlanes, v_dst2); v_store(dst + v_float64x2::nlanes * 2, v_dst3); v_store(dst + v_float64x2::nlanes * 3, v_dst4); } // from double template <> struct cvtScale_SIMD { int operator () (const double * src, uchar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 4; x += cWidth * 4) { v_float32x4 v_dst1, v_dst2; v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); v_pack_u_store(dst + x, v_pack(v_round(v_dst1), v_round(v_dst2))); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const double * src, schar * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 4; x += cWidth * 4) { v_float32x4 v_dst1, v_dst2; v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); v_pack_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const double * src, ushort * dst, int width, float scale, float shift) const { int x = 0; #if CV_TRY_SSE4_1 if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::cvtScale_SIMD_f64u16f32_SSE41(src, dst, width, scale, shift); #endif if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); int cWidth = v_uint16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_dst1, v_dst2; v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const double * src, short * dst, int width, float scale, float shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale); int cWidth = v_int16x8::nlanes; for (; x <= width - cWidth; x += cWidth) { v_float32x4 v_dst1, v_dst2; v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2); v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2)); v_store(dst + x, v_dst); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const double * src, int * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2))); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const double * src, float * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); v_float32x4 v_dst1 = v_cvt_f32(v_src1); v_float32x4 v_dst2 = v_cvt_f32(v_src2); v_store(dst + x, v_combine_low(v_dst1, v_dst2)); } } return x; } }; // to double template <> struct cvtScale_SIMD { int operator () (const uchar * src, double * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 4; x += cWidth * 4) { v_uint32x4 v_src1, v_src2; v_expand(v_load_expand(src + x), v_src1, v_src2); v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const schar * src, double * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 4; x += cWidth * 4) { v_int32x4 v_src1, v_src2; v_expand(v_load_expand(src + x), v_src1, v_src2); v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const ushort * src, double * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 4; x += cWidth * 4) { v_uint32x4 v_src1, v_src2; v_expand(v_load(src + x), v_src1, v_src2); v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2)); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const short * src, double * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 4; x += cWidth * 4) { v_int32x4 v_src1, v_src2; v_expand(v_load(src + x), v_src1, v_src2); v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const int * src, double * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_int32x4 v_src1 = v_load(src + x); v_int32x4 v_src2 = v_load(src + x + cWidth); v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const float * src, double * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float32x4::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float32x4 v_src1 = v_load(src + x); v_float32x4 v_src2 = v_load(src + x + cWidth); v_store_scale_shift_f32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2); } } return x; } }; template <> struct cvtScale_SIMD { int operator () (const double * src, double * dst, int width, double scale, double shift) const { int x = 0; if (hasSIMD128()) { v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale); int cWidth = v_float64x2::nlanes; for (; x <= width - cWidth * 2; x += cWidth * 2) { v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x); v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth); v_store(dst + x, v_src1); v_store(dst + x + cWidth, v_src2); } } return x; } }; #endif #endif template static void cvtScale_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size, WT scale, WT shift ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); cvtScale_SIMD vop; for( ; size.height--; src += sstep, dst += dstep ) { int x = vop(src, dst, size.width, scale, shift); #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { DT t0, t1; t0 = saturate_cast
(src[x]*scale + shift); t1 = saturate_cast
(src[x+1]*scale + shift); dst[x] = t0; dst[x+1] = t1; t0 = saturate_cast
(src[x+2]*scale + shift); t1 = saturate_cast
(src[x+3]*scale + shift); dst[x+2] = t0; dst[x+3] = t1; } #endif for( ; x < size.width; x++ ) dst[x] = saturate_cast
(src[x]*scale + shift); } } template<> void cvtScale_( const short* src, size_t sstep, int* dst, size_t dstep, Size size, float scale, float shift ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; #if CV_TRY_AVX2 if (CV_CPU_HAS_SUPPORT_AVX2) { opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width); continue; } #endif #if CV_SIMD128 if (hasSIMD128()) { v_float32x4 v_shift = v_setall_f32(shift); v_float32x4 v_scale = v_setall_f32(scale); int cWidth = v_int32x4::nlanes; for (; x <= size.width - cWidth * 2; x += cWidth * 2) { v_int16x8 v_src = v_load(src + x); v_int32x4 v_src1, v_src2; v_expand(v_src, v_src1, v_src2); v_float32x4 v_tmp1 = v_cvt_f32(v_src1); v_float32x4 v_tmp2 = v_cvt_f32(v_src2); v_tmp1 = v_tmp1 * v_scale + v_shift; v_tmp2 = v_tmp2 * v_scale + v_shift; v_store(dst + x, v_round(v_tmp1)); v_store(dst + x + cWidth, v_round(v_tmp2)); } } #endif for(; x < size.width; x++ ) dst[x] = saturate_cast(src[x]*scale + shift); } } //================================================================================================== #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ dtype* dst, size_t dstep, Size size, double* scale) \ { \ tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ } #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ dtype* dst, size_t dstep, Size size, double* scale) \ { \ cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ } DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float) DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float) DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float) DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float) DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float) DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float) DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float) DEF_CVT_SCALE_FUNC(8u, uchar, uchar, float) DEF_CVT_SCALE_FUNC(8s8u, schar, uchar, float) DEF_CVT_SCALE_FUNC(16u8u, ushort, uchar, float) DEF_CVT_SCALE_FUNC(16s8u, short, uchar, float) DEF_CVT_SCALE_FUNC(32s8u, int, uchar, float) DEF_CVT_SCALE_FUNC(32f8u, float, uchar, float) DEF_CVT_SCALE_FUNC(64f8u, double, uchar, float) DEF_CVT_SCALE_FUNC(8u8s, uchar, schar, float) DEF_CVT_SCALE_FUNC(8s, schar, schar, float) DEF_CVT_SCALE_FUNC(16u8s, ushort, schar, float) DEF_CVT_SCALE_FUNC(16s8s, short, schar, float) DEF_CVT_SCALE_FUNC(32s8s, int, schar, float) DEF_CVT_SCALE_FUNC(32f8s, float, schar, float) DEF_CVT_SCALE_FUNC(64f8s, double, schar, float) DEF_CVT_SCALE_FUNC(8u16u, uchar, ushort, float) DEF_CVT_SCALE_FUNC(8s16u, schar, ushort, float) DEF_CVT_SCALE_FUNC(16u, ushort, ushort, float) DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float) DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float) DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float) DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float) DEF_CVT_SCALE_FUNC(8u16s, uchar, short, float) DEF_CVT_SCALE_FUNC(8s16s, schar, short, float) DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float) DEF_CVT_SCALE_FUNC(16s, short, short, float) DEF_CVT_SCALE_FUNC(32s16s, int, short, float) DEF_CVT_SCALE_FUNC(32f16s, float, short, float) DEF_CVT_SCALE_FUNC(64f16s, double, short, float) DEF_CVT_SCALE_FUNC(8u32s, uchar, int, float) DEF_CVT_SCALE_FUNC(8s32s, schar, int, float) DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float) DEF_CVT_SCALE_FUNC(16s32s, short, int, float) DEF_CVT_SCALE_FUNC(32s, int, int, double) DEF_CVT_SCALE_FUNC(32f32s, float, int, float) DEF_CVT_SCALE_FUNC(64f32s, double, int, double) DEF_CVT_SCALE_FUNC(8u32f, uchar, float, float) DEF_CVT_SCALE_FUNC(8s32f, schar, float, float) DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float) DEF_CVT_SCALE_FUNC(16s32f, short, float, float) DEF_CVT_SCALE_FUNC(32s32f, int, float, double) DEF_CVT_SCALE_FUNC(32f, float, float, float) DEF_CVT_SCALE_FUNC(64f32f, double, float, double) DEF_CVT_SCALE_FUNC(8u64f, uchar, double, double) DEF_CVT_SCALE_FUNC(8s64f, schar, double, double) DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double) DEF_CVT_SCALE_FUNC(16s64f, short, double, double) DEF_CVT_SCALE_FUNC(32s64f, int, double, double) DEF_CVT_SCALE_FUNC(32f64f, float, double, double) DEF_CVT_SCALE_FUNC(64f, double, double, double) static BinaryFunc getCvtScaleAbsFunc(int depth) { static BinaryFunc cvtScaleAbsTab[] = { (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u, (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u, (BinaryFunc)cvtScaleAbs64f8u, 0 }; return cvtScaleAbsTab[depth]; } BinaryFunc getConvertScaleFunc(int sdepth, int ddepth) { static BinaryFunc cvtScaleTab[][8] = { { (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u), (BinaryFunc)cvtScale64f8u, 0 }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s), (BinaryFunc)cvtScale64f8s, 0 }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u), (BinaryFunc)cvtScale64f16u, 0 }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s), (BinaryFunc)cvtScale64f16s, 0 }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s), (BinaryFunc)cvtScale64f32s, 0 }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f), (BinaryFunc)cvtScale64f32f, 0 }, { (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f, (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f, (BinaryFunc)cvtScale64f, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } }; return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; } #ifdef HAVE_OPENCL static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) { const ocl::Device & d = ocl::Device::getDefault(); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); bool doubleSupport = d.doubleFPConfig() > 0; if (!doubleSupport && depth == CV_64F) return false; _dst.create(_src.size(), CV_8UC(cn)); int kercn = 1; if (d.isIntel()) { static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1}; kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst, noArray(), noArray(), noArray(), noArray(), noArray(), noArray(), noArray(), ocl::OCL_VECTOR_MAX); } else kercn = ocl::predictOptimalVectorWidthMax(_src, _dst); int rowsPerWI = d.isIntel() ? 4 : 1; char cvt[2][50]; int wdepth = std::max(depth, CV_32F); String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s" " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s" " -D workT1=%s -D rowsPerWI=%d%s", ocl::typeToStr(CV_8UC(kercn)), ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth, ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]), ocl::typeToStr(wdepth), rowsPerWI, doubleSupport ? " -D DOUBLE_SUPPORT" : ""); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt); if (k.empty()) return false; UMat src = _src.getUMat(); UMat dst = _dst.getUMat(); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); if (wdepth == CV_32F) k.args(srcarg, dstarg, (float)alpha, (float)beta); else if (wdepth == CV_64F) k.args(srcarg, dstarg, alpha, beta); size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } #endif } //cv:: void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) { CV_INSTRUMENT_REGION() CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), ocl_convertScaleAbs(_src, _dst, alpha, beta)) Mat src = _src.getMat(); int cn = src.channels(); double scale[] = {alpha, beta}; _dst.create( src.dims, src.size, CV_8UC(cn) ); Mat dst = _dst.getMat(); BinaryFunc func = getCvtScaleAbsFunc(src.depth()); CV_Assert( func != 0 ); if( src.dims <= 2 ) { Size sz = getContinuousSize(src, dst, cn); func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale ); } else { const Mat* arrays[] = {&src, &dst, 0}; uchar* ptrs[2]; NAryMatIterator it(arrays, ptrs); Size sz((int)it.size*cn, 1); for( size_t i = 0; i < it.nplanes; i++, ++it ) func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale ); } } //================================================================================================== namespace cv { #ifdef HAVE_OPENCL static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype, double scale, double delta ) { UMat src = _src.getUMat(); if( _mask.empty() ) src.convertTo( _dst, dtype, scale, delta ); else if (src.channels() <= 4) { const ocl::Device & dev = ocl::Device::getDefault(); int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)), rowsPerWI = dev.isIntel() ? 4 : 1; float fscale = static_cast(scale), fdelta = static_cast(delta); bool haveScale = std::fabs(scale - 1) > DBL_EPSILON, haveZeroScale = !(std::fabs(scale) > DBL_EPSILON), haveDelta = std::fabs(delta) > DBL_EPSILON, doubleSupport = dev.doubleFPConfig() > 0; if (!haveScale && !haveDelta && stype == dtype) { _src.copyTo(_dst, _mask); return true; } if (haveZeroScale) { _dst.setTo(Scalar(delta), _mask); return true; } if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport) return false; char cvt[2][40]; String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d" " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s", ocl::typeToStr(stype), ocl::typeToStr(dtype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn, rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", haveScale ? " -D HAVE_SCALE" : "", haveDelta ? " -D HAVE_DELTA" : "", ocl::typeToStr(sdepth), ocl::typeToStr(ddepth)); ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts); if (k.empty()) return false; UMat mask = _mask.getUMat(), dst = _dst.getUMat(); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), maskarg = ocl::KernelArg::ReadOnlyNoSize(mask), dstarg = ocl::KernelArg::ReadWrite(dst); if (haveScale) { if (haveDelta) k.args(srcarg, maskarg, dstarg, fscale, fdelta); else k.args(srcarg, maskarg, dstarg, fscale); } else { if (haveDelta) k.args(srcarg, maskarg, dstarg, fdelta); else k.args(srcarg, maskarg, dstarg); } size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } else { UMat temp; src.convertTo( temp, dtype, scale, delta ); temp.copyTo( _dst, _mask ); } return true; } #endif } // cv:: void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b, int norm_type, int rtype, InputArray _mask ) { CV_INSTRUMENT_REGION() double scale = 1, shift = 0; if( norm_type == CV_MINMAX ) { double smin = 0, smax = 0; double dmin = MIN( a, b ), dmax = MAX( a, b ); minMaxIdx( _src, &smin, &smax, 0, 0, _mask ); scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0); shift = dmin - smin*scale; } else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C ) { scale = norm( _src, norm_type, _mask ); scale = scale > DBL_EPSILON ? a/scale : 0.; shift = 0; } else CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" ); int type = _src.type(), depth = CV_MAT_DEPTH(type); if( rtype < 0 ) rtype = _dst.fixedType() ? _dst.depth() : depth; CV_OCL_RUN(_dst.isUMat(), ocl_normalize(_src, _dst, _mask, rtype, scale, shift)) Mat src = _src.getMat(); if( _mask.empty() ) src.convertTo( _dst, rtype, scale, shift ); else { Mat temp; src.convertTo( temp, rtype, scale, shift ); temp.copyTo( _dst, _mask ); } }