diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake index b2d927957f..6e1029c2c4 100644 --- a/cmake/OpenCVFindLibsVideo.cmake +++ b/cmake/OpenCVFindLibsVideo.cmake @@ -229,6 +229,18 @@ if(WITH_FFMPEG) find_library(FFMPEG_UTIL_LIB "avutil" HINTS "${FFMPEG_LIB_DIR}") find_library(FFMPEG_SWSCALE_LIB "swscale" HINTS "${FFMPEG_LIB_DIR}") find_library(FFMPEG_RESAMPLE_LIB "avresample" HINTS "${FFMPEG_LIB_DIR}") + if(FFMPEG_CODEC_LIB) + set(HAVE_FFMPEG_CODEC 1) + endif() + if(FFMPEG_FORMAT_LIB) + set(HAVE_FFMPEG_FORMAT 1) + endif() + if(FFMPEG_UTIL_LIB) + set(HAVE_FFMPEG_UTIL 1) + endif() + if(FFMPEG_SWSCALE_LIB) + set(HAVE_FFMPEG_SWSCALE 1) + endif() if(FFMPEG_CODEC_LIB AND FFMPEG_FORMAT_LIB AND FFMPEG_UTIL_LIB AND FFMPEG_SWSCALE_LIB) set(ALIASOF_libavcodec_VERSION "Unknown") diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index 315d498c5a..45f3cef0ce 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -163,7 +163,8 @@ public: CUDA_HOST_MEM = 8 << KIND_SHIFT, CUDA_GPU_MAT = 9 << KIND_SHIFT, UMAT =10 << KIND_SHIFT, - STD_VECTOR_UMAT =11 << KIND_SHIFT + STD_VECTOR_UMAT =11 << KIND_SHIFT, + STD_BOOL_VECTOR =12 << KIND_SHIFT }; _InputArray(); @@ -173,6 +174,7 @@ public: _InputArray(const std::vector& vec); template _InputArray(const Mat_<_Tp>& m); template _InputArray(const std::vector<_Tp>& vec); + _InputArray(const std::vector& vec); template _InputArray(const std::vector >& vec); template _InputArray(const std::vector >& vec); template _InputArray(const _Tp* vec, int n); @@ -284,6 +286,7 @@ public: _OutputArray(cuda::HostMem& cuda_mem); template _OutputArray(cudev::GpuMat_<_Tp>& m); template _OutputArray(std::vector<_Tp>& vec); + _OutputArray(std::vector& vec); template _OutputArray(std::vector >& vec); template _OutputArray(std::vector >& vec); template _OutputArray(Mat_<_Tp>& m); @@ -340,6 +343,7 @@ public: _InputOutputArray(cuda::HostMem& cuda_mem); template _InputOutputArray(cudev::GpuMat_<_Tp>& m); template _InputOutputArray(std::vector<_Tp>& vec); + _InputOutputArray(std::vector& vec); template _InputOutputArray(std::vector >& vec); template _InputOutputArray(std::vector >& vec); template _InputOutputArray(Mat_<_Tp>& m); diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp index 535baa156d..3779b83f82 100644 --- a/modules/core/include/opencv2/core/mat.inl.hpp +++ b/modules/core/include/opencv2/core/mat.inl.hpp @@ -77,6 +77,10 @@ template inline _InputArray::_InputArray(const std::vector<_Tp>& vec) { init(FIXED_TYPE + STD_VECTOR + DataType<_Tp>::type + ACCESS_READ, &vec); } +inline +_InputArray::_InputArray(const std::vector& vec) +{ init(FIXED_TYPE + STD_BOOL_VECTOR + DataType::type + ACCESS_READ, &vec); } + template inline _InputArray::_InputArray(const std::vector >& vec) { init(FIXED_TYPE + STD_VECTOR_VECTOR + DataType<_Tp>::type + ACCESS_READ, &vec); } @@ -140,6 +144,10 @@ template inline _OutputArray::_OutputArray(std::vector<_Tp>& vec) { init(FIXED_TYPE + STD_VECTOR + DataType<_Tp>::type + ACCESS_WRITE, &vec); } +inline +_OutputArray::_OutputArray(std::vector&) +{ CV_Error(Error::StsUnsupportedFormat, "std::vector cannot be an output array\n"); } + template inline _OutputArray::_OutputArray(std::vector >& vec) { init(FIXED_TYPE + STD_VECTOR_VECTOR + DataType<_Tp>::type + ACCESS_WRITE, &vec); } @@ -227,6 +235,9 @@ template inline _InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec) { init(FIXED_TYPE + STD_VECTOR + DataType<_Tp>::type + ACCESS_RW, &vec); } +inline _InputOutputArray::_InputOutputArray(std::vector&) +{ CV_Error(Error::StsUnsupportedFormat, "std::vector cannot be an input/output array\n"); } + template inline _InputOutputArray::_InputOutputArray(std::vector >& vec) { init(FIXED_TYPE + STD_VECTOR_VECTOR + DataType<_Tp>::type + ACCESS_RW, &vec); } @@ -1464,13 +1475,15 @@ Mat_<_Tp> Mat_<_Tp>::operator()( const Range* ranges ) const template inline _Tp* Mat_<_Tp>::operator [](int y) { - return (_Tp*)ptr(y); + CV_DbgAssert( 0 <= y && y < rows ); + return (_Tp*)(data + y*step.p[0]); } template inline const _Tp* Mat_<_Tp>::operator [](int y) const { - return (const _Tp*)ptr(y); + CV_DbgAssert( 0 <= y && y < rows ); + return (const _Tp*)(data + y*step.p[0]); } template inline diff --git a/modules/core/include/opencv2/core/optim.hpp b/modules/core/include/opencv2/core/optim.hpp index 4f1749ec97..18e733f47b 100644 --- a/modules/core/include/opencv2/core/optim.hpp +++ b/modules/core/include/opencv2/core/optim.hpp @@ -63,9 +63,11 @@ public: class CV_EXPORTS Function { public: - virtual ~Function() {} - virtual double calc(const double* x) const = 0; - virtual void getGradient(const double* /*x*/,double* /*grad*/) {} + virtual ~Function() {} + virtual int getDims() const = 0; + virtual double getGradientEps() const; + virtual double calc(const double* x) const = 0; + virtual void getGradient(const double* x,double* grad); }; /** @brief Getter for the optimized function. diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 15b0759f77..1b8705983f 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -2781,15 +2781,23 @@ struct Div_SIMD } }; -#if CV_SSE2 +template +struct Recip_SIMD +{ + int operator() (const T *, T *, int, double) const + { + return 0; + } +}; -#if CV_SSE4_1 + +#if CV_SIMD128 template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const { @@ -2798,50 +2806,44 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero); - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); + v_uint16x8 v_src1 = v_load_expand(src1 + x); + v_uint16x8 v_src2 = v_load_expand(src2 + x); - __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); - v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const { @@ -2850,50 +2852,44 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8); - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); + v_int16x8 v_src1 = v_load_expand(src1 + x); + v_int16x8 v_src2 = v_load_expand(src2 + x); - __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); - v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#if CV_SSE4_1 template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const { @@ -2902,49 +2898,43 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_uint16x8 v_src1 = v_load(src1 + x); + v_uint16x8 v_src2 = v_load(src2 + x); - __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); - v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 - template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const { @@ -2953,36 +2943,32 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_int16x8 v_src1 = v_load(src1 + x); + v_int16x8 v_src2 = v_load(src2 + x); - __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); - v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src1d = _mm_cvtepi32_pd(v_src1i); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; @@ -2993,7 +2979,7 @@ template <> struct Div_SIMD { bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const { @@ -3002,100 +2988,82 @@ struct Div_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); - for ( ; x <= width - 4; x += 4) + for ( ; x <= width - 8; x += 8) { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_int32x4 t0 = v_load(src1 + x); + v_int32x4 t1 = v_load(src1 + x + 4); + v_int32x4 t2 = v_load(src2 + x); + v_int32x4 t3 = v_load(src2 + x + 4); - __m128d v_src1d = _mm_cvtepi32_pd(v_src1); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); - v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; - __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); + + res0 = v_select(t2 == v_zero, v_zero, res0); + res1 = v_select(t3 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); } return x; } }; -#endif -template static void -div_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) +template <> +struct Div_SIMD { - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - Div_SIMD vop; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) + int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const { - int i = vop(src1, src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for( ; i <= size.width - 4; i += 4 ) + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) { - if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 ) - { - double a = (double)src2[i] * src2[i+1]; - double b = (double)src2[i+2] * src2[i+3]; - double d = scale/(a * b); - b *= d; - a *= d; + v_float32x4 f0 = v_load(src1 + x); + v_float32x4 f1 = v_load(src1 + x + 4); + v_float32x4 f2 = v_load(src2 + x); + v_float32x4 f3 = v_load(src2 + x + 4); - T z0 = saturate_cast(src2[i+1] * ((double)src1[i] * b)); - T z1 = saturate_cast(src2[i] * ((double)src1[i+1] * b)); - T z2 = saturate_cast(src2[i+3] * ((double)src1[i+2] * a)); - T z3 = saturate_cast(src2[i+2] * ((double)src1[i+3] * a)); + v_float32x4 res0 = f0 * v_scale / f2; + v_float32x4 res1 = f1 * v_scale / f3; - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } - else - { - T z0 = src2[i] != 0 ? saturate_cast(src1[i]*scale/src2[i]) : 0; - T z1 = src2[i+1] != 0 ? saturate_cast(src1[i+1]*scale/src2[i+1]) : 0; - T z2 = src2[i+2] != 0 ? saturate_cast(src1[i+2]*scale/src2[i+2]) : 0; - T z3 = src2[i+3] != 0 ? saturate_cast(src1[i+3]*scale/src2[i+3]) : 0; + res0 = v_select(f2 == v_zero, v_zero, res0); + res1 = v_select(f3 == v_zero, v_zero, res1); - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } + v_store(dst + x, res0); + v_store(dst + x + 4, res1); } - #endif - for( ; i < size.width; i++ ) - dst[i] = src2[i] != 0 ? saturate_cast(src1[i]*scale/src2[i]) : 0; - } -} -template -struct Recip_SIMD -{ - int operator() (const T *, T *, int, double) const - { - return 0; + return x; } }; -#if CV_SSE2 -#if CV_SSE4_1 +///////////////////////// RECIPROCAL ////////////////////// template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const uchar * src2, uchar * dst, int width, double scale) const { @@ -3104,43 +3072,39 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); + v_uint16x8 v_src2 = v_load_expand(src2 + x); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const schar * src2, schar * dst, int width, double scale) const { @@ -3149,43 +3113,39 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); + v_int16x8 v_src2 = v_load_expand(src2 + x); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); - __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); - _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); } return x; } }; -#if CV_SSE4_1 template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const ushort * src2, ushort * dst, int width, double scale) const { @@ -3194,42 +3154,38 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_uint16x8 v_src2 = v_load(src2 + x); - __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); - v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; } }; -#endif // CV_SSE4_1 - template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const short * src2, short * dst, int width, double scale) const { @@ -3238,29 +3194,27 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_int16x8 v_src2 = v_load(src2 + x); - __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); - v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); - v_src2d = _mm_cvtepi32_pd(v_src2i); - v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); - v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); - __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); - __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); } return x; @@ -3271,7 +3225,7 @@ template <> struct Recip_SIMD { bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const int * src2, int * dst, int width, double scale) const { @@ -3280,22 +3234,136 @@ struct Recip_SIMD if (!haveSIMD) return x; - __m128d v_scale = _mm_set1_pd(scale); - __m128i v_zero = _mm_setzero_si128(); + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); + + for ( ; x <= width - 8; x += 8) + { + v_int32x4 t0 = v_load(src2 + x); + v_int32x4 t1 = v_load(src2 + x + 4); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); + + res0 = v_select(t0 == v_zero, v_zero, res0); + res1 = v_select(t1 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const float * src2, float * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) + { + v_float32x4 f0 = v_load(src2 + x); + v_float32x4 f1 = v_load(src2 + x + 4); + + v_float32x4 res0 = v_scale / f0; + v_float32x4 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + +#if CV_SIMD128_64F + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); for ( ; x <= width - 4; x += 4) { - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + v_float64x2 f0 = v_load(src1 + x); + v_float64x2 f1 = v_load(src1 + x + 2); + v_float64x2 f2 = v_load(src2 + x); + v_float64x2 f3 = v_load(src2 + x + 2); - __m128d v_src2d = _mm_cvtepi32_pd(v_src2); - __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_float64x2 res0 = f0 * v_scale / f2; + v_float64x2 res1 = f1 * v_scale / f3; - v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); - __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); - __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); - __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); - _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + v_store(dst + x, res0); + v_store(dst + x + 2, res1); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src2, double * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); + + for ( ; x <= width - 4; x += 4) + { + v_float64x2 f0 = v_load(src2 + x); + v_float64x2 f1 = v_load(src2 + x + 2); + + v_float64x2 res0 = v_scale / f0; + v_float64x2 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 2, res1); } return x; @@ -3304,10 +3372,78 @@ struct Recip_SIMD #endif +#endif + template static void -recip_( const T*, size_t, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) +div_i( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) { + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + float scale_f = (float)scale; + + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, size.width, scale); + for( ; i < size.width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +div_f( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) +{ + T scale_f = (T)scale; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, size.width, scale); + for( ; i < size.width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +recip_i( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) +{ + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Recip_SIMD vop; + float scale_f = (float)scale; + + for( ; size.height--; src2 += step2, dst += step ) + { + int i = vop(src2, dst, size.width, scale); + for( ; i < size.width; i++ ) + { + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + } + } +} + +template static void +recip_f( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, Size size, double scale ) +{ + T scale_f = (T)scale; step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); @@ -3316,39 +3452,11 @@ recip_( const T*, size_t, const T* src2, size_t step2, for( ; size.height--; src2 += step2, dst += step ) { int i = vop(src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for( ; i <= size.width - 4; i += 4 ) - { - if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 ) - { - double a = (double)src2[i] * src2[i+1]; - double b = (double)src2[i+2] * src2[i+3]; - double d = scale/(a * b); - b *= d; - a *= d; - - T z0 = saturate_cast(src2[i+1] * b); - T z1 = saturate_cast(src2[i] * b); - T z2 = saturate_cast(src2[i+3] * a); - T z3 = saturate_cast(src2[i+2] * a); - - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } - else - { - T z0 = src2[i] != 0 ? saturate_cast(scale/src2[i]) : 0; - T z1 = src2[i+1] != 0 ? saturate_cast(scale/src2[i+1]) : 0; - T z2 = src2[i+2] != 0 ? saturate_cast(scale/src2[i+2]) : 0; - T z3 = src2[i+3] != 0 ? saturate_cast(scale/src2[i+3]) : 0; - - dst[i] = z0; dst[i+1] = z1; - dst[i+2] = z2; dst[i+3] = z3; - } - } - #endif for( ; i < size.width; i++ ) - dst[i] = src2[i] != 0 ? saturate_cast(scale/src2[i]) : 0; + { + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + } } } @@ -3459,87 +3567,87 @@ static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t st uchar* dst, size_t step, Size sz, void* scale) { if( src1 ) - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); else - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* scale) { - div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* scale) { - recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); + recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } diff --git a/modules/core/src/conjugate_gradient.cpp b/modules/core/src/conjugate_gradient.cpp index 90353cc7fd..1259cc9756 100644 --- a/modules/core/src/conjugate_gradient.cpp +++ b/modules/core/src/conjugate_gradient.cpp @@ -46,6 +46,25 @@ namespace cv { + double MinProblemSolver::Function::getGradientEps() const { return 1e-3; } + void MinProblemSolver::Function::getGradient(const double* x, double* grad) + { + double eps = getGradientEps(); + int i, n = getDims(); + AutoBuffer x_buf(n); + double* x_ = x_buf; + for( i = 0; i < n; i++ ) + x_[i] = x[i]; + for( i = 0; i < n; i++ ) + { + x_[i] = x[i] + eps; + double y1 = calc(x_); + x_[i] = x[i] - eps; + double y0 = calc(x_); + grad[i] = (y1 - y0)/(2*eps); + x_[i] = x[i]; + } + } #define SEC_METHOD_ITERATIONS 4 #define INITIAL_SEC_METHOD_SIGMA 0.1 diff --git a/modules/core/src/downhill_simplex.cpp b/modules/core/src/downhill_simplex.cpp index 261bf33c3f..a0cc1320b8 100644 --- a/modules/core/src/downhill_simplex.cpp +++ b/modules/core/src/downhill_simplex.cpp @@ -40,8 +40,13 @@ //M*/ #include "precomp.hpp" +#if 0 +#define dprintf(x) printf x +#define print_matrix(x) print(x) +#else #define dprintf(x) #define print_matrix(x) +#endif /* @@ -51,14 +56,14 @@ Downhill Simplex method in OpenCV dev 3.0.0 getting this error: OpenCV Error: Assertion failed (dims <= 2 && data && (unsigned)i0 < (unsigned)(s ize.p[0] * size.p[1]) && elemSize() == (((((DataType<_Tp>::type) & ((512 - 1) << 3)) >> 3) + 1) << ((((sizeof(size_t)/4+1)16384|0x3a50) ->> ((DataType<_Tp>::typ e) & ((1 << 3) - 1))2) & 3))) in cv::Mat::at, +>> ((DataType<_Tp>::typ e) & ((1 << 3) - 1))2) & 3))) in Mat::at, file C:\builds\master_PackSlave-w in32-vc12-shared\opencv\modules\core\include\opencv2/core/mat.inl.hpp, line 893 ****Problem and Possible Fix********************************************************************************************************* DownhillSolverImpl::innerDownhillSimplex something looks broken here: Mat_ coord_sum(1,ndim,0.0),buf(1,ndim,0.0),y(1,ndim,0.0); -nfunk = 0; +fcount = 0; for(i=0;icalc(p[i]); @@ -135,275 +140,326 @@ multiple lines in three dimensions as not all lines intersect in three dimension namespace cv { - class DownhillSolverImpl : public DownhillSolver +class DownhillSolverImpl : public DownhillSolver +{ +public: + DownhillSolverImpl() { - public: - void getInitStep(OutputArray step) const; - void setInitStep(InputArray step); - Ptr getFunction() const; - void setFunction(const Ptr& f); - TermCriteria getTermCriteria() const; - DownhillSolverImpl(); - void setTermCriteria(const TermCriteria& termcrit); - double minimize(InputOutputArray x); - protected: - Ptr _Function; - TermCriteria _termcrit; - Mat _step; - Mat_ buf_x; - - private: - inline void createInitialSimplex(Mat_& simplex,Mat& step); - inline double innerDownhillSimplex(cv::Mat_& p,double MinRange,double MinError,int& nfunk, - const Ptr& f,int nmax); - inline double tryNewPoint(Mat_& p,Mat_& y,Mat_& coord_sum,const Ptr& f,int ihi, - double fac,Mat_& ptry); - }; - - double DownhillSolverImpl::tryNewPoint( - Mat_& p, - Mat_& y, - Mat_& coord_sum, - const Ptr& f, - int ihi, - double fac, - Mat_& ptry - ) - { - int ndim=p.cols; - int j; - double fac1,fac2,ytry; - - fac1=(1.0-fac)/ndim; - fac2=fac1-fac; - for (j=0;jcalc(ptry.ptr()); - if (ytry < y(ihi)) - { - y(ihi)=ytry; - for (j=0;j(); + _step=Mat_(); } - /* - Performs the actual minimization of MinProblemSolver::Function f (after the initialization was done) - - The matrix p[ndim+1][1..ndim] represents ndim+1 vertices that - form a simplex - each row is an ndim vector. - On output, nfunk gives the number of function evaluations taken. - */ - double DownhillSolverImpl::innerDownhillSimplex( - cv::Mat_& p, - double MinRange, - double MinError, - int& nfunk, - const Ptr& f, - int nmax - ) + void getInitStep(OutputArray step) const { _step.copyTo(step); } + void setInitStep(InputArray step) { - int ndim=p.cols; - double res; - int i,ihi,ilo,inhi,j,mpts=ndim+1; - double error, range,ysave,ytry; - Mat_ coord_sum(1,ndim,0.0),buf(1,ndim,0.0),y(1,ndim+1,0.0); - - nfunk = 0; - - for(i=0;icalc(p[i]); - } - - nfunk = ndim+1; - - reduce(p,coord_sum,0,CV_REDUCE_SUM); - - for (;;) - { - ilo=0; - /* find highest (worst), next-to-worst, and lowest - (best) points by going through all of them. */ - ihi = y(0)>y(1) ? (inhi=1,0) : (inhi=0,1); - for (i=0;i y(ihi)) - { - inhi=ihi; - ihi=i; - } - else if (y(i) > y(inhi) && i != ihi) - inhi=i; - } - - /* check stop criterion */ - error=fabs(y(ihi)-y(ilo)); - range=0; - for(i=0;i p(j,i) ) min = p(j,i); - if( max < p(j,i) ) max = p(j,i); - } - d = fabs(max-min); - if(range < d) range = d; - } - - if(range <= MinRange || error <= MinError) - { /* Put best point and value in first slot. */ - std::swap(y(0),y(ilo)); - for (i=0;i= nmax){ - dprintf(("nmax exceeded\n")); - return y(ilo); - } - nfunk += 2; - /*Begin a new iteration. First, reflect the worst point about the centroid of others */ - ytry = tryNewPoint(p,y,coord_sum,f,ihi,-1.0,buf); - if (ytry <= y(ilo)) - { /*If that's better than the best point, go twice as far in that direction*/ - ytry = tryNewPoint(p,y,coord_sum,f,ihi,2.0,buf); - } - else if (ytry >= y(inhi)) - { /* The new point is worse than the second-highest, but better - than the worst so do not go so far in that direction */ - ysave = y(ihi); - ytry = tryNewPoint(p,y,coord_sum,f,ihi,0.5,buf); - if (ytry >= ysave) - { /* Can't seem to improve things. Contract the simplex to good point - in hope to find a simplex landscape. */ - for (i=0;icalc(coord_sum.ptr()); - } - } - nfunk += ndim; - reduce(p,coord_sum,0,CV_REDUCE_SUM); - } - } else --(nfunk); /* correct nfunk */ - dprintf(("this is simplex on iteration %d\n",nfunk)); - print_matrix(p); - } /* go to next iteration. */ - res = y(0); - - return res; + // set dimensionality and make a deep copy of step + Mat m = step.getMat(); + dprintf(("m.cols=%d\nm.rows=%d\n", m.cols, m.rows)); + if( m.rows == 1 ) + m.copyTo(_step); + else + transpose(m, _step); } - void DownhillSolverImpl::createInitialSimplex(Mat_& simplex,Mat& step){ - for(int i=1;i<=step.cols;++i) - { - simplex.row(0).copyTo(simplex.row(i)); - simplex(i,i-1)+= 0.5*step.at(0,i-1); - } - simplex.row(0) -= 0.5*step; + Ptr getFunction() const { return _Function; } - dprintf(("this is simplex\n")); - print_matrix(simplex); + void setFunction(const Ptr& f) { _Function=f; } + + TermCriteria getTermCriteria() const { return _termcrit; } + + void setTermCriteria( const TermCriteria& termcrit ) + { + CV_Assert( termcrit.type == (TermCriteria::MAX_ITER + TermCriteria::EPS) && + termcrit.epsilon > 0 && + termcrit.maxCount > 0 ); + _termcrit=termcrit; } - double DownhillSolverImpl::minimize(InputOutputArray x){ + double minimize( InputOutputArray x_ ) + { dprintf(("hi from minimize\n")); - CV_Assert(_Function.empty()==false); + CV_Assert( !_Function.empty() ); + CV_Assert( std::min(_step.cols, _step.rows) == 1 && + std::max(_step.cols, _step.rows) >= 2 && + _step.type() == CV_64FC1 ); dprintf(("termcrit:\n\ttype: %d\n\tmaxCount: %d\n\tEPS: %g\n",_termcrit.type,_termcrit.maxCount,_termcrit.epsilon)); dprintf(("step\n")); print_matrix(_step); - Mat x_mat=x.getMat(); - CV_Assert(MIN(x_mat.rows,x_mat.cols)==1); - CV_Assert(MAX(x_mat.rows,x_mat.cols)==_step.cols); - CV_Assert(x_mat.type()==CV_64FC1); - - Mat_ proxy_x; - - if(x_mat.rows>1){ - buf_x.create(1,_step.cols); - Mat_ proxy(_step.cols,1,buf_x.ptr()); - x_mat.copyTo(proxy); - proxy_x=buf_x; - }else{ - proxy_x=x_mat; - } - - int count=0; - int ndim=_step.cols; - Mat_ simplex=Mat_(ndim+1,ndim,0.0); - - simplex.row(0).copyTo(proxy_x); - createInitialSimplex(simplex,_step); - double res = innerDownhillSimplex( - simplex,_termcrit.epsilon, _termcrit.epsilon, count,_Function,_termcrit.maxCount); - simplex.row(0).copyTo(proxy_x); + Mat x = x_.getMat(), simplex; + createInitialSimplex(x, simplex, _step); + int count = 0; + double res = innerDownhillSimplex(simplex,_termcrit.epsilon, _termcrit.epsilon, + count, _termcrit.maxCount); dprintf(("%d iterations done\n",count)); - if(x_mat.rows>1){ - Mat(x_mat.rows, 1, CV_64F, proxy_x.ptr()).copyTo(x); + if( !x.empty() ) + { + Mat simplex_0m(x.rows, x.cols, CV_64F, simplex.ptr()); + simplex_0m.convertTo(x, x.type()); + } + else + { + int x_type = x_.fixedType() ? x_.type() : CV_64F; + simplex.row(0).convertTo(x_, x_type); } return res; } - DownhillSolverImpl::DownhillSolverImpl(){ - _Function=Ptr(); - _step=Mat_(); - } - Ptr DownhillSolverImpl::getFunction()const{ - return _Function; - } - void DownhillSolverImpl::setFunction(const Ptr& f){ - _Function=f; - } - TermCriteria DownhillSolverImpl::getTermCriteria()const{ - return _termcrit; - } - void DownhillSolverImpl::setTermCriteria(const TermCriteria& termcrit){ - CV_Assert(termcrit.type==(TermCriteria::MAX_ITER+TermCriteria::EPS) && termcrit.epsilon>0 && termcrit.maxCount>0); - _termcrit=termcrit; - } - // both minRange & minError are specified by termcrit.epsilon; In addition, user may specify the number of iterations that the algorithm does. - Ptr DownhillSolver::create(const Ptr& f, InputArray initStep, TermCriteria termcrit){ - Ptr DS = makePtr(); - DS->setFunction(f); - DS->setInitStep(initStep); - DS->setTermCriteria(termcrit); - return DS; - } - void DownhillSolverImpl::getInitStep(OutputArray step)const{ - _step.copyTo(step); - } - void DownhillSolverImpl::setInitStep(InputArray step){ - //set dimensionality and make a deep copy of step - Mat m=step.getMat(); - dprintf(("m.cols=%d\nm.rows=%d\n",m.cols,m.rows)); - CV_Assert(MIN(m.cols,m.rows)==1 && m.type()==CV_64FC1); - if(m.rows==1){ - m.copyTo(_step); - }else{ - transpose(m,_step); +protected: + Ptr _Function; + TermCriteria _termcrit; + Mat _step; + + inline void updateCoordSum(const Mat& p, Mat& coord_sum) + { + int i, j, m = p.rows, n = p.cols; + double* coord_sum_ = coord_sum.ptr(); + CV_Assert( coord_sum.cols == n && coord_sum.rows == 1 ); + + for( j = 0; j < n; j++ ) + coord_sum_[j] = 0.; + + for( i = 0; i < m; i++ ) + { + const double* p_i = p.ptr(i); + for( j = 0; j < n; j++ ) + coord_sum_[j] += p_i[j]; } + + dprintf(("\nupdated coord sum:\n")); + print_matrix(coord_sum); + } + + inline void createInitialSimplex( const Mat& x0, Mat& simplex, Mat& step ) + { + int i, j, ndim = step.cols; + CV_Assert( _Function->getDims() == ndim ); + Mat x = x0; + if( x0.empty() ) + x = Mat::zeros(1, ndim, CV_64F); + CV_Assert( (x.cols == 1 && x.rows == ndim) || (x.cols == ndim && x.rows == 1) ); + CV_Assert( x.type() == CV_32F || x.type() == CV_64F ); + + simplex.create(ndim + 1, ndim, CV_64F); + Mat simplex_0m(x.rows, x.cols, CV_64F, simplex.ptr()); + + x.convertTo(simplex_0m, CV_64F); + double* simplex_0 = simplex.ptr(); + const double* step_ = step.ptr(); + for( i = 1; i <= ndim; i++ ) + { + double* simplex_i = simplex.ptr(i); + for( j = 0; j < ndim; j++ ) + simplex_i[j] = simplex_0[j]; + simplex_i[i-1] += 0.5*step_[i-1]; + } + for( j = 0; j < ndim; j++ ) + simplex_0[j] -= 0.5*step_[j]; + + dprintf(("\nthis is simplex\n")); + print_matrix(simplex); + } + + /* + Performs the actual minimization of MinProblemSolver::Function f (after the initialization was done) + + The matrix p[ndim+1][1..ndim] represents ndim+1 vertices that + form a simplex - each row is an ndim vector. + On output, fcount gives the number of function evaluations taken. + */ + double innerDownhillSimplex( Mat& p, double MinRange, double MinError, int& fcount, int nmax ) + { + int i, j, ndim = p.cols; + Mat coord_sum(1, ndim, CV_64F), buf(1, ndim, CV_64F), y(1, ndim+1, CV_64F); + double* y_ = y.ptr(); + + fcount = ndim+1; + for( i = 0; i <= ndim; i++ ) + y_[i] = calc_f(p.ptr(i)); + + updateCoordSum(p, coord_sum); + + for (;;) + { + // find highest (worst), next-to-worst, and lowest + // (best) points by going through all of them. + int ilo = 0, ihi, inhi; + if( y_[0] > y_[1] ) + { + ihi = 0; inhi = 1; + } + else + { + ihi = 1; inhi = 0; + } + for( i = 0; i <= ndim; i++ ) + { + double yval = y_[i]; + if (yval <= y_[ilo]) + ilo = i; + if (yval > y_[ihi]) + { + inhi = ihi; + ihi = i; + } + else if (yval > y_[inhi] && i != ihi) + inhi = i; + } + CV_Assert( ihi != inhi ); + if( ilo == inhi || ilo == ihi ) + { + for( i = 0; i <= ndim; i++ ) + { + double yval = y_[i]; + if( yval == y_[ilo] && i != ihi && i != inhi ) + { + ilo = i; + break; + } + } + } + dprintf(("\nthis is y on iteration %d:\n",fcount)); + print_matrix(y); + + // check stop criterion + double error = fabs(y_[ihi] - y_[ilo]); + double range = 0; + for( j = 0; j < ndim; j++ ) + { + double minval, maxval; + minval = maxval = p.at(0, j); + for( i = 1; i <= ndim; i++ ) + { + double pval = p.at(i, j); + minval = std::min(minval, pval); + maxval = std::max(maxval, pval); + } + range = std::max(range, fabs(maxval - minval)); + } + + if( range <= MinRange || error <= MinError || fcount >= nmax ) + { + // Put best point and value in first slot. + std::swap(y_[0], y_[ilo]); + for( j = 0; j < ndim; j++ ) + { + std::swap(p.at(0, j), p.at(ilo, j)); + } + break; + } + + double y_lo = y_[ilo], y_nhi = y_[inhi], y_hi = y_[ihi]; + // Begin a new iteration. First, reflect the worst point about the centroid of others + double alpha = -1.0; + double y_alpha = tryNewPoint(p, coord_sum, ihi, alpha, buf, fcount); + + dprintf(("\ny_lo=%g, y_nhi=%g, y_hi=%g, y_alpha=%g, p_alpha:\n", y_lo, y_nhi, y_hi, y_alpha)); + print_matrix(buf); + + if( y_alpha < y_nhi ) + { + if( y_alpha < y_lo ) + { + // If that's better than the best point, go twice as far in that direction + double beta = -2.0; + double y_beta = tryNewPoint(p, coord_sum, ihi, beta, buf, fcount); + dprintf(("\ny_beta=%g, p_beta:\n", y_beta)); + print_matrix(buf); + if( y_beta < y_alpha ) + { + alpha = beta; + y_alpha = y_beta; + } + } + replacePoint(p, coord_sum, y, ihi, alpha, y_alpha); + } + else + { + // The new point is worse than the second-highest, + // do not go so far in that direction + double gamma = 0.5; + double y_gamma = tryNewPoint(p, coord_sum, ihi, gamma, buf, fcount); + dprintf(("\ny_gamma=%g, p_gamma:\n", y_gamma)); + print_matrix(buf); + if( y_gamma < y_hi ) + replacePoint(p, coord_sum, y, ihi, gamma, y_gamma); + else + { + // Can't seem to improve things. + // Contract the simplex to good point + // in hope to find a simplex landscape. + for( i = 0; i <= ndim; i++ ) + { + if (i != ilo) + { + for( j = 0; j < ndim; j++ ) + p.at(i, j) = 0.5*(p.at(i, j) + p.at(ilo, j)); + y_[i] = calc_f(p.ptr(i)); + } + } + fcount += ndim; + updateCoordSum(p, coord_sum); + } + } + dprintf(("\nthis is simplex on iteration %d\n",fcount)); + print_matrix(p); + } + return y_[0]; + } + + inline double calc_f(const double* ptr) + { + double res = _Function->calc(ptr); + CV_Assert( !cvIsNaN(res) && !cvIsInf(res) ); + return res; + } + + double tryNewPoint( Mat& p, Mat& coord_sum, int ihi, double alpha_, Mat& ptry, int& fcount ) + { + int j, ndim = p.cols; + + double alpha = (1.0 - alpha_)/ndim; + double beta = alpha - alpha_; + double* p_ihi = p.ptr(ihi); + double* ptry_ = ptry.ptr(); + double* coord_sum_ = coord_sum.ptr(); + + for( j = 0; j < ndim; j++ ) + ptry_[j] = coord_sum_[j]*alpha - p_ihi[j]*beta; + + fcount++; + return calc_f(ptry_); + } + + void replacePoint( Mat& p, Mat& coord_sum, Mat& y, int ihi, double alpha_, double ytry ) + { + int j, ndim = p.cols; + + double alpha = (1.0 - alpha_)/ndim; + double beta = alpha - alpha_; + double* p_ihi = p.ptr(ihi); + double* coord_sum_ = coord_sum.ptr(); + + for( j = 0; j < ndim; j++ ) + p_ihi[j] = coord_sum_[j]*alpha - p_ihi[j]*beta; + y.at(ihi) = ytry; + updateCoordSum(p, coord_sum); + } +}; + + +// both minRange & minError are specified by termcrit.epsilon; +// In addition, user may specify the number of iterations that the algorithm does. +Ptr DownhillSolver::create( const Ptr& f, + InputArray initStep, TermCriteria termcrit ) +{ + Ptr DS = makePtr(); + DS->setFunction(f); + DS->setInitStep(initStep); + DS->setTermCriteria(termcrit); + return DS; +} + } diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 1291278209..e3fd0e4c40 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -60,7 +60,6 @@ namespace cv #undef USE_IPP_DFT #endif - /****************************************************************************************\ Discrete Fourier Transform \****************************************************************************************/ @@ -1090,11 +1089,12 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab, } } - if( complex_output && (n & 1) == 0 ) + if( complex_output && ((n & 1) == 0 || n == 1)) { dst[-1] = dst[0]; dst[0] = 0; - dst[n] = 0; + if( n > 1 ) + dst[n] = 0; } } @@ -2426,6 +2426,47 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags) #endif // HAVE_CLAMDFFT +namespace cv +{ +static void complementComplexOutput(Mat& dst, int len, int dft_dims) +{ + int i, n = dst.cols; + size_t elem_size = dst.elemSize1(); + if( elem_size == sizeof(float) ) + { + float* p0 = dst.ptr(); + size_t dstep = dst.step/sizeof(p0[0]); + for( i = 0; i < len; i++ ) + { + float* p = p0 + dstep*i; + float* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i); + + for( int j = 1; j < (n+1)/2; j++ ) + { + p[(n-j)*2] = q[j*2]; + p[(n-j)*2+1] = -q[j*2+1]; + } + } + } + else + { + double* p0 = dst.ptr(); + size_t dstep = dst.step/sizeof(p0[0]); + for( i = 0; i < len; i++ ) + { + double* p = p0 + dstep*i; + double* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i); + + for( int j = 1; j < (n+1)/2; j++ ) + { + p[(n-j)*2] = q[j*2]; + p[(n-j)*2+1] = -q[j*2+1]; + } + } + } +} +} + void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) { #ifdef HAVE_CLAMDFFT @@ -2705,7 +2746,11 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) } if( stage != 1 ) + { + if( !inv && real_transform && dst.channels() == 2 ) + complementComplexOutput(dst, nonzero_rows, 1); break; + } src = dst; } else @@ -2847,41 +2892,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) if( stage != 0 ) { if( !inv && real_transform && dst.channels() == 2 && len > 1 ) - { - int n = dst.cols; - if( elem_size == (int)sizeof(float) ) - { - float* p0 = dst.ptr(); - size_t dstep = dst.step/sizeof(p0[0]); - for( i = 0; i < len; i++ ) - { - float* p = p0 + dstep*i; - float* q = i == 0 || i*2 == len ? p : p0 + dstep*(len-i); - - for( int j = 1; j < (n+1)/2; j++ ) - { - p[(n-j)*2] = q[j*2]; - p[(n-j)*2+1] = -q[j*2+1]; - } - } - } - else - { - double* p0 = dst.ptr(); - size_t dstep = dst.step/sizeof(p0[0]); - for( i = 0; i < len; i++ ) - { - double* p = p0 + dstep*i; - double* q = i == 0 || i*2 == len ? p : p0 + dstep*(len-i); - - for( int j = 1; j < (n+1)/2; j++ ) - { - p[(n-j)*2] = q[j*2]; - p[(n-j)*2+1] = -q[j*2+1]; - } - } - } - } + complementComplexOutput(dst, len, 2); break; } src = dst; diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index dea25dd64c..e246a7c400 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -502,7 +502,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, { sd = i < n ? W[i] : 0; - while( sd <= minval ) + for( int ii = 0; ii < 100 && sd <= minval; ii++ ) { // if we got a zero singular value, then in order to get the corresponding left singular vector // we generate a random vector, project it to the previously computed left singular vectors, @@ -527,7 +527,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, At[i*astep + k] = t; asum += std::abs(t); } - asum = asum ? 1/asum : 0; + asum = asum > eps*100 ? 1/asum : 0; for( k = 0; k < m; k++ ) At[i*astep + k] *= asum; } @@ -541,7 +541,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, sd = std::sqrt(sd); } - s = (_Tp)(1/sd); + s = (_Tp)(sd > minval ? 1/sd : 0.); for( k = 0; k < m; k++ ) At[i*astep + k] *= s; } diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index e96eaeb41a..b8e4b1a2ed 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -43,6 +43,7 @@ #include "precomp.hpp" #include "opencl_kernels_core.hpp" +#include namespace cv { @@ -889,38 +890,41 @@ struct iPow_SIMD } }; -#if CV_NEON +#if CV_SIMD128 template <> struct iPow_SIMD { - int operator() ( const uchar * src, uchar * dst, int len, int power) + int operator() ( const uchar * src, uchar * dst, int len, int power ) { int i = 0; - uint32x4_t v_1 = vdupq_n_u32(1u); + v_uint32x4 v_1 = v_setall_u32(1u); for ( ; i <= len - 8; i += 8) { - uint32x4_t v_a1 = v_1, v_a2 = v_1; - uint16x8_t v_src = vmovl_u8(vld1_u8(src + i)); - uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src)); + v_uint32x4 v_a1 = v_1, v_a2 = v_1; + v_uint16x8 v = v_load_expand(src + i); + v_uint32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_u32(v_b1, v_b1); - v_b2 = vmulq_u32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); - vst1_u8(dst + i, vqmovn_u16(vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2)))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_pack_store(dst + i, v); } return i; @@ -933,30 +937,33 @@ struct iPow_SIMD int operator() ( const schar * src, schar * dst, int len, int power) { int i = 0; - int32x4_t v_1 = vdupq_n_s32(1); + v_int32x4 v_1 = v_setall_s32(1); for ( ; i <= len - 8; i += 8) { - int32x4_t v_a1 = v_1, v_a2 = v_1; - int16x8_t v_src = vmovl_s8(vld1_s8(src + i)); - int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src)); + v_int32x4 v_a1 = v_1, v_a2 = v_1; + v_int16x8 v = v_load_expand(src + i); + v_int32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_s32(v_b1, v_b1); - v_b2 = vmulq_s32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); - vst1_s8(dst + i, vqmovn_s16(vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2)))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_pack_store(dst + i, v); } return i; @@ -969,30 +976,33 @@ struct iPow_SIMD int operator() ( const ushort * src, ushort * dst, int len, int power) { int i = 0; - uint32x4_t v_1 = vdupq_n_u32(1u); + v_uint32x4 v_1 = v_setall_u32(1u); for ( ; i <= len - 8; i += 8) { - uint32x4_t v_a1 = v_1, v_a2 = v_1; - uint16x8_t v_src = vld1q_u16(src + i); - uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src)); + v_uint32x4 v_a1 = v_1, v_a2 = v_1; + v_uint16x8 v = v_load(src + i); + v_uint32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_u32(v_b1, v_b1); - v_b2 = vmulq_u32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_u32(v_a1, v_b1); - v_a2 = vmulq_u32(v_a2, v_b2); - vst1q_u16(dst + i, vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_store(dst + i, v); } return i; @@ -1005,60 +1015,70 @@ struct iPow_SIMD int operator() ( const short * src, short * dst, int len, int power) { int i = 0; - int32x4_t v_1 = vdupq_n_s32(1); + v_int32x4 v_1 = v_setall_s32(1); for ( ; i <= len - 8; i += 8) { - int32x4_t v_a1 = v_1, v_a2 = v_1; - int16x8_t v_src = vld1q_s16(src + i); - int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src)); + v_int32x4 v_a1 = v_1, v_a2 = v_1; + v_int16x8 v = v_load(src + i); + v_int32x4 v_b1, v_b2; + v_expand(v, v_b1, v_b2); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); + v_a1 *= v_b1; + v_a2 *= v_b2; } - v_b1 = vmulq_s32(v_b1, v_b1); - v_b2 = vmulq_s32(v_b2, v_b2); + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a1 = vmulq_s32(v_a1, v_b1); - v_a2 = vmulq_s32(v_a2, v_b2); - vst1q_s16(dst + i, vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2))); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v = v_pack(v_a1, v_a2); + v_store(dst + i, v); } return i; } }; - template <> struct iPow_SIMD { int operator() ( const int * src, int * dst, int len, int power) { int i = 0; - int32x4_t v_1 = vdupq_n_s32(1); + v_int32x4 v_1 = v_setall_s32(1); - for ( ; i <= len - 4; i += 4) + for ( ; i <= len - 8; i += 8) { - int32x4_t v_b = vld1q_s32(src + i), v_a = v_1; + v_int32x4 v_a1 = v_1, v_a2 = v_1; + v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); int p = power; while( p > 1 ) { if (p & 1) - v_a = vmulq_s32(v_a, v_b); - v_b = vmulq_s32(v_b, v_b); + { + v_a1 *= v_b1; + v_a2 *= v_b2; + } + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a = vmulq_s32(v_a, v_b); - vst1q_s32(dst + i, v_a); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v_store(dst + i, v_a1); + v_store(dst + i + 4, v_a2); } return i; @@ -1071,42 +1091,143 @@ struct iPow_SIMD int operator() ( const float * src, float * dst, int len, int power) { int i = 0; - float32x4_t v_1 = vdupq_n_f32(1.0f); + v_float32x4 v_1 = v_setall_f32(1.f); - for ( ; i <= len - 4; i += 4) + for ( ; i <= len - 8; i += 8) { - float32x4_t v_b = vld1q_f32(src + i), v_a = v_1; - int p = power; + v_float32x4 v_a1 = v_1, v_a2 = v_1; + v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); + int p = std::abs(power); + if( power < 0 ) + { + v_b1 = v_1 / v_b1; + v_b2 = v_1 / v_b2; + } while( p > 1 ) { if (p & 1) - v_a = vmulq_f32(v_a, v_b); - v_b = vmulq_f32(v_b, v_b); + { + v_a1 *= v_b1; + v_a2 *= v_b2; + } + v_b1 *= v_b1; + v_b2 *= v_b2; p >>= 1; } - v_a = vmulq_f32(v_a, v_b); - vst1q_f32(dst + i, v_a); + v_a1 *= v_b1; + v_a2 *= v_b2; + + v_store(dst + i, v_a1); + v_store(dst + i + 4, v_a2); } return i; } }; +#if CV_SIMD128_64F +template <> +struct iPow_SIMD +{ + int operator() ( const double * src, double * dst, int len, int power) + { + int i = 0; + v_float64x2 v_1 = v_setall_f64(1.); + + for ( ; i <= len - 4; i += 4) + { + v_float64x2 v_a1 = v_1, v_a2 = v_1; + v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2); + int p = std::abs(power); + if( power < 0 ) + { + v_b1 = v_1 / v_b1; + v_b2 = v_1 / v_b2; + } + + while( p > 1 ) + { + if (p & 1) + { + v_a1 *= v_b1; + v_a2 *= v_b2; + } + v_b1 *= v_b1; + v_b2 *= v_b2; + p >>= 1; + } + + v_a1 *= v_b1; + v_a2 *= v_b2; + + v_store(dst + i, v_a1); + v_store(dst + i + 2, v_a2); + } + + return i; + } +}; +#endif + #endif template static void -iPow_( const T* src, T* dst, int len, int power ) +iPow_i( const T* src, T* dst, int len, int power ) { - iPow_SIMD vop; - int i = vop(src, dst, len, power); + if( power < 0 ) + { + T tab[5] = + { + power == -1 ? saturate_cast(-1) : 0, (power & 1) ? -1 : 1, + std::numeric_limits::max(), 1, power == -1 ? 1 : 0 + }; + for( int i = 0; i < len; i++ ) + { + T val = src[i]; + dst[i] = cv_abs(val) <= 2 ? tab[val + 2] : (T)0; + } + } + else + { + iPow_SIMD vop; + int i = vop(src, dst, len, power); + + for( ; i < len; i++ ) + { + WT a = 1, b = src[i]; + int p = power; + while( p > 1 ) + { + if( p & 1 ) + a *= b; + b *= b; + p >>= 1; + } + + a *= b; + dst[i] = saturate_cast(a); + } + } +} + +template +static void +iPow_f( const T* src, T* dst, int len, int power0 ) +{ + iPow_SIMD vop; + int i = vop(src, dst, len, power0); + int power = std::abs(power0); for( ; i < len; i++ ) { - WT a = 1, b = src[i]; + T a = 1, b = src[i]; int p = power; + if( power0 < 0 ) + b = 1/b; + while( p > 1 ) { if( p & 1 ) @@ -1116,44 +1237,43 @@ iPow_( const T* src, T* dst, int len, int power ) } a *= b; - dst[i] = saturate_cast(a); + dst[i] = a; } } - static void iPow8u(const uchar* src, uchar* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow8s(const schar* src, schar* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow16u(const ushort* src, ushort* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow16s(const short* src, short* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow32s(const int* src, int* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_i(src, dst, len, power); } static void iPow32f(const float* src, float* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_f(src, dst, len, power); } static void iPow64f(const double* src, double* dst, int len, int power) { - iPow_(src, dst, len, power); + iPow_f(src, dst, len, power); } @@ -1176,14 +1296,25 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst, bool doubleSupport = d.doubleFPConfig() > 0; _dst.createSameSize(_src, type); - if (is_ipower && (ipower == 0 || ipower == 1)) + if (is_ipower) { if (ipower == 0) + { _dst.setTo(Scalar::all(1)); - else if (ipower == 1) + return true; + } + if (ipower == 1) + { _src.copyTo(_dst); - - return true; + return true; + } + if( ipower < 0 ) + { + if( depth == CV_32F || depth == CV_64F ) + is_ipower = false; + else + return false; + } } if (depth == CV_64F && !doubleSupport) @@ -1233,20 +1364,11 @@ void pow( InputArray _src, double power, OutputArray _dst ) { int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), ipower = cvRound(power); - bool is_ipower = fabs(ipower - power) < DBL_EPSILON, same = false, + bool is_ipower = fabs(ipower - power) < DBL_EPSILON, useOpenCL = _dst.isUMat() && _src.dims() <= 2; if( is_ipower && !(ocl::Device::getDefault().isIntel() && useOpenCL && depth != CV_64F)) { - if( ipower < 0 ) - { - divide( Scalar::all(1), _src, _dst ); - if( ipower == -1 ) - return; - ipower = -ipower; - same = true; - } - switch( ipower ) { case 0: @@ -1257,59 +1379,18 @@ void pow( InputArray _src, double power, OutputArray _dst ) _src.copyTo(_dst); return; case 2: -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - if (depth == CV_32F && !same && ( (_src.dims() <= 2 && !ocl::useOpenCL()) || - (_src.dims() > 2 && _src.isContinuous() && _dst.isContinuous()) )) - { - Mat src = _src.getMat(); - _dst.create( src.dims, src.size, type ); - Mat dst = _dst.getMat(); - - Size size = src.size(); - int srcstep = (int)src.step, dststep = (int)dst.step, esz = CV_ELEM_SIZE(type); - if (src.isContinuous() && dst.isContinuous()) - { - size.width = (int)src.total(); - size.height = 1; - srcstep = dststep = (int)src.total() * esz; - } - size.width *= cn; - - IppStatus status = ippiSqr_32f_C1R(src.ptr(), srcstep, dst.ptr(), dststep, ippiSize(size.width, size.height)); - - if (status >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - if (same) - multiply(_dst, _dst, _dst); - else - multiply(_src, _src, _dst); + multiply(_src, _src, _dst); return; } } else CV_Assert( depth == CV_32F || depth == CV_64F ); - CV_OCL_RUN(useOpenCL, - ocl_pow(same ? _dst : _src, power, _dst, is_ipower, ipower)) + CV_OCL_RUN(useOpenCL, ocl_pow(_src, power, _dst, is_ipower, ipower)) - Mat src, dst; - if (same) - src = dst = _dst.getMat(); - else - { - src = _src.getMat(); - _dst.create( src.dims, src.size, type ); - dst = _dst.getMat(); - } + Mat src = _src.getMat(); + _dst.create( src.dims, src.size, type ); + Mat dst = _dst.getMat(); const Mat* arrays[] = {&src, &dst, 0}; uchar* ptrs[2]; @@ -1335,52 +1416,103 @@ void pow( InputArray _src, double power, OutputArray _dst ) } else { -#if defined(HAVE_IPP) - CV_IPP_CHECK() - { - if (src.isContinuous() && dst.isContinuous()) - { - IppStatus status = depth == CV_32F ? - ippsPowx_32f_A21(src.ptr(), (Ipp32f)power, dst.ptr(), (Ipp32s)(src.total() * cn)) : - ippsPowx_64f_A50(src.ptr(), power, dst.ptr(), (Ipp32s)(src.total() * cn)); - - if (status >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - int j, k, blockSize = std::min(len, ((BLOCK_SIZE + cn-1)/cn)*cn); size_t esz1 = src.elemSize1(); + AutoBuffer buf; + Cv32suf inf32, nan32; + Cv64suf inf64, nan64; + float* fbuf = 0; + double* dbuf = 0; + inf32.i = 0x7f800000; + nan32.i = 0x7fffffff; + inf64.i = CV_BIG_INT(0x7FF0000000000000); + nan64.i = CV_BIG_INT(0x7FFFFFFFFFFFFFFF); + + if( src.ptr() == dst.ptr() ) + { + buf.allocate(blockSize*esz1); + fbuf = (float*)(uchar*)buf; + dbuf = (double*)(uchar*)buf; + } for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( j = 0; j < len; j += blockSize ) { int bsz = std::min(len - j, blockSize); + + #if defined(HAVE_IPP) + CV_IPP_CHECK() + { + IppStatus status = depth == CV_32F ? + ippsPowx_32f_A21((const float*)ptrs[0], (float)power, (float*)ptrs[1], bsz) : + ippsPowx_64f_A50((const double*)ptrs[0], (double)power, (double*)ptrs[1], bsz); + + if (status >= 0) + { + CV_IMPL_ADD(CV_IMPL_IPP); + ptrs[0] += bsz*esz1; + ptrs[1] += bsz*esz1; + continue; + } + setIppErrorStatus(); + } + #endif + if( depth == CV_32F ) { - const float* x = (const float*)ptrs[0]; + float* x0 = (float*)ptrs[0]; + float* x = fbuf ? fbuf : x0; float* y = (float*)ptrs[1]; + if( x != x0 ) + memcpy(x, x0, bsz*esz1); + Log_32f(x, y, bsz); for( k = 0; k < bsz; k++ ) y[k] = (float)(y[k]*power); Exp_32f(y, y, bsz); + for( k = 0; k < bsz; k++ ) + { + if( x0[k] <= 0 ) + { + if( x0[k] == 0.f ) + { + if( power < 0 ) + y[k] = inf32.f; + } + else + y[k] = nan32.f; + } + } } else { - const double* x = (const double*)ptrs[0]; + double* x0 = (double*)ptrs[0]; + double* x = dbuf ? dbuf : x0; double* y = (double*)ptrs[1]; + if( x != x0 ) + memcpy(x, x0, bsz*esz1); + Log_64f(x, y, bsz); for( k = 0; k < bsz; k++ ) y[k] *= power; Exp_64f(y, y, bsz); + + for( k = 0; k < bsz; k++ ) + { + if( x0[k] <= 0 ) + { + if( x0[k] == 0. ) + { + if( power < 0 ) + y[k] = inf64.f; + } + else + y[k] = nan64.f; + } + } } ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz1; diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index feffc8d32f..c1eacfd210 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -1195,7 +1195,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, GEMMBlockMulFunc blockMulFunc; GEMMStoreFunc storeFunc; Mat *matD = &D, tmat; - int tmat_size = 0; + size_t tmat_size = 0; const uchar* Cdata = C.data; size_t Cstep = C.data ? (size_t)C.step : 0; AutoBuffer buf; @@ -1228,7 +1228,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, if( D.data == A.data || D.data == B.data ) { - tmat_size = d_size.width*d_size.height*CV_ELEM_SIZE(type); + tmat_size = (size_t)d_size.width*d_size.height*CV_ELEM_SIZE(type); // Allocate tmat later, once the size of buf is known matD = &tmat; } @@ -1319,7 +1319,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, int is_b_t = flags & GEMM_2_T; int elem_size = CV_ELEM_SIZE(type); int dk0_1, dk0_2; - int a_buf_size = 0, b_buf_size, d_buf_size; + size_t a_buf_size = 0, b_buf_size, d_buf_size; uchar* a_buf = 0; uchar* b_buf = 0; uchar* d_buf = 0; @@ -1360,12 +1360,12 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, dn0 = block_size / dk0; dk0_1 = (dn0+dn0/8+2) & -2; - b_buf_size = (dk0+dk0/8+1)*dk0_1*elem_size; - d_buf_size = (dk0+dk0/8+1)*dk0_1*work_elem_size; + b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size; + d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size; if( is_a_t ) { - a_buf_size = (dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size; + a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size; flags &= ~GEMM_1_T; } diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 65b93890ec..9b77a4cf2c 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -222,11 +222,10 @@ public: } }; - MatAllocator* Mat::getStdAllocator() { - static MatAllocator * allocator = new StdMatAllocator(); - return allocator; + static StdMatAllocator allocator; + return &allocator; } void swap( Mat& a, Mat& b ) @@ -1155,6 +1154,21 @@ Mat _InputArray::getMat_(int i) const return !v.empty() ? Mat(size(), t, (void*)&v[0]) : Mat(); } + if( k == STD_BOOL_VECTOR ) + { + CV_Assert( i < 0 ); + int t = CV_8U; + const std::vector& v = *(const std::vector*)obj; + int j, n = (int)v.size(); + if( n == 0 ) + return Mat(); + Mat m(1, n, t); + uchar* dst = m.data; + for( j = 0; j < n; j++ ) + dst[j] = (uchar)v[j]; + return m; + } + if( k == NONE ) return Mat(); @@ -1482,6 +1496,13 @@ Size _InputArray::size(int i) const return szb == szi ? Size((int)szb, 1) : Size((int)(szb/CV_ELEM_SIZE(flags)), 1); } + if( k == STD_BOOL_VECTOR ) + { + CV_Assert( i < 0 ); + const std::vector& v = *(const std::vector*)obj; + return Size((int)v.size(), 1); + } + if( k == NONE ) return Size(); @@ -1662,7 +1683,7 @@ int _InputArray::dims(int i) const return 2; } - if( k == STD_VECTOR ) + if( k == STD_VECTOR || k == STD_BOOL_VECTOR ) { CV_Assert( i < 0 ); return 2; @@ -1774,7 +1795,7 @@ int _InputArray::type(int i) const if( k == EXPR ) return ((const MatExpr*)obj)->type(); - if( k == MATX || k == STD_VECTOR || k == STD_VECTOR_VECTOR ) + if( k == MATX || k == STD_VECTOR || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR ) return CV_MAT_TYPE(flags); if( k == NONE ) @@ -1849,6 +1870,12 @@ bool _InputArray::empty() const return v.empty(); } + if( k == STD_BOOL_VECTOR ) + { + const std::vector& v = *(const std::vector*)obj; + return v.empty(); + } + if( k == NONE ) return true; @@ -1893,7 +1920,8 @@ bool _InputArray::isContinuous(int i) const if( k == UMAT ) return i < 0 ? ((const UMat*)obj)->isContinuous() : true; - if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR) + if( k == EXPR || k == MATX || k == STD_VECTOR || + k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR ) return true; if( k == STD_VECTOR_MAT ) @@ -1924,7 +1952,8 @@ bool _InputArray::isSubmatrix(int i) const if( k == UMAT ) return i < 0 ? ((const UMat*)obj)->isSubmatrix() : false; - if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR) + if( k == EXPR || k == MATX || k == STD_VECTOR || + k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR ) return false; if( k == STD_VECTOR_MAT ) @@ -1962,7 +1991,8 @@ size_t _InputArray::offset(int i) const return ((const UMat*)obj)->offset; } - if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR) + if( k == EXPR || k == MATX || k == STD_VECTOR || + k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR ) return 0; if( k == STD_VECTOR_MAT ) @@ -2009,7 +2039,8 @@ size_t _InputArray::step(int i) const return ((const UMat*)obj)->step; } - if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR) + if( k == EXPR || k == MATX || k == STD_VECTOR || + k == NONE || k == STD_VECTOR_VECTOR || k == STD_BOOL_VECTOR ) return 0; if( k == STD_VECTOR_MAT ) @@ -2044,7 +2075,7 @@ void _InputArray::copyTo(const _OutputArray& arr) const if( k == NONE ) arr.release(); - else if( k == MAT || k == MATX || k == STD_VECTOR ) + else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_BOOL_VECTOR ) { Mat m = getMat(); m.copyTo(arr); @@ -2069,7 +2100,7 @@ void _InputArray::copyTo(const _OutputArray& arr, const _InputArray & mask) cons if( k == NONE ) arr.release(); - else if( k == MAT || k == MATX || k == STD_VECTOR ) + else if( k == MAT || k == MATX || k == STD_VECTOR || k == STD_BOOL_VECTOR ) { Mat m = getMat(); m.copyTo(arr, mask); diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index 3901db4eb7..d9dfbb7c6a 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -574,6 +574,16 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, #pragma unroll for (int i=x; i struct OpMax T operator ()(const T a, const T b) const { return std::max(a, b); } }; +inline Size getContinuousSize_( int flags, int cols, int rows, int widthScale ) +{ + int64 sz = (int64)cols * rows * widthScale; + return (flags & Mat::CONTINUOUS_FLAG) != 0 && + (int)sz == sz ? Size((int)sz, 1) : Size(cols * widthScale, rows); +} + inline Size getContinuousSize( const Mat& m1, int widthScale=1 ) { - return m1.isContinuous() ? Size(m1.cols*m1.rows*widthScale, 1) : - Size(m1.cols*widthScale, m1.rows); + return getContinuousSize_(m1.flags, + m1.cols, m1.rows, widthScale); } inline Size getContinuousSize( const Mat& m1, const Mat& m2, int widthScale=1 ) { - return (m1.flags & m2.flags & Mat::CONTINUOUS_FLAG) != 0 ? - Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows); + return getContinuousSize_(m1.flags & m2.flags, + m1.cols, m1.rows, widthScale); } inline Size getContinuousSize( const Mat& m1, const Mat& m2, const Mat& m3, int widthScale=1 ) { - return (m1.flags & m2.flags & m3.flags & Mat::CONTINUOUS_FLAG) != 0 ? - Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows); + return getContinuousSize_(m1.flags & m2.flags & m3.flags, + m1.cols, m1.rows, widthScale); } inline Size getContinuousSize( const Mat& m1, const Mat& m2, const Mat& m3, const Mat& m4, int widthScale=1 ) { - return (m1.flags & m2.flags & m3.flags & m4.flags & Mat::CONTINUOUS_FLAG) != 0 ? - Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows); + return getContinuousSize_(m1.flags & m2.flags & m3.flags & m4.flags, + m1.cols, m1.rows, widthScale); } inline Size getContinuousSize( const Mat& m1, const Mat& m2, const Mat& m3, const Mat& m4, const Mat& m5, int widthScale=1 ) { - return (m1.flags & m2.flags & m3.flags & m4.flags & m5.flags & Mat::CONTINUOUS_FLAG) != 0 ? - Size(m1.cols*m1.rows*widthScale, 1) : Size(m1.cols*widthScale, m1.rows); + return getContinuousSize_(m1.flags & m2.flags & m3.flags & m4.flags & m5.flags, + m1.cols, m1.rows, widthScale); } struct NoVec @@ -290,4 +297,6 @@ extern bool __termination; // skip some cleanups, because process is terminating } +#include "opencv2/hal/intrin.hpp" + #endif /*_CXCORE_INTERNAL_H_*/ diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp index bd1d80ca2b..9d95243828 100644 --- a/modules/core/src/rand.cpp +++ b/modules/core/src/rand.cpp @@ -748,29 +748,35 @@ namespace cv { template static void -randShuffle_( Mat& _arr, RNG& rng, double iterFactor ) +randShuffle_( Mat& _arr, RNG& rng, double ) { - int sz = _arr.rows*_arr.cols, iters = cvRound(iterFactor*sz); + unsigned sz = (unsigned)_arr.total(); if( _arr.isContinuous() ) { T* arr = _arr.ptr(); - for( int i = 0; i < iters; i++ ) + for( unsigned i = 0; i < sz; i++ ) { - int j = (unsigned)rng % sz, k = (unsigned)rng % sz; - std::swap( arr[j], arr[k] ); + unsigned j = (unsigned)rng % sz; + std::swap( arr[j], arr[i] ); } } else { + CV_Assert( _arr.dims <= 2 ); uchar* data = _arr.ptr(); size_t step = _arr.step; + int rows = _arr.rows; int cols = _arr.cols; - for( int i = 0; i < iters; i++ ) + for( int i0 = 0; i0 < rows; i0++ ) { - int j1 = (unsigned)rng % sz, k1 = (unsigned)rng % sz; - int j0 = j1/cols, k0 = k1/cols; - j1 -= j0*cols; k1 -= k0*cols; - std::swap( ((T*)(data + step*j0))[j1], ((T*)(data + step*k0))[k1] ); + T* p = _arr.ptr(i0); + for( int j0 = 0; j0 < cols; j0++ ) + { + unsigned k1 = (unsigned)rng % sz; + int i1 = (int)(k1 / cols); + int j1 = (int)(k1 - (unsigned)i1*(unsigned)cols); + std::swap( p[j0], ((T*)(data + step*i1))[j1] ); + } } } } diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index e43df94448..8a6bd54721 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -3826,6 +3826,11 @@ void cv::findNonZero( InputArray _src, OutputArray _idx ) Mat src = _src.getMat(); CV_Assert( src.type() == CV_8UC1 ); int n = countNonZero(src); + if( n == 0 ) + { + _idx.release(); + return; + } if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() ) _idx.release(); _idx.create(n, 1, CV_32SC2); diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp index 14c741edc0..8bd45ee396 100644 --- a/modules/core/test/test_arithm.cpp +++ b/modules/core/test/test_arithm.cpp @@ -1791,3 +1791,28 @@ INSTANTIATE_TEST_CASE_P(Arithm, SubtractOutputMatNotEmpty, testing::Combine( testing::Values(perf::MatType(CV_8UC1), CV_8UC3, CV_8UC4, CV_16SC1, CV_16SC3), testing::Values(-1, CV_16S, CV_32S, CV_32F), testing::Bool())); + + +TEST(Core_FindNonZero, singular) +{ + Mat img(10, 10, CV_8U, Scalar::all(0)); + vector pts, pts2(10); + findNonZero(img, pts); + findNonZero(img, pts2); + ASSERT_TRUE(pts.empty() && pts2.empty()); +} + +TEST(Core_BoolVector, support) +{ + std::vector test; + int i, n = 205; + int nz = 0; + test.resize(n); + for( i = 0; i < n; i++ ) + { + test[i] = theRNG().uniform(0, 2) != 0; + nz += (int)test[i]; + } + ASSERT_EQ( nz, countNonZero(test) ); + ASSERT_FLOAT_EQ((float)nz/n, (float)(mean(test)[0])); +} diff --git a/modules/core/test/test_conjugate_gradient.cpp b/modules/core/test/test_conjugate_gradient.cpp index 3a4d93797a..a1fd6113c3 100644 --- a/modules/core/test/test_conjugate_gradient.cpp +++ b/modules/core/test/test_conjugate_gradient.cpp @@ -58,18 +58,21 @@ static void mytest(cv::Ptr solver,cv::Ptr solver=cv::ConjGradSolver::create(); #if 1 { - cv::Ptr ptr_F(new SphereF()); + cv::Ptr ptr_F(new SphereF_CG()); cv::Mat x=(cv::Mat_(4,1)<<50.0,10.0,1.0,-10.0), etalon_x=(cv::Mat_(1,4)<<0.0,0.0,0.0,0.0); double etalon_res=0.0; @@ -92,7 +95,7 @@ TEST(DISABLED_Core_ConjGradSolver, regression_basic){ #endif #if 1 { - cv::Ptr ptr_F(new RosenbrockF()); + cv::Ptr ptr_F(new RosenbrockF_CG()); cv::Mat x=(cv::Mat_(2,1)<<0.0,0.0), etalon_x=(cv::Mat_(2,1)<<1.0,1.0); double etalon_res=0.0; diff --git a/modules/core/test/test_downhill_simplex.cpp b/modules/core/test/test_downhill_simplex.cpp index aa6d746121..47c73f9751 100644 --- a/modules/core/test/test_downhill_simplex.cpp +++ b/modules/core/test/test_downhill_simplex.cpp @@ -68,17 +68,19 @@ static void mytest(cv::Ptr solver,cv::Ptr solver=cv::DownhillSolver::create(); #if 1 { diff --git a/modules/core/test/test_dxt.cpp b/modules/core/test/test_dxt.cpp index 2e7bb38eaf..ad75e52dda 100644 --- a/modules/core/test/test_dxt.cpp +++ b/modules/core/test/test_dxt.cpp @@ -866,3 +866,24 @@ protected: }; TEST(Core_DFT, complex_output) { Core_DFTComplexOutputTest test; test.safe_run(); } + +TEST(Core_DFT, complex_output2) +{ + for( int i = 0; i < 100; i++ ) + { + int type = theRNG().uniform(0, 2) ? CV_64F : CV_32F; + int m = theRNG().uniform(1, 10); + int n = theRNG().uniform(1, 10); + Mat x(m, n, type), out; + randu(x, -1., 1.); + dft(x, out, DFT_ROWS | DFT_COMPLEX_OUTPUT); + double nrm = norm(out, NORM_INF); + double thresh = n*m*2; + if( nrm > thresh ) + { + cout << "x: " << x << endl; + cout << "out: " << out << endl; + ASSERT_LT(nrm, thresh); + } + } +} diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp index 8cd92dd514..102bd99764 100644 --- a/modules/core/test/test_mat.cpp +++ b/modules/core/test/test_mat.cpp @@ -1209,3 +1209,42 @@ TEST(Core_Mat, copyNx1ToVector) ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), ref_dst16, cv::Mat_(dst16)); } + +TEST(Core_Matx, fromMat_) +{ + Mat_ a = (Mat_(2,2) << 10, 11, 12, 13); + Matx22d b(a); + ASSERT_EQ( norm(a, b, NORM_INF), 0.); +} + +TEST(Core_InputArray, empty) +{ + vector > data; + ASSERT_TRUE( _InputArray(data).empty() ); +} + +TEST(Core_CopyMask, bug1918) +{ + Mat_ tmpSrc(100,100); + tmpSrc = 124; + Mat_ tmpMask(100,100); + tmpMask = 255; + Mat_ tmpDst(100,100); + tmpDst = 2; + tmpSrc.copyTo(tmpDst,tmpMask); + ASSERT_EQ(sum(tmpDst)[0], 124*100*100); +} + +TEST(Core_SVD, orthogonality) +{ + for( int i = 0; i < 2; i++ ) + { + int type = i == 0 ? CV_32F : CV_64F; + Mat mat_D(2, 2, type); + mat_D.setTo(88.); + Mat mat_U, mat_W; + SVD::compute(mat_D, mat_W, mat_U, noArray(), SVD::FULL_UV); + mat_U *= mat_U.t(); + ASSERT_LT(norm(mat_U, Mat::eye(2, 2, type), NORM_INF), 1e-5); + } +} diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp index 5d483206e2..e44051914e 100644 --- a/modules/core/test/test_math.cpp +++ b/modules/core/test/test_math.cpp @@ -232,7 +232,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((uchar*)a_data)[j]; - ((uchar*)b_data)[j] = (uchar)(val <= 1 ? val : + ((uchar*)b_data)[j] = (uchar)(val == 0 ? 255 : val == 1 ? 1 : val == 2 && ipower == -1 ? 1 : 0); } else @@ -247,17 +247,17 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) if( ipower < 0 ) for( j = 0; j < ncols; j++ ) { - int val = ((char*)a_data)[j]; - ((char*)b_data)[j] = (char)((val&~1)==0 ? val : + int val = ((schar*)a_data)[j]; + ((schar*)b_data)[j] = (schar)(val == 0 ? 127 : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0); } else for( j = 0; j < ncols; j++ ) { - int val = ((char*)a_data)[j]; + int val = ((schar*)a_data)[j]; val = ipow( val, ipower ); - ((char*)b_data)[j] = saturate_cast(val); + ((schar*)b_data)[j] = saturate_cast(val); } break; case CV_16U: @@ -265,7 +265,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((ushort*)a_data)[j]; - ((ushort*)b_data)[j] = (ushort)((val&~1)==0 ? val : + ((ushort*)b_data)[j] = (ushort)(val == 0 ? 65535 : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0); } @@ -282,7 +282,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((short*)a_data)[j]; - ((short*)b_data)[j] = (short)((val&~1)==0 ? val : + ((short*)b_data)[j] = (short)(val == 0 ? 32767 : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0); } @@ -299,7 +299,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) for( j = 0; j < ncols; j++ ) { int val = ((int*)a_data)[j]; - ((int*)b_data)[j] = (val&~1)==0 ? val : + ((int*)b_data)[j] = val == 0 ? INT_MAX : val == 1 ? 1 : val ==-1 ? 1-2*(ipower&1) : val == 2 && ipower == -1 ? 1 : 0; } @@ -351,8 +351,6 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ ) } } - - ///////////////////////////////////////// matrix tests //////////////////////////////////////////// class Core_MatrixTest : public cvtest::ArrayTest @@ -2822,4 +2820,56 @@ TEST(CovariationMatrixVectorOfMatWithMean, accuracy) ASSERT_EQ(sDiff.dot(sDiff), 0.0); } +TEST(Core_Pow, special) +{ + for( int i = 0; i < 100; i++ ) + { + int n = theRNG().uniform(1, 30); + Mat mtx0(1, n, CV_8S), mtx, result; + randu(mtx0, -5, 5); + + int type = theRNG().uniform(0, 2) ? CV_64F : CV_32F; + double eps = type == CV_32F ? 1e-3 : 1e-10; + mtx0.convertTo(mtx, type); + // generate power from [-n, n] interval with 1/8 step - enough to check various cases. + const int max_pf = 3; + int pf = theRNG().uniform(0, max_pf*2+1); + double power = ((1 << pf) - (1 << (max_pf*2-1)))/16.; + int ipower = cvRound(power); + bool is_ipower = ipower == power; + cv::pow(mtx, power, result); + for( int j = 0; j < n; j++ ) + { + double val = type == CV_32F ? (double)mtx.at(j) : mtx.at(j); + double r = type == CV_32F ? (double)result.at(j) : result.at(j); + double r0; + if( power == 0. ) + r0 = 1; + else if( is_ipower ) + { + r0 = 1; + for( int k = 0; k < std::abs(ipower); k++ ) + r0 *= val; + if( ipower < 0 ) + r0 = 1./r0; + } + else + r0 = std::pow(val, power); + if( cvIsInf(r0) ) + { + ASSERT_TRUE(cvIsInf(r) != 0); + } + else if( cvIsNaN(r0) ) + { + ASSERT_TRUE(cvIsNaN(r) != 0); + } + else + { + ASSERT_TRUE(cvIsInf(r) == 0 && cvIsNaN(r) == 0); + ASSERT_LT(fabs(r - r0), eps); + } + } + } +} + /* End of file. */ diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp index ab6aa86315..2418566e62 100644 --- a/modules/hal/include/opencv2/hal/intrin_neon.hpp +++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp @@ -322,6 +322,9 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) +OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32) +OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32) +OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32) OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32) diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp index 3b77a11542..69171e2516 100644 --- a/modules/hal/include/opencv2/hal/intrin_sse.hpp +++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp @@ -614,6 +614,16 @@ inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) __m128i d1 = _mm_unpackhi_epi32(c0, c1); return v_int32x4(_mm_unpacklo_epi64(d0, d1)); } +inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b) +{ + a = a * b; + return a; +} +inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b) +{ + a = a * b; + return a; +} inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d)