From cd66f6e3db101bdcf1406ec3604592662c6aebef Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Thu, 14 Feb 2019 16:00:17 +0300 Subject: [PATCH] core: dispatch matmul - gemm: keep baseline only (lapack is 10x+ faster, lets reduce binary size) - transform / distTransform - scaleAdd (32f/64f only) - Mahalanobis: keep baseline only (no perf tests) - mulTransposed: keep baseline only (no perf tests) - dot --- modules/core/CMakeLists.txt | 1 + .../include/opencv2/core/cv_cpu_dispatch.h | 1 + modules/core/src/matmul.dispatch.cpp | 2559 +---------------- modules/core/src/matmul.simd.hpp | 1322 ++------- 4 files changed, 308 insertions(+), 3575 deletions(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 9559783d94..9d7a925dd0 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -6,6 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) ocv_add_dispatched_file(convert SSE2 AVX2) ocv_add_dispatched_file(convert_scale SSE2 AVX2) ocv_add_dispatched_file(count_non_zero SSE2 AVX2) +ocv_add_dispatched_file(matmul SSE2 AVX2) ocv_add_dispatched_file(sum SSE2 AVX2) # dispatching for accuracy tests diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 57aa0ce2fb..08909f8b28 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -15,6 +15,7 @@ #define CV_CPU_OPTIMIZATION_NAMESPACE cpu_baseline #define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace cpu_baseline { #define CV_CPU_OPTIMIZATION_NAMESPACE_END } +#define CV_CPU_BASELINE_MODE 1 #endif diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp index 19ab907c9e..6fcdb4c700 100644 --- a/modules/core/src/matmul.dispatch.cpp +++ b/modules/core/src/matmul.dispatch.cpp @@ -41,13 +41,15 @@ // //M*/ -#include #include "precomp.hpp" #include "opencl_kernels_core.hpp" #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp" #include "intel_gpu_gemm.inl.hpp" +#include "matmul.simd.hpp" +#include "matmul.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + namespace cv { @@ -55,646 +57,6 @@ namespace cv * GEMM * \****************************************************************************************/ -static void -GEMM_CopyBlock( const uchar* src, size_t src_step, - uchar* dst, size_t dst_step, - Size size, size_t pix_size ) -{ - int j; - size.width *= (int)(pix_size / sizeof(int)); - - for( ; size.height--; src += src_step, dst += dst_step ) - { - j=0; - #if CV_ENABLE_UNROLLED - for( ; j <= size.width - 4; j += 4 ) - { - int t0 = ((const int*)src)[j]; - int t1 = ((const int*)src)[j+1]; - ((int*)dst)[j] = t0; - ((int*)dst)[j+1] = t1; - t0 = ((const int*)src)[j+2]; - t1 = ((const int*)src)[j+3]; - ((int*)dst)[j+2] = t0; - ((int*)dst)[j+3] = t1; - } - #endif - for( ; j < size.width; j++ ) - ((int*)dst)[j] = ((const int*)src)[j]; - } -} - - -static void -GEMM_TransposeBlock( const uchar* src, size_t src_step, - uchar* dst, size_t dst_step, - Size size, size_t pix_size ) -{ - int i, j; - for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size ) - { - const uchar* _src = src; - switch( pix_size ) - { - case sizeof(int): - for( j = 0; j < size.height; j++, _src += src_step ) - ((int*)dst)[j] = ((int*)_src)[0]; - break; - case sizeof(int)*2: - for( j = 0; j < size.height*2; j += 2, _src += src_step ) - { - int t0 = ((int*)_src)[0]; - int t1 = ((int*)_src)[1]; - ((int*)dst)[j] = t0; - ((int*)dst)[j+1] = t1; - } - break; - case sizeof(int)*4: - for( j = 0; j < size.height*4; j += 4, _src += src_step ) - { - int t0 = ((int*)_src)[0]; - int t1 = ((int*)_src)[1]; - ((int*)dst)[j] = t0; - ((int*)dst)[j+1] = t1; - t0 = ((int*)_src)[2]; - t1 = ((int*)_src)[3]; - ((int*)dst)[j+2] = t0; - ((int*)dst)[j+3] = t1; - } - break; - default: - assert(0); - return; - } - } -} - - -template static void -GEMMSingleMul( const T* a_data, size_t a_step, - const T* b_data, size_t b_step, - const T* c_data, size_t c_step, - T* d_data, size_t d_step, - Size a_size, Size d_size, - double alpha, double beta, int flags ) -{ - int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height; - const T *_a_data = a_data, *_b_data = b_data, *_c_data = c_data; - cv::AutoBuffer _a_buf; - T* a_buf = 0; - size_t a_step0, a_step1, c_step0, c_step1, t_step; - - a_step /= sizeof(a_data[0]); - b_step /= sizeof(b_data[0]); - c_step /= sizeof(c_data[0]); - d_step /= sizeof(d_data[0]); - a_step0 = a_step; - a_step1 = 1; - - if( !c_data ) - c_step0 = c_step1 = 0; - else if( !(flags & GEMM_3_T) ) - c_step0 = c_step, c_step1 = 1; - else - c_step0 = 1, c_step1 = c_step; - - if( flags & GEMM_1_T ) - { - CV_SWAP( a_step0, a_step1, t_step ); - n = a_size.height; - if( a_step > 1 && n > 1 ) - { - _a_buf.allocate(n); - a_buf = _a_buf.data(); - } - } - - if( n == 1 ) /* external product */ - { - cv::AutoBuffer _b_buf; - T* b_buf = 0; - - if( a_step > 1 && a_size.height > 1 ) - { - _a_buf.allocate(drows); - a_buf = _a_buf.data(); - for( k = 0; k < drows; k++ ) - a_buf[k] = a_data[a_step*k]; - a_data = a_buf; - } - - if( b_step > 1 ) - { - _b_buf.allocate(d_size.width); - b_buf = _b_buf.data(); - for( j = 0; j < d_size.width; j++ ) - b_buf[j] = b_data[j*b_step]; - b_data = b_buf; - } - - for( i = 0; i < drows; i++, _c_data += c_step0, d_data += d_step ) - { - WT al = WT(a_data[i])*alpha; - c_data = _c_data; - for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 ) - { - WT s0 = al*WT(b_data[j]); - WT s1 = al*WT(b_data[j+1]); - if( !c_data ) - { - d_data[j] = T(s0); - d_data[j+1] = T(s1); - } - else - { - d_data[j] = T(s0 + WT(c_data[0])*beta); - d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta); - } - } - - for( ; j < d_size.width; j++, c_data += c_step1 ) - { - WT s0 = al*WT(b_data[j]); - if( !c_data ) - d_data[j] = T(s0); - else - d_data[j] = T(s0 + WT(c_data[0])*beta); - } - } - } - else if( flags & GEMM_2_T ) /* A * Bt */ - { - for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step ) - { - a_data = _a_data; - b_data = _b_data; - c_data = _c_data; - - if( a_buf ) - { - for( k = 0; k < n; k++ ) - a_buf[k] = a_data[a_step1*k]; - a_data = a_buf; - } - - for( j = 0; j < d_size.width; j++, b_data += b_step, - c_data += c_step1 ) - { - WT s0(0), s1(0), s2(0), s3(0); - k = 0; - #if CV_ENABLE_UNROLLED - for( ; k <= n - 4; k += 4 ) - { - s0 += WT(a_data[k])*WT(b_data[k]); - s1 += WT(a_data[k+1])*WT(b_data[k+1]); - s2 += WT(a_data[k+2])*WT(b_data[k+2]); - s3 += WT(a_data[k+3])*WT(b_data[k+3]); - } - #endif - for( ; k < n; k++ ) - s0 += WT(a_data[k])*WT(b_data[k]); - s0 = (s0+s1+s2+s3)*alpha; - - if( !c_data ) - d_data[j] = T(s0); - else - d_data[j] = T(s0 + WT(c_data[0])*beta); - } - } - } - else if( d_size.width*sizeof(d_data[0]) <= 1600 ) - { - for( i = 0; i < drows; i++, _a_data += a_step0, - _c_data += c_step0, - d_data += d_step ) - { - a_data = _a_data, c_data = _c_data; - - if( a_buf ) - { - for( k = 0; k < n; k++ ) - a_buf[k] = a_data[a_step1*k]; - a_data = a_buf; - } - - for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 ) - { - const T* b = _b_data + j; - WT s0(0), s1(0), s2(0), s3(0); - - for( k = 0; k < n; k++, b += b_step ) - { - WT a(a_data[k]); - s0 += a * WT(b[0]); s1 += a * WT(b[1]); - s2 += a * WT(b[2]); s3 += a * WT(b[3]); - } - - if( !c_data ) - { - d_data[j] = T(s0*alpha); - d_data[j+1] = T(s1*alpha); - d_data[j+2] = T(s2*alpha); - d_data[j+3] = T(s3*alpha); - } - else - { - s0 = s0*alpha; s1 = s1*alpha; - s2 = s2*alpha; s3 = s3*alpha; - d_data[j] = T(s0 + WT(c_data[0])*beta); - d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta); - d_data[j+2] = T(s2 + WT(c_data[c_step1*2])*beta); - d_data[j+3] = T(s3 + WT(c_data[c_step1*3])*beta); - } - } - - for( ; j < m; j++, c_data += c_step1 ) - { - const T* b = _b_data + j; - WT s0(0); - - for( k = 0; k < n; k++, b += b_step ) - s0 += WT(a_data[k]) * WT(b[0]); - - s0 = s0*alpha; - if( !c_data ) - d_data[j] = T(s0); - else - d_data[j] = T(s0 + WT(c_data[0])*beta); - } - } - } - else - { - cv::AutoBuffer _d_buf(m); - WT* d_buf = _d_buf.data(); - - for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step ) - { - a_data = _a_data; - b_data = _b_data; - c_data = _c_data; - - if( a_buf ) - { - for( k = 0; k < n; k++ ) - a_buf[k] = _a_data[a_step1*k]; - a_data = a_buf; - } - - for( j = 0; j < m; j++ ) - d_buf[j] = WT(0); - - for( k = 0; k < n; k++, b_data += b_step ) - { - WT al(a_data[k]); - j=0; - #if CV_ENABLE_UNROLLED - for(; j <= m - 4; j += 4 ) - { - WT t0 = d_buf[j] + WT(b_data[j])*al; - WT t1 = d_buf[j+1] + WT(b_data[j+1])*al; - d_buf[j] = t0; - d_buf[j+1] = t1; - t0 = d_buf[j+2] + WT(b_data[j+2])*al; - t1 = d_buf[j+3] + WT(b_data[j+3])*al; - d_buf[j+2] = t0; - d_buf[j+3] = t1; - } - #endif - for( ; j < m; j++ ) - d_buf[j] += WT(b_data[j])*al; - } - - if( !c_data ) - for( j = 0; j < m; j++ ) - d_data[j] = T(d_buf[j]*alpha); - else - for( j = 0; j < m; j++, c_data += c_step1 ) - { - WT t = d_buf[j]*alpha; - d_data[j] = T(t + WT(c_data[0])*beta); - } - } - } -} - - -template static void -GEMMBlockMul( const T* a_data, size_t a_step, - const T* b_data, size_t b_step, - WT* d_data, size_t d_step, - Size a_size, Size d_size, int flags ) -{ - int i, j, k, n = a_size.width, m = d_size.width; - const T *_a_data = a_data, *_b_data = b_data; - cv::AutoBuffer _a_buf; - T* a_buf = 0; - size_t a_step0, a_step1, t_step; - int do_acc = flags & 16; - - a_step /= sizeof(a_data[0]); - b_step /= sizeof(b_data[0]); - d_step /= sizeof(d_data[0]); - - a_step0 = a_step; - a_step1 = 1; - - if( flags & GEMM_1_T ) - { - CV_SWAP( a_step0, a_step1, t_step ); - n = a_size.height; - _a_buf.allocate(n); - a_buf = _a_buf.data(); - } - - if( flags & GEMM_2_T ) - { - /* second operand is transposed */ - for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step ) - { - a_data = _a_data; b_data = _b_data; - - if( a_buf ) - { - for( k = 0; k < n; k++ ) - a_buf[k] = a_data[a_step1*k]; - a_data = a_buf; - } - - for( j = 0; j < d_size.width; j++, b_data += b_step ) - { - WT s0 = do_acc ? d_data[j]:WT(0), s1(0); - for( k = 0; k <= n - 2; k += 2 ) - { - s0 += WT(a_data[k])*WT(b_data[k]); - s1 += WT(a_data[k+1])*WT(b_data[k+1]); - } - - for( ; k < n; k++ ) - s0 += WT(a_data[k])*WT(b_data[k]); - - d_data[j] = s0 + s1; - } - } - } - else - { - for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step ) - { - a_data = _a_data, b_data = _b_data; - - if( a_buf ) - { - for( k = 0; k < n; k++ ) - a_buf[k] = a_data[a_step1*k]; - a_data = a_buf; - } - - for( j = 0; j <= m - 4; j += 4 ) - { - WT s0, s1, s2, s3; - const T* b = b_data + j; - - if( do_acc ) - { - s0 = d_data[j]; s1 = d_data[j+1]; - s2 = d_data[j+2]; s3 = d_data[j+3]; - } - else - s0 = s1 = s2 = s3 = WT(0); - - for( k = 0; k < n; k++, b += b_step ) - { - WT a(a_data[k]); - s0 += a * WT(b[0]); s1 += a * WT(b[1]); - s2 += a * WT(b[2]); s3 += a * WT(b[3]); - } - - d_data[j] = s0; d_data[j+1] = s1; - d_data[j+2] = s2; d_data[j+3] = s3; - } - - for( ; j < m; j++ ) - { - const T* b = b_data + j; - WT s0 = do_acc ? d_data[j] : WT(0); - - for( k = 0; k < n; k++, b += b_step ) - s0 += WT(a_data[k]) * WT(b[0]); - - d_data[j] = s0; - } - } - } -} - - -template static void -GEMMStore( const T* c_data, size_t c_step, - const WT* d_buf, size_t d_buf_step, - T* d_data, size_t d_step, Size d_size, - double alpha, double beta, int flags ) -{ - const T* _c_data = c_data; - int j; - size_t c_step0, c_step1; - - c_step /= sizeof(c_data[0]); - d_buf_step /= sizeof(d_buf[0]); - d_step /= sizeof(d_data[0]); - - if( !c_data ) - c_step0 = c_step1 = 0; - else if( !(flags & GEMM_3_T) ) - c_step0 = c_step, c_step1 = 1; - else - c_step0 = 1, c_step1 = c_step; - - for( ; d_size.height--; _c_data += c_step0, d_buf += d_buf_step, d_data += d_step ) - { - if( _c_data ) - { - c_data = _c_data; - j=0; - #if CV_ENABLE_UNROLLED - for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 ) - { - WT t0 = alpha*d_buf[j]; - WT t1 = alpha*d_buf[j+1]; - t0 += beta*WT(c_data[0]); - t1 += beta*WT(c_data[c_step1]); - d_data[j] = T(t0); - d_data[j+1] = T(t1); - t0 = alpha*d_buf[j+2]; - t1 = alpha*d_buf[j+3]; - t0 += beta*WT(c_data[c_step1*2]); - t1 += beta*WT(c_data[c_step1*3]); - d_data[j+2] = T(t0); - d_data[j+3] = T(t1); - } - #endif - for( ; j < d_size.width; j++, c_data += c_step1 ) - { - WT t0 = alpha*d_buf[j]; - d_data[j] = T(t0 + WT(c_data[0])*beta); - } - } - else - { - j = 0; - #if CV_ENABLE_UNROLLED - for( ; j <= d_size.width - 4; j += 4 ) - { - WT t0 = alpha*d_buf[j]; - WT t1 = alpha*d_buf[j+1]; - d_data[j] = T(t0); - d_data[j+1] = T(t1); - t0 = alpha*d_buf[j+2]; - t1 = alpha*d_buf[j+3]; - d_data[j+2] = T(t0); - d_data[j+3] = T(t1); - } - #endif - for( ; j < d_size.width; j++ ) - d_data[j] = T(alpha*d_buf[j]); - } - } -} - - -typedef void (*GEMMSingleMulFunc)( const void* src1, size_t step1, - const void* src2, size_t step2, const void* src3, size_t step3, - void* dst, size_t dststep, Size srcsize, Size dstsize, - double alpha, double beta, int flags ); - -typedef void (*GEMMBlockMulFunc)( const void* src1, size_t step1, - const void* src2, size_t step2, void* dst, size_t dststep, - Size srcsize, Size dstsize, int flags ); - -typedef void (*GEMMStoreFunc)( const void* src1, size_t step1, - const void* src2, size_t step2, void* dst, size_t dststep, - Size dstsize, double alpha, double beta, int flags ); - -static void GEMMSingleMul_32f( const float* a_data, size_t a_step, - const float* b_data, size_t b_step, - const float* c_data, size_t c_step, - float* d_data, size_t d_step, - Size a_size, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMSingleMul(a_data, a_step, b_data, b_step, c_data, - c_step, d_data, d_step, a_size, d_size, - alpha, beta, flags); -} - -static void GEMMSingleMul_64f( const double* a_data, size_t a_step, - const double* b_data, size_t b_step, - const double* c_data, size_t c_step, - double* d_data, size_t d_step, - Size a_size, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMSingleMul(a_data, a_step, b_data, b_step, c_data, - c_step, d_data, d_step, a_size, d_size, - alpha, beta, flags); -} - - -static void GEMMSingleMul_32fc( const Complexf* a_data, size_t a_step, - const Complexf* b_data, size_t b_step, - const Complexf* c_data, size_t c_step, - Complexf* d_data, size_t d_step, - Size a_size, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMSingleMul(a_data, a_step, b_data, b_step, c_data, - c_step, d_data, d_step, a_size, d_size, - alpha, beta, flags); -} - -static void GEMMSingleMul_64fc( const Complexd* a_data, size_t a_step, - const Complexd* b_data, size_t b_step, - const Complexd* c_data, size_t c_step, - Complexd* d_data, size_t d_step, - Size a_size, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMSingleMul(a_data, a_step, b_data, b_step, c_data, - c_step, d_data, d_step, a_size, d_size, - alpha, beta, flags); -} - -static void GEMMBlockMul_32f( const float* a_data, size_t a_step, - const float* b_data, size_t b_step, - double* d_data, size_t d_step, - Size a_size, Size d_size, int flags ) -{ - GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); -} - - -static void GEMMBlockMul_64f( const double* a_data, size_t a_step, - const double* b_data, size_t b_step, - double* d_data, size_t d_step, - Size a_size, Size d_size, int flags ) -{ - GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); -} - - -static void GEMMBlockMul_32fc( const Complexf* a_data, size_t a_step, - const Complexf* b_data, size_t b_step, - Complexd* d_data, size_t d_step, - Size a_size, Size d_size, int flags ) -{ - GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); -} - - -static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step, - const Complexd* b_data, size_t b_step, - Complexd* d_data, size_t d_step, - Size a_size, Size d_size, int flags ) -{ - GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); -} - - -static void GEMMStore_32f( const float* c_data, size_t c_step, - const double* d_buf, size_t d_buf_step, - float* d_data, size_t d_step, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); -} - - -static void GEMMStore_64f( const double* c_data, size_t c_step, - const double* d_buf, size_t d_buf_step, - double* d_data, size_t d_step, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); -} - - -static void GEMMStore_32fc( const Complexf* c_data, size_t c_step, - const Complexd* d_buf, size_t d_buf_step, - Complexf* d_data, size_t d_step, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); -} - - -static void GEMMStore_64fc( const Complexd* c_data, size_t c_step, - const Complexd* d_buf, size_t d_buf_step, - Complexd* d_data, size_t d_step, Size d_size, - double alpha, double beta, int flags ) -{ - GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); -} - #ifdef HAVE_CLAMDBLAS static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, @@ -893,652 +255,69 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, } #endif -static void gemmImpl( Mat A, Mat B, double alpha, - Mat C, double beta, Mat D, int flags ) + +namespace hal { + +void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { CV_INSTRUMENT_REGION(); - - const int block_lin_size = 128; - const int block_size = block_lin_size * block_lin_size; - - static double zero[] = {0,0,0,0}; - static float zerof[] = {0,0,0,0}; - - Size a_size = A.size(), d_size; - int i, len = 0, type = A.type(); - - switch( flags & (GEMM_1_T|GEMM_2_T) ) - { - case 0: - d_size = Size( B.cols, a_size.height ); - len = B.rows; - break; - case 1: - d_size = Size( B.cols, a_size.width ); - len = B.rows; - break; - case 2: - d_size = Size( B.rows, a_size.height ); - len = B.cols; - break; - case 3: - d_size = Size( B.rows, a_size.width ); - len = B.cols; - break; - } - - if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) ) - { - if( type == CV_32F ) - { - float* d = D.ptr(); - const float *a = A.ptr(), - *b = B.ptr(), - *c = (const float*)C.data; - size_t d_step = D.step/sizeof(d[0]), - a_step = A.step/sizeof(a[0]), - b_step = B.step/sizeof(b[0]), - c_step = C.data ? C.step/sizeof(c[0]) : 0; - - if( !c ) - c = zerof; - - switch( len ) - { - case 2: - if( len == d_size.width && b != d ) - { - for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) - { - float t0 = a[0]*b[0] + a[1]*b[b_step]; - float t1 = a[0]*b[1] + a[1]*b[b_step+1]; - d[0] = (float)(t0*alpha + c[0]*beta); - d[1] = (float)(t1*alpha + c[1]*beta); - } - } - else if( a != d ) - { - int c_step0 = 1; - if( c == zerof ) - { - c_step0 = 0; - c_step = 1; - } - - for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) - { - float t0 = a[0]*b[0] + a[1]*b[b_step]; - float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step]; - d[0] = (float)(t0*alpha + c[0]*beta); - d[d_step] = (float)(t1*alpha + c[c_step]*beta); - } - } - else - break; - return; - case 3: - if( len == d_size.width && b != d ) - { - for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) - { - float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; - float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1]; - float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2]; - d[0] = (float)(t0*alpha + c[0]*beta); - d[1] = (float)(t1*alpha + c[1]*beta); - d[2] = (float)(t2*alpha + c[2]*beta); - } - } - else if( a != d ) - { - int c_step0 = 1; - if( c == zerof ) - { - c_step0 = 0; - c_step = 1; - } - - for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) - { - float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; - float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2]; - float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2]; - - d[0] = (float)(t0*alpha + c[0]*beta); - d[d_step] = (float)(t1*alpha + c[c_step]*beta); - d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta); - } - } - else - break; - return; - case 4: - if( len == d_size.width && b != d ) - { - for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) - { - float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; - float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1]; - float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2]; - float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3]; - d[0] = (float)(t0*alpha + c[0]*beta); - d[1] = (float)(t1*alpha + c[1]*beta); - d[2] = (float)(t2*alpha + c[2]*beta); - d[3] = (float)(t3*alpha + c[3]*beta); - } - } - else if( len <= 16 && a != d ) - { - int c_step0 = 1; - if( c == zerof ) - { - c_step0 = 0; - c_step = 1; - } - - for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) - { - float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; - float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + - a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3]; - float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + - a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3]; - float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] + - a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3]; - d[0] = (float)(t0*alpha + c[0]*beta); - d[d_step] = (float)(t1*alpha + c[c_step]*beta); - d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta); - d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta); - } - } - else - break; - return; - } - } - - if( type == CV_64F ) - { - double* d = D.ptr(); - const double *a = A.ptr(), - *b = B.ptr(), - *c = (const double*)C.data; - size_t d_step = D.step/sizeof(d[0]), - a_step = A.step/sizeof(a[0]), - b_step = B.step/sizeof(b[0]), - c_step = C.data ? C.step/sizeof(c[0]) : 0; - if( !c ) - c = zero; - - switch( len ) - { - case 2: - if( len == d_size.width && b != d ) - { - for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) - { - double t0 = a[0]*b[0] + a[1]*b[b_step]; - double t1 = a[0]*b[1] + a[1]*b[b_step+1]; - d[0] = t0*alpha + c[0]*beta; - d[1] = t1*alpha + c[1]*beta; - } - } - else if( a != d ) - { - int c_step0 = 1; - if( c == zero ) - { - c_step0 = 0; - c_step = 1; - } - - for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) - { - double t0 = a[0]*b[0] + a[1]*b[b_step]; - double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step]; - d[0] = t0*alpha + c[0]*beta; - d[d_step] = t1*alpha + c[c_step]*beta; - } - } - else - break; - return; - case 3: - if( len == d_size.width && b != d ) - { - for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) - { - double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; - double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1]; - double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2]; - d[0] = t0*alpha + c[0]*beta; - d[1] = t1*alpha + c[1]*beta; - d[2] = t2*alpha + c[2]*beta; - } - } - else if( a != d ) - { - int c_step0 = 1; - if( c == zero ) - { - c_step0 = 0; - c_step = 1; - } - - for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) - { - double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; - double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2]; - double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2]; - - d[0] = t0*alpha + c[0]*beta; - d[d_step] = t1*alpha + c[c_step]*beta; - d[d_step*2] = t2*alpha + c[c_step*2]*beta; - } - } - else - break; - return; - case 4: - if( len == d_size.width && b != d ) - { - for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) - { - double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; - double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1]; - double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2]; - double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3]; - d[0] = t0*alpha + c[0]*beta; - d[1] = t1*alpha + c[1]*beta; - d[2] = t2*alpha + c[2]*beta; - d[3] = t3*alpha + c[3]*beta; - } - } - else if( d_size.width <= 16 && a != d ) - { - int c_step0 = 1; - if( c == zero ) - { - c_step0 = 0; - c_step = 1; - } - - for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) - { - double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; - double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + - a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3]; - double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + - a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3]; - double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] + - a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3]; - d[0] = t0*alpha + c[0]*beta; - d[d_step] = t1*alpha + c[c_step]*beta; - d[d_step*2] = t2*alpha + c[c_step*2]*beta; - d[d_step*3] = t3*alpha + c[c_step*3]*beta; - } - } - else - break; - return; - } - } - } - - { - size_t b_step = B.step; - GEMMSingleMulFunc singleMulFunc; - GEMMBlockMulFunc blockMulFunc; - GEMMStoreFunc storeFunc; - Mat *matD = &D; - const uchar* Cdata = C.data; - size_t Cstep = C.data ? (size_t)C.step : 0; - AutoBuffer buf; - - if( type == CV_32FC1 ) - { - singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32f; - blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32f; - storeFunc = (GEMMStoreFunc)GEMMStore_32f; - } - else if( type == CV_64FC1 ) - { - singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64f; - blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64f; - storeFunc = (GEMMStoreFunc)GEMMStore_64f; - } - else if( type == CV_32FC2 ) - { - singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32fc; - blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32fc; - storeFunc = (GEMMStoreFunc)GEMMStore_32fc; - } - else - { - CV_Assert( type == CV_64FC2 ); - singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64fc; - blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64fc; - storeFunc = (GEMMStoreFunc)GEMMStore_64fc; - } - - if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() ) - { - b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type); - flags |= GEMM_2_T; - } - - /*if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 ) - { - blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p : - type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p : - type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p : - type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0; - } - - if( blas_func ) - { - const char* transa = flags & GEMM_1_T ? "t" : "n"; - const char* transb = flags & GEMM_2_T ? "t" : "n"; - int lda, ldb, ldd; - - if( C->data.ptr ) - { - if( C->data.ptr != D->data.ptr ) - { - if( !(flags & GEMM_3_T) ) - cvCopy( C, D ); - else - cvTranspose( C, D ); - } - } - - if( CV_MAT_DEPTH(type) == CV_32F ) - { - Complex32f _alpha, _beta; - - lda = A->step/sizeof(float); - ldb = b_step/sizeof(float); - ldd = D->step/sizeof(float); - _alpha.re = (float)alpha; - _alpha.im = 0; - _beta.re = C->data.ptr ? (float)beta : 0; - _beta.im = 0; - if( CV_MAT_CN(type) == 2 ) - lda /= 2, ldb /= 2, ldd /= 2; - - blas_func( transb, transa, &d_size.width, &d_size.height, &len, - &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda, - &_beta, D->data.ptr, &ldd ); - } - else - { - CvComplex64f _alpha, _beta; - - lda = A->step/sizeof(double); - ldb = b_step/sizeof(double); - ldd = D->step/sizeof(double); - _alpha.re = alpha; - _alpha.im = 0; - _beta.re = C->data.ptr ? beta : 0; - _beta.im = 0; - if( CV_MAT_CN(type) == 2 ) - lda /= 2, ldb /= 2, ldd /= 2; - - blas_func( transb, transa, &d_size.width, &d_size.height, &len, - &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda, - &_beta, D->data.ptr, &ldd ); - } - } - else*/ if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) && - len <= 10000) || len <= 10 || - (d_size.width <= block_lin_size && - d_size.height <= block_lin_size && len <= block_lin_size) ) - { - singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep, - matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags ); - } - else - { - int is_a_t = flags & GEMM_1_T; - int is_b_t = flags & GEMM_2_T; - int elem_size = CV_ELEM_SIZE(type); - int dk0_1, dk0_2; - size_t a_buf_size = 0, b_buf_size, d_buf_size; - uchar* a_buf = 0; - uchar* b_buf = 0; - uchar* d_buf = 0; - int j, k, di = 0, dj = 0, dk = 0; - int dm0, dn0, dk0; - size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1; - int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0); - - if( !is_a_t ) - a_step0 = A.step, a_step1 = elem_size; - else - a_step0 = elem_size, a_step1 = A.step; - - if( !is_b_t ) - b_step0 = b_step, b_step1 = elem_size; - else - b_step0 = elem_size, b_step1 = b_step; - - if( C.empty() ) - { - c_step0 = c_step1 = 0; - flags &= ~GEMM_3_T; - } - else if( !(flags & GEMM_3_T) ) - c_step0 = C.step, c_step1 = elem_size; - else - c_step0 = elem_size, c_step1 = C.step; - - dm0 = std::min( block_lin_size, d_size.height ); - dn0 = std::min( block_lin_size, d_size.width ); - dk0_1 = block_size / dm0; - dk0_2 = block_size / dn0; - dk0 = std::min( dk0_1, dk0_2 ); - dk0 = std::min( dk0, len ); - if( dk0*dm0 > block_size ) - dm0 = block_size / dk0; - if( dk0*dn0 > block_size ) - dn0 = block_size / dk0; - - dk0_1 = (dn0+dn0/8+2) & -2; - b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size; - d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size; - - if( is_a_t ) - { - a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size; - flags &= ~GEMM_1_T; - } - - buf.allocate(d_buf_size + b_buf_size + a_buf_size); - d_buf = buf.data(); - b_buf = d_buf + d_buf_size; - - if( is_a_t ) - a_buf = b_buf + b_buf_size; - - for( i = 0; i < d_size.height; i += di ) - { - di = dm0; - if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height ) - di = d_size.height - i; - - for( j = 0; j < d_size.width; j += dj ) - { - uchar* _d = matD->ptr() + i*matD->step + j*elem_size; - const uchar* _c = Cdata + i*c_step0 + j*c_step1; - size_t _d_step = matD->step; - dj = dn0; - - if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width ) - dj = d_size.width - j; - - flags &= 15; - if( dk0 < len ) - { - _d = d_buf; - _d_step = dj*work_elem_size; - } - - for( k = 0; k < len; k += dk ) - { - const uchar* _a = A.ptr() + i*a_step0 + k*a_step1; - size_t _a_step = A.step; - const uchar* _b = B.ptr() + k*b_step0 + j*b_step1; - size_t _b_step = b_step; - Size a_bl_size; - - dk = dk0; - if( k + dk >= len || 8*(k + dk) + dk > 8*len ) - dk = len - k; - - if( !is_a_t ) - a_bl_size.width = dk, a_bl_size.height = di; - else - a_bl_size.width = di, a_bl_size.height = dk; - - if( a_buf && is_a_t ) - { - _a_step = dk*elem_size; - GEMM_TransposeBlock( _a, A.step, a_buf, _a_step, a_bl_size, elem_size ); - std::swap( a_bl_size.width, a_bl_size.height ); - _a = a_buf; - } - - if( dj < d_size.width ) - { - Size b_size; - if( !is_b_t ) - b_size.width = dj, b_size.height = dk; - else - b_size.width = dk, b_size.height = dj; - - _b_step = b_size.width*elem_size; - GEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size ); - _b = b_buf; - } - - if( dk0 < len ) - blockMulFunc( _a, _a_step, _b, _b_step, _d, _d_step, - a_bl_size, Size(dj,di), flags ); - else - singleMulFunc( _a, _a_step, _b, _b_step, _c, Cstep, - _d, _d_step, a_bl_size, Size(dj,di), alpha, beta, flags ); - flags |= 16; - } - - if( dk0 < len ) - storeFunc( _c, Cstep, _d, _d_step, - matD->ptr(i) + j*elem_size, - matD->step, Size(dj,di), alpha, beta, flags ); - } - } - } - } -} - -template inline static void -callGemmImpl(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha, - const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int m_a, int n_a, int n_d, int flags, int type) -{ - CV_StaticAssert(GEMM_1_T == CV_HAL_GEMM_1_T, "Incompatible GEMM_1_T flag in HAL"); - CV_StaticAssert(GEMM_2_T == CV_HAL_GEMM_2_T, "Incompatible GEMM_2_T flag in HAL"); - CV_StaticAssert(GEMM_3_T == CV_HAL_GEMM_3_T, "Incompatible GEMM_3_T flag in HAL"); - - int b_m, b_n, c_m, c_n, m_d; - - if(flags & GEMM_2_T) - { - b_m = n_d; - if(flags & GEMM_1_T ) - { - b_n = m_a; - m_d = n_a; - } - else - { - b_n = n_a; - m_d = m_a; - } - } - else - { - b_n = n_d; - if(flags & GEMM_1_T ) - { - b_m = m_a; - m_d = n_a; - } - else - { - m_d = m_a; - b_m = n_a; - } - } - - if(flags & GEMM_3_T) - { - c_m = n_d; - c_n = m_d; - } - else - { - c_m = m_d; - c_n = n_d; - } - - Mat A, B, C; - if(src1 != NULL) - A = Mat(m_a, n_a, type, (void*)src1, src1_step); - if(src2 != NULL) - B = Mat(b_m, b_n, type, (void*)src2, src2_step); - if(src3 != NULL && beta != 0.0) - C = Mat(c_m, c_n, type, (void*)src3, src3_step); - Mat D(m_d, n_d, type, (void*)dst, dst_step); - - gemmImpl(A, B, alpha, C, beta, D, flags); -} - -} - -void cv::hal::gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, - float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) -{ - CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) - callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32F); +#ifdef CV_GEMM_BASELINE_ONLY + CV_CPU_CALL_BASELINE(gemm32f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); +#else + CV_CPU_DISPATCH(gemm32f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), + CV_CPU_DISPATCH_MODES_ALL); +#endif } -void cv::hal::gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, - double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) +void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { + CV_INSTRUMENT_REGION(); CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) - callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64F); +#ifdef CV_GEMM_BASELINE_ONLY + CV_CPU_CALL_BASELINE(gemm64f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); +#else + CV_CPU_DISPATCH(gemm64f, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), + CV_CPU_DISPATCH_MODES_ALL); +#endif } -CV_EXPORTS void cv::hal::gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, - float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) +void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { + CV_INSTRUMENT_REGION(); CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) - callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32FC2); +#ifdef CV_GEMM_BASELINE_ONLY + CV_CPU_CALL_BASELINE(gemm32fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); +#else + CV_CPU_DISPATCH(gemm32fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), + CV_CPU_DISPATCH_MODES_ALL); +#endif } -CV_EXPORTS void cv::hal::gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, - double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) +void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { + CV_INSTRUMENT_REGION(); CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) - callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64FC2); +#ifdef CV_GEMM_BASELINE_ONLY + CV_CPU_CALL_BASELINE(gemm64fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)); +#else + CV_CPU_DISPATCH(gemm64fc, (src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags), + CV_CPU_DISPATCH_MODES_ALL); +#endif } -void cv::gemm( InputArray matA, InputArray matB, double alpha, - InputArray matC, double beta, OutputArray _matD, int flags ) +} // namespace hal + +void gemm(InputArray matA, InputArray matB, double alpha, + InputArray matC, double beta, OutputArray _matD, int flags) { #ifdef HAVE_CLAMDBLAS CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && @@ -1631,457 +410,27 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, DProxyPtr->copyTo(D); } + + /****************************************************************************************\ * Transform * \****************************************************************************************/ -namespace cv -{ - -template static void -transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) -{ - int x; - - if( scn == 2 && dcn == 2 ) - { - for( x = 0; x < len*2; x += 2 ) - { - WT v0 = src[x], v1 = src[x+1]; - T t0 = saturate_cast(m[0]*v0 + m[1]*v1 + m[2]); - T t1 = saturate_cast(m[3]*v0 + m[4]*v1 + m[5]); - dst[x] = t0; dst[x+1] = t1; - } - } - else if( scn == 3 && dcn == 3 ) - { - for( x = 0; x < len*3; x += 3 ) - { - WT v0 = src[x], v1 = src[x+1], v2 = src[x+2]; - T t0 = saturate_cast(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); - T t1 = saturate_cast(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); - T t2 = saturate_cast(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); - dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; - } - } - else if( scn == 3 && dcn == 1 ) - { - for( x = 0; x < len; x++, src += 3 ) - dst[x] = saturate_cast(m[0]*src[0] + m[1]*src[1] + m[2]*src[2] + m[3]); - } - else if( scn == 4 && dcn == 4 ) - { - for( x = 0; x < len*4; x += 4 ) - { - WT v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3]; - T t0 = saturate_cast(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]*v3 + m[4]); - T t1 = saturate_cast(m[5]*v0 + m[6]*v1 + m[7]*v2 + m[8]*v3 + m[9]); - dst[x] = t0; dst[x+1] = t1; - t0 = saturate_cast(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]); - t1 = saturate_cast(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]); - dst[x+2] = t0; dst[x+3] = t1; - } - } - else - { - for( x = 0; x < len; x++, src += scn, dst += dcn ) - { - const WT* _m = m; - int j, k; - for( j = 0; j < dcn; j++, _m += scn + 1 ) - { - WT s = _m[scn]; - for( k = 0; k < scn; k++ ) - s += _m[k]*src[k]; - dst[j] = saturate_cast(s); - } - } - } -} - -#if CV_SIMD128 && !defined(__aarch64__) -static inline void -load3x3Matrix(const float* m, v_float32x4& m0, v_float32x4& m1, v_float32x4& m2, v_float32x4& m3) -{ - m0 = v_float32x4(m[0], m[4], m[8], 0); - m1 = v_float32x4(m[1], m[5], m[9], 0); - m2 = v_float32x4(m[2], m[6], m[10], 0); - m3 = v_float32x4(m[3], m[7], m[11], 0); -} -#endif - -#if CV_SIMD128 -static inline v_int16x8 -v_matmulvec(const v_int16x8 &v0, const v_int16x8 &m0, const v_int16x8 &m1, const v_int16x8 &m2, const v_int32x4 &m3, const int BITS) -{ - // v0 : 0 b0 g0 r0 b1 g1 r1 ? - v_int32x4 t0 = v_dotprod(v0, m0); // a0 b0 a1 b1 - v_int32x4 t1 = v_dotprod(v0, m1); // c0 d0 c1 d1 - v_int32x4 t2 = v_dotprod(v0, m2); // e0 f0 e1 f1 - v_int32x4 t3 = v_setzero_s32(); - v_int32x4 s0, s1, s2, s3; - v_transpose4x4(t0, t1, t2, t3, s0, s1, s2, s3); - s0 = s0 + s1 + m3; // B0 G0 R0 ? - s2 = s2 + s3 + m3; // B1 G1 R1 ? - - s0 = s0 >> BITS; - s2 = s2 >> BITS; - - v_int16x8 result = v_pack(s0, v_setzero_s32()); // B0 G0 R0 0 0 0 0 0 - result = v_reinterpret_as_s16(v_reinterpret_as_s64(result) << 16); // 0 B0 G0 R0 0 0 0 0 - result = result | v_pack(v_setzero_s32(), s2); // 0 B0 G0 R0 B1 G1 R1 0 - return result; -} -#endif - -static void -transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn ) -{ -#if CV_SIMD128 - const int BITS = 10, SCALE = 1 << BITS; - const float MAX_M = (float)(1 << (15 - BITS)); - - if( hasSIMD128() && scn == 3 && dcn == 3 && - std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 && - std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 && - std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 ) - { - const int nChannels = 3; - const int cWidth = v_int16x8::nlanes; - // faster fixed-point transformation - short m00 = saturate_cast(m[0]*SCALE), m01 = saturate_cast(m[1]*SCALE), - m02 = saturate_cast(m[2]*SCALE), m10 = saturate_cast(m[4]*SCALE), - m11 = saturate_cast(m[5]*SCALE), m12 = saturate_cast(m[6]*SCALE), - m20 = saturate_cast(m[8]*SCALE), m21 = saturate_cast(m[9]*SCALE), - m22 = saturate_cast(m[10]*SCALE); - int m03 = saturate_cast((m[3]+0.5f)*SCALE), m13 = saturate_cast((m[7]+0.5f)*SCALE ), - m23 = saturate_cast((m[11]+0.5f)*SCALE); - - v_int16x8 m0 = v_int16x8(0, m00, m01, m02, m00, m01, m02, 0); - v_int16x8 m1 = v_int16x8(0, m10, m11, m12, m10, m11, m12, 0); - v_int16x8 m2 = v_int16x8(0, m20, m21, m22, m20, m21, m22, 0); - v_int32x4 m3 = v_int32x4(m03, m13, m23, 0); - int x = 0; - - for (; x <= (len - cWidth) * nChannels; x += cWidth * nChannels) - { - // load 8 pixels - v_int16x8 v0 = v_reinterpret_as_s16(v_load_expand(src + x)); - v_int16x8 v1 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth)); - v_int16x8 v2 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth * 2)); - v_int16x8 v3; - - // rotate and pack - v3 = v_rotate_right<1>(v2); // 0 b6 g6 r6 b7 g7 r7 0 - v2 = v_rotate_left <5>(v2, v1); // 0 b4 g4 r4 b5 g5 r5 0 - v1 = v_rotate_left <3>(v1, v0); // 0 b2 g2 r2 b3 g3 r3 0 - v0 = v_rotate_left <1>(v0); // 0 b0 g0 r0 b1 g1 r1 0 - - // multiply with matrix and normalize - v0 = v_matmulvec(v0, m0, m1, m2, m3, BITS); // 0 B0 G0 R0 B1 G1 R1 0 - v1 = v_matmulvec(v1, m0, m1, m2, m3, BITS); // 0 B2 G2 R2 B3 G3 R3 0 - v2 = v_matmulvec(v2, m0, m1, m2, m3, BITS); // 0 B4 G4 R4 B5 G5 R5 0 - v3 = v_matmulvec(v3, m0, m1, m2, m3, BITS); // 0 B6 G6 R6 B7 G7 R7 0 - - // narrow down as uint8x16 - v_uint8x16 z0 = v_pack_u(v0, v_setzero_s16()); // 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0 - v_uint8x16 z1 = v_pack_u(v1, v_setzero_s16()); // 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0 - v_uint8x16 z2 = v_pack_u(v2, v_setzero_s16()); // 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0 - v_uint8x16 z3 = v_pack_u(v3, v_setzero_s16()); // 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0 - - // rotate and pack - z0 = v_reinterpret_as_u8(v_reinterpret_as_u64(z0) >> 8) | v_reinterpret_as_u8(v_reinterpret_as_u64(z1) << 40); // B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0 - z1 = v_reinterpret_as_u8(v_reinterpret_as_u64(z1) >> 24) | v_reinterpret_as_u8(v_reinterpret_as_u64(z2) << 24); // R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0 - z2 = v_reinterpret_as_u8(v_reinterpret_as_u64(z2) >> 40) | v_reinterpret_as_u8(v_reinterpret_as_u64(z3) << 8); // G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 - - // store on memory - v_store_low(dst + x, z0); - v_store_low(dst + x + cWidth, z1); - v_store_low(dst + x + cWidth * 2, z2); - } - - for( ; x < len * nChannels; x += nChannels ) - { - int v0 = src[x], v1 = src[x+1], v2 = src[x+2]; - uchar t0 = saturate_cast((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS); - uchar t1 = saturate_cast((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS); - uchar t2 = saturate_cast((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS); - dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; - } - return; - } -#endif - - transform_(src, dst, m, len, scn, dcn); -} - -static void -transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn ) -{ -#if CV_SIMD128 && !defined(__aarch64__) - if( hasSIMD128() && scn == 3 && dcn == 3 ) - { - const int nChannels = 3; - const int cWidth = v_float32x4::nlanes; - v_int16x8 delta = v_int16x8(0, -32768, -32768, -32768, -32768, -32768, -32768, 0); - v_float32x4 m0, m1, m2, m3; - load3x3Matrix(m, m0, m1, m2, m3); - m3 -= v_float32x4(32768.f, 32768.f, 32768.f, 0.f); - - int x = 0; - for( ; x <= (len - cWidth) * nChannels; x += cWidth * nChannels ) - { - // load 4 pixels - v_uint16x8 v0_16 = v_load(src + x); // b0 g0 r0 b1 g1 r1 b2 g2 - v_uint16x8 v2_16 = v_load_low(src + x + cWidth * 2); // r2 b3 g3 r3 ? ? ? ? - - // expand to 4 vectors - v_uint32x4 v0_32, v1_32, v2_32, v3_32, dummy_32; - v_expand(v_rotate_right<3>(v0_16), v1_32, dummy_32); // b1 g1 r1 - v_expand(v_rotate_right<1>(v2_16), v3_32, dummy_32); // b3 g3 r3 - v_expand(v_rotate_right<6>(v0_16, v2_16), v2_32, dummy_32); // b2 g2 r2 - v_expand(v0_16, v0_32, dummy_32); // b0 g0 r0 - - // convert to float32x4 - v_float32x4 x0 = v_cvt_f32(v_reinterpret_as_s32(v0_32)); // b0 g0 r0 - v_float32x4 x1 = v_cvt_f32(v_reinterpret_as_s32(v1_32)); // b1 g1 r1 - v_float32x4 x2 = v_cvt_f32(v_reinterpret_as_s32(v2_32)); // b2 g2 r2 - v_float32x4 x3 = v_cvt_f32(v_reinterpret_as_s32(v3_32)); // b3 g3 r3 - - // multiply and convert back to int32x4 - v_int32x4 y0, y1, y2, y3; - y0 = v_round(v_matmuladd(x0, m0, m1, m2, m3)); // B0 G0 R0 - y1 = v_round(v_matmuladd(x1, m0, m1, m2, m3)); // B1 G1 R1 - y2 = v_round(v_matmuladd(x2, m0, m1, m2, m3)); // B2 G2 R2 - y3 = v_round(v_matmuladd(x3, m0, m1, m2, m3)); // B3 G3 R3 - - // narrow down to int16x8 - v_int16x8 v0 = v_add_wrap(v_pack(v_rotate_left<1>(y0), y1), delta); // 0 B0 G0 R0 B1 G1 R1 0 - v_int16x8 v2 = v_add_wrap(v_pack(v_rotate_left<1>(y2), y3), delta); // 0 B2 G2 R2 B3 G3 R3 0 - - // rotate and pack - v0 = v_rotate_right<1>(v0) | v_rotate_left<5>(v2); // B0 G0 R0 B1 G1 R1 B2 G2 - v2 = v_rotate_right<3>(v2); // R2 B3 G3 R3 0 0 0 0 - - // store 4 pixels - v_store(dst + x, v_reinterpret_as_u16(v0)); - v_store_low(dst + x + cWidth * 2, v_reinterpret_as_u16(v2)); - } - - for( ; x < len * nChannels; x += nChannels ) - { - float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2]; - ushort t0 = saturate_cast(m[0] * v0 + m[1] * v1 + m[2] * v2 + m[3]); - ushort t1 = saturate_cast(m[4] * v0 + m[5] * v1 + m[6] * v2 + m[7]); - ushort t2 = saturate_cast(m[8] * v0 + m[9] * v1 + m[10] * v2 + m[11]); - dst[x] = t0; dst[x + 1] = t1; dst[x + 2] = t2; - } - return; - } -#endif - - transform_(src, dst, m, len, scn, dcn); -} - -static void -transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) -{ -#if CV_SIMD128 && !defined(__aarch64__) - if( hasSIMD128() ) - { - int x = 0; - if( scn == 3 && dcn == 3 ) - { - const int cWidth = 3; - v_float32x4 m0, m1, m2, m3; - load3x3Matrix(m, m0, m1, m2, m3); - - for( ; x < (len - 1)*cWidth; x += cWidth ) - { - v_float32x4 x0 = v_load(src + x); - v_float32x4 y0 = v_matmuladd(x0, m0, m1, m2, m3); - v_store_low(dst + x, y0); - dst[x + 2] = v_combine_high(y0, y0).get0(); - } - - for( ; x < len*cWidth; x += cWidth ) - { - float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; - float t0 = saturate_cast(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); - float t1 = saturate_cast(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); - float t2 = saturate_cast(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); - dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; - } - return; - } - - if( scn == 4 && dcn == 4 ) - { - const int cWidth = 4; - v_float32x4 m0 = v_float32x4(m[0], m[5], m[10], m[15]); - v_float32x4 m1 = v_float32x4(m[1], m[6], m[11], m[16]); - v_float32x4 m2 = v_float32x4(m[2], m[7], m[12], m[17]); - v_float32x4 m3 = v_float32x4(m[3], m[8], m[13], m[18]); - v_float32x4 m4 = v_float32x4(m[4], m[9], m[14], m[19]); - - for( ; x < len*cWidth; x += cWidth ) - { - v_float32x4 x0 = v_load(src + x); - v_float32x4 y0 = v_matmul(x0, m0, m1, m2, m3) + m4; - v_store(dst + x, y0); - } - return; - } - } -#endif - - transform_(src, dst, m, len, scn, dcn); -} - - -static void -transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn) -{ - transform_(src, dst, m, len, scn, dcn); -} - -static void -transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn) -{ - transform_(src, dst, m, len, scn, dcn); -} - -static void -transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn) -{ - transform_(src, dst, m, len, scn, dcn); -} - -static void -transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) -{ - transform_(src, dst, m, len, scn, dcn); -} - -template static void -diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int ) -{ - int x; - - if( cn == 2 ) - { - for( x = 0; x < len*2; x += 2 ) - { - T t0 = saturate_cast(m[0]*src[x] + m[2]); - T t1 = saturate_cast(m[4]*src[x+1] + m[5]); - dst[x] = t0; dst[x+1] = t1; - } - } - else if( cn == 3 ) - { - for( x = 0; x < len*3; x += 3 ) - { - T t0 = saturate_cast(m[0]*src[x] + m[3]); - T t1 = saturate_cast(m[5]*src[x+1] + m[7]); - T t2 = saturate_cast(m[10]*src[x+2] + m[11]); - dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; - } - } - else if( cn == 4 ) - { - for( x = 0; x < len*4; x += 4 ) - { - T t0 = saturate_cast(m[0]*src[x] + m[4]); - T t1 = saturate_cast(m[6]*src[x+1] + m[9]); - dst[x] = t0; dst[x+1] = t1; - t0 = saturate_cast(m[12]*src[x+2] + m[14]); - t1 = saturate_cast(m[18]*src[x+3] + m[19]); - dst[x+2] = t0; dst[x+3] = t1; - } - } - else - { - for( x = 0; x < len; x++, src += cn, dst += cn ) - { - const WT* _m = m; - for( int j = 0; j < cn; j++, _m += cn + 1 ) - dst[j] = saturate_cast(src[j]*_m[j] + _m[cn]); - } - } -} - -static void -diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn) -{ - diagtransform_(src, dst, m, len, scn, dcn); -} - -static void -diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn) -{ - diagtransform_(src, dst, m, len, scn, dcn); -} - -static void -diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn) -{ - diagtransform_(src, dst, m, len, scn, dcn); -} - -static void -diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn) -{ - diagtransform_(src, dst, m, len, scn, dcn); -} - -static void -diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn) -{ - diagtransform_(src, dst, m, len, scn, dcn); -} - -static void -diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn) -{ - diagtransform_(src, dst, m, len, scn, dcn); -} - -static void -diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) -{ - diagtransform_(src, dst, m, len, scn, dcn); -} - - -typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int ); - static TransformFunc getTransformFunc(int depth) { - static TransformFunc transformTab[] = - { - (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u, - (TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f, - (TransformFunc)transform_64f, 0 - }; - - return transformTab[depth]; + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getTransformFunc, (depth), + CV_CPU_DISPATCH_MODES_ALL); } static TransformFunc getDiagTransformFunc(int depth) { - static TransformFunc diagTransformTab[] = - { - (TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u, - (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f, - (TransformFunc)diagtransform_64f, 0 - }; - - return diagTransformTab[depth]; + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getDiagTransformFunc, (depth), + CV_CPU_DISPATCH_MODES_ALL); } -} - -void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx ) +void transform(InputArray _src, OutputArray _dst, InputArray _mtx) { CV_INSTRUMENT_REGION(); @@ -2154,114 +503,20 @@ void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx ) func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); } + + /****************************************************************************************\ * Perspective Transform * \****************************************************************************************/ -namespace cv +static TransformFunc getPerspectiveTransform(int depth) { - -template static void -perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn ) -{ - const double eps = FLT_EPSILON; - int i; - - if( scn == 2 && dcn == 2 ) - { - for( i = 0; i < len*2; i += 2 ) - { - T x = src[i], y = src[i + 1]; - double w = x*m[6] + y*m[7] + m[8]; - - if( fabs(w) > eps ) - { - w = 1./w; - dst[i] = (T)((x*m[0] + y*m[1] + m[2])*w); - dst[i+1] = (T)((x*m[3] + y*m[4] + m[5])*w); - } - else - dst[i] = dst[i+1] = (T)0; - } - } - else if( scn == 3 && dcn == 3 ) - { - for( i = 0; i < len*3; i += 3 ) - { - T x = src[i], y = src[i + 1], z = src[i + 2]; - double w = x*m[12] + y*m[13] + z*m[14] + m[15]; - - if( fabs(w) > eps ) - { - w = 1./w; - dst[i] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3]) * w); - dst[i+1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7]) * w); - dst[i+2] = (T)((x*m[8] + y*m[9] + z*m[10] + m[11]) * w); - } - else - dst[i] = dst[i+1] = dst[i+2] = (T)0; - } - } - else if( scn == 3 && dcn == 2 ) - { - for( i = 0; i < len; i++, src += 3, dst += 2 ) - { - T x = src[0], y = src[1], z = src[2]; - double w = x*m[8] + y*m[9] + z*m[10] + m[11]; - - if( fabs(w) > eps ) - { - w = 1./w; - dst[0] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3])*w); - dst[1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7])*w); - } - else - dst[0] = dst[1] = (T)0; - } - } - else - { - for( i = 0; i < len; i++, src += scn, dst += dcn ) - { - const double* _m = m + dcn*(scn + 1); - double w = _m[scn]; - int j, k; - for( k = 0; k < scn; k++ ) - w += _m[k]*src[k]; - if( fabs(w) > eps ) - { - _m = m; - for( j = 0; j < dcn; j++, _m += scn + 1 ) - { - double s = _m[scn]; - for( k = 0; k < scn; k++ ) - s += _m[k]*src[k]; - dst[j] = (T)(s*w); - } - } - else - for( j = 0; j < dcn; j++ ) - dst[j] = 0; - } - } + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getPerspectiveTransform, (depth), + CV_CPU_DISPATCH_MODES_ALL); } - -static void -perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn) -{ - perspectiveTransform_(src, dst, m, len, scn, dcn); -} - -static void -perspectiveTransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) -{ - perspectiveTransform_(src, dst, m, len, scn, dcn); -} - -} - -void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx ) +void perspectiveTransform(InputArray _src, OutputArray _dst, InputArray _mtx) { CV_INSTRUMENT_REGION(); @@ -2286,9 +541,7 @@ void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mt m = tmp; } - TransformFunc func = depth == CV_32F ? - (TransformFunc)perspectiveTransform_32f : - (TransformFunc)perspectiveTransform_64f; + TransformFunc func = getPerspectiveTransform(depth); CV_Assert( func != 0 ); const Mat* arrays[] = {&src, &dst, 0}; @@ -2304,44 +557,6 @@ void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mt * ScaleAdd * \****************************************************************************************/ -namespace cv -{ - -static void scaleAdd_32f(const float* src1, const float* src2, float* dst, - int len, float* _alpha) -{ - float alpha = *_alpha; - int i = 0; -#if CV_SIMD - v_float32 v_alpha = vx_setall_f32(alpha); - const int cWidth = v_float32::nlanes; - for (; i <= len - cWidth; i += cWidth) - v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i))); - vx_cleanup(); -#endif - for (; i < len; i++) - dst[i] = src1[i] * alpha + src2[i]; -} - - -static void scaleAdd_64f(const double* src1, const double* src2, double* dst, - int len, double* _alpha) -{ - double alpha = *_alpha; - int i = 0; -#if CV_SIMD_64F - v_float64 a2 = vx_setall_f64(alpha); - const int cWidth = v_float64::nlanes; - for (; i <= len - cWidth; i += cWidth) - v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i))); - vx_cleanup(); -#endif - for (; i < len; i++) - dst[i] = src1[i] * alpha + src2[i]; -} - -typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha); - #ifdef HAVE_OPENCL static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) @@ -2390,9 +605,14 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp #endif +static ScaleAddFunc getScaleAddFunc(int depth) +{ + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getScaleAddFunc, (depth), + CV_CPU_DISPATCH_MODES_ALL); } -void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst ) +void scaleAdd(InputArray _src1, double alpha, InputArray _src2, OutputArray _dst) { CV_INSTRUMENT_REGION(); @@ -2417,7 +637,8 @@ void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray float falpha = (float)alpha; void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α - ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f; + ScaleAddFunc func = getScaleAddFunc(depth); + CV_Assert(func); if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous()) { @@ -2439,7 +660,7 @@ void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray * Covariation Matrix * \****************************************************************************************/ -void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype ) +void calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype ) { CV_INSTRUMENT_REGION(); @@ -2481,7 +702,7 @@ void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, _mean = mean.reshape(1, size.height); } -void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype ) +void calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype ) { CV_INSTRUMENT_REGION(); @@ -2566,20 +787,33 @@ void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype ); } + + /****************************************************************************************\ * Mahalanobis * \****************************************************************************************/ -double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar ) +static MahalanobisImplFunc getMahalanobisImplFunc(int depth) +{ +#ifdef CV_MAHALANOBIS_BASELINE_ONLY + CV_CPU_CALL_BASELINE(getMahalanobisImplFunc, (depth)); +#else + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getMahalanobisImplFunc, (depth), + CV_CPU_DISPATCH_MODES_ALL); +#endif +} + + +double Mahalanobis(InputArray _v1, InputArray _v2, InputArray _icovar) { CV_INSTRUMENT_REGION(); Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat(); int type = v1.type(), depth = v1.depth(); Size sz = v1.size(); - int i, j, len = sz.width*sz.height*v1.channels(); + int len = sz.width*sz.height*v1.channels(); AutoBuffer buf(len); - double result = 0; CV_Assert_N( type == v2.type(), type == icovar.type(), sz == v2.size(), len == icovar.rows && len == icovar.cols ); @@ -2591,278 +825,32 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar ) sz.height = 1; } - if( depth == CV_32F ) - { - const float* src1 = v1.ptr(); - const float* src2 = v2.ptr(); - size_t step1 = v1.step/sizeof(src1[0]); - size_t step2 = v2.step/sizeof(src2[0]); - double* diff = buf.data(); - const float* mat = icovar.ptr(); - size_t matstep = icovar.step/sizeof(mat[0]); - - for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) - { - for( i = 0; i < sz.width; i++ ) - diff[i] = src1[i] - src2[i]; - } - - diff = buf.data(); - for( i = 0; i < len; i++, mat += matstep ) - { - double row_sum = 0; - j = 0; - #if CV_ENABLE_UNROLLED - for(; j <= len - 4; j += 4 ) - row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + - diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; - #endif - for( ; j < len; j++ ) - row_sum += diff[j]*mat[j]; - result += row_sum * diff[i]; - } - } - else if( depth == CV_64F ) - { - const double* src1 = v1.ptr(); - const double* src2 = v2.ptr(); - size_t step1 = v1.step/sizeof(src1[0]); - size_t step2 = v2.step/sizeof(src2[0]); - double* diff = buf.data(); - const double* mat = icovar.ptr(); - size_t matstep = icovar.step/sizeof(mat[0]); - - for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) - { - for( i = 0; i < sz.width; i++ ) - diff[i] = src1[i] - src2[i]; - } - - diff = buf.data(); - for( i = 0; i < len; i++, mat += matstep ) - { - double row_sum = 0; - j = 0; - #if CV_ENABLE_UNROLLED - for(; j <= len - 4; j += 4 ) - row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + - diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; - #endif - for( ; j < len; j++ ) - row_sum += diff[j]*mat[j]; - result += row_sum * diff[i]; - } - } - else - CV_Error( CV_StsUnsupportedFormat, "" ); + MahalanobisImplFunc func = getMahalanobisImplFunc(depth); + CV_Assert(func); + double result = func(v1, v2, icovar, buf.data(), len); return std::sqrt(result); } + + /****************************************************************************************\ * MulTransposed * \****************************************************************************************/ -namespace cv +static MulTransposedFunc getMulTransposedFunc(int stype, int dtype, bool ata) { - -template static void -MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) -{ - int i, j, k; - const sT* src = srcmat.ptr(); - dT* dst = dstmat.ptr
(); - const dT* delta = deltamat.ptr
(); - size_t srcstep = srcmat.step/sizeof(src[0]); - size_t dststep = dstmat.step/sizeof(dst[0]); - size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0; - int delta_cols = deltamat.cols; - Size size = srcmat.size(); - dT* tdst = dst; - dT* col_buf = 0; - dT* delta_buf = 0; - int buf_size = size.height*sizeof(dT); - AutoBuffer buf; - - if( delta && delta_cols < size.width ) - { - assert( delta_cols == 1 ); - buf_size *= 5; - } - buf.allocate(buf_size); - col_buf = (dT*)buf.data(); - - if( delta && delta_cols < size.width ) - { - delta_buf = col_buf + size.height; - for( i = 0; i < size.height; i++ ) - delta_buf[i*4] = delta_buf[i*4+1] = - delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep]; - delta = delta_buf; - deltastep = deltastep ? 4 : 0; - } - - if( !delta ) - for( i = 0; i < size.width; i++, tdst += dststep ) - { - for( k = 0; k < size.height; k++ ) - col_buf[k] = src[k*srcstep+i]; - - for( j = i; j <= size.width - 4; j += 4 ) - { - double s0 = 0, s1 = 0, s2 = 0, s3 = 0; - const sT *tsrc = src + j; - - for( k = 0; k < size.height; k++, tsrc += srcstep ) - { - double a = col_buf[k]; - s0 += a * tsrc[0]; - s1 += a * tsrc[1]; - s2 += a * tsrc[2]; - s3 += a * tsrc[3]; - } - - tdst[j] = (dT)(s0*scale); - tdst[j+1] = (dT)(s1*scale); - tdst[j+2] = (dT)(s2*scale); - tdst[j+3] = (dT)(s3*scale); - } - - for( ; j < size.width; j++ ) - { - double s0 = 0; - const sT *tsrc = src + j; - - for( k = 0; k < size.height; k++, tsrc += srcstep ) - s0 += (double)col_buf[k] * tsrc[0]; - - tdst[j] = (dT)(s0*scale); - } - } - else - for( i = 0; i < size.width; i++, tdst += dststep ) - { - if( !delta_buf ) - for( k = 0; k < size.height; k++ ) - col_buf[k] = src[k*srcstep+i] - delta[k*deltastep+i]; - else - for( k = 0; k < size.height; k++ ) - col_buf[k] = src[k*srcstep+i] - delta_buf[k*deltastep]; - - for( j = i; j <= size.width - 4; j += 4 ) - { - double s0 = 0, s1 = 0, s2 = 0, s3 = 0; - const sT *tsrc = src + j; - const dT *d = delta_buf ? delta_buf : delta + j; - - for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) - { - double a = col_buf[k]; - s0 += a * (tsrc[0] - d[0]); - s1 += a * (tsrc[1] - d[1]); - s2 += a * (tsrc[2] - d[2]); - s3 += a * (tsrc[3] - d[3]); - } - - tdst[j] = (dT)(s0*scale); - tdst[j+1] = (dT)(s1*scale); - tdst[j+2] = (dT)(s2*scale); - tdst[j+3] = (dT)(s3*scale); - } - - for( ; j < size.width; j++ ) - { - double s0 = 0; - const sT *tsrc = src + j; - const dT *d = delta_buf ? delta_buf : delta + j; - - for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) - s0 += (double)col_buf[k] * (tsrc[0] - d[0]); - - tdst[j] = (dT)(s0*scale); - } - } +#ifdef CV_MULTRANSPOSED_BASELINE_ONLY + CV_CPU_CALL_BASELINE(getMulTransposedFunc, (stype, dtype, ata)); +#else + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getMulTransposedFunc, (stype, dtype, ata), + CV_CPU_DISPATCH_MODES_ALL); +#endif } - -template static void -MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) -{ - int i, j, k; - const sT* src = srcmat.ptr(); - dT* dst = dstmat.ptr
(); - const dT* delta = deltamat.ptr
(); - size_t srcstep = srcmat.step/sizeof(src[0]); - size_t dststep = dstmat.step/sizeof(dst[0]); - size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0; - int delta_cols = deltamat.cols; - Size size = srcmat.size(); - dT* tdst = dst; - - if( !delta ) - for( i = 0; i < size.height; i++, tdst += dststep ) - for( j = i; j < size.height; j++ ) - { - double s = 0; - const sT *tsrc1 = src + i*srcstep; - const sT *tsrc2 = src + j*srcstep; - - for( k = 0; k <= size.width - 4; k += 4 ) - s += (double)tsrc1[k]*tsrc2[k] + (double)tsrc1[k+1]*tsrc2[k+1] + - (double)tsrc1[k+2]*tsrc2[k+2] + (double)tsrc1[k+3]*tsrc2[k+3]; - for( ; k < size.width; k++ ) - s += (double)tsrc1[k] * tsrc2[k]; - tdst[j] = (dT)(s*scale); - } - else - { - dT delta_buf[4]; - int delta_shift = delta_cols == size.width ? 4 : 0; - AutoBuffer buf(size.width*sizeof(dT)); - dT* row_buf = (dT*)buf.data(); - - for( i = 0; i < size.height; i++, tdst += dststep ) - { - const sT *tsrc1 = src + i*srcstep; - const dT *tdelta1 = delta + i*deltastep; - - if( delta_cols < size.width ) - for( k = 0; k < size.width; k++ ) - row_buf[k] = tsrc1[k] - tdelta1[0]; - else - for( k = 0; k < size.width; k++ ) - row_buf[k] = tsrc1[k] - tdelta1[k]; - - for( j = i; j < size.height; j++ ) - { - double s = 0; - const sT *tsrc2 = src + j*srcstep; - const dT *tdelta2 = delta + j*deltastep; - if( delta_cols < size.width ) - { - delta_buf[0] = delta_buf[1] = - delta_buf[2] = delta_buf[3] = tdelta2[0]; - tdelta2 = delta_buf; - } - for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift ) - s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]) + - (double)row_buf[k+1]*(tsrc2[k+1] - tdelta2[1]) + - (double)row_buf[k+2]*(tsrc2[k+2] - tdelta2[2]) + - (double)row_buf[k+3]*(tsrc2[k+3] - tdelta2[3]); - for( ; k < size.width; k++, tdelta2++ ) - s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]); - tdst[j] = (dT)(s*scale); - } - } - } -} - -typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale); - -} - -void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata, - InputArray _delta, double scale, int dtype ) +void mulTransposed(InputArray _src, OutputArray _dst, bool ata, + InputArray _delta, double scale, int dtype) { CV_INSTRUMENT_REGION(); @@ -2906,70 +894,7 @@ void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata, } else { - MulTransposedFunc func = 0; - if(stype == CV_8U && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_8U && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16U && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16U && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16S && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16S && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_32F && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_32F && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_64F && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } + MulTransposedFunc func = getMulTransposedFunc(stype, dtype, ata); if( !func ) CV_Error( CV_StsUnsupportedFormat, "" ); @@ -2982,273 +907,49 @@ void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata, * Dot Product * \****************************************************************************************/ -namespace cv -{ - -template double -dotProd_(const T* src1, const T* src2, int len) -{ - int i = 0; - double result = 0; - - #if CV_ENABLE_UNROLLED - for( ; i <= len - 4; i += 4 ) - result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] + - (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3]; - #endif - for( ; i < len; i++ ) - result += (double)src1[i]*src2[i]; - - return result; -} - - static double dotProd_8u(const uchar* src1, const uchar* src2, int len) { - double r = 0; -#if ARITHM_USE_IPP - CV_IPP_RUN(IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42, CV_INSTRUMENT_FUN_IPP(ippiDotProd_8u64f_C1R, src1, len*sizeof(uchar), src2, len*sizeof(uchar), ippiSize(len, 1), &r) >= 0, r); -#endif - int i = 0; - -#if CV_SIMD - int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize; - - while (i < len0) - { - blockSize = std::min(len0 - i, blockSize0); - v_int32 v_sum = vx_setzero_s32(); - const int cWidth = v_uint16::nlanes; - - int j = 0; - for (; j <= blockSize - cWidth * 2; j += cWidth * 2) - { - v_uint16 v_src10, v_src20, v_src11, v_src21; - v_expand(vx_load(src1 + j), v_src10, v_src11); - v_expand(vx_load(src2 + j), v_src20, v_src21); - - v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20)); - v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21)); - } - - for (; j <= blockSize - cWidth; j += cWidth) - { - v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j)); - v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j)); - - v_sum += v_dotprod(v_src10, v_src20); - } - r += (double)v_reduce_sum(v_sum); - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } - vx_cleanup(); -#elif CV_NEON - if( cv::checkHardwareSupport(CV_CPU_NEON) ) - { - int len0 = len & -8, blockSize0 = (1 << 15), blockSize; - uint32x4_t v_zero = vdupq_n_u32(0u); - CV_DECL_ALIGNED(16) uint buf[4]; - - while( i < len0 ) - { - blockSize = std::min(len0 - i, blockSize0); - uint32x4_t v_sum = v_zero; - - int j = 0; - for( ; j <= blockSize - 16; j += 16 ) - { - uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j); - - uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2)); - v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); - v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); - - v_src10 = vmovl_u8(vget_high_u8(v_src1)); - v_src20 = vmovl_u8(vget_high_u8(v_src2)); - v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); - v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); - } - - for( ; j <= blockSize - 8; j += 8 ) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j)); - v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2)); - v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2)); - } - - vst1q_u32(buf, v_sum); - r += buf[0] + buf[1] + buf[2] + buf[3]; - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } - } -#endif - return r + dotProd_(src1, src2, len - i); + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(dotProd_8u, (src1, src2, len), + CV_CPU_DISPATCH_MODES_ALL); } - - static double dotProd_8s(const schar* src1, const schar* src2, int len) { - double r = 0.0; - int i = 0; - -#if CV_SIMD - int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize; - - while (i < len0) - { - blockSize = std::min(len0 - i, blockSize0); - v_int32 v_sum = vx_setzero_s32(); - const int cWidth = v_int16::nlanes; - - int j = 0; - for (; j <= blockSize - cWidth * 2; j += cWidth * 2) - { - v_int16 v_src10, v_src20, v_src11, v_src21; - v_expand(vx_load(src1 + j), v_src10, v_src11); - v_expand(vx_load(src2 + j), v_src20, v_src21); - - v_sum += v_dotprod(v_src10, v_src20); - v_sum += v_dotprod(v_src11, v_src21); - } - - for (; j <= blockSize - cWidth; j += cWidth) - { - v_int16 v_src10 = vx_load_expand(src1 + j); - v_int16 v_src20 = vx_load_expand(src2 + j); - - v_sum += v_dotprod(v_src10, v_src20); - } - r += (double)v_reduce_sum(v_sum); - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } - vx_cleanup(); -#elif CV_NEON - if( cv::checkHardwareSupport(CV_CPU_NEON) ) - { - int len0 = len & -8, blockSize0 = (1 << 14), blockSize; - int32x4_t v_zero = vdupq_n_s32(0); - CV_DECL_ALIGNED(16) int buf[4]; - - while( i < len0 ) - { - blockSize = std::min(len0 - i, blockSize0); - int32x4_t v_sum = v_zero; - - int j = 0; - for( ; j <= blockSize - 16; j += 16 ) - { - int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j); - - int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2)); - v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); - v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); - - v_src10 = vmovl_s8(vget_high_s8(v_src1)); - v_src20 = vmovl_s8(vget_high_s8(v_src2)); - v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); - v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); - } - - for( ; j <= blockSize - 8; j += 8 ) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j)); - v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2)); - v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2)); - } - - vst1q_s32(buf, v_sum); - r += buf[0] + buf[1] + buf[2] + buf[3]; - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } - } -#endif - - return r + dotProd_(src1, src2, len - i); + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(dotProd_8s, (src1, src2, len), + CV_CPU_DISPATCH_MODES_ALL); } - static double dotProd_16u(const ushort* src1, const ushort* src2, int len) { -#if ARITHM_USE_IPP - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16u64f_C1R, src1, len*sizeof(ushort), src2, len*sizeof(ushort), ippiSize(len, 1), &r) >= 0, r); -#endif - return dotProd_(src1, src2, len); + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(dotProd_16u, (src1, src2, len), + CV_CPU_DISPATCH_MODES_ALL); } - static double dotProd_16s(const short* src1, const short* src2, int len) { -#if ARITHM_USE_IPP && (IPP_VERSION_X100 != 900) // bug in IPP 9.0.0 - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16s64f_C1R, src1, len*sizeof(short), src2, len*sizeof(short), ippiSize(len, 1), &r) >= 0, r); -#endif - return dotProd_(src1, src2, len); + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(dotProd_16s, (src1, src2, len), + CV_CPU_DISPATCH_MODES_ALL); } - static double dotProd_32s(const int* src1, const int* src2, int len) { -#if ARITHM_USE_IPP - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32s64f_C1R, src1, len*sizeof(int), src2, len*sizeof(int), ippiSize(len, 1), &r) >= 0, r); -#endif - return dotProd_(src1, src2, len); + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(dotProd_32s, (src1, src2, len), + CV_CPU_DISPATCH_MODES_ALL); } - static double dotProd_32f(const float* src1, const float* src2, int len) { - double r = 0.0; - -#if ARITHM_USE_IPP - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32f64f_C1R, src1, len*sizeof(float), src2, len*sizeof(float), ippiSize(len, 1), &r, ippAlgHintFast) >= 0, r); -#endif - int i = 0; - -#if CV_SIMD - int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize; - - while (i < len0) - { - blockSize = std::min(len0 - i, blockSize0); - v_float32 v_sum = vx_setzero_f32(); - - int j = 0; - int cWidth = v_float32::nlanes; - for (; j <= blockSize - cWidth; j += cWidth) - v_sum = v_muladd(vx_load(src1 + j), vx_load(src2 + j), v_sum); - - r += v_reduce_sum(v_sum); - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } - vx_cleanup(); -#endif - return r + dotProd_(src1, src2, len - i); + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(dotProd_32f, (src1, src2, len), + CV_CPU_DISPATCH_MODES_ALL); } - static double dotProd_64f(const double* src1, const double* src2, int len) { -#if ARITHM_USE_IPP - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsDotProd_64f, src1, src2, len, &r) >= 0, r); -#endif - - return dotProd_(src1, src2, len); + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(dotProd_64f, (src1, src2, len), + CV_CPU_DISPATCH_MODES_ALL); } - typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len); static DotProdFunc getDotProdFunc(int depth) @@ -3292,7 +993,7 @@ double Mat::dot(InputArray _mat) const return r; } -} +} // namespace cv:: /****************************************************************************************\ * Earlier API * diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index 19ab907c9e..760bf39e0f 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -41,16 +41,62 @@ // //M*/ -#include #include "precomp.hpp" -#include "opencl_kernels_core.hpp" -#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" -#include "opencv2/core/opencl/runtime/opencl_core.hpp" -#include "intel_gpu_gemm.inl.hpp" -namespace cv -{ +#ifdef HAVE_LAPACK +#define CV_GEMM_BASELINE_ONLY +#endif +#define CV_MAHALANOBIS_BASELINE_ONLY +#define CV_MULTRANSPOSED_BASELINE_ONLY +namespace cv { + +// forward declarations +typedef void (*TransformFunc)(const uchar* src, uchar* dst, const uchar* m, int len, int scn, int dcn); +typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha); +typedef void (*MulTransposedFunc)(const Mat& src, const/*preallocated*/ Mat& dst, const Mat& delta, double scale); +typedef double (*MahalanobisImplFunc)(const Mat& v1, const Mat& v2, const Mat& icovar, double *diff_buffer /*[len]*/, int len /*=v1.total()*/); + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +// forward declarations + +void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); +void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); +void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); +void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); + +TransformFunc getTransformFunc(int depth); +TransformFunc getDiagTransformFunc(int depth); +TransformFunc getPerspectiveTransform(int depth); + +ScaleAddFunc getScaleAddFunc(int depth); + +MahalanobisImplFunc getMahalanobisImplFunc(int depth); + +MulTransposedFunc getMulTransposedFunc(int stype, int dtype, bool ata); + +double dotProd_8u(const uchar* src1, const uchar* src2, int len); +double dotProd_8s(const schar* src1, const schar* src2, int len); +double dotProd_16u(const ushort* src1, const ushort* src2, int len); +double dotProd_16s(const short* src1, const short* src2, int len); +double dotProd_32s(const int* src1, const int* src2, int len); +double dotProd_32f(const float* src1, const float* src2, int len); +double dotProd_64f(const double* src1, const double* src2, int len); + + + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if !defined(CV_GEMM_BASELINE_ONLY) || defined(CV_CPU_BASELINE_MODE) /****************************************************************************************\ * GEMM * \****************************************************************************************/ @@ -695,204 +741,6 @@ static void GEMMStore_64fc( const Complexd* c_data, size_t c_step, GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); } -#ifdef HAVE_CLAMDBLAS - -static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, - InputArray matC, double beta, OutputArray matD, int flags ) -{ - int type = matA.type(), esz = CV_ELEM_SIZE(type); - bool haveC = matC.kind() != cv::_InputArray::NONE; - Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); - bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; - - if (atrans) - sizeA = Size(sizeA.height, sizeA.width); - if (btrans) - sizeB = Size(sizeB.height, sizeB.width); - if (haveC && ctrans) - sizeC = Size(sizeC.height, sizeC.width); - - Size sizeD(sizeB.width, sizeA.height); - - CV_Assert( matB.type() == type && (!haveC || matC.type() == type) ); - CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); - - matD.create(sizeD, type); - if ( matA.offset() % esz != 0 || matA.step() % esz != 0 || - matB.offset() % esz != 0 || matB.step() % esz != 0 || - (haveC && (matC.offset() % esz != 0 || matC.step() % esz != 0)) ) - return false; - - UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); - if (!ocl::internal::isCLBuffer(A) || !ocl::internal::isCLBuffer(B) || !ocl::internal::isCLBuffer(D)) - { - return false; - } - if (haveC) - { - UMat C = matC.getUMat(); - if (!ocl::internal::isCLBuffer(C)) - return false; - } - if (haveC) - ctrans ? transpose(matC, D) : matC.copyTo(D); - else - D.setTo(Scalar::all(0)); - - int M = sizeD.height, N = sizeD.width, K = sizeA.width; - int lda = (int)A.step / esz, ldb = (int)B.step / esz, ldc = (int)D.step / esz; - int offa = (int)A.offset / esz, offb = (int)B.offset / esz, offc = (int)D.offset / esz; - - cl_command_queue clq = (cl_command_queue)ocl::Queue::getDefault().ptr(); - clAmdBlasTranspose transA = atrans ? clAmdBlasTrans : clAmdBlasNoTrans; - clAmdBlasTranspose transB = btrans ? clAmdBlasTrans : clAmdBlasNoTrans; - clAmdBlasOrder order = clAmdBlasRowMajor; - clAmdBlasStatus status = clAmdBlasSuccess; - - if (type == CV_32FC1) - status = clAmdBlasSgemmEx(order, transA, transB, M, N, K, - (cl_float)alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, - (const cl_mem)B.handle(ACCESS_READ), offb, ldb, - (cl_float)beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, - 1, &clq, 0, NULL, NULL); - else if (type == CV_64FC1) - status = clAmdBlasDgemmEx(order, transA, transB, M, N, K, - alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, - (const cl_mem)B.handle(ACCESS_READ), offb, ldb, - beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, - 1, &clq, 0, NULL, NULL); - else if (type == CV_32FC2) - { - cl_float2 alpha_2 = { { (cl_float)alpha, 0 } }; - cl_float2 beta_2 = { { (cl_float)beta, 0 } }; - status = clAmdBlasCgemmEx(order, transA, transB, M, N, K, - alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, - (const cl_mem)B.handle(ACCESS_READ), offb, ldb, - beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, - 1, &clq, 0, NULL, NULL); - } - else if (type == CV_64FC2) - { - cl_double2 alpha_2 = { { alpha, 0 } }; - cl_double2 beta_2 = { { beta, 0 } }; - status = clAmdBlasZgemmEx(order, transA, transB, M, N, K, - alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, - (const cl_mem)B.handle(ACCESS_READ), offb, ldb, - beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, - 1, &clq, 0, NULL, NULL); - } - else - CV_Error(Error::StsUnsupportedFormat, ""); - - return status == clAmdBlasSuccess; -} - -#endif - -#ifdef HAVE_OPENCL -static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, - InputArray matC, double beta, OutputArray matD, int flags ) -{ - int depth = matA.depth(), cn = matA.channels(); - int type = CV_MAKETYPE(depth, cn); - - CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); - - const ocl::Device & dev = ocl::Device::getDefault(); - bool doubleSupport = dev.doubleFPConfig() > 0; - - if (!doubleSupport && depth == CV_64F) - return false; - - bool haveC = matC.kind() != cv::_InputArray::NONE; - Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); - bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; - - CV_Assert( !haveC || matC.type() == type ); - - Size sizeD(((btrans)? sizeB.height : sizeB.width), - ((atrans)? sizeA.width : sizeA.height)); - matD.create(sizeD, type); - - UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); - - - if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1) - { - String opts; - - if (atrans) - sizeA = Size(sizeA.height, sizeA.width); - if (btrans) - sizeB = Size(sizeB.height, sizeB.width); - if (haveC && ctrans) - sizeC = Size(sizeC.height, sizeC.width); - - CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); - - int max_wg_size = (int)dev.maxWorkGroupSize(); - int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; - - if (atrans) - A = A.t(); - - if (btrans) - B = B.t(); - - if (haveC) - ctrans ? transpose(matC, D) : matC.copyTo(D); - - int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; - int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); - - opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", - ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), - cn, kercn, block_size, - (sizeA.width % block_size !=0) ? " -D NO_MULT" : "", - haveC ? " -D HAVE_C" : "", - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - - ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); - if (k.empty()) - return false; - - if (depth == CV_64F) - k.args(ocl::KernelArg::ReadOnlyNoSize(A), - ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), - ocl::KernelArg::ReadWrite(D, cn, kercn), - sizeA.width, alpha, beta); - else - k.args(ocl::KernelArg::ReadOnlyNoSize(A), - ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), - ocl::KernelArg::ReadWrite(D, cn, kercn), - sizeA.width, (float)alpha, (float)beta); - - size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; - size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; - - return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); - } - else - { - if (haveC && beta != 0.0) - { - ctrans ? transpose(matC, D) : matC.copyTo(D); - } - else - { - beta = 0.0; - } - - return intel_gpu_gemm(A, sizeA, - B, sizeB, - D, sizeD, - alpha, - beta, - atrans, btrans); - } -} -#endif - static void gemmImpl( Mat A, Mat B, double alpha, Mat C, double beta, Mat D, int flags ) { @@ -1502,142 +1350,46 @@ callGemmImpl(const fptype *src1, size_t src1_step, const fptype *src2, size_t sr gemmImpl(A, B, alpha, C, beta, D, flags); } -} - -void cv::hal::gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, - float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) +void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { - - CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + CV_INSTRUMENT_REGION(); callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32F); } -void cv::hal::gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, - double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) +void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { - CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + CV_INSTRUMENT_REGION(); callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64F); } -CV_EXPORTS void cv::hal::gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, - float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) +void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { - CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + CV_INSTRUMENT_REGION(); callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32FC2); } -CV_EXPORTS void cv::hal::gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, - double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, - int m_a, int n_a, int n_d, int flags) +void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) { - CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + CV_INSTRUMENT_REGION(); callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64FC2); } -void cv::gemm( InputArray matA, InputArray matB, double alpha, - InputArray matC, double beta, OutputArray _matD, int flags ) -{ -#ifdef HAVE_CLAMDBLAS - CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && - matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes - ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags)) -#endif +#endif // !defined(CV_GEMM_BASELINE_ONLY) || defined(CV_CPU_BASELINE_MODE) -#ifdef HAVE_OPENCL - CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2, - ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags)) -#endif - Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0.0 ? matC.getMat() : Mat(); - Size a_size = A.size(), d_size; - int len = 0, type = A.type(); - - CV_Assert_N( type == B.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); - - switch( flags & (GEMM_1_T|GEMM_2_T) ) - { - case 0: - d_size = Size( B.cols, a_size.height ); - len = B.rows; - CV_Assert( a_size.width == len ); - break; - case 1: - d_size = Size( B.cols, a_size.width ); - len = B.rows; - CV_Assert( a_size.height == len ); - break; - case 2: - d_size = Size( B.rows, a_size.height ); - len = B.cols; - CV_Assert( a_size.width == len ); - break; - case 3: - d_size = Size( B.rows, a_size.width ); - len = B.cols; - CV_Assert( a_size.height == len ); - break; - } - - if( !C.empty() ) - { - CV_Assert_N( C.type() == type, - (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) || - ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height))); - } - - _matD.create( d_size.height, d_size.width, type ); - Mat D = _matD.getMat(); - if( (flags & GEMM_3_T) != 0 && C.data == D.data ) - { - transpose( C, C ); - flags &= ~GEMM_3_T; - } - - Mat *DProxyPtr = &D, DProxy; - if( D.data == A.data || D.data == B.data ) - { - DProxy = Mat(d_size.height, d_size.width, D.type()); - DProxyPtr = &DProxy; - } - - if( type == CV_32FC1 ) - hal::gemm32f(A.ptr(), A.step, B.ptr(), B.step, static_cast(alpha), - C.ptr(), C.step, static_cast(beta), - DProxyPtr->ptr(), DProxyPtr->step, - a_size.height, a_size.width, DProxyPtr->cols, flags); - else if( type == CV_64FC1 ) - hal::gemm64f(A.ptr(), A.step, B.ptr(), B.step, alpha, - C.ptr(), C.step, beta, - DProxyPtr->ptr(), DProxyPtr->step, - a_size.height, a_size.width, DProxyPtr->cols, flags); - else if( type == CV_32FC2 ) - hal::gemm32fc(A.ptr(), A.step, B.ptr(), B.step, static_cast(alpha), - C.ptr(), C.step, static_cast(beta), - DProxyPtr->ptr(), DProxyPtr->step, - a_size.height, a_size.width, DProxyPtr->cols, flags); - else - { - CV_Assert( type == CV_64FC2 ); - hal::gemm64fc(A.ptr(), A.step, B.ptr(), B.step, alpha, - C.ptr(), C.step, beta, - D.ptr(), D.step, - a_size.height, a_size.width, DProxyPtr->cols, flags); - } - - if(DProxyPtr != &D) - DProxyPtr->copyTo(D); -} /****************************************************************************************\ * Transform * \****************************************************************************************/ -namespace cv -{ - template static void transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) { @@ -2053,9 +1805,7 @@ diagtransform_64f(const double* src, double* dst, const double* m, int len, int } -typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int ); - -static TransformFunc getTransformFunc(int depth) +TransformFunc getTransformFunc(int depth) { static TransformFunc transformTab[] = { @@ -2067,7 +1817,7 @@ static TransformFunc getTransformFunc(int depth) return transformTab[depth]; } -static TransformFunc getDiagTransformFunc(int depth) +TransformFunc getDiagTransformFunc(int depth) { static TransformFunc diagTransformTab[] = { @@ -2079,88 +1829,12 @@ static TransformFunc getDiagTransformFunc(int depth) return diagTransformTab[depth]; } -} -void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx ) -{ - CV_INSTRUMENT_REGION(); - - Mat src = _src.getMat(), m = _mtx.getMat(); - int depth = src.depth(), scn = src.channels(), dcn = m.rows; - CV_Assert( scn == m.cols || scn + 1 == m.cols ); - bool isDiag = false; - - _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); - Mat dst = _dst.getMat(); - - int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F; - AutoBuffer _mbuf; - double* mbuf; - - if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 ) - { - _mbuf.allocate(dcn*(scn+1)); - mbuf = _mbuf.data(); - Mat tmp(dcn, scn+1, mtype, mbuf); - memset(tmp.ptr(), 0, tmp.total()*tmp.elemSize()); - if( m.cols == scn+1 ) - m.convertTo(tmp, mtype); - else - { - Mat tmppart = tmp.colRange(0, m.cols); - m.convertTo(tmppart, mtype); - } - m = tmp; - } - else - mbuf = m.ptr(); - - if( scn == dcn ) - { - int i, j; - double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON; - - if( scn == 1 ) - { - double alpha, beta; - if( mtype == CV_32F ) - alpha = m.at(0), beta = m.at(1); - else - alpha = m.at(0), beta = m.at(1); - src.convertTo(dst, dst.type(), alpha, beta); - return; - } - - for( i = 0, isDiag = true; isDiag && i < scn; i++ ) - { - for( j = 0; isDiag && j < scn; j++ ) - { - double v = mtype == CV_32F ? m.at(i, j) : m.at(i, j); - if( i != j && fabs(v) > eps ) - isDiag = false; - } - } - } - - TransformFunc func = isDiag ? getDiagTransformFunc(depth): getTransformFunc(depth); - CV_Assert( func != 0 ); - - const Mat* arrays[] = {&src, &dst, 0}; - uchar* ptrs[2] = {}; - NAryMatIterator it(arrays, ptrs); - size_t i, total = it.size; - - for( i = 0; i < it.nplanes; i++, ++it ) - func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); -} /****************************************************************************************\ * Perspective Transform * \****************************************************************************************/ -namespace cv -{ - template static void perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn ) { @@ -2246,7 +1920,6 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, } } - static void perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn) { @@ -2259,54 +1932,21 @@ perspectiveTransform_64f(const double* src, double* dst, const double* m, int le perspectiveTransform_(src, dst, m, len, scn, dcn); } -} - -void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx ) +TransformFunc getPerspectiveTransform(int depth) { - CV_INSTRUMENT_REGION(); - - Mat src = _src.getMat(), m = _mtx.getMat(); - int depth = src.depth(), scn = src.channels(), dcn = m.rows-1; - CV_Assert( scn + 1 == m.cols ); - CV_Assert( depth == CV_32F || depth == CV_64F ); - - _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); - Mat dst = _dst.getMat(); - - const int mtype = CV_64F; - AutoBuffer _mbuf; - double* mbuf = m.ptr(); - - if( !m.isContinuous() || m.type() != mtype ) - { - _mbuf.allocate((dcn+1)*(scn+1)); - mbuf = _mbuf.data(); - Mat tmp(dcn+1, scn+1, mtype, mbuf); - m.convertTo(tmp, mtype); - m = tmp; - } - - TransformFunc func = depth == CV_32F ? - (TransformFunc)perspectiveTransform_32f : - (TransformFunc)perspectiveTransform_64f; - CV_Assert( func != 0 ); - - const Mat* arrays[] = {&src, &dst, 0}; - uchar* ptrs[2] = {}; - NAryMatIterator it(arrays, ptrs); - size_t i, total = it.size; - - for( i = 0; i < it.nplanes; i++, ++it ) - func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); + if (depth == CV_32F) + return (TransformFunc)perspectiveTransform_32f; + if (depth == CV_64F) + return (TransformFunc)perspectiveTransform_64f; + CV_Assert(0 && "Not supported"); } + + /****************************************************************************************\ * ScaleAdd * \****************************************************************************************/ -namespace cv -{ - static void scaleAdd_32f(const float* src1, const float* src2, float* dst, int len, float* _alpha) { @@ -2340,338 +1980,90 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst, dst[i] = src1[i] * alpha + src2[i]; } -typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha); - -#ifdef HAVE_OPENCL - -static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) +ScaleAddFunc getScaleAddFunc(int depth) { - const ocl::Device & d = ocl::Device::getDefault(); - - bool doubleSupport = d.doubleFPConfig() > 0; - Size size = _src1.size(); - int depth = CV_MAT_DEPTH(type); - if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() ) - return false; - - _dst.create(size, type); - int cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F); - int kercn = ocl::predictOptimalVectorWidthMax(_src1, _src2, _dst), - rowsPerWI = d.isIntel() ? 4 : 1; - - char cvt[2][50]; - ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D DEPTH_dst=%d -D workT=%s -D convertToWT1=%s" - " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s" - " -D wdepth=%d%s -D rowsPerWI=%d", - ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), depth, - ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), - ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), - ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]), - ocl::typeToStr(wdepth), wdepth, - doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI)); - if (k.empty()) - return false; - - UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(); - - ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1), - src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), - dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); - - if (wdepth == CV_32F) - k.args(src1arg, src2arg, dstarg, (float)alpha); - else - k.args(src1arg, src2arg, dstarg, alpha); - - size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI }; - return k.run(2, globalsize, NULL, false); + if (depth == CV_32F) + return (ScaleAddFunc)scaleAdd_32f; + if (depth == CV_64F) + return (ScaleAddFunc)scaleAdd_64f; + CV_Assert(0 && "Not supported"); } -#endif -} - -void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst ) -{ - CV_INSTRUMENT_REGION(); - - int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - CV_Assert( type == _src2.type() ); - - CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(), - ocl_scaleAdd(_src1, alpha, _src2, _dst, type)) - - if( depth < CV_32F ) - { - addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth); - return; - } - - Mat src1 = _src1.getMat(), src2 = _src2.getMat(); - CV_Assert(src1.size == src2.size); - - _dst.create(src1.dims, src1.size, type); - Mat dst = _dst.getMat(); - - float falpha = (float)alpha; - void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α - - ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f; - - if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous()) - { - size_t len = src1.total()*cn; - func(src1.ptr(), src2.ptr(), dst.ptr(), (int)len, palpha); - return; - } - - const Mat* arrays[] = {&src1, &src2, &dst, 0}; - uchar* ptrs[3] = {}; - NAryMatIterator it(arrays, ptrs); - size_t i, len = it.size*cn; - - for( i = 0; i < it.nplanes; i++, ++it ) - func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha ); -} - -/****************************************************************************************\ -* Covariation Matrix * -\****************************************************************************************/ - -void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype ) -{ - CV_INSTRUMENT_REGION(); - - CV_Assert_N( data, nsamples > 0 ); - Size size = data[0].size(); - int sz = size.width * size.height, esz = (int)data[0].elemSize(); - int type = data[0].type(); - Mat mean; - ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F); - - if( (flags & CV_COVAR_USE_AVG) != 0 ) - { - CV_Assert( _mean.size() == size ); - if( _mean.isContinuous() && _mean.type() == ctype ) - mean = _mean.reshape(1, 1); - else - { - _mean.convertTo(mean, ctype); - mean = mean.reshape(1, 1); - } - } - - Mat _data(nsamples, sz, type); - - for( int i = 0; i < nsamples; i++ ) - { - CV_Assert_N( data[i].size() == size, data[i].type() == type ); - if( data[i].isContinuous() ) - memcpy( _data.ptr(i), data[i].ptr(), sz*esz ); - else - { - Mat dataRow(size.height, size.width, type, _data.ptr(i)); - data[i].copyTo(dataRow); - } - } - - calcCovarMatrix( _data, covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); - if( (flags & CV_COVAR_USE_AVG) == 0 ) - _mean = mean.reshape(1, size.height); -} - -void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype ) -{ - CV_INSTRUMENT_REGION(); - - if(_src.kind() == _InputArray::STD_VECTOR_MAT || _src.kind() == _InputArray::STD_ARRAY_MAT) - { - std::vector src; - _src.getMatVector(src); - - CV_Assert( src.size() > 0 ); - - Size size = src[0].size(); - int type = src[0].type(); - - ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F); - - Mat _data(static_cast(src.size()), size.area(), type); - - int i = 0; - for(std::vector::iterator each = src.begin(); each != src.end(); ++each, ++i ) - { - CV_Assert_N( (*each).size() == size, (*each).type() == type ); - Mat dataRow(size.height, size.width, type, _data.ptr(i)); - (*each).copyTo(dataRow); - } - - Mat mean; - if( (flags & CV_COVAR_USE_AVG) != 0 ) - { - CV_Assert( _mean.size() == size ); - - if( mean.type() != ctype ) - { - mean = _mean.getMat(); - _mean.create(mean.size(), ctype); - Mat tmp = _mean.getMat(); - mean.convertTo(tmp, ctype); - mean = tmp; - } - - mean = _mean.getMat().reshape(1, 1); - } - - calcCovarMatrix( _data, _covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); - - if( (flags & CV_COVAR_USE_AVG) == 0 ) - { - mean = mean.reshape(1, size.height); - mean.copyTo(_mean); - } - return; - } - - Mat data = _src.getMat(), mean; - CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) ); - bool takeRows = (flags & CV_COVAR_ROWS) != 0; - int type = data.type(); - int nsamples = takeRows ? data.rows : data.cols; - CV_Assert( nsamples > 0 ); - Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows); - - if( (flags & CV_COVAR_USE_AVG) != 0 ) - { - mean = _mean.getMat(); - ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), mean.depth()), CV_32F); - CV_Assert( mean.size() == size ); - if( mean.type() != ctype ) - { - _mean.create(mean.size(), ctype); - Mat tmp = _mean.getMat(); - mean.convertTo(tmp, ctype); - mean = tmp; - } - } - else - { - ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F); - reduce( _src, _mean, takeRows ? 0 : 1, CV_REDUCE_AVG, ctype ); - mean = _mean.getMat(); - } - - mulTransposed( data, _covar, ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows, - mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype ); -} /****************************************************************************************\ * Mahalanobis * \****************************************************************************************/ -double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar ) +template static inline +double MahalanobisImpl(const Mat& v1, const Mat& v2, const Mat& icovar, double *diff_buffer /*[len]*/, int len /*=v1.total()*/) { CV_INSTRUMENT_REGION(); - Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat(); - int type = v1.type(), depth = v1.depth(); Size sz = v1.size(); - int i, j, len = sz.width*sz.height*v1.channels(); - AutoBuffer buf(len); double result = 0; - CV_Assert_N( type == v2.type(), type == icovar.type(), - sz == v2.size(), len == icovar.rows && len == icovar.cols ); - sz.width *= v1.channels(); - if( v1.isContinuous() && v2.isContinuous() ) + if (v1.isContinuous() && v2.isContinuous()) { sz.width *= sz.height; sz.height = 1; } - if( depth == CV_32F ) { - const float* src1 = v1.ptr(); - const float* src2 = v2.ptr(); + const T* src1 = v1.ptr(); + const T* src2 = v2.ptr(); size_t step1 = v1.step/sizeof(src1[0]); size_t step2 = v2.step/sizeof(src2[0]); - double* diff = buf.data(); - const float* mat = icovar.ptr(); + double* diff = diff_buffer; + const T* mat = icovar.ptr(); size_t matstep = icovar.step/sizeof(mat[0]); - for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) + for (; sz.height--; src1 += step1, src2 += step2, diff += sz.width) { - for( i = 0; i < sz.width; i++ ) + for (int i = 0; i < sz.width; i++) diff[i] = src1[i] - src2[i]; } - diff = buf.data(); - for( i = 0; i < len; i++, mat += matstep ) + diff = diff_buffer; + for (int i = 0; i < len; i++, mat += matstep) { double row_sum = 0; - j = 0; - #if CV_ENABLE_UNROLLED + int j = 0; +#if CV_ENABLE_UNROLLED for(; j <= len - 4; j += 4 ) row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; - #endif - for( ; j < len; j++ ) +#endif + for (; j < len; j++) row_sum += diff[j]*mat[j]; result += row_sum * diff[i]; } } - else if( depth == CV_64F ) - { - const double* src1 = v1.ptr(); - const double* src2 = v2.ptr(); - size_t step1 = v1.step/sizeof(src1[0]); - size_t step2 = v2.step/sizeof(src2[0]); - double* diff = buf.data(); - const double* mat = icovar.ptr(); - size_t matstep = icovar.step/sizeof(mat[0]); - - for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) - { - for( i = 0; i < sz.width; i++ ) - diff[i] = src1[i] - src2[i]; - } - - diff = buf.data(); - for( i = 0; i < len; i++, mat += matstep ) - { - double row_sum = 0; - j = 0; - #if CV_ENABLE_UNROLLED - for(; j <= len - 4; j += 4 ) - row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + - diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; - #endif - for( ; j < len; j++ ) - row_sum += diff[j]*mat[j]; - result += row_sum * diff[i]; - } - } - else - CV_Error( CV_StsUnsupportedFormat, "" ); - - return std::sqrt(result); + return result; } +MahalanobisImplFunc getMahalanobisImplFunc(int depth) +{ + if (depth == CV_32F) + return (MahalanobisImplFunc)MahalanobisImpl; + if (depth == CV_64F) + return (MahalanobisImplFunc)MahalanobisImpl; + CV_Assert(0 && "Not supported"); +} + + +#if !defined(CV_MULTRANSPOSED_BASELINE_ONLY) || defined(CV_CPU_BASELINE_MODE) /****************************************************************************************\ * MulTransposed * \****************************************************************************************/ -namespace cv -{ - template static void -MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) +MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double scale) { int i, j, k; const sT* src = srcmat.ptr(); - dT* dst = dstmat.ptr
(); + dT* dst = (dT*)dstmat.ptr
(); const dT* delta = deltamat.ptr
(); size_t srcstep = srcmat.step/sizeof(src[0]); size_t dststep = dstmat.step/sizeof(dst[0]); @@ -2786,11 +2178,11 @@ MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scal template static void -MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) +MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double scale) { int i, j, k; const sT* src = srcmat.ptr(); - dT* dst = dstmat.ptr
(); + dT* dst = (dT*)dstmat.ptr
(); const dT* delta = deltamat.ptr
(); size_t srcstep = srcmat.step/sizeof(src[0]); size_t dststep = dstmat.step/sizeof(dst[0]); @@ -2857,145 +2249,75 @@ MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scal } } -typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale); - -} - -void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata, - InputArray _delta, double scale, int dtype ) +MulTransposedFunc getMulTransposedFunc(int stype, int dtype, bool ata) { - CV_INSTRUMENT_REGION(); - - Mat src = _src.getMat(), delta = _delta.getMat(); - const int gemm_level = 100; // boundary above which GEMM is faster. - int stype = src.type(); - dtype = std::max(std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), delta.depth()), CV_32F); - CV_Assert( src.channels() == 1 ); - - if( !delta.empty() ) + MulTransposedFunc func = NULL; + if (stype == CV_8U && dtype == CV_32F) { - CV_Assert_N( delta.channels() == 1, - (delta.rows == src.rows || delta.rows == 1), - (delta.cols == src.cols || delta.cols == 1)); - if( delta.type() != dtype ) - delta.convertTo(delta, dtype); + func = ata ? MulTransposedR + : MulTransposedL; } - - int dsize = ata ? src.cols : src.rows; - _dst.create( dsize, dsize, dtype ); - Mat dst = _dst.getMat(); - - if( src.data == dst.data || (stype == dtype && - (dst.cols >= gemm_level && dst.rows >= gemm_level && - src.cols >= gemm_level && src.rows >= gemm_level))) + else if (stype == CV_8U && dtype == CV_64F) { - Mat src2; - const Mat* tsrc = &src; - if( !delta.empty() ) - { - if( delta.size() == src.size() ) - subtract( src, delta, src2 ); - else - { - repeat(delta, src.rows/delta.rows, src.cols/delta.cols, src2); - subtract( src, src2, src2 ); - } - tsrc = &src2; - } - gemm( *tsrc, *tsrc, scale, Mat(), 0, dst, ata ? GEMM_1_T : GEMM_2_T ); + func = ata ? MulTransposedR + : MulTransposedL; } - else + else if(stype == CV_16U && dtype == CV_32F) { - MulTransposedFunc func = 0; - if(stype == CV_8U && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_8U && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16U && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16U && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16S && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_16S && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_32F && dtype == CV_32F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_32F && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - else if(stype == CV_64F && dtype == CV_64F) - { - if(ata) - func = MulTransposedR; - else - func = MulTransposedL; - } - if( !func ) - CV_Error( CV_StsUnsupportedFormat, "" ); - - func( src, dst, delta, scale ); - completeSymm( dst, false ); + func = ata ? MulTransposedR + : MulTransposedL; } + else if(stype == CV_16U && dtype == CV_64F) + { + func = ata ? MulTransposedR + : MulTransposedL; + } + else if(stype == CV_16S && dtype == CV_32F) + { + func = ata ? MulTransposedR + : MulTransposedL; + } + else if(stype == CV_16S && dtype == CV_64F) + { + func = ata ? MulTransposedR + : MulTransposedL; + } + else if(stype == CV_32F && dtype == CV_32F) + { + func = ata ? MulTransposedR + : MulTransposedL; + } + else if(stype == CV_32F && dtype == CV_64F) + { + func = ata ? MulTransposedR + : MulTransposedL; + } + else if(stype == CV_64F && dtype == CV_64F) + { + func = ata ? MulTransposedR + : MulTransposedL; + } + CV_Assert(func && "Not supported"); + return func; } +#endif // !defined(CV_MULTRANSPOSED_BASELINE_ONLY) || defined(CV_CPU_BASELINE_MODE) + /****************************************************************************************\ * Dot Product * \****************************************************************************************/ -namespace cv -{ - -template double -dotProd_(const T* src1, const T* src2, int len) +template static inline +double dotProd_(const T* src1, const T* src2, int len) { int i = 0; double result = 0; - #if CV_ENABLE_UNROLLED +#if CV_ENABLE_UNROLLED for( ; i <= len - 4; i += 4 ) result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] + (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3]; - #endif +#endif for( ; i < len; i++ ) result += (double)src1[i]*src2[i]; @@ -3003,12 +2325,9 @@ dotProd_(const T* src1, const T* src2, int len) } -static double dotProd_8u(const uchar* src1, const uchar* src2, int len) +double dotProd_8u(const uchar* src1, const uchar* src2, int len) { double r = 0; -#if ARITHM_USE_IPP - CV_IPP_RUN(IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42, CV_INSTRUMENT_FUN_IPP(ippiDotProd_8u64f_C1R, src1, len*sizeof(uchar), src2, len*sizeof(uchar), ippiSize(len, 1), &r) >= 0, r); -#endif int i = 0; #if CV_SIMD @@ -3092,7 +2411,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) } -static double dotProd_8s(const schar* src1, const schar* src2, int len) +double dotProd_8s(const schar* src1, const schar* src2, int len) { double r = 0.0; int i = 0; @@ -3178,40 +2497,24 @@ static double dotProd_8s(const schar* src1, const schar* src2, int len) return r + dotProd_(src1, src2, len - i); } -static double dotProd_16u(const ushort* src1, const ushort* src2, int len) +double dotProd_16u(const ushort* src1, const ushort* src2, int len) { -#if ARITHM_USE_IPP - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16u64f_C1R, src1, len*sizeof(ushort), src2, len*sizeof(ushort), ippiSize(len, 1), &r) >= 0, r); -#endif return dotProd_(src1, src2, len); } -static double dotProd_16s(const short* src1, const short* src2, int len) +double dotProd_16s(const short* src1, const short* src2, int len) { -#if ARITHM_USE_IPP && (IPP_VERSION_X100 != 900) // bug in IPP 9.0.0 - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16s64f_C1R, src1, len*sizeof(short), src2, len*sizeof(short), ippiSize(len, 1), &r) >= 0, r); -#endif return dotProd_(src1, src2, len); } -static double dotProd_32s(const int* src1, const int* src2, int len) +double dotProd_32s(const int* src1, const int* src2, int len) { -#if ARITHM_USE_IPP - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32s64f_C1R, src1, len*sizeof(int), src2, len*sizeof(int), ippiSize(len, 1), &r) >= 0, r); -#endif return dotProd_(src1, src2, len); } -static double dotProd_32f(const float* src1, const float* src2, int len) +double dotProd_32f(const float* src1, const float* src2, int len) { double r = 0.0; - -#if ARITHM_USE_IPP - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32f64f_C1R, src1, len*sizeof(float), src2, len*sizeof(float), ippiSize(len, 1), &r, ippAlgHintFast) >= 0, r); -#endif int i = 0; #if CV_SIMD @@ -3238,284 +2541,11 @@ static double dotProd_32f(const float* src1, const float* src2, int len) return r + dotProd_(src1, src2, len - i); } -static double dotProd_64f(const double* src1, const double* src2, int len) +double dotProd_64f(const double* src1, const double* src2, int len) { -#if ARITHM_USE_IPP - double r = 0; - CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsDotProd_64f, src1, src2, len, &r) >= 0, r); -#endif - return dotProd_(src1, src2, len); } - -typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len); - -static DotProdFunc getDotProdFunc(int depth) -{ - static DotProdFunc dotProdTab[] = - { - (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s), - (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s, - (DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f), - (DotProdFunc)dotProd_64f, 0 - }; - - return dotProdTab[depth]; -} - -double Mat::dot(InputArray _mat) const -{ - CV_INSTRUMENT_REGION(); - - Mat mat = _mat.getMat(); - int cn = channels(); - DotProdFunc func = getDotProdFunc(depth()); - CV_Assert_N( mat.type() == type(), mat.size == size, func != 0 ); - - if( isContinuous() && mat.isContinuous() ) - { - size_t len = total()*cn; - if( len == (size_t)(int)len ) - return func(data, mat.data, (int)len); - } - - const Mat* arrays[] = {this, &mat, 0}; - uchar* ptrs[2] = {}; - NAryMatIterator it(arrays, ptrs); - int len = (int)(it.size*cn); - double r = 0; - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - r += func( ptrs[0], ptrs[1], len ); - - return r; -} - -} - -/****************************************************************************************\ -* Earlier API * -\****************************************************************************************/ - -CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha, - const CvArr* Carr, double beta, CvArr* Darr, int flags ) -{ - cv::Mat A = cv::cvarrToMat(Aarr), B = cv::cvarrToMat(Barr); - cv::Mat C, D = cv::cvarrToMat(Darr); - - if( Carr ) - C = cv::cvarrToMat(Carr); - - CV_Assert_N( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)), - (D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)), - D.type() == A.type() ); - - gemm( A, B, alpha, C, beta, D, flags ); -} - - -CV_IMPL void -cvTransform( const CvArr* srcarr, CvArr* dstarr, - const CvMat* transmat, const CvMat* shiftvec ) -{ - cv::Mat m = cv::cvarrToMat(transmat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); - - if( shiftvec ) - { - cv::Mat v = cv::cvarrToMat(shiftvec).reshape(1,m.rows), - _m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(0,m.cols), v1 = _m.col(m.cols); - m.convertTo(m1, m1.type()); - v.convertTo(v1, v1.type()); - m = _m; - } - - CV_Assert_N( dst.depth() == src.depth(), dst.channels() == m.rows ); - cv::transform( src, dst, m ); -} - - -CV_IMPL void -cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat ) -{ - cv::Mat m = cv::cvarrToMat(mat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); - - CV_Assert_N( dst.type() == src.type(), dst.channels() == m.rows-1 ); - cv::perspectiveTransform( src, dst, m ); -} - - -CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale, - const CvArr* srcarr2, CvArr* dstarr ) -{ - cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); - - CV_Assert_N( src1.size == dst.size, src1.type() == dst.type() ); - cv::scaleAdd( src1, scale.val[0], cv::cvarrToMat(srcarr2), dst ); -} - - -CV_IMPL void -cvCalcCovarMatrix( const CvArr** vecarr, int count, - CvArr* covarr, CvArr* avgarr, int flags ) -{ - cv::Mat cov0 = cv::cvarrToMat(covarr), cov = cov0, mean0, mean; - CV_Assert_N( vecarr != 0, count >= 1 ); - - if( avgarr ) - mean = mean0 = cv::cvarrToMat(avgarr); - - if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 ) - { - - cv::Mat data = cv::cvarrToMat(vecarr[0]); - cv::calcCovarMatrix( data, cov, mean, flags, cov.type() ); - } - else - { - std::vector data(count); - for( int i = 0; i < count; i++ ) - data[i] = cv::cvarrToMat(vecarr[i]); - cv::calcCovarMatrix( &data[0], count, cov, mean, flags, cov.type() ); - } - - if( mean.data != mean0.data && mean0.data ) - mean.convertTo(mean0, mean0.type()); - - if( cov.data != cov0.data ) - cov.convertTo(cov0, cov0.type()); -} - - -CV_IMPL double -cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr ) -{ - return cv::Mahalanobis(cv::cvarrToMat(srcAarr), - cv::cvarrToMat(srcBarr), cv::cvarrToMat(matarr)); -} - -CV_IMPL void -cvMulTransposed( const CvArr* srcarr, CvArr* dstarr, - int order, const CvArr* deltaarr, double scale ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0, delta; - if( deltaarr ) - delta = cv::cvarrToMat(deltaarr); - cv::mulTransposed( src, dst, order != 0, delta, scale, dst.type()); - if( dst.data != dst0.data ) - dst.convertTo(dst0, dst0.type()); -} - -CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr ) -{ - return cv::cvarrToMat(srcAarr).dot(cv::cvarrToMat(srcBarr)); -} - - -CV_IMPL void -cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags ) -{ - cv::Mat data = cv::cvarrToMat(data_arr), mean0 = cv::cvarrToMat(avg_arr); - cv::Mat evals0 = cv::cvarrToMat(eigenvals), evects0 = cv::cvarrToMat(eigenvects); - cv::Mat mean = mean0, evals = evals0, evects = evects0; - - cv::PCA pca; - pca.mean = mean; - pca.eigenvalues = evals; - pca.eigenvectors = evects; - - pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(), - flags, !evals.empty() ? evals.rows + evals.cols - 1 : 0); - - if( pca.mean.size() == mean.size() ) - pca.mean.convertTo( mean, mean.type() ); - else - { - cv::Mat temp; pca.mean.convertTo( temp, mean.type() ); - transpose( temp, mean ); - } - - evals = pca.eigenvalues; - evects = pca.eigenvectors; - int ecount0 = evals0.cols + evals0.rows - 1; - int ecount = evals.cols + evals.rows - 1; - - CV_Assert_N( (evals0.cols == 1 || evals0.rows == 1), - ecount0 <= ecount, - evects0.cols == evects.cols, - evects0.rows == ecount0 ); - - cv::Mat temp = evals0; - if( evals.rows == 1 ) - evals.colRange(0, ecount0).convertTo(temp, evals0.type()); - else - evals.rowRange(0, ecount0).convertTo(temp, evals0.type()); - if( temp.data != evals0.data ) - transpose(temp, evals0); - evects.rowRange(0, ecount0).convertTo( evects0, evects0.type() ); - - // otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated - CV_Assert( mean0.data == mean.data ); -} - - -CV_IMPL void -cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr, - const CvArr* eigenvects, CvArr* result_arr ) -{ - cv::Mat data = cv::cvarrToMat(data_arr), mean = cv::cvarrToMat(avg_arr); - cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0; - - cv::PCA pca; - pca.mean = mean; - int n; - if( mean.rows == 1 ) - { - CV_Assert_N(dst.cols <= evects.rows, dst.rows == data.rows); - n = dst.cols; - } - else - { - CV_Assert_N(dst.rows <= evects.rows, dst.cols == data.cols); - n = dst.rows; - } - pca.eigenvectors = evects.rowRange(0, n); - - cv::Mat result = pca.project(data); - if( result.cols != dst.cols ) - result = result.reshape(1, 1); - result.convertTo(dst, dst.type()); - - CV_Assert(dst0.data == dst.data); -} - - -CV_IMPL void -cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr, - const CvArr* eigenvects, CvArr* result_arr ) -{ - cv::Mat data = cv::cvarrToMat(proj_arr), mean = cv::cvarrToMat(avg_arr); - cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0; - - cv::PCA pca; - pca.mean = mean; - int n; - if( mean.rows == 1 ) - { - CV_Assert_N(data.cols <= evects.rows, dst.rows == data.rows); - n = data.cols; - } - else - { - CV_Assert_N(data.rows <= evects.rows, dst.cols == data.cols); - n = data.rows; - } - pca.eigenvectors = evects.rowRange(0, n); - - cv::Mat result = pca.backProject(data); - result.convertTo(dst, dst.type()); - - CV_Assert(dst0.data == dst.data); -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace \ No newline at end of file