2010-05-12 01:44:00 +08:00
|
|
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
//
|
|
|
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
|
|
|
//
|
|
|
|
// By downloading, copying, installing or using the software you agree to this license.
|
|
|
|
// If you do not agree to this license, do not download, install,
|
|
|
|
// copy or use the software.
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// License Agreement
|
|
|
|
// For Open Source Computer Vision Library
|
|
|
|
//
|
|
|
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
2011-04-17 21:14:45 +08:00
|
|
|
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
2015-01-12 15:59:30 +08:00
|
|
|
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
2010-05-12 01:44:00 +08:00
|
|
|
// Third party copyrights are property of their respective owners.
|
|
|
|
//
|
|
|
|
// Redistribution and use in source and binary forms, with or without modification,
|
|
|
|
// are permitted provided that the following conditions are met:
|
|
|
|
//
|
|
|
|
// * Redistribution's of source code must retain the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer.
|
|
|
|
//
|
|
|
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
|
|
// and/or other materials provided with the distribution.
|
|
|
|
//
|
|
|
|
// * The name of the copyright holders may not be used to endorse or promote products
|
|
|
|
// derived from this software without specific prior written permission.
|
|
|
|
//
|
|
|
|
// This software is provided by the copyright holders and contributors "as is" and
|
|
|
|
// any express or implied warranties, including, but not limited to, the implied
|
|
|
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
|
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
|
|
|
// indirect, incidental, special, exemplary, or consequential damages
|
|
|
|
// (including, but not limited to, procurement of substitute goods or services;
|
|
|
|
// loss of use, data, or profits; or business interruption) however caused
|
|
|
|
// and on any theory of liability, whether in contract, strict liability,
|
|
|
|
// or tort (including negligence or otherwise) arising in any way out of
|
|
|
|
// the use of this software, even if advised of the possibility of such damage.
|
|
|
|
//
|
|
|
|
//M*/
|
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
#include <sstream>
|
2010-05-12 01:44:00 +08:00
|
|
|
#include "precomp.hpp"
|
2014-08-01 22:11:20 +08:00
|
|
|
#include "opencl_kernels_core.hpp"
|
2013-12-15 03:16:53 +08:00
|
|
|
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
|
|
|
|
#include "intel_gpu_gemm.inl.hpp"
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
namespace cv
|
|
|
|
{
|
|
|
|
|
|
|
|
/****************************************************************************************\
|
|
|
|
* GEMM *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
GEMM_CopyBlock( const uchar* src, size_t src_step,
|
|
|
|
uchar* dst, size_t dst_step,
|
|
|
|
Size size, size_t pix_size )
|
|
|
|
{
|
|
|
|
int j;
|
|
|
|
size.width *= (int)(pix_size / sizeof(int));
|
|
|
|
|
|
|
|
for( ; size.height--; src += src_step, dst += dst_step )
|
|
|
|
{
|
2012-06-09 23:00:04 +08:00
|
|
|
j=0;
|
2012-02-10 14:05:04 +08:00
|
|
|
#if CV_ENABLE_UNROLLED
|
|
|
|
for( ; j <= size.width - 4; j += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
int t0 = ((const int*)src)[j];
|
|
|
|
int t1 = ((const int*)src)[j+1];
|
|
|
|
((int*)dst)[j] = t0;
|
|
|
|
((int*)dst)[j+1] = t1;
|
|
|
|
t0 = ((const int*)src)[j+2];
|
|
|
|
t1 = ((const int*)src)[j+3];
|
|
|
|
((int*)dst)[j+2] = t0;
|
|
|
|
((int*)dst)[j+3] = t1;
|
|
|
|
}
|
2012-02-10 14:05:04 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; j < size.width; j++ )
|
|
|
|
((int*)dst)[j] = ((const int*)src)[j];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
GEMM_TransposeBlock( const uchar* src, size_t src_step,
|
|
|
|
uchar* dst, size_t dst_step,
|
|
|
|
Size size, size_t pix_size )
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size )
|
|
|
|
{
|
|
|
|
const uchar* _src = src;
|
|
|
|
switch( pix_size )
|
|
|
|
{
|
|
|
|
case sizeof(int):
|
|
|
|
for( j = 0; j < size.height; j++, _src += src_step )
|
|
|
|
((int*)dst)[j] = ((int*)_src)[0];
|
|
|
|
break;
|
|
|
|
case sizeof(int)*2:
|
|
|
|
for( j = 0; j < size.height*2; j += 2, _src += src_step )
|
|
|
|
{
|
|
|
|
int t0 = ((int*)_src)[0];
|
|
|
|
int t1 = ((int*)_src)[1];
|
|
|
|
((int*)dst)[j] = t0;
|
|
|
|
((int*)dst)[j+1] = t1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case sizeof(int)*4:
|
|
|
|
for( j = 0; j < size.height*4; j += 4, _src += src_step )
|
|
|
|
{
|
|
|
|
int t0 = ((int*)_src)[0];
|
|
|
|
int t1 = ((int*)_src)[1];
|
|
|
|
((int*)dst)[j] = t0;
|
|
|
|
((int*)dst)[j+1] = t1;
|
|
|
|
t0 = ((int*)_src)[2];
|
|
|
|
t1 = ((int*)_src)[3];
|
|
|
|
((int*)dst)[j+2] = t0;
|
|
|
|
((int*)dst)[j+3] = t1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template<typename T, typename WT> static void
|
|
|
|
GEMMSingleMul( const T* a_data, size_t a_step,
|
|
|
|
const T* b_data, size_t b_step,
|
|
|
|
const T* c_data, size_t c_step,
|
|
|
|
T* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height;
|
|
|
|
const T *_a_data = a_data, *_b_data = b_data, *_c_data = c_data;
|
2011-02-18 18:29:57 +08:00
|
|
|
cv::AutoBuffer<T> _a_buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
T* a_buf = 0;
|
|
|
|
size_t a_step0, a_step1, c_step0, c_step1, t_step;
|
|
|
|
|
|
|
|
a_step /= sizeof(a_data[0]);
|
|
|
|
b_step /= sizeof(b_data[0]);
|
|
|
|
c_step /= sizeof(c_data[0]);
|
|
|
|
d_step /= sizeof(d_data[0]);
|
|
|
|
a_step0 = a_step;
|
|
|
|
a_step1 = 1;
|
|
|
|
|
|
|
|
if( !c_data )
|
|
|
|
c_step0 = c_step1 = 0;
|
|
|
|
else if( !(flags & GEMM_3_T) )
|
|
|
|
c_step0 = c_step, c_step1 = 1;
|
|
|
|
else
|
|
|
|
c_step0 = 1, c_step1 = c_step;
|
|
|
|
|
|
|
|
if( flags & GEMM_1_T )
|
|
|
|
{
|
|
|
|
CV_SWAP( a_step0, a_step1, t_step );
|
|
|
|
n = a_size.height;
|
|
|
|
if( a_step > 1 && n > 1 )
|
2011-02-18 18:29:57 +08:00
|
|
|
{
|
|
|
|
_a_buf.allocate(n);
|
|
|
|
a_buf = _a_buf;
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if( n == 1 ) /* external product */
|
|
|
|
{
|
2011-02-18 18:29:57 +08:00
|
|
|
cv::AutoBuffer<T> _b_buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
T* b_buf = 0;
|
|
|
|
|
|
|
|
if( a_step > 1 && a_size.height > 1 )
|
|
|
|
{
|
2011-02-18 18:29:57 +08:00
|
|
|
_a_buf.allocate(drows);
|
|
|
|
a_buf = _a_buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
for( k = 0; k < drows; k++ )
|
|
|
|
a_buf[k] = a_data[a_step*k];
|
|
|
|
a_data = a_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( b_step > 1 )
|
|
|
|
{
|
2011-02-18 18:29:57 +08:00
|
|
|
_b_buf.allocate(d_size.width);
|
|
|
|
b_buf = _b_buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
for( j = 0; j < d_size.width; j++ )
|
|
|
|
b_buf[j] = b_data[j*b_step];
|
|
|
|
b_data = b_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0; i < drows; i++, _c_data += c_step0, d_data += d_step )
|
|
|
|
{
|
|
|
|
WT al = WT(a_data[i])*alpha;
|
|
|
|
c_data = _c_data;
|
|
|
|
for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 )
|
|
|
|
{
|
|
|
|
WT s0 = al*WT(b_data[j]);
|
|
|
|
WT s1 = al*WT(b_data[j+1]);
|
|
|
|
if( !c_data )
|
|
|
|
{
|
|
|
|
d_data[j] = T(s0);
|
|
|
|
d_data[j+1] = T(s1);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
d_data[j] = T(s0 + WT(c_data[0])*beta);
|
|
|
|
d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j < d_size.width; j++, c_data += c_step1 )
|
|
|
|
{
|
|
|
|
WT s0 = al*WT(b_data[j]);
|
|
|
|
if( !c_data )
|
|
|
|
d_data[j] = T(s0);
|
|
|
|
else
|
|
|
|
d_data[j] = T(s0 + WT(c_data[0])*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( flags & GEMM_2_T ) /* A * Bt */
|
|
|
|
{
|
|
|
|
for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
|
|
|
|
{
|
|
|
|
a_data = _a_data;
|
|
|
|
b_data = _b_data;
|
|
|
|
c_data = _c_data;
|
|
|
|
|
|
|
|
if( a_buf )
|
|
|
|
{
|
|
|
|
for( k = 0; k < n; k++ )
|
|
|
|
a_buf[k] = a_data[a_step1*k];
|
|
|
|
a_data = a_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( j = 0; j < d_size.width; j++, b_data += b_step,
|
|
|
|
c_data += c_step1 )
|
|
|
|
{
|
|
|
|
WT s0(0), s1(0), s2(0), s3(0);
|
2012-02-10 14:05:04 +08:00
|
|
|
k = 0;
|
|
|
|
#if CV_ENABLE_UNROLLED
|
|
|
|
for( ; k <= n - 4; k += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
s0 += WT(a_data[k])*WT(b_data[k]);
|
|
|
|
s1 += WT(a_data[k+1])*WT(b_data[k+1]);
|
|
|
|
s2 += WT(a_data[k+2])*WT(b_data[k+2]);
|
|
|
|
s3 += WT(a_data[k+3])*WT(b_data[k+3]);
|
|
|
|
}
|
2012-02-10 14:05:04 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; k < n; k++ )
|
|
|
|
s0 += WT(a_data[k])*WT(b_data[k]);
|
|
|
|
s0 = (s0+s1+s2+s3)*alpha;
|
|
|
|
|
|
|
|
if( !c_data )
|
|
|
|
d_data[j] = T(s0);
|
|
|
|
else
|
|
|
|
d_data[j] = T(s0 + WT(c_data[0])*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( d_size.width*sizeof(d_data[0]) <= 1600 )
|
|
|
|
{
|
|
|
|
for( i = 0; i < drows; i++, _a_data += a_step0,
|
|
|
|
_c_data += c_step0,
|
|
|
|
d_data += d_step )
|
|
|
|
{
|
|
|
|
a_data = _a_data, c_data = _c_data;
|
|
|
|
|
|
|
|
if( a_buf )
|
|
|
|
{
|
|
|
|
for( k = 0; k < n; k++ )
|
|
|
|
a_buf[k] = a_data[a_step1*k];
|
|
|
|
a_data = a_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 )
|
|
|
|
{
|
|
|
|
const T* b = _b_data + j;
|
|
|
|
WT s0(0), s1(0), s2(0), s3(0);
|
|
|
|
|
|
|
|
for( k = 0; k < n; k++, b += b_step )
|
|
|
|
{
|
|
|
|
WT a(a_data[k]);
|
|
|
|
s0 += a * WT(b[0]); s1 += a * WT(b[1]);
|
|
|
|
s2 += a * WT(b[2]); s3 += a * WT(b[3]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if( !c_data )
|
|
|
|
{
|
|
|
|
d_data[j] = T(s0*alpha);
|
|
|
|
d_data[j+1] = T(s1*alpha);
|
|
|
|
d_data[j+2] = T(s2*alpha);
|
|
|
|
d_data[j+3] = T(s3*alpha);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
s0 = s0*alpha; s1 = s1*alpha;
|
|
|
|
s2 = s2*alpha; s3 = s3*alpha;
|
|
|
|
d_data[j] = T(s0 + WT(c_data[0])*beta);
|
|
|
|
d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
|
|
|
|
d_data[j+2] = T(s2 + WT(c_data[c_step1*2])*beta);
|
|
|
|
d_data[j+3] = T(s3 + WT(c_data[c_step1*3])*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j < m; j++, c_data += c_step1 )
|
|
|
|
{
|
|
|
|
const T* b = _b_data + j;
|
|
|
|
WT s0(0);
|
|
|
|
|
|
|
|
for( k = 0; k < n; k++, b += b_step )
|
|
|
|
s0 += WT(a_data[k]) * WT(b[0]);
|
|
|
|
|
|
|
|
s0 = s0*alpha;
|
|
|
|
if( !c_data )
|
|
|
|
d_data[j] = T(s0);
|
|
|
|
else
|
|
|
|
d_data[j] = T(s0 + WT(c_data[0])*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2011-02-18 18:29:57 +08:00
|
|
|
cv::AutoBuffer<WT> _d_buf(m);
|
|
|
|
WT* d_buf = _d_buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
|
|
|
|
{
|
|
|
|
a_data = _a_data;
|
|
|
|
b_data = _b_data;
|
|
|
|
c_data = _c_data;
|
|
|
|
|
|
|
|
if( a_buf )
|
|
|
|
{
|
|
|
|
for( k = 0; k < n; k++ )
|
|
|
|
a_buf[k] = _a_data[a_step1*k];
|
|
|
|
a_data = a_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( j = 0; j < m; j++ )
|
|
|
|
d_buf[j] = WT(0);
|
|
|
|
|
|
|
|
for( k = 0; k < n; k++, b_data += b_step )
|
|
|
|
{
|
|
|
|
WT al(a_data[k]);
|
2012-06-09 23:00:04 +08:00
|
|
|
j=0;
|
2012-02-10 14:05:04 +08:00
|
|
|
#if CV_ENABLE_UNROLLED
|
|
|
|
for(; j <= m - 4; j += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
WT t0 = d_buf[j] + WT(b_data[j])*al;
|
|
|
|
WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
|
|
|
|
d_buf[j] = t0;
|
|
|
|
d_buf[j+1] = t1;
|
|
|
|
t0 = d_buf[j+2] + WT(b_data[j+2])*al;
|
|
|
|
t1 = d_buf[j+3] + WT(b_data[j+3])*al;
|
|
|
|
d_buf[j+2] = t0;
|
|
|
|
d_buf[j+3] = t1;
|
|
|
|
}
|
2012-02-10 14:05:04 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; j < m; j++ )
|
|
|
|
d_buf[j] += WT(b_data[j])*al;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( !c_data )
|
|
|
|
for( j = 0; j < m; j++ )
|
|
|
|
d_data[j] = T(d_buf[j]*alpha);
|
|
|
|
else
|
|
|
|
for( j = 0; j < m; j++, c_data += c_step1 )
|
|
|
|
{
|
|
|
|
WT t = d_buf[j]*alpha;
|
|
|
|
d_data[j] = T(t + WT(c_data[0])*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template<typename T, typename WT> static void
|
|
|
|
GEMMBlockMul( const T* a_data, size_t a_step,
|
|
|
|
const T* b_data, size_t b_step,
|
|
|
|
WT* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size, int flags )
|
|
|
|
{
|
|
|
|
int i, j, k, n = a_size.width, m = d_size.width;
|
|
|
|
const T *_a_data = a_data, *_b_data = b_data;
|
2011-02-18 18:29:57 +08:00
|
|
|
cv::AutoBuffer<T> _a_buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
T* a_buf = 0;
|
|
|
|
size_t a_step0, a_step1, t_step;
|
|
|
|
int do_acc = flags & 16;
|
|
|
|
|
|
|
|
a_step /= sizeof(a_data[0]);
|
|
|
|
b_step /= sizeof(b_data[0]);
|
|
|
|
d_step /= sizeof(d_data[0]);
|
|
|
|
|
|
|
|
a_step0 = a_step;
|
|
|
|
a_step1 = 1;
|
|
|
|
|
|
|
|
if( flags & GEMM_1_T )
|
|
|
|
{
|
|
|
|
CV_SWAP( a_step0, a_step1, t_step );
|
|
|
|
n = a_size.height;
|
2011-02-18 18:29:57 +08:00
|
|
|
_a_buf.allocate(n);
|
|
|
|
a_buf = _a_buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if( flags & GEMM_2_T )
|
|
|
|
{
|
|
|
|
/* second operand is transposed */
|
|
|
|
for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
|
|
|
|
{
|
|
|
|
a_data = _a_data; b_data = _b_data;
|
|
|
|
|
|
|
|
if( a_buf )
|
|
|
|
{
|
|
|
|
for( k = 0; k < n; k++ )
|
|
|
|
a_buf[k] = a_data[a_step1*k];
|
|
|
|
a_data = a_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( j = 0; j < d_size.width; j++, b_data += b_step )
|
|
|
|
{
|
|
|
|
WT s0 = do_acc ? d_data[j]:WT(0), s1(0);
|
|
|
|
for( k = 0; k <= n - 2; k += 2 )
|
|
|
|
{
|
|
|
|
s0 += WT(a_data[k])*WT(b_data[k]);
|
|
|
|
s1 += WT(a_data[k+1])*WT(b_data[k+1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; k < n; k++ )
|
|
|
|
s0 += WT(a_data[k])*WT(b_data[k]);
|
|
|
|
|
|
|
|
d_data[j] = s0 + s1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
|
|
|
|
{
|
|
|
|
a_data = _a_data, b_data = _b_data;
|
|
|
|
|
|
|
|
if( a_buf )
|
|
|
|
{
|
|
|
|
for( k = 0; k < n; k++ )
|
|
|
|
a_buf[k] = a_data[a_step1*k];
|
|
|
|
a_data = a_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( j = 0; j <= m - 4; j += 4 )
|
|
|
|
{
|
|
|
|
WT s0, s1, s2, s3;
|
|
|
|
const T* b = b_data + j;
|
|
|
|
|
|
|
|
if( do_acc )
|
|
|
|
{
|
|
|
|
s0 = d_data[j]; s1 = d_data[j+1];
|
|
|
|
s2 = d_data[j+2]; s3 = d_data[j+3];
|
|
|
|
}
|
|
|
|
else
|
|
|
|
s0 = s1 = s2 = s3 = WT(0);
|
|
|
|
|
|
|
|
for( k = 0; k < n; k++, b += b_step )
|
|
|
|
{
|
|
|
|
WT a(a_data[k]);
|
|
|
|
s0 += a * WT(b[0]); s1 += a * WT(b[1]);
|
|
|
|
s2 += a * WT(b[2]); s3 += a * WT(b[3]);
|
|
|
|
}
|
|
|
|
|
|
|
|
d_data[j] = s0; d_data[j+1] = s1;
|
|
|
|
d_data[j+2] = s2; d_data[j+3] = s3;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j < m; j++ )
|
|
|
|
{
|
|
|
|
const T* b = b_data + j;
|
|
|
|
WT s0 = do_acc ? d_data[j] : WT(0);
|
|
|
|
|
|
|
|
for( k = 0; k < n; k++, b += b_step )
|
|
|
|
s0 += WT(a_data[k]) * WT(b[0]);
|
|
|
|
|
|
|
|
d_data[j] = s0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template<typename T, typename WT> static void
|
|
|
|
GEMMStore( const T* c_data, size_t c_step,
|
|
|
|
const WT* d_buf, size_t d_buf_step,
|
|
|
|
T* d_data, size_t d_step, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
const T* _c_data = c_data;
|
|
|
|
int j;
|
|
|
|
size_t c_step0, c_step1;
|
|
|
|
|
|
|
|
c_step /= sizeof(c_data[0]);
|
|
|
|
d_buf_step /= sizeof(d_buf[0]);
|
|
|
|
d_step /= sizeof(d_data[0]);
|
|
|
|
|
|
|
|
if( !c_data )
|
|
|
|
c_step0 = c_step1 = 0;
|
|
|
|
else if( !(flags & GEMM_3_T) )
|
|
|
|
c_step0 = c_step, c_step1 = 1;
|
|
|
|
else
|
|
|
|
c_step0 = 1, c_step1 = c_step;
|
|
|
|
|
|
|
|
for( ; d_size.height--; _c_data += c_step0, d_buf += d_buf_step, d_data += d_step )
|
|
|
|
{
|
|
|
|
if( _c_data )
|
|
|
|
{
|
|
|
|
c_data = _c_data;
|
2012-06-09 23:00:04 +08:00
|
|
|
j=0;
|
|
|
|
#if CV_ENABLE_UNROLLED
|
2012-02-10 14:05:04 +08:00
|
|
|
for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
WT t0 = alpha*d_buf[j];
|
|
|
|
WT t1 = alpha*d_buf[j+1];
|
|
|
|
t0 += beta*WT(c_data[0]);
|
|
|
|
t1 += beta*WT(c_data[c_step1]);
|
|
|
|
d_data[j] = T(t0);
|
|
|
|
d_data[j+1] = T(t1);
|
|
|
|
t0 = alpha*d_buf[j+2];
|
|
|
|
t1 = alpha*d_buf[j+3];
|
|
|
|
t0 += beta*WT(c_data[c_step1*2]);
|
|
|
|
t1 += beta*WT(c_data[c_step1*3]);
|
|
|
|
d_data[j+2] = T(t0);
|
|
|
|
d_data[j+3] = T(t1);
|
|
|
|
}
|
2012-02-10 14:05:04 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; j < d_size.width; j++, c_data += c_step1 )
|
|
|
|
{
|
|
|
|
WT t0 = alpha*d_buf[j];
|
|
|
|
d_data[j] = T(t0 + WT(c_data[0])*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2012-06-09 23:00:04 +08:00
|
|
|
j = 0;
|
|
|
|
#if CV_ENABLE_UNROLLED
|
2012-02-10 14:05:04 +08:00
|
|
|
for( ; j <= d_size.width - 4; j += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
WT t0 = alpha*d_buf[j];
|
|
|
|
WT t1 = alpha*d_buf[j+1];
|
|
|
|
d_data[j] = T(t0);
|
|
|
|
d_data[j+1] = T(t1);
|
|
|
|
t0 = alpha*d_buf[j+2];
|
|
|
|
t1 = alpha*d_buf[j+3];
|
|
|
|
d_data[j+2] = T(t0);
|
|
|
|
d_data[j+3] = T(t1);
|
|
|
|
}
|
2012-06-09 23:00:04 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; j < d_size.width; j++ )
|
|
|
|
d_data[j] = T(alpha*d_buf[j]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
typedef void (*GEMMSingleMulFunc)( const void* src1, size_t step1,
|
|
|
|
const void* src2, size_t step2, const void* src3, size_t step3,
|
|
|
|
void* dst, size_t dststep, Size srcsize, Size dstsize,
|
|
|
|
double alpha, double beta, int flags );
|
|
|
|
|
|
|
|
typedef void (*GEMMBlockMulFunc)( const void* src1, size_t step1,
|
|
|
|
const void* src2, size_t step2, void* dst, size_t dststep,
|
|
|
|
Size srcsize, Size dstsize, int flags );
|
|
|
|
|
|
|
|
typedef void (*GEMMStoreFunc)( const void* src1, size_t step1,
|
|
|
|
const void* src2, size_t step2, void* dst, size_t dststep,
|
|
|
|
Size dstsize, double alpha, double beta, int flags );
|
|
|
|
|
|
|
|
static void GEMMSingleMul_32f( const float* a_data, size_t a_step,
|
|
|
|
const float* b_data, size_t b_step,
|
|
|
|
const float* c_data, size_t c_step,
|
|
|
|
float* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMSingleMul<float,double>(a_data, a_step, b_data, b_step, c_data,
|
|
|
|
c_step, d_data, d_step, a_size, d_size,
|
|
|
|
alpha, beta, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void GEMMSingleMul_64f( const double* a_data, size_t a_step,
|
|
|
|
const double* b_data, size_t b_step,
|
|
|
|
const double* c_data, size_t c_step,
|
|
|
|
double* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMSingleMul<double,double>(a_data, a_step, b_data, b_step, c_data,
|
|
|
|
c_step, d_data, d_step, a_size, d_size,
|
|
|
|
alpha, beta, flags);
|
|
|
|
}
|
|
|
|
|
2012-06-09 23:00:04 +08:00
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
static void GEMMSingleMul_32fc( const Complexf* a_data, size_t a_step,
|
|
|
|
const Complexf* b_data, size_t b_step,
|
|
|
|
const Complexf* c_data, size_t c_step,
|
|
|
|
Complexf* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMSingleMul<Complexf,Complexd>(a_data, a_step, b_data, b_step, c_data,
|
|
|
|
c_step, d_data, d_step, a_size, d_size,
|
|
|
|
alpha, beta, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void GEMMSingleMul_64fc( const Complexd* a_data, size_t a_step,
|
|
|
|
const Complexd* b_data, size_t b_step,
|
|
|
|
const Complexd* c_data, size_t c_step,
|
|
|
|
Complexd* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMSingleMul<Complexd,Complexd>(a_data, a_step, b_data, b_step, c_data,
|
|
|
|
c_step, d_data, d_step, a_size, d_size,
|
|
|
|
alpha, beta, flags);
|
2012-06-09 23:00:04 +08:00
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
static void GEMMBlockMul_32f( const float* a_data, size_t a_step,
|
|
|
|
const float* b_data, size_t b_step,
|
|
|
|
double* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size, int flags )
|
|
|
|
{
|
|
|
|
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void GEMMBlockMul_64f( const double* a_data, size_t a_step,
|
|
|
|
const double* b_data, size_t b_step,
|
|
|
|
double* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size, int flags )
|
|
|
|
{
|
|
|
|
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void GEMMBlockMul_32fc( const Complexf* a_data, size_t a_step,
|
|
|
|
const Complexf* b_data, size_t b_step,
|
|
|
|
Complexd* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size, int flags )
|
|
|
|
{
|
|
|
|
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step,
|
|
|
|
const Complexd* b_data, size_t b_step,
|
|
|
|
Complexd* d_data, size_t d_step,
|
|
|
|
Size a_size, Size d_size, int flags )
|
|
|
|
{
|
|
|
|
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
|
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
static void GEMMStore_32f( const float* c_data, size_t c_step,
|
|
|
|
const double* d_buf, size_t d_buf_step,
|
|
|
|
float* d_data, size_t d_step, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void GEMMStore_64f( const double* c_data, size_t c_step,
|
|
|
|
const double* d_buf, size_t d_buf_step,
|
|
|
|
double* d_data, size_t d_step, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
|
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
static void GEMMStore_32fc( const Complexf* c_data, size_t c_step,
|
|
|
|
const Complexd* d_buf, size_t d_buf_step,
|
|
|
|
Complexf* d_data, size_t d_step, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void GEMMStore_64fc( const Complexd* c_data, size_t c_step,
|
|
|
|
const Complexd* d_buf, size_t d_buf_step,
|
|
|
|
Complexd* d_data, size_t d_step, Size d_size,
|
|
|
|
double alpha, double beta, int flags )
|
|
|
|
{
|
|
|
|
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
|
|
|
|
}
|
|
|
|
|
2013-12-15 03:16:53 +08:00
|
|
|
#ifdef HAVE_CLAMDBLAS
|
|
|
|
|
2014-08-22 19:05:29 +08:00
|
|
|
static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
|
2013-12-15 03:16:53 +08:00
|
|
|
InputArray matC, double beta, OutputArray matD, int flags )
|
|
|
|
{
|
|
|
|
int type = matA.type(), esz = CV_ELEM_SIZE(type);
|
|
|
|
bool haveC = matC.kind() != cv::_InputArray::NONE;
|
|
|
|
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
|
|
|
|
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
|
|
|
|
|
|
|
|
if (atrans)
|
|
|
|
sizeA = Size(sizeA.height, sizeA.width);
|
|
|
|
if (btrans)
|
|
|
|
sizeB = Size(sizeB.height, sizeB.width);
|
|
|
|
if (haveC && ctrans)
|
|
|
|
sizeC = Size(sizeC.height, sizeC.width);
|
|
|
|
|
|
|
|
Size sizeD(sizeB.width, sizeA.height);
|
|
|
|
|
|
|
|
CV_Assert( matB.type() == type && (!haveC || matC.type() == type) );
|
|
|
|
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
|
|
|
|
|
|
|
|
matD.create(sizeD, type);
|
|
|
|
if ( matA.offset() % esz != 0 || matA.step() % esz != 0 ||
|
|
|
|
matB.offset() % esz != 0 || matB.step() % esz != 0 ||
|
|
|
|
(haveC && (matC.offset() % esz != 0 || matC.step() % esz != 0)) )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
|
2015-01-02 08:33:40 +08:00
|
|
|
if (!ocl::internal::isCLBuffer(A) || !ocl::internal::isCLBuffer(B) || !ocl::internal::isCLBuffer(D))
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (haveC)
|
|
|
|
{
|
|
|
|
UMat C = matC.getUMat();
|
|
|
|
if (!ocl::internal::isCLBuffer(C))
|
|
|
|
return false;
|
|
|
|
}
|
2013-12-15 03:16:53 +08:00
|
|
|
if (haveC)
|
2013-12-26 21:13:26 +08:00
|
|
|
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
2013-12-15 03:16:53 +08:00
|
|
|
else
|
|
|
|
D.setTo(Scalar::all(0));
|
|
|
|
|
|
|
|
int M = sizeD.height, N = sizeD.width, K = sizeA.width;
|
|
|
|
int lda = (int)A.step / esz, ldb = (int)B.step / esz, ldc = (int)D.step / esz;
|
|
|
|
int offa = (int)A.offset / esz, offb = (int)B.offset / esz, offc = (int)D.offset / esz;
|
|
|
|
|
|
|
|
cl_command_queue clq = (cl_command_queue)ocl::Queue::getDefault().ptr();
|
|
|
|
clAmdBlasTranspose transA = atrans ? clAmdBlasTrans : clAmdBlasNoTrans;
|
|
|
|
clAmdBlasTranspose transB = btrans ? clAmdBlasTrans : clAmdBlasNoTrans;
|
|
|
|
clAmdBlasOrder order = clAmdBlasRowMajor;
|
|
|
|
clAmdBlasStatus status = clAmdBlasSuccess;
|
|
|
|
|
|
|
|
if (type == CV_32FC1)
|
|
|
|
status = clAmdBlasSgemmEx(order, transA, transB, M, N, K,
|
|
|
|
(cl_float)alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
|
|
|
|
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
|
|
|
|
(cl_float)beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
|
|
|
|
1, &clq, 0, NULL, NULL);
|
|
|
|
else if (type == CV_64FC1)
|
|
|
|
status = clAmdBlasDgemmEx(order, transA, transB, M, N, K,
|
|
|
|
alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
|
|
|
|
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
|
|
|
|
beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
|
|
|
|
1, &clq, 0, NULL, NULL);
|
|
|
|
else if (type == CV_32FC2)
|
|
|
|
{
|
|
|
|
cl_float2 alpha_2 = { { (cl_float)alpha, 0 } };
|
|
|
|
cl_float2 beta_2 = { { (cl_float)beta, 0 } };
|
|
|
|
status = clAmdBlasCgemmEx(order, transA, transB, M, N, K,
|
|
|
|
alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
|
|
|
|
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
|
|
|
|
beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
|
|
|
|
1, &clq, 0, NULL, NULL);
|
|
|
|
}
|
|
|
|
else if (type == CV_64FC2)
|
|
|
|
{
|
|
|
|
cl_double2 alpha_2 = { { alpha, 0 } };
|
|
|
|
cl_double2 beta_2 = { { beta, 0 } };
|
|
|
|
status = clAmdBlasZgemmEx(order, transA, transB, M, N, K,
|
|
|
|
alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
|
|
|
|
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
|
|
|
|
beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
|
|
|
|
1, &clq, 0, NULL, NULL);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
CV_Error(Error::StsUnsupportedFormat, "");
|
|
|
|
|
|
|
|
return status == clAmdBlasSuccess;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2014-08-22 19:05:29 +08:00
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
|
|
|
|
InputArray matC, double beta, OutputArray matD, int flags )
|
|
|
|
{
|
|
|
|
int depth = matA.depth(), cn = matA.channels();
|
|
|
|
int type = CV_MAKETYPE(depth, cn);
|
|
|
|
|
|
|
|
CV_Assert( type == matB.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
|
|
|
|
|
|
|
|
const ocl::Device & dev = ocl::Device::getDefault();
|
|
|
|
bool doubleSupport = dev.doubleFPConfig() > 0;
|
|
|
|
|
2014-10-16 15:24:44 +08:00
|
|
|
if (!doubleSupport && depth == CV_64F)
|
2014-08-22 19:05:29 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
bool haveC = matC.kind() != cv::_InputArray::NONE;
|
|
|
|
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
|
|
|
|
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
|
|
|
|
|
2014-10-16 15:24:44 +08:00
|
|
|
CV_Assert( !haveC || matC.type() == type );
|
2014-08-22 19:05:29 +08:00
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
Size sizeD(((btrans)? sizeB.height : sizeB.width),
|
|
|
|
((atrans)? sizeA.width : sizeA.height));
|
2014-08-22 19:05:29 +08:00
|
|
|
matD.create(sizeD, type);
|
|
|
|
|
|
|
|
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
|
|
|
|
|
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
|
|
|
|
{
|
|
|
|
String opts;
|
2014-08-22 19:05:29 +08:00
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
if (atrans)
|
|
|
|
sizeA = Size(sizeA.height, sizeA.width);
|
|
|
|
if (btrans)
|
|
|
|
sizeB = Size(sizeB.height, sizeB.width);
|
|
|
|
if (haveC && ctrans)
|
|
|
|
sizeC = Size(sizeC.height, sizeC.width);
|
|
|
|
|
|
|
|
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
|
2014-08-22 19:05:29 +08:00
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
int max_wg_size = (int)dev.maxWorkGroupSize();
|
|
|
|
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
|
2014-09-04 16:36:23 +08:00
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
if (atrans)
|
|
|
|
A = A.t();
|
|
|
|
|
|
|
|
if (btrans)
|
|
|
|
B = B.t();
|
|
|
|
|
|
|
|
if (haveC)
|
|
|
|
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
|
|
|
|
|
|
|
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
|
|
|
|
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
|
|
|
|
|
|
|
|
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s",
|
2014-09-04 16:36:23 +08:00
|
|
|
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
|
|
|
|
cn, kercn, block_size,
|
|
|
|
(sizeA.width % block_size !=0) ? "-D NO_MULT" : "",
|
|
|
|
haveC ? "-D HAVE_C" : "",
|
|
|
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
|
|
|
|
if (k.empty())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (depth == CV_64F)
|
|
|
|
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
|
|
|
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
|
|
|
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
|
|
|
sizeA.width, alpha, beta);
|
|
|
|
else
|
|
|
|
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
|
|
|
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
|
|
|
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
|
|
|
sizeA.width, (float)alpha, (float)beta);
|
|
|
|
|
|
|
|
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
|
|
|
|
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
|
2014-09-04 16:36:23 +08:00
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
|
|
|
|
}
|
2014-08-22 19:05:29 +08:00
|
|
|
else
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
{
|
|
|
|
if (haveC && beta != 0.0)
|
|
|
|
{
|
|
|
|
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
beta = 0.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return intel_gpu_gemm(A, sizeA,
|
|
|
|
B, sizeB,
|
|
|
|
D, sizeD,
|
|
|
|
alpha,
|
|
|
|
beta,
|
|
|
|
atrans, btrans);
|
|
|
|
}
|
2014-08-22 19:05:29 +08:00
|
|
|
}
|
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2016-06-07 15:58:12 +08:00
|
|
|
static void gemmImpl( Mat A, Mat B, double alpha,
|
2016-06-03 15:38:30 +08:00
|
|
|
Mat C, double beta, Mat D, int flags )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
const int block_lin_size = 128;
|
|
|
|
const int block_size = block_lin_size * block_lin_size;
|
|
|
|
|
|
|
|
static double zero[] = {0,0,0,0};
|
|
|
|
static float zerof[] = {0,0,0,0};
|
|
|
|
|
|
|
|
Size a_size = A.size(), d_size;
|
|
|
|
int i, len = 0, type = A.type();
|
|
|
|
|
|
|
|
switch( flags & (GEMM_1_T|GEMM_2_T) )
|
|
|
|
{
|
|
|
|
case 0:
|
|
|
|
d_size = Size( B.cols, a_size.height );
|
|
|
|
len = B.rows;
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
d_size = Size( B.cols, a_size.width );
|
|
|
|
len = B.rows;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
d_size = Size( B.rows, a_size.height );
|
|
|
|
len = B.cols;
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
d_size = Size( B.rows, a_size.width );
|
|
|
|
len = B.cols;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) )
|
|
|
|
{
|
|
|
|
if( type == CV_32F )
|
|
|
|
{
|
2014-08-13 19:08:27 +08:00
|
|
|
float* d = D.ptr<float>();
|
|
|
|
const float *a = A.ptr<float>(),
|
|
|
|
*b = B.ptr<float>(),
|
2011-04-17 21:14:45 +08:00
|
|
|
*c = (const float*)C.data;
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t d_step = D.step/sizeof(d[0]),
|
|
|
|
a_step = A.step/sizeof(a[0]),
|
|
|
|
b_step = B.step/sizeof(b[0]),
|
2011-04-17 21:14:45 +08:00
|
|
|
c_step = C.data ? C.step/sizeof(c[0]) : 0;
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
if( !c )
|
|
|
|
c = zerof;
|
|
|
|
|
|
|
|
switch( len )
|
|
|
|
{
|
|
|
|
case 2:
|
|
|
|
if( len == d_size.width && b != d )
|
|
|
|
{
|
|
|
|
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
|
|
|
|
{
|
|
|
|
float t0 = a[0]*b[0] + a[1]*b[b_step];
|
|
|
|
float t1 = a[0]*b[1] + a[1]*b[b_step+1];
|
|
|
|
d[0] = (float)(t0*alpha + c[0]*beta);
|
|
|
|
d[1] = (float)(t1*alpha + c[1]*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( a != d )
|
|
|
|
{
|
|
|
|
int c_step0 = 1;
|
|
|
|
if( c == zerof )
|
|
|
|
{
|
|
|
|
c_step0 = 0;
|
|
|
|
c_step = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
|
|
|
|
{
|
|
|
|
float t0 = a[0]*b[0] + a[1]*b[b_step];
|
|
|
|
float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
|
|
|
|
d[0] = (float)(t0*alpha + c[0]*beta);
|
|
|
|
d[d_step] = (float)(t1*alpha + c[c_step]*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
return;
|
|
|
|
case 3:
|
|
|
|
if( len == d_size.width && b != d )
|
|
|
|
{
|
|
|
|
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
|
|
|
|
{
|
|
|
|
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
|
|
|
|
float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
|
|
|
|
float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
|
|
|
|
d[0] = (float)(t0*alpha + c[0]*beta);
|
|
|
|
d[1] = (float)(t1*alpha + c[1]*beta);
|
|
|
|
d[2] = (float)(t2*alpha + c[2]*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( a != d )
|
|
|
|
{
|
|
|
|
int c_step0 = 1;
|
|
|
|
if( c == zerof )
|
|
|
|
{
|
|
|
|
c_step0 = 0;
|
|
|
|
c_step = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
|
|
|
|
{
|
|
|
|
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
|
|
|
|
float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
|
|
|
|
float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
|
|
|
|
|
|
|
|
d[0] = (float)(t0*alpha + c[0]*beta);
|
|
|
|
d[d_step] = (float)(t1*alpha + c[c_step]*beta);
|
|
|
|
d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
return;
|
|
|
|
case 4:
|
|
|
|
if( len == d_size.width && b != d )
|
|
|
|
{
|
|
|
|
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
|
|
|
|
{
|
|
|
|
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
|
|
|
|
float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
|
|
|
|
float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
|
|
|
|
float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
|
|
|
|
d[0] = (float)(t0*alpha + c[0]*beta);
|
|
|
|
d[1] = (float)(t1*alpha + c[1]*beta);
|
|
|
|
d[2] = (float)(t2*alpha + c[2]*beta);
|
|
|
|
d[3] = (float)(t3*alpha + c[3]*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( len <= 16 && a != d )
|
|
|
|
{
|
|
|
|
int c_step0 = 1;
|
|
|
|
if( c == zerof )
|
|
|
|
{
|
|
|
|
c_step0 = 0;
|
|
|
|
c_step = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
|
|
|
|
{
|
|
|
|
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
|
|
|
|
float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
|
|
|
|
a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
|
|
|
|
float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
|
|
|
|
a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
|
|
|
|
float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
|
|
|
|
a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
|
|
|
|
d[0] = (float)(t0*alpha + c[0]*beta);
|
|
|
|
d[d_step] = (float)(t1*alpha + c[c_step]*beta);
|
|
|
|
d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
|
|
|
|
d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if( type == CV_64F )
|
|
|
|
{
|
2014-08-13 19:08:27 +08:00
|
|
|
double* d = D.ptr<double>();
|
|
|
|
const double *a = A.ptr<double>(),
|
|
|
|
*b = B.ptr<double>(),
|
2011-04-17 21:14:45 +08:00
|
|
|
*c = (const double*)C.data;
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t d_step = D.step/sizeof(d[0]),
|
|
|
|
a_step = A.step/sizeof(a[0]),
|
|
|
|
b_step = B.step/sizeof(b[0]),
|
2011-04-17 21:14:45 +08:00
|
|
|
c_step = C.data ? C.step/sizeof(c[0]) : 0;
|
2010-05-12 01:44:00 +08:00
|
|
|
if( !c )
|
|
|
|
c = zero;
|
|
|
|
|
|
|
|
switch( len )
|
|
|
|
{
|
|
|
|
case 2:
|
|
|
|
if( len == d_size.width && b != d )
|
|
|
|
{
|
|
|
|
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
|
|
|
|
{
|
|
|
|
double t0 = a[0]*b[0] + a[1]*b[b_step];
|
|
|
|
double t1 = a[0]*b[1] + a[1]*b[b_step+1];
|
|
|
|
d[0] = t0*alpha + c[0]*beta;
|
|
|
|
d[1] = t1*alpha + c[1]*beta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( a != d )
|
|
|
|
{
|
|
|
|
int c_step0 = 1;
|
|
|
|
if( c == zero )
|
|
|
|
{
|
|
|
|
c_step0 = 0;
|
|
|
|
c_step = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
|
|
|
|
{
|
|
|
|
double t0 = a[0]*b[0] + a[1]*b[b_step];
|
|
|
|
double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
|
|
|
|
d[0] = t0*alpha + c[0]*beta;
|
|
|
|
d[d_step] = t1*alpha + c[c_step]*beta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
return;
|
|
|
|
case 3:
|
|
|
|
if( len == d_size.width && b != d )
|
|
|
|
{
|
|
|
|
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
|
|
|
|
{
|
|
|
|
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
|
|
|
|
double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
|
|
|
|
double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
|
|
|
|
d[0] = t0*alpha + c[0]*beta;
|
|
|
|
d[1] = t1*alpha + c[1]*beta;
|
|
|
|
d[2] = t2*alpha + c[2]*beta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( a != d )
|
|
|
|
{
|
|
|
|
int c_step0 = 1;
|
|
|
|
if( c == zero )
|
|
|
|
{
|
|
|
|
c_step0 = 0;
|
|
|
|
c_step = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
|
|
|
|
{
|
|
|
|
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
|
|
|
|
double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
|
|
|
|
double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
|
|
|
|
|
|
|
|
d[0] = t0*alpha + c[0]*beta;
|
|
|
|
d[d_step] = t1*alpha + c[c_step]*beta;
|
|
|
|
d[d_step*2] = t2*alpha + c[c_step*2]*beta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
return;
|
|
|
|
case 4:
|
|
|
|
if( len == d_size.width && b != d )
|
|
|
|
{
|
|
|
|
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
|
|
|
|
{
|
|
|
|
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
|
|
|
|
double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
|
|
|
|
double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
|
|
|
|
double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
|
|
|
|
d[0] = t0*alpha + c[0]*beta;
|
|
|
|
d[1] = t1*alpha + c[1]*beta;
|
|
|
|
d[2] = t2*alpha + c[2]*beta;
|
|
|
|
d[3] = t3*alpha + c[3]*beta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( d_size.width <= 16 && a != d )
|
|
|
|
{
|
|
|
|
int c_step0 = 1;
|
|
|
|
if( c == zero )
|
|
|
|
{
|
|
|
|
c_step0 = 0;
|
|
|
|
c_step = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
|
|
|
|
{
|
|
|
|
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
|
|
|
|
double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
|
|
|
|
a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
|
|
|
|
double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
|
|
|
|
a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
|
|
|
|
double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
|
|
|
|
a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
|
|
|
|
d[0] = t0*alpha + c[0]*beta;
|
|
|
|
d[d_step] = t1*alpha + c[c_step]*beta;
|
|
|
|
d[d_step*2] = t2*alpha + c[c_step*2]*beta;
|
|
|
|
d[d_step*3] = t3*alpha + c[c_step*3]*beta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
size_t b_step = B.step;
|
|
|
|
GEMMSingleMulFunc singleMulFunc;
|
|
|
|
GEMMBlockMulFunc blockMulFunc;
|
|
|
|
GEMMStoreFunc storeFunc;
|
2016-06-03 15:38:30 +08:00
|
|
|
Mat *matD = &D;
|
2011-04-17 21:14:45 +08:00
|
|
|
const uchar* Cdata = C.data;
|
|
|
|
size_t Cstep = C.data ? (size_t)C.step : 0;
|
2010-05-12 01:44:00 +08:00
|
|
|
AutoBuffer<uchar> buf;
|
|
|
|
|
|
|
|
if( type == CV_32FC1 )
|
|
|
|
{
|
|
|
|
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32f;
|
|
|
|
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32f;
|
|
|
|
storeFunc = (GEMMStoreFunc)GEMMStore_32f;
|
|
|
|
}
|
|
|
|
else if( type == CV_64FC1 )
|
|
|
|
{
|
|
|
|
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64f;
|
|
|
|
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64f;
|
|
|
|
storeFunc = (GEMMStoreFunc)GEMMStore_64f;
|
|
|
|
}
|
|
|
|
else if( type == CV_32FC2 )
|
|
|
|
{
|
|
|
|
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32fc;
|
|
|
|
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32fc;
|
|
|
|
storeFunc = (GEMMStoreFunc)GEMMStore_32fc;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CV_Assert( type == CV_64FC2 );
|
|
|
|
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64fc;
|
|
|
|
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64fc;
|
|
|
|
storeFunc = (GEMMStoreFunc)GEMMStore_64fc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() )
|
|
|
|
{
|
|
|
|
b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
|
|
|
|
flags |= GEMM_2_T;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 )
|
|
|
|
{
|
|
|
|
blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p :
|
|
|
|
type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p :
|
|
|
|
type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p :
|
|
|
|
type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( blas_func )
|
|
|
|
{
|
|
|
|
const char* transa = flags & GEMM_1_T ? "t" : "n";
|
|
|
|
const char* transb = flags & GEMM_2_T ? "t" : "n";
|
|
|
|
int lda, ldb, ldd;
|
|
|
|
|
|
|
|
if( C->data.ptr )
|
|
|
|
{
|
|
|
|
if( C->data.ptr != D->data.ptr )
|
|
|
|
{
|
|
|
|
if( !(flags & GEMM_3_T) )
|
|
|
|
cvCopy( C, D );
|
|
|
|
else
|
|
|
|
cvTranspose( C, D );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if( CV_MAT_DEPTH(type) == CV_32F )
|
|
|
|
{
|
|
|
|
Complex32f _alpha, _beta;
|
|
|
|
|
|
|
|
lda = A->step/sizeof(float);
|
|
|
|
ldb = b_step/sizeof(float);
|
|
|
|
ldd = D->step/sizeof(float);
|
|
|
|
_alpha.re = (float)alpha;
|
|
|
|
_alpha.im = 0;
|
|
|
|
_beta.re = C->data.ptr ? (float)beta : 0;
|
|
|
|
_beta.im = 0;
|
|
|
|
if( CV_MAT_CN(type) == 2 )
|
|
|
|
lda /= 2, ldb /= 2, ldd /= 2;
|
|
|
|
|
|
|
|
blas_func( transb, transa, &d_size.width, &d_size.height, &len,
|
|
|
|
&_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
|
|
|
|
&_beta, D->data.ptr, &ldd );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CvComplex64f _alpha, _beta;
|
|
|
|
|
|
|
|
lda = A->step/sizeof(double);
|
|
|
|
ldb = b_step/sizeof(double);
|
|
|
|
ldd = D->step/sizeof(double);
|
|
|
|
_alpha.re = alpha;
|
|
|
|
_alpha.im = 0;
|
|
|
|
_beta.re = C->data.ptr ? beta : 0;
|
|
|
|
_beta.im = 0;
|
|
|
|
if( CV_MAT_CN(type) == 2 )
|
|
|
|
lda /= 2, ldb /= 2, ldd /= 2;
|
|
|
|
|
|
|
|
blas_func( transb, transa, &d_size.width, &d_size.height, &len,
|
|
|
|
&_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
|
|
|
|
&_beta, D->data.ptr, &ldd );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else*/ if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) &&
|
|
|
|
len <= 10000) || len <= 10 ||
|
|
|
|
(d_size.width <= block_lin_size &&
|
|
|
|
d_size.height <= block_lin_size && len <= block_lin_size) )
|
|
|
|
{
|
2014-08-13 19:08:27 +08:00
|
|
|
singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep,
|
|
|
|
matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags );
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int is_a_t = flags & GEMM_1_T;
|
|
|
|
int is_b_t = flags & GEMM_2_T;
|
|
|
|
int elem_size = CV_ELEM_SIZE(type);
|
|
|
|
int dk0_1, dk0_2;
|
2015-04-29 16:58:49 +08:00
|
|
|
size_t a_buf_size = 0, b_buf_size, d_buf_size;
|
2010-05-12 01:44:00 +08:00
|
|
|
uchar* a_buf = 0;
|
|
|
|
uchar* b_buf = 0;
|
|
|
|
uchar* d_buf = 0;
|
|
|
|
int j, k, di = 0, dj = 0, dk = 0;
|
|
|
|
int dm0, dn0, dk0;
|
|
|
|
size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
|
|
|
|
int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
if( !is_a_t )
|
|
|
|
a_step0 = A.step, a_step1 = elem_size;
|
|
|
|
else
|
|
|
|
a_step0 = elem_size, a_step1 = A.step;
|
|
|
|
|
|
|
|
if( !is_b_t )
|
|
|
|
b_step0 = b_step, b_step1 = elem_size;
|
|
|
|
else
|
|
|
|
b_step0 = elem_size, b_step1 = b_step;
|
|
|
|
|
2014-08-13 19:08:27 +08:00
|
|
|
if( C.empty() )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
c_step0 = c_step1 = 0;
|
|
|
|
flags &= ~GEMM_3_T;
|
|
|
|
}
|
|
|
|
else if( !(flags & GEMM_3_T) )
|
2011-04-17 21:14:45 +08:00
|
|
|
c_step0 = C.step, c_step1 = elem_size;
|
2010-05-12 01:44:00 +08:00
|
|
|
else
|
2011-04-17 21:14:45 +08:00
|
|
|
c_step0 = elem_size, c_step1 = C.step;
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
dm0 = std::min( block_lin_size, d_size.height );
|
|
|
|
dn0 = std::min( block_lin_size, d_size.width );
|
|
|
|
dk0_1 = block_size / dm0;
|
|
|
|
dk0_2 = block_size / dn0;
|
|
|
|
dk0 = std::min( dk0_1, dk0_2 );
|
|
|
|
dk0 = std::min( dk0, len );
|
|
|
|
if( dk0*dm0 > block_size )
|
|
|
|
dm0 = block_size / dk0;
|
|
|
|
if( dk0*dn0 > block_size )
|
|
|
|
dn0 = block_size / dk0;
|
|
|
|
|
|
|
|
dk0_1 = (dn0+dn0/8+2) & -2;
|
2015-04-29 16:58:49 +08:00
|
|
|
b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size;
|
|
|
|
d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size;
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
if( is_a_t )
|
|
|
|
{
|
2015-04-29 16:58:49 +08:00
|
|
|
a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
|
2010-05-12 01:44:00 +08:00
|
|
|
flags &= ~GEMM_1_T;
|
|
|
|
}
|
|
|
|
|
2016-06-03 15:38:30 +08:00
|
|
|
buf.allocate(d_buf_size + b_buf_size + a_buf_size);
|
2010-05-12 01:44:00 +08:00
|
|
|
d_buf = (uchar*)buf;
|
|
|
|
b_buf = d_buf + d_buf_size;
|
|
|
|
|
|
|
|
if( is_a_t )
|
|
|
|
a_buf = b_buf + b_buf_size;
|
|
|
|
|
|
|
|
for( i = 0; i < d_size.height; i += di )
|
|
|
|
{
|
|
|
|
di = dm0;
|
|
|
|
if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height )
|
|
|
|
di = d_size.height - i;
|
|
|
|
|
|
|
|
for( j = 0; j < d_size.width; j += dj )
|
|
|
|
{
|
2014-08-13 19:08:27 +08:00
|
|
|
uchar* _d = matD->ptr() + i*matD->step + j*elem_size;
|
2010-05-12 01:44:00 +08:00
|
|
|
const uchar* _c = Cdata + i*c_step0 + j*c_step1;
|
|
|
|
size_t _d_step = matD->step;
|
|
|
|
dj = dn0;
|
|
|
|
|
|
|
|
if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width )
|
|
|
|
dj = d_size.width - j;
|
|
|
|
|
|
|
|
flags &= 15;
|
|
|
|
if( dk0 < len )
|
|
|
|
{
|
|
|
|
_d = d_buf;
|
|
|
|
_d_step = dj*work_elem_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( k = 0; k < len; k += dk )
|
|
|
|
{
|
2014-08-13 19:08:27 +08:00
|
|
|
const uchar* _a = A.ptr() + i*a_step0 + k*a_step1;
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t _a_step = A.step;
|
2014-08-13 19:08:27 +08:00
|
|
|
const uchar* _b = B.ptr() + k*b_step0 + j*b_step1;
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t _b_step = b_step;
|
|
|
|
Size a_bl_size;
|
|
|
|
|
|
|
|
dk = dk0;
|
|
|
|
if( k + dk >= len || 8*(k + dk) + dk > 8*len )
|
|
|
|
dk = len - k;
|
|
|
|
|
|
|
|
if( !is_a_t )
|
|
|
|
a_bl_size.width = dk, a_bl_size.height = di;
|
|
|
|
else
|
|
|
|
a_bl_size.width = di, a_bl_size.height = dk;
|
|
|
|
|
|
|
|
if( a_buf && is_a_t )
|
|
|
|
{
|
|
|
|
_a_step = dk*elem_size;
|
|
|
|
GEMM_TransposeBlock( _a, A.step, a_buf, _a_step, a_bl_size, elem_size );
|
|
|
|
std::swap( a_bl_size.width, a_bl_size.height );
|
|
|
|
_a = a_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( dj < d_size.width )
|
|
|
|
{
|
|
|
|
Size b_size;
|
|
|
|
if( !is_b_t )
|
|
|
|
b_size.width = dj, b_size.height = dk;
|
|
|
|
else
|
|
|
|
b_size.width = dk, b_size.height = dj;
|
|
|
|
|
|
|
|
_b_step = b_size.width*elem_size;
|
|
|
|
GEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size );
|
|
|
|
_b = b_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( dk0 < len )
|
|
|
|
blockMulFunc( _a, _a_step, _b, _b_step, _d, _d_step,
|
|
|
|
a_bl_size, Size(dj,di), flags );
|
|
|
|
else
|
|
|
|
singleMulFunc( _a, _a_step, _b, _b_step, _c, Cstep,
|
|
|
|
_d, _d_step, a_bl_size, Size(dj,di), alpha, beta, flags );
|
|
|
|
flags |= 16;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( dk0 < len )
|
|
|
|
storeFunc( _c, Cstep, _d, _d_step,
|
2014-08-13 19:08:27 +08:00
|
|
|
matD->ptr(i) + j*elem_size,
|
2010-05-12 01:44:00 +08:00
|
|
|
matD->step, Size(dj,di), alpha, beta, flags );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-06-03 15:38:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename fptype>inline static void
|
2016-06-07 15:58:12 +08:00
|
|
|
callGemmImpl(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
|
2016-06-03 15:38:30 +08:00
|
|
|
const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int m_a, int n_a, int n_d, int flags, int type)
|
|
|
|
{
|
|
|
|
CV_StaticAssert(GEMM_1_T == CV_HAL_GEMM_1_T, "Incompatible GEMM_1_T flag in HAL");
|
|
|
|
CV_StaticAssert(GEMM_2_T == CV_HAL_GEMM_2_T, "Incompatible GEMM_2_T flag in HAL");
|
|
|
|
CV_StaticAssert(GEMM_3_T == CV_HAL_GEMM_3_T, "Incompatible GEMM_3_T flag in HAL");
|
|
|
|
|
|
|
|
int b_m, b_n, c_m, c_n, m_d;
|
|
|
|
|
|
|
|
if(flags & GEMM_2_T)
|
|
|
|
{
|
|
|
|
b_m = n_d;
|
|
|
|
if(flags & GEMM_1_T )
|
|
|
|
{
|
|
|
|
b_n = m_a;
|
|
|
|
m_d = n_a;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
b_n = n_a;
|
|
|
|
m_d = m_a;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
b_n = n_d;
|
|
|
|
if(flags & GEMM_1_T )
|
|
|
|
{
|
|
|
|
b_m = m_a;
|
|
|
|
m_d = n_a;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
m_d = m_a;
|
|
|
|
b_m = n_a;
|
|
|
|
}
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2016-06-03 15:38:30 +08:00
|
|
|
if(flags & GEMM_3_T)
|
|
|
|
{
|
|
|
|
c_m = n_d;
|
|
|
|
c_n = m_d;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
c_m = m_d;
|
|
|
|
c_n = n_d;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
2016-06-03 15:38:30 +08:00
|
|
|
|
|
|
|
Mat A, B, C;
|
|
|
|
if(src1 != NULL)
|
|
|
|
A = Mat(m_a, n_a, type, (void*)src1, src1_step);
|
|
|
|
if(src2 != NULL)
|
|
|
|
B = Mat(b_m, b_n, type, (void*)src2, src2_step);
|
|
|
|
if(src3 != NULL && beta != 0.0)
|
|
|
|
C = Mat(c_m, c_n, type, (void*)src3, src3_step);
|
|
|
|
Mat D(m_d, n_d, type, (void*)dst, dst_step);
|
|
|
|
|
2016-06-07 15:58:12 +08:00
|
|
|
gemmImpl(A, B, alpha, C, beta, D, flags);
|
2016-06-03 15:38:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void cv::hal::gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
|
|
|
|
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
|
|
|
|
int m_a, int n_a, int n_d, int flags)
|
|
|
|
{
|
|
|
|
|
|
|
|
CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
|
2016-06-07 15:58:12 +08:00
|
|
|
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32F);
|
2016-06-03 15:38:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void cv::hal::gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
|
|
|
|
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
|
|
|
|
int m_a, int n_a, int n_d, int flags)
|
|
|
|
{
|
|
|
|
CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
|
2016-06-07 15:58:12 +08:00
|
|
|
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64F);
|
2016-06-03 15:38:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
CV_EXPORTS void cv::hal::gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
|
|
|
|
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
|
|
|
|
int m_a, int n_a, int n_d, int flags)
|
|
|
|
{
|
|
|
|
CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
|
2016-06-07 15:58:12 +08:00
|
|
|
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32FC2);
|
2016-06-03 15:38:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
CV_EXPORTS void cv::hal::gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
|
|
|
|
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
|
|
|
|
int m_a, int n_a, int n_d, int flags)
|
|
|
|
{
|
|
|
|
CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
|
2016-06-07 15:58:12 +08:00
|
|
|
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64FC2);
|
2016-06-03 15:38:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void cv::gemm( InputArray matA, InputArray matB, double alpha,
|
|
|
|
InputArray matC, double beta, OutputArray _matD, int flags )
|
|
|
|
{
|
|
|
|
#ifdef HAVE_CLAMDBLAS
|
|
|
|
CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() &&
|
|
|
|
matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes
|
|
|
|
ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags))
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2,
|
|
|
|
ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags))
|
|
|
|
#endif
|
|
|
|
|
|
|
|
Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0.0 ? matC.getMat() : Mat();
|
|
|
|
Size a_size = A.size(), d_size;
|
|
|
|
int len = 0, type = A.type();
|
|
|
|
|
|
|
|
CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
|
|
|
|
|
|
|
|
switch( flags & (GEMM_1_T|GEMM_2_T) )
|
|
|
|
{
|
|
|
|
case 0:
|
|
|
|
d_size = Size( B.cols, a_size.height );
|
|
|
|
len = B.rows;
|
|
|
|
CV_Assert( a_size.width == len );
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
d_size = Size( B.cols, a_size.width );
|
|
|
|
len = B.rows;
|
|
|
|
CV_Assert( a_size.height == len );
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
d_size = Size( B.rows, a_size.height );
|
|
|
|
len = B.cols;
|
|
|
|
CV_Assert( a_size.width == len );
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
d_size = Size( B.rows, a_size.width );
|
|
|
|
len = B.cols;
|
|
|
|
CV_Assert( a_size.height == len );
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( !C.empty() )
|
|
|
|
{
|
|
|
|
CV_Assert( C.type() == type &&
|
|
|
|
(((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) ||
|
|
|
|
((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height)));
|
|
|
|
}
|
|
|
|
|
|
|
|
_matD.create( d_size.height, d_size.width, type );
|
|
|
|
Mat D = _matD.getMat();
|
|
|
|
if( (flags & GEMM_3_T) != 0 && C.data == D.data )
|
|
|
|
{
|
|
|
|
transpose( C, C );
|
|
|
|
flags &= ~GEMM_3_T;
|
|
|
|
}
|
|
|
|
|
|
|
|
Mat *DProxyPtr = &D, DProxy;
|
|
|
|
if( D.data == A.data || D.data == B.data )
|
|
|
|
{
|
|
|
|
DProxy = Mat(d_size.height, d_size.width, D.type());
|
|
|
|
DProxyPtr = &DProxy;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( type == CV_32FC1 )
|
|
|
|
hal::gemm32f(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
|
|
|
|
C.ptr<float>(), C.step, static_cast<float>(beta),
|
|
|
|
DProxyPtr->ptr<float>(), DProxyPtr->step,
|
|
|
|
a_size.height, a_size.width, DProxyPtr->cols, flags);
|
|
|
|
else if( type == CV_64FC1 )
|
|
|
|
hal::gemm64f(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
|
|
|
|
C.ptr<double>(), C.step, beta,
|
|
|
|
DProxyPtr->ptr<double>(), DProxyPtr->step,
|
|
|
|
a_size.height, a_size.width, DProxyPtr->cols, flags);
|
|
|
|
else if( type == CV_32FC2 )
|
|
|
|
hal::gemm32fc(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
|
|
|
|
C.ptr<float>(), C.step, static_cast<float>(beta),
|
|
|
|
DProxyPtr->ptr<float>(), DProxyPtr->step,
|
|
|
|
a_size.height, a_size.width, DProxyPtr->cols, flags);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CV_Assert( type == CV_64FC2 );
|
|
|
|
hal::gemm64fc(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
|
|
|
|
C.ptr<double>(), C.step, beta,
|
|
|
|
D.ptr<double>(), D.step,
|
|
|
|
a_size.height, a_size.width, DProxyPtr->cols, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(DProxyPtr != &D)
|
|
|
|
DProxyPtr->copyTo(D);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************************************\
|
|
|
|
* Transform *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
namespace cv
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
|
|
|
|
template<typename T, typename WT> static void
|
2011-04-17 21:14:45 +08:00
|
|
|
transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
int x;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( scn == 2 && dcn == 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( x = 0; x < len*2; x += 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
WT v0 = src[x], v1 = src[x+1];
|
|
|
|
T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]);
|
|
|
|
T t1 = saturate_cast<T>(m[3]*v0 + m[4]*v1 + m[5]);
|
|
|
|
dst[x] = t0; dst[x+1] = t1;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else if( scn == 3 && dcn == 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( x = 0; x < len*3; x += 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
WT v0 = src[x], v1 = src[x+1], v2 = src[x+2];
|
|
|
|
T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
|
|
|
|
T t1 = saturate_cast<T>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
|
|
|
|
T t2 = saturate_cast<T>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
|
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else if( scn == 3 && dcn == 1 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( x = 0; x < len; x++, src += 3 )
|
|
|
|
dst[x] = saturate_cast<T>(m[0]*src[0] + m[1]*src[1] + m[2]*src[2] + m[3]);
|
|
|
|
}
|
|
|
|
else if( scn == 4 && dcn == 4 )
|
|
|
|
{
|
|
|
|
for( x = 0; x < len*4; x += 4 )
|
|
|
|
{
|
|
|
|
WT v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
|
|
|
|
T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]*v3 + m[4]);
|
|
|
|
T t1 = saturate_cast<T>(m[5]*v0 + m[6]*v1 + m[7]*v2 + m[8]*v3 + m[9]);
|
|
|
|
dst[x] = t0; dst[x+1] = t1;
|
|
|
|
t0 = saturate_cast<T>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
|
|
|
|
t1 = saturate_cast<T>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
|
|
|
|
dst[x+2] = t0; dst[x+3] = t1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for( x = 0; x < len; x++, src += scn, dst += dcn )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
const WT* _m = m;
|
2011-04-17 21:14:45 +08:00
|
|
|
int j, k;
|
|
|
|
for( j = 0; j < dcn; j++, _m += scn + 1 )
|
|
|
|
{
|
|
|
|
WT s = _m[scn];
|
|
|
|
for( k = 0; k < scn; k++ )
|
|
|
|
s += _m[k]*src[k];
|
|
|
|
dst[j] = saturate_cast<T>(s);
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if CV_SSE2
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
load3x3Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3 )
|
|
|
|
{
|
|
|
|
m0 = _mm_setr_ps(m[0], m[4], m[8], 0);
|
|
|
|
m1 = _mm_setr_ps(m[1], m[5], m[9], 0);
|
|
|
|
m2 = _mm_setr_ps(m[2], m[6], m[10], 0);
|
|
|
|
m3 = _mm_setr_ps(m[3], m[7], m[11], 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
load4x4Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3, __m128& m4 )
|
|
|
|
{
|
|
|
|
m0 = _mm_setr_ps(m[0], m[5], m[10], m[15]);
|
|
|
|
m1 = _mm_setr_ps(m[1], m[6], m[11], m[16]);
|
|
|
|
m2 = _mm_setr_ps(m[2], m[7], m[12], m[17]);
|
|
|
|
m3 = _mm_setr_ps(m[3], m[8], m[13], m[18]);
|
|
|
|
m4 = _mm_setr_ps(m[4], m[9], m[14], m[19]);
|
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
#endif
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
#if CV_SSE2
|
2010-05-12 01:44:00 +08:00
|
|
|
const int BITS = 10, SCALE = 1 << BITS;
|
|
|
|
const float MAX_M = (float)(1 << (15 - BITS));
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( USE_SSE2 && scn == 3 && dcn == 3 &&
|
2010-05-12 01:44:00 +08:00
|
|
|
std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 &&
|
|
|
|
std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 &&
|
|
|
|
std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 )
|
|
|
|
{
|
|
|
|
// faster fixed-point transformation
|
|
|
|
short m00 = saturate_cast<short>(m[0]*SCALE), m01 = saturate_cast<short>(m[1]*SCALE),
|
|
|
|
m02 = saturate_cast<short>(m[2]*SCALE), m10 = saturate_cast<short>(m[4]*SCALE),
|
|
|
|
m11 = saturate_cast<short>(m[5]*SCALE), m12 = saturate_cast<short>(m[6]*SCALE),
|
|
|
|
m20 = saturate_cast<short>(m[8]*SCALE), m21 = saturate_cast<short>(m[9]*SCALE),
|
|
|
|
m22 = saturate_cast<short>(m[10]*SCALE);
|
|
|
|
int m03 = saturate_cast<int>((m[3]+0.5f)*SCALE), m13 = saturate_cast<int>((m[7]+0.5f)*SCALE ),
|
|
|
|
m23 = saturate_cast<int>((m[11]+0.5f)*SCALE);
|
|
|
|
|
|
|
|
__m128i m0 = _mm_setr_epi16(0, m00, m01, m02, m00, m01, m02, 0);
|
|
|
|
__m128i m1 = _mm_setr_epi16(0, m10, m11, m12, m10, m11, m12, 0);
|
|
|
|
__m128i m2 = _mm_setr_epi16(0, m20, m21, m22, m20, m21, m22, 0);
|
|
|
|
__m128i m3 = _mm_setr_epi32(m03, m13, m23, 0);
|
2011-04-17 21:14:45 +08:00
|
|
|
int x = 0;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; x <= (len - 8)*3; x += 8*3 )
|
|
|
|
{
|
|
|
|
__m128i z = _mm_setzero_si128(), t0, t1, t2, r0, r1;
|
|
|
|
__m128i v0 = _mm_loadl_epi64((const __m128i*)(src + x));
|
|
|
|
__m128i v1 = _mm_loadl_epi64((const __m128i*)(src + x + 8));
|
|
|
|
__m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 16)), v3;
|
|
|
|
v0 = _mm_unpacklo_epi8(v0, z); // b0 g0 r0 b1 g1 r1 b2 g2
|
|
|
|
v1 = _mm_unpacklo_epi8(v1, z); // r2 b3 g3 r3 b4 g4 r4 b5
|
|
|
|
v2 = _mm_unpacklo_epi8(v2, z); // g5 r5 b6 g6 r6 b7 g7 r7
|
|
|
|
|
|
|
|
v3 = _mm_srli_si128(v2, 2); // ? b6 g6 r6 b7 g7 r7 0
|
|
|
|
v2 = _mm_or_si128(_mm_slli_si128(v2, 10), _mm_srli_si128(v1, 6)); // ? b4 g4 r4 b5 g5 r5 ?
|
|
|
|
v1 = _mm_or_si128(_mm_slli_si128(v1, 6), _mm_srli_si128(v0, 10)); // ? b2 g2 r2 b3 g3 r3 ?
|
|
|
|
v0 = _mm_slli_si128(v0, 2); // 0 b0 g0 r0 b1 g1 r1 ?
|
|
|
|
|
|
|
|
// process pixels 0 & 1
|
|
|
|
t0 = _mm_madd_epi16(v0, m0); // a0 b0 a1 b1
|
|
|
|
t1 = _mm_madd_epi16(v0, m1); // c0 d0 c1 d1
|
|
|
|
t2 = _mm_madd_epi16(v0, m2); // e0 f0 e1 f1
|
|
|
|
v0 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
|
|
|
|
t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
|
|
|
|
t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
|
|
|
|
t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
|
|
|
|
r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v0, t1), _mm_unpackhi_epi64(v0,t1)), m3); // B0 G0 R0 0
|
|
|
|
r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B1 G1 R1 0
|
|
|
|
r0 = _mm_srai_epi32(r0, BITS);
|
|
|
|
r1 = _mm_srai_epi32(r1, BITS);
|
|
|
|
v0 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B0 G0 R0 B1 G1 R1 0
|
|
|
|
|
|
|
|
// process pixels 2 & 3
|
|
|
|
t0 = _mm_madd_epi16(v1, m0); // a0 b0 a1 b1
|
|
|
|
t1 = _mm_madd_epi16(v1, m1); // c0 d0 c1 d1
|
|
|
|
t2 = _mm_madd_epi16(v1, m2); // e0 f0 e1 f1
|
|
|
|
v1 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
|
|
|
|
t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
|
|
|
|
t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
|
|
|
|
t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
|
|
|
|
r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v1, t1), _mm_unpackhi_epi64(v1,t1)), m3); // B2 G2 R2 0
|
|
|
|
r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B3 G3 R3 0
|
|
|
|
r0 = _mm_srai_epi32(r0, BITS);
|
|
|
|
r1 = _mm_srai_epi32(r1, BITS);
|
|
|
|
v1 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B2 G2 R2 B3 G3 R3 0
|
|
|
|
|
|
|
|
// process pixels 4 & 5
|
|
|
|
t0 = _mm_madd_epi16(v2, m0); // a0 b0 a1 b1
|
|
|
|
t1 = _mm_madd_epi16(v2, m1); // c0 d0 c1 d1
|
|
|
|
t2 = _mm_madd_epi16(v2, m2); // e0 f0 e1 f1
|
|
|
|
v2 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
|
|
|
|
t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
|
|
|
|
t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
|
|
|
|
t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
|
|
|
|
r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v2, t1), _mm_unpackhi_epi64(v2,t1)), m3); // B4 G4 R4 0
|
|
|
|
r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B5 G5 R5 0
|
|
|
|
r0 = _mm_srai_epi32(r0, BITS);
|
|
|
|
r1 = _mm_srai_epi32(r1, BITS);
|
|
|
|
v2 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B4 G4 R4 B5 G5 R5 0
|
|
|
|
|
|
|
|
// process pixels 6 & 7
|
|
|
|
t0 = _mm_madd_epi16(v3, m0); // a0 b0 a1 b1
|
|
|
|
t1 = _mm_madd_epi16(v3, m1); // c0 d0 c1 d1
|
|
|
|
t2 = _mm_madd_epi16(v3, m2); // e0 f0 e1 f1
|
|
|
|
v3 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0
|
|
|
|
t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1
|
|
|
|
t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0
|
|
|
|
t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0
|
|
|
|
r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v3, t1), _mm_unpackhi_epi64(v3,t1)), m3); // B6 G6 R6 0
|
|
|
|
r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B7 G7 R7 0
|
|
|
|
r0 = _mm_srai_epi32(r0, BITS);
|
|
|
|
r1 = _mm_srai_epi32(r1, BITS);
|
|
|
|
v3 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B6 G6 R6 B7 G7 R7 0
|
|
|
|
|
|
|
|
v0 = _mm_or_si128(_mm_srli_si128(v0, 1), _mm_slli_si128(v1, 5));
|
|
|
|
v1 = _mm_or_si128(_mm_srli_si128(v1, 3), _mm_slli_si128(v2, 3));
|
|
|
|
v2 = _mm_or_si128(_mm_srli_si128(v2, 5), _mm_slli_si128(v3, 1));
|
|
|
|
_mm_storel_epi64((__m128i*)(dst + x), v0);
|
|
|
|
_mm_storel_epi64((__m128i*)(dst + x + 8), v1);
|
|
|
|
_mm_storel_epi64((__m128i*)(dst + x + 16), v2);
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; x < len*3; x += 3 )
|
|
|
|
{
|
|
|
|
int v0 = src[x], v1 = src[x+1], v2 = src[x+2];
|
|
|
|
uchar t0 = saturate_cast<uchar>((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS);
|
|
|
|
uchar t1 = saturate_cast<uchar>((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS);
|
|
|
|
uchar t2 = saturate_cast<uchar>((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS);
|
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
#endif
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
transform_(src, dst, m, len, scn, dcn);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-19 05:24:57 +08:00
|
|
|
#if CV_SSE2
|
2011-04-17 21:14:45 +08:00
|
|
|
if( USE_SSE2 && scn == 3 && dcn == 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
__m128 m0, m1, m2, m3;
|
|
|
|
__m128i delta = _mm_setr_epi16(0,-32768,-32768,-32768,-32768,-32768,-32768,0);
|
|
|
|
load3x3Matrix(m, m0, m1, m2, m3);
|
|
|
|
m3 = _mm_sub_ps(m3, _mm_setr_ps(32768.f, 32768.f, 32768.f, 0.f));
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
int x = 0;
|
|
|
|
for( ; x <= (len - 4)*3; x += 4*3 )
|
|
|
|
{
|
|
|
|
__m128i z = _mm_setzero_si128();
|
|
|
|
__m128i v0 = _mm_loadu_si128((const __m128i*)(src + x)), v1;
|
|
|
|
__m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 8)), v3;
|
|
|
|
v1 = _mm_unpacklo_epi16(_mm_srli_si128(v0, 6), z); // b1 g1 r1
|
|
|
|
v3 = _mm_unpacklo_epi16(_mm_srli_si128(v2, 2), z); // b3 g3 r3
|
|
|
|
v2 = _mm_or_si128(_mm_srli_si128(v0, 12), _mm_slli_si128(v2, 4));
|
|
|
|
v0 = _mm_unpacklo_epi16(v0, z); // b0 g0 r0
|
|
|
|
v2 = _mm_unpacklo_epi16(v2, z); // b2 g2 r2
|
|
|
|
__m128 x0 = _mm_cvtepi32_ps(v0), x1 = _mm_cvtepi32_ps(v1);
|
|
|
|
__m128 x2 = _mm_cvtepi32_ps(v2), x3 = _mm_cvtepi32_ps(v3);
|
|
|
|
__m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
|
|
|
|
_mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))),
|
|
|
|
_mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))),
|
|
|
|
_mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3);
|
|
|
|
__m128 y1 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
|
|
|
|
_mm_mul_ps(m0, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(0,0,0,0))),
|
|
|
|
_mm_mul_ps(m1, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(1,1,1,1)))),
|
|
|
|
_mm_mul_ps(m2, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(2,2,2,2)))), m3);
|
|
|
|
__m128 y2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
|
|
|
|
_mm_mul_ps(m0, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(0,0,0,0))),
|
|
|
|
_mm_mul_ps(m1, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(1,1,1,1)))),
|
|
|
|
_mm_mul_ps(m2, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(2,2,2,2)))), m3);
|
|
|
|
__m128 y3 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
|
|
|
|
_mm_mul_ps(m0, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(0,0,0,0))),
|
|
|
|
_mm_mul_ps(m1, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(1,1,1,1)))),
|
|
|
|
_mm_mul_ps(m2, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(2,2,2,2)))), m3);
|
|
|
|
v0 = _mm_cvtps_epi32(y0); v1 = _mm_cvtps_epi32(y1);
|
|
|
|
v2 = _mm_cvtps_epi32(y2); v3 = _mm_cvtps_epi32(y3);
|
|
|
|
|
|
|
|
v0 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v0,4), v1), delta); // 0 b0 g0 r0 b1 g1 r1 0
|
|
|
|
v2 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v2,4), v3), delta); // 0 b2 g2 r2 b3 g3 r3 0
|
|
|
|
v1 = _mm_or_si128(_mm_srli_si128(v0,2), _mm_slli_si128(v2,10)); // b0 g0 r0 b1 g1 r1 b2 g2
|
|
|
|
v2 = _mm_srli_si128(v2, 6); // r2 b3 g3 r3 0 0 0 0
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), v1);
|
|
|
|
_mm_storel_epi64((__m128i*)(dst + x + 8), v2);
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; x < len*3; x += 3 )
|
|
|
|
{
|
|
|
|
float v0 = src[x], v1 = src[x+1], v2 = src[x+2];
|
|
|
|
ushort t0 = saturate_cast<ushort>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
|
|
|
|
ushort t1 = saturate_cast<ushort>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
|
|
|
|
ushort t2 = saturate_cast<ushort>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
|
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
#endif
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
transform_(src, dst, m, len, scn, dcn);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
#if CV_SSE2
|
|
|
|
if( USE_SSE2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
int x = 0;
|
|
|
|
if( scn == 3 && dcn == 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
__m128 m0, m1, m2, m3;
|
|
|
|
load3x3Matrix(m, m0, m1, m2, m3);
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; x < (len - 1)*3; x += 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
__m128 x0 = _mm_loadu_ps(src + x);
|
|
|
|
__m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(
|
|
|
|
_mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))),
|
|
|
|
_mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))),
|
|
|
|
_mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3);
|
|
|
|
_mm_storel_pi((__m64*)(dst + x), y0);
|
|
|
|
_mm_store_ss(dst + x + 2, _mm_movehl_ps(y0,y0));
|
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; x < len*3; x += 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
float v0 = src[x], v1 = src[x+1], v2 = src[x+2];
|
|
|
|
float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
|
|
|
|
float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
|
|
|
|
float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
|
2010-05-12 01:44:00 +08:00
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
return;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( scn == 4 && dcn == 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
__m128 m0, m1, m2, m3, m4;
|
|
|
|
load4x4Matrix(m, m0, m1, m2, m3, m4);
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; x < len*4; x += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
__m128 x0 = _mm_loadu_ps(src + x);
|
|
|
|
__m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(
|
2011-04-17 21:14:45 +08:00
|
|
|
_mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))),
|
|
|
|
_mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))),
|
|
|
|
_mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))),
|
|
|
|
_mm_mul_ps(m3, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(3,3,3,3)))), m4);
|
2010-05-12 01:44:00 +08:00
|
|
|
_mm_storeu_ps(dst + x, y0);
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
return;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
transform_(src, dst, m, len, scn, dcn);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
transform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
transform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
transform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
transform_(src, dst, m, len, scn, dcn);
|
2011-04-19 05:24:57 +08:00
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
template<typename T, typename WT> static void
|
|
|
|
diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
|
|
|
|
{
|
|
|
|
int x;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( cn == 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( x = 0; x < len*2; x += 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
T t0 = saturate_cast<T>(m[0]*src[x] + m[2]);
|
|
|
|
T t1 = saturate_cast<T>(m[4]*src[x+1] + m[5]);
|
|
|
|
dst[x] = t0; dst[x+1] = t1;
|
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else if( cn == 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( x = 0; x < len*3; x += 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
T t0 = saturate_cast<T>(m[0]*src[x] + m[3]);
|
|
|
|
T t1 = saturate_cast<T>(m[5]*src[x+1] + m[7]);
|
|
|
|
T t2 = saturate_cast<T>(m[10]*src[x+2] + m[11]);
|
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
|
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else if( cn == 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( x = 0; x < len*4; x += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
T t0 = saturate_cast<T>(m[0]*src[x] + m[4]);
|
|
|
|
T t1 = saturate_cast<T>(m[6]*src[x+1] + m[9]);
|
|
|
|
dst[x] = t0; dst[x+1] = t1;
|
|
|
|
t0 = saturate_cast<T>(m[12]*src[x+2] + m[14]);
|
|
|
|
t1 = saturate_cast<T>(m[18]*src[x+3] + m[19]);
|
|
|
|
dst[x+2] = t0; dst[x+3] = t1;
|
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
for( x = 0; x < len; x++, src += cn, dst += cn )
|
|
|
|
{
|
|
|
|
const WT* _m = m;
|
|
|
|
for( int j = 0; j < cn; j++, _m += cn + 1 )
|
2011-04-18 23:14:32 +08:00
|
|
|
dst[j] = saturate_cast<T>(src[j]*_m[j] + _m[cn]);
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
diagtransform_(src, dst, m, len, scn, dcn);
|
2011-04-19 05:24:57 +08:00
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
diagtransform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
diagtransform_(src, dst, m, len, scn, dcn);
|
2011-04-19 05:24:57 +08:00
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
diagtransform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
diagtransform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
diagtransform_(src, dst, m, len, scn, dcn);
|
2011-04-19 05:24:57 +08:00
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
diagtransform_(src, dst, m, len, scn, dcn);
|
2011-04-19 05:24:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int );
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2013-08-15 15:01:40 +08:00
|
|
|
static TransformFunc getTransformFunc(int depth)
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
2013-08-15 15:01:40 +08:00
|
|
|
static TransformFunc transformTab[] =
|
|
|
|
{
|
|
|
|
(TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
|
|
|
|
(TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f,
|
|
|
|
(TransformFunc)transform_64f, 0
|
|
|
|
};
|
|
|
|
|
|
|
|
return transformTab[depth];
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2013-08-15 15:01:40 +08:00
|
|
|
static TransformFunc getDiagTransformFunc(int depth)
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
2013-08-15 15:01:40 +08:00
|
|
|
static TransformFunc diagTransformTab[] =
|
|
|
|
{
|
|
|
|
(TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u,
|
|
|
|
(TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
|
|
|
|
(TransformFunc)diagtransform_64f, 0
|
|
|
|
};
|
|
|
|
|
|
|
|
return diagTransformTab[depth];
|
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-06-06 22:51:27 +08:00
|
|
|
void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx )
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
Mat src = _src.getMat(), m = _mtx.getMat();
|
|
|
|
int depth = src.depth(), scn = src.channels(), dcn = m.rows;
|
|
|
|
CV_Assert( scn == m.cols || scn + 1 == m.cols );
|
|
|
|
bool isDiag = false;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
_dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
|
|
|
|
Mat dst = _dst.getMat();
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F;
|
2011-04-17 21:14:45 +08:00
|
|
|
AutoBuffer<double> _mbuf;
|
2011-06-18 00:14:47 +08:00
|
|
|
double* mbuf;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
_mbuf.allocate(dcn*(scn+1));
|
2011-06-18 00:14:47 +08:00
|
|
|
mbuf = (double*)_mbuf;
|
|
|
|
Mat tmp(dcn, scn+1, mtype, mbuf);
|
2014-08-13 19:08:27 +08:00
|
|
|
memset(tmp.ptr(), 0, tmp.total()*tmp.elemSize());
|
2011-04-17 21:14:45 +08:00
|
|
|
if( m.cols == scn+1 )
|
|
|
|
m.convertTo(tmp, mtype);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Mat tmppart = tmp.colRange(0, m.cols);
|
|
|
|
m.convertTo(tmppart, mtype);
|
|
|
|
}
|
|
|
|
m = tmp;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else
|
2014-08-13 19:08:27 +08:00
|
|
|
mbuf = m.ptr<double>();
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
if( scn == dcn )
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON;
|
|
|
|
|
|
|
|
if( scn == 1 )
|
|
|
|
{
|
|
|
|
double alpha, beta;
|
|
|
|
if( mtype == CV_32F )
|
2011-04-17 21:14:45 +08:00
|
|
|
alpha = m.at<float>(0), beta = m.at<float>(1);
|
2010-05-12 01:44:00 +08:00
|
|
|
else
|
2011-04-17 21:14:45 +08:00
|
|
|
alpha = m.at<double>(0), beta = m.at<double>(1);
|
|
|
|
src.convertTo(dst, dst.type(), alpha, beta);
|
2010-05-12 01:44:00 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for( i = 0, isDiag = true; isDiag && i < scn; i++ )
|
|
|
|
{
|
|
|
|
for( j = 0; isDiag && j < scn; j++ )
|
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
double v = mtype == CV_32F ? m.at<float>(i, j) : m.at<double>(i, j);
|
2010-05-12 01:44:00 +08:00
|
|
|
if( i != j && fabs(v) > eps )
|
|
|
|
isDiag = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-15 15:01:40 +08:00
|
|
|
TransformFunc func = isDiag ? getDiagTransformFunc(depth): getTransformFunc(depth);
|
2010-05-12 01:44:00 +08:00
|
|
|
CV_Assert( func != 0 );
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
const Mat* arrays[] = {&src, &dst, 0};
|
|
|
|
uchar* ptrs[2];
|
|
|
|
NAryMatIterator it(arrays, ptrs);
|
|
|
|
size_t i, total = it.size;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( i = 0; i < it.nplanes; i++, ++it )
|
|
|
|
func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************************************\
|
|
|
|
* Perspective Transform *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
namespace cv
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
template<typename T> static void
|
|
|
|
perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn )
|
|
|
|
{
|
|
|
|
const double eps = FLT_EPSILON;
|
|
|
|
int i;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( scn == 2 && dcn == 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( i = 0; i < len*2; i += 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
T x = src[i], y = src[i + 1];
|
|
|
|
double w = x*m[6] + y*m[7] + m[8];
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( fabs(w) > eps )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
w = 1./w;
|
2011-04-17 21:14:45 +08:00
|
|
|
dst[i] = (T)((x*m[0] + y*m[1] + m[2])*w);
|
|
|
|
dst[i+1] = (T)((x*m[3] + y*m[4] + m[5])*w);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
else
|
2011-04-17 21:14:45 +08:00
|
|
|
dst[i] = dst[i+1] = (T)0;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else if( scn == 3 && dcn == 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( i = 0; i < len*3; i += 3 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
T x = src[i], y = src[i + 1], z = src[i + 2];
|
|
|
|
double w = x*m[12] + y*m[13] + z*m[14] + m[15];
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( fabs(w) > eps )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
w = 1./w;
|
2011-04-17 21:14:45 +08:00
|
|
|
dst[i] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3]) * w);
|
|
|
|
dst[i+1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7]) * w);
|
|
|
|
dst[i+2] = (T)((x*m[8] + y*m[9] + z*m[10] + m[11]) * w);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
else
|
2011-04-17 21:14:45 +08:00
|
|
|
dst[i] = dst[i+1] = dst[i+2] = (T)0;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else if( scn == 3 && dcn == 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
for( i = 0; i < len; i++, src += 3, dst += 2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
T x = src[0], y = src[1], z = src[2];
|
2011-04-17 21:14:45 +08:00
|
|
|
double w = x*m[8] + y*m[9] + z*m[10] + m[11];
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( fabs(w) > eps )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
w = 1./w;
|
2011-04-17 21:14:45 +08:00
|
|
|
dst[0] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3])*w);
|
|
|
|
dst[1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7])*w);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
dst[0] = dst[1] = (T)0;
|
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
for( i = 0; i < len; i++, src += scn, dst += dcn )
|
|
|
|
{
|
|
|
|
const double* _m = m + dcn*(scn + 1);
|
|
|
|
double w = _m[scn];
|
|
|
|
int j, k;
|
|
|
|
for( k = 0; k < scn; k++ )
|
|
|
|
w += _m[k]*src[k];
|
|
|
|
if( fabs(w) > eps )
|
|
|
|
{
|
|
|
|
_m = m;
|
|
|
|
for( j = 0; j < dcn; j++, _m += scn + 1 )
|
|
|
|
{
|
|
|
|
double s = _m[scn];
|
|
|
|
for( k = 0; k < scn; k++ )
|
|
|
|
s += _m[k]*src[k];
|
|
|
|
dst[j] = (T)(s*w);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
for( j = 0; j < dcn; j++ )
|
|
|
|
dst[j] = 0;
|
|
|
|
}
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn)
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
perspectiveTransform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void
|
|
|
|
perspectiveTransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
|
|
|
|
{
|
|
|
|
perspectiveTransform_(src, dst, m, len, scn, dcn);
|
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-06-06 22:51:27 +08:00
|
|
|
void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx )
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
Mat src = _src.getMat(), m = _mtx.getMat();
|
|
|
|
int depth = src.depth(), scn = src.channels(), dcn = m.rows-1;
|
2014-06-02 06:27:32 +08:00
|
|
|
CV_Assert( scn + 1 == m.cols );
|
|
|
|
CV_Assert( depth == CV_32F || depth == CV_64F );
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
_dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
|
|
|
|
Mat dst = _dst.getMat();
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
const int mtype = CV_64F;
|
|
|
|
AutoBuffer<double> _mbuf;
|
|
|
|
double* mbuf = _mbuf;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( !m.isContinuous() || m.type() != mtype )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
_mbuf.allocate((dcn+1)*(scn+1));
|
|
|
|
Mat tmp(dcn+1, scn+1, mtype, (double*)_mbuf);
|
|
|
|
m.convertTo(tmp, mtype);
|
|
|
|
m = tmp;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
else
|
2014-08-13 19:08:27 +08:00
|
|
|
mbuf = m.ptr<double>();
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
TransformFunc func = depth == CV_32F ?
|
|
|
|
(TransformFunc)perspectiveTransform_32f :
|
|
|
|
(TransformFunc)perspectiveTransform_64f;
|
|
|
|
CV_Assert( func != 0 );
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
const Mat* arrays[] = {&src, &dst, 0};
|
|
|
|
uchar* ptrs[2];
|
|
|
|
NAryMatIterator it(arrays, ptrs);
|
|
|
|
size_t i, total = it.size;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( i = 0; i < it.nplanes; i++, ++it )
|
|
|
|
func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
|
2011-04-19 05:24:57 +08:00
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
/****************************************************************************************\
|
|
|
|
* ScaleAdd *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
namespace cv
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
|
|
|
|
int len, float* _alpha)
|
|
|
|
{
|
|
|
|
float alpha = *_alpha;
|
|
|
|
int i = 0;
|
|
|
|
#if CV_SSE2
|
|
|
|
if( USE_SSE2 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
__m128 a4 = _mm_set1_ps(alpha);
|
|
|
|
if( (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 )
|
|
|
|
for( ; i <= len - 8; i += 8 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
__m128 x0, x1, y0, y1, t0, t1;
|
|
|
|
x0 = _mm_load_ps(src1 + i); x1 = _mm_load_ps(src1 + i + 4);
|
|
|
|
y0 = _mm_load_ps(src2 + i); y1 = _mm_load_ps(src2 + i + 4);
|
|
|
|
t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0);
|
|
|
|
t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1);
|
|
|
|
_mm_store_ps(dst + i, t0);
|
|
|
|
_mm_store_ps(dst + i + 4, t1);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
for( ; i <= len - 8; i += 8 )
|
|
|
|
{
|
|
|
|
__m128 x0, x1, y0, y1, t0, t1;
|
|
|
|
x0 = _mm_loadu_ps(src1 + i); x1 = _mm_loadu_ps(src1 + i + 4);
|
|
|
|
y0 = _mm_loadu_ps(src2 + i); y1 = _mm_loadu_ps(src2 + i + 4);
|
|
|
|
t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0);
|
|
|
|
t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1);
|
|
|
|
_mm_storeu_ps(dst + i, t0);
|
|
|
|
_mm_storeu_ps(dst + i + 4, t1);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
else
|
2014-09-24 01:14:42 +08:00
|
|
|
#elif CV_NEON
|
|
|
|
if (true)
|
|
|
|
{
|
|
|
|
for ( ; i <= len - 4; i += 4)
|
|
|
|
{
|
|
|
|
float32x4_t v_src1 = vld1q_f32(src1 + i), v_src2 = vld1q_f32(src2 + i);
|
|
|
|
vst1q_f32(dst + i, vaddq_f32(vmulq_n_f32(v_src1, alpha), v_src2));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2011-04-17 21:14:45 +08:00
|
|
|
#endif
|
2012-02-10 14:05:04 +08:00
|
|
|
//vz why do we need unroll here?
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; i <= len - 4; i += 4 )
|
|
|
|
{
|
|
|
|
float t0, t1;
|
|
|
|
t0 = src1[i]*alpha + src2[i];
|
|
|
|
t1 = src1[i+1]*alpha + src2[i+1];
|
|
|
|
dst[i] = t0; dst[i+1] = t1;
|
|
|
|
t0 = src1[i+2]*alpha + src2[i+2];
|
|
|
|
t1 = src1[i+3]*alpha + src2[i+3];
|
|
|
|
dst[i+2] = t0; dst[i+3] = t1;
|
|
|
|
}
|
2012-06-09 23:00:04 +08:00
|
|
|
for(; i < len; i++ )
|
2011-04-17 21:14:45 +08:00
|
|
|
dst[i] = src1[i]*alpha + src2[i];
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
|
|
|
|
int len, double* _alpha)
|
|
|
|
{
|
|
|
|
double alpha = *_alpha;
|
|
|
|
int i = 0;
|
|
|
|
#if CV_SSE2
|
|
|
|
if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 )
|
|
|
|
{
|
|
|
|
__m128d a2 = _mm_set1_pd(alpha);
|
|
|
|
for( ; i <= len - 4; i += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
__m128d x0, x1, y0, y1, t0, t1;
|
|
|
|
x0 = _mm_load_pd(src1 + i); x1 = _mm_load_pd(src1 + i + 2);
|
|
|
|
y0 = _mm_load_pd(src2 + i); y1 = _mm_load_pd(src2 + i + 2);
|
|
|
|
t0 = _mm_add_pd(_mm_mul_pd(x0, a2), y0);
|
|
|
|
t1 = _mm_add_pd(_mm_mul_pd(x1, a2), y1);
|
|
|
|
_mm_store_pd(dst + i, t0);
|
|
|
|
_mm_store_pd(dst + i + 2, t1);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
2012-06-09 23:00:04 +08:00
|
|
|
//vz why do we need unroll here?
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; i <= len - 4; i += 4 )
|
|
|
|
{
|
|
|
|
double t0, t1;
|
|
|
|
t0 = src1[i]*alpha + src2[i];
|
|
|
|
t1 = src1[i+1]*alpha + src2[i+1];
|
|
|
|
dst[i] = t0; dst[i+1] = t1;
|
|
|
|
t0 = src1[i+2]*alpha + src2[i+2];
|
|
|
|
t1 = src1[i+3]*alpha + src2[i+3];
|
|
|
|
dst[i+2] = t0; dst[i+3] = t1;
|
|
|
|
}
|
2012-06-09 23:00:04 +08:00
|
|
|
for(; i < len; i++ )
|
2011-04-17 21:14:45 +08:00
|
|
|
dst[i] = src1[i]*alpha + src2[i];
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha);
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2014-01-25 01:03:31 +08:00
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
|
2013-12-29 22:46:25 +08:00
|
|
|
static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
|
|
|
|
{
|
2014-05-14 19:42:30 +08:00
|
|
|
const ocl::Device & d = ocl::Device::getDefault();
|
2014-10-24 18:05:39 +08:00
|
|
|
|
2014-05-14 19:42:30 +08:00
|
|
|
bool doubleSupport = d.doubleFPConfig() > 0;
|
2013-12-29 22:46:25 +08:00
|
|
|
Size size = _src1.size();
|
2014-10-24 18:05:39 +08:00
|
|
|
int depth = CV_MAT_DEPTH(type);
|
2013-12-29 22:46:25 +08:00
|
|
|
if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
|
|
|
|
return false;
|
|
|
|
|
2014-10-24 18:05:39 +08:00
|
|
|
_dst.create(size, type);
|
|
|
|
int cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F);
|
|
|
|
int kercn = ocl::predictOptimalVectorWidthMax(_src1, _src2, _dst),
|
|
|
|
rowsPerWI = d.isIntel() ? 4 : 1;
|
|
|
|
|
2013-12-29 22:46:25 +08:00
|
|
|
char cvt[2][50];
|
|
|
|
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
|
2014-03-08 05:29:27 +08:00
|
|
|
format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
|
2014-05-14 19:42:30 +08:00
|
|
|
" -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s"
|
|
|
|
" -D wdepth=%d%s -D rowsPerWI=%d",
|
2014-03-08 05:29:27 +08:00
|
|
|
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
|
|
|
|
ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
|
|
|
|
ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
|
|
|
|
ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]),
|
|
|
|
ocl::typeToStr(wdepth), wdepth,
|
2014-05-14 19:42:30 +08:00
|
|
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI));
|
2013-12-29 22:46:25 +08:00
|
|
|
if (k.empty())
|
|
|
|
return false;
|
|
|
|
|
2014-10-24 18:05:39 +08:00
|
|
|
UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat();
|
2013-12-29 22:46:25 +08:00
|
|
|
|
|
|
|
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
|
|
|
|
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
2014-03-08 05:29:27 +08:00
|
|
|
dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
|
2013-12-29 22:46:25 +08:00
|
|
|
|
|
|
|
if (wdepth == CV_32F)
|
|
|
|
k.args(src1arg, src2arg, dstarg, (float)alpha);
|
|
|
|
else
|
|
|
|
k.args(src1arg, src2arg, dstarg, alpha);
|
|
|
|
|
2015-10-16 22:10:00 +08:00
|
|
|
size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
|
2013-12-29 22:46:25 +08:00
|
|
|
return k.run(2, globalsize, NULL, false);
|
|
|
|
}
|
|
|
|
|
2014-01-25 01:03:31 +08:00
|
|
|
#endif
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-06-06 22:51:27 +08:00
|
|
|
void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst )
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2013-12-29 22:46:25 +08:00
|
|
|
int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
|
|
|
CV_Assert( type == _src2.type() );
|
|
|
|
|
2014-01-25 01:03:31 +08:00
|
|
|
CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
|
2013-12-29 22:46:25 +08:00
|
|
|
ocl_scaleAdd(_src1, alpha, _src2, _dst, type))
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( depth < CV_32F )
|
|
|
|
{
|
|
|
|
addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
|
|
|
|
return;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2013-12-29 22:46:25 +08:00
|
|
|
Mat src1 = _src1.getMat(), src2 = _src2.getMat();
|
|
|
|
CV_Assert(src1.size == src2.size);
|
|
|
|
|
2014-04-06 20:09:38 +08:00
|
|
|
_dst.create(src1.dims, src1.size, type);
|
2011-04-17 21:14:45 +08:00
|
|
|
Mat dst = _dst.getMat();
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
float falpha = (float)alpha;
|
|
|
|
void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2012-06-09 23:00:04 +08:00
|
|
|
ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2014-04-07 19:35:51 +08:00
|
|
|
if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous())
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
|
|
|
size_t len = src1.total()*cn;
|
2014-08-13 19:08:27 +08:00
|
|
|
func(src1.ptr(), src2.ptr(), dst.ptr(), (int)len, palpha);
|
2011-04-17 21:14:45 +08:00
|
|
|
return;
|
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
const Mat* arrays[] = {&src1, &src2, &dst, 0};
|
|
|
|
uchar* ptrs[3];
|
|
|
|
NAryMatIterator it(arrays, ptrs);
|
|
|
|
size_t i, len = it.size*cn;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( i = 0; i < it.nplanes; i++, ++it )
|
|
|
|
func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha );
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************************************\
|
|
|
|
* Covariation Matrix *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
CV_Assert( data && nsamples > 0 );
|
|
|
|
Size size = data[0].size();
|
2012-04-14 05:50:59 +08:00
|
|
|
int sz = size.width * size.height, esz = (int)data[0].elemSize();
|
2010-05-12 01:44:00 +08:00
|
|
|
int type = data[0].type();
|
|
|
|
Mat mean;
|
|
|
|
ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);
|
|
|
|
|
|
|
|
if( (flags & CV_COVAR_USE_AVG) != 0 )
|
|
|
|
{
|
|
|
|
CV_Assert( _mean.size() == size );
|
|
|
|
if( _mean.isContinuous() && _mean.type() == ctype )
|
|
|
|
mean = _mean.reshape(1, 1);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
_mean.convertTo(mean, ctype);
|
|
|
|
mean = mean.reshape(1, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Mat _data(nsamples, sz, type);
|
2012-04-14 05:50:59 +08:00
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
for( int i = 0; i < nsamples; i++ )
|
|
|
|
{
|
|
|
|
CV_Assert( data[i].size() == size && data[i].type() == type );
|
|
|
|
if( data[i].isContinuous() )
|
2014-08-13 19:08:27 +08:00
|
|
|
memcpy( _data.ptr(i), data[i].ptr(), sz*esz );
|
2010-05-12 01:44:00 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
Mat dataRow(size.height, size.width, type, _data.ptr(i));
|
|
|
|
data[i].copyTo(dataRow);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
calcCovarMatrix( _data, covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );
|
|
|
|
if( (flags & CV_COVAR_USE_AVG) == 0 )
|
|
|
|
_mean = mean.reshape(1, size.height);
|
|
|
|
}
|
|
|
|
|
2012-06-09 23:00:04 +08:00
|
|
|
void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2017-04-19 18:13:39 +08:00
|
|
|
if(_src.kind() == _InputArray::STD_VECTOR_MAT || _src.kind() == _InputArray::STD_ARRAY_MAT)
|
2012-04-14 05:50:59 +08:00
|
|
|
{
|
|
|
|
std::vector<cv::Mat> src;
|
2012-06-09 23:00:04 +08:00
|
|
|
_src.getMatVector(src);
|
2012-04-14 05:50:59 +08:00
|
|
|
|
|
|
|
CV_Assert( src.size() > 0 );
|
|
|
|
|
|
|
|
Size size = src[0].size();
|
|
|
|
int type = src[0].type();
|
|
|
|
|
|
|
|
ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);
|
|
|
|
|
|
|
|
Mat _data(static_cast<int>(src.size()), size.area(), type);
|
|
|
|
|
|
|
|
int i = 0;
|
2016-04-29 21:41:39 +08:00
|
|
|
for(std::vector<cv::Mat>::iterator each = src.begin(); each != src.end(); ++each, ++i )
|
2012-04-14 05:50:59 +08:00
|
|
|
{
|
|
|
|
CV_Assert( (*each).size() == size && (*each).type() == type );
|
|
|
|
Mat dataRow(size.height, size.width, type, _data.ptr(i));
|
|
|
|
(*each).copyTo(dataRow);
|
|
|
|
}
|
|
|
|
|
|
|
|
Mat mean;
|
|
|
|
if( (flags & CV_COVAR_USE_AVG) != 0 )
|
|
|
|
{
|
|
|
|
CV_Assert( _mean.size() == size );
|
|
|
|
|
|
|
|
if( mean.type() != ctype )
|
|
|
|
{
|
|
|
|
mean = _mean.getMat();
|
|
|
|
_mean.create(mean.size(), ctype);
|
|
|
|
Mat tmp = _mean.getMat();
|
|
|
|
mean.convertTo(tmp, ctype);
|
|
|
|
mean = tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
mean = _mean.getMat().reshape(1, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
calcCovarMatrix( _data, _covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );
|
|
|
|
|
|
|
|
if( (flags & CV_COVAR_USE_AVG) == 0 )
|
|
|
|
{
|
|
|
|
mean = mean.reshape(1, size.height);
|
|
|
|
mean.copyTo(_mean);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-06-09 23:00:04 +08:00
|
|
|
Mat data = _src.getMat(), mean;
|
2010-05-12 01:44:00 +08:00
|
|
|
CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) );
|
|
|
|
bool takeRows = (flags & CV_COVAR_ROWS) != 0;
|
|
|
|
int type = data.type();
|
|
|
|
int nsamples = takeRows ? data.rows : data.cols;
|
|
|
|
CV_Assert( nsamples > 0 );
|
|
|
|
Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows);
|
|
|
|
|
|
|
|
if( (flags & CV_COVAR_USE_AVG) != 0 )
|
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
mean = _mean.getMat();
|
|
|
|
ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), mean.depth()), CV_32F);
|
2010-05-12 01:44:00 +08:00
|
|
|
CV_Assert( mean.size() == size );
|
|
|
|
if( mean.type() != ctype )
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
|
|
|
_mean.create(mean.size(), ctype);
|
|
|
|
Mat tmp = _mean.getMat();
|
|
|
|
mean.convertTo(tmp, ctype);
|
|
|
|
mean = tmp;
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F);
|
2012-06-09 23:00:04 +08:00
|
|
|
reduce( _src, _mean, takeRows ? 0 : 1, CV_REDUCE_AVG, ctype );
|
2011-04-17 21:14:45 +08:00
|
|
|
mean = _mean.getMat();
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
mulTransposed( data, _covar, ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows,
|
2010-05-12 01:44:00 +08:00
|
|
|
mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype );
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************************************\
|
|
|
|
* Mahalanobis *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
2011-06-06 22:51:27 +08:00
|
|
|
double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2012-06-09 23:00:04 +08:00
|
|
|
Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat();
|
2010-05-12 01:44:00 +08:00
|
|
|
int type = v1.type(), depth = v1.depth();
|
|
|
|
Size sz = v1.size();
|
|
|
|
int i, j, len = sz.width*sz.height*v1.channels();
|
2010-11-06 22:56:01 +08:00
|
|
|
AutoBuffer<double> buf(len);
|
2010-05-12 01:44:00 +08:00
|
|
|
double result = 0;
|
|
|
|
|
|
|
|
CV_Assert( type == v2.type() && type == icovar.type() &&
|
|
|
|
sz == v2.size() && len == icovar.rows && len == icovar.cols );
|
|
|
|
|
2010-11-06 22:56:01 +08:00
|
|
|
sz.width *= v1.channels();
|
2010-05-12 01:44:00 +08:00
|
|
|
if( v1.isContinuous() && v2.isContinuous() )
|
|
|
|
{
|
|
|
|
sz.width *= sz.height;
|
|
|
|
sz.height = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( depth == CV_32F )
|
|
|
|
{
|
2014-08-13 19:08:27 +08:00
|
|
|
const float* src1 = v1.ptr<float>();
|
|
|
|
const float* src2 = v2.ptr<float>();
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t step1 = v1.step/sizeof(src1[0]);
|
|
|
|
size_t step2 = v2.step/sizeof(src2[0]);
|
2010-11-06 22:56:01 +08:00
|
|
|
double* diff = buf;
|
2014-08-13 19:08:27 +08:00
|
|
|
const float* mat = icovar.ptr<float>();
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t matstep = icovar.step/sizeof(mat[0]);
|
|
|
|
|
|
|
|
for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
|
|
|
|
{
|
|
|
|
for( i = 0; i < sz.width; i++ )
|
|
|
|
diff[i] = src1[i] - src2[i];
|
|
|
|
}
|
|
|
|
|
2010-11-06 22:56:01 +08:00
|
|
|
diff = buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
for( i = 0; i < len; i++, mat += matstep )
|
|
|
|
{
|
|
|
|
double row_sum = 0;
|
2012-02-10 14:05:04 +08:00
|
|
|
j = 0;
|
2012-06-09 23:00:04 +08:00
|
|
|
#if CV_ENABLE_UNROLLED
|
2012-02-10 14:05:04 +08:00
|
|
|
for(; j <= len - 4; j += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
|
|
|
|
diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
|
2012-02-10 14:05:04 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; j < len; j++ )
|
|
|
|
row_sum += diff[j]*mat[j];
|
|
|
|
result += row_sum * diff[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if( depth == CV_64F )
|
|
|
|
{
|
2014-08-13 19:08:27 +08:00
|
|
|
const double* src1 = v1.ptr<double>();
|
|
|
|
const double* src2 = v2.ptr<double>();
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t step1 = v1.step/sizeof(src1[0]);
|
|
|
|
size_t step2 = v2.step/sizeof(src2[0]);
|
2010-11-06 22:56:01 +08:00
|
|
|
double* diff = buf;
|
2014-08-13 19:08:27 +08:00
|
|
|
const double* mat = icovar.ptr<double>();
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t matstep = icovar.step/sizeof(mat[0]);
|
|
|
|
|
|
|
|
for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
|
|
|
|
{
|
|
|
|
for( i = 0; i < sz.width; i++ )
|
|
|
|
diff[i] = src1[i] - src2[i];
|
|
|
|
}
|
|
|
|
|
2010-11-06 22:56:01 +08:00
|
|
|
diff = buf;
|
2010-05-12 01:44:00 +08:00
|
|
|
for( i = 0; i < len; i++, mat += matstep )
|
|
|
|
{
|
|
|
|
double row_sum = 0;
|
2012-02-10 14:05:04 +08:00
|
|
|
j = 0;
|
2012-06-09 23:00:04 +08:00
|
|
|
#if CV_ENABLE_UNROLLED
|
2012-02-10 14:05:04 +08:00
|
|
|
for(; j <= len - 4; j += 4 )
|
2010-05-12 01:44:00 +08:00
|
|
|
row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
|
|
|
|
diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
|
2012-02-10 14:05:04 +08:00
|
|
|
#endif
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; j < len; j++ )
|
|
|
|
row_sum += diff[j]*mat[j];
|
|
|
|
result += row_sum * diff[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
CV_Error( CV_StsUnsupportedFormat, "" );
|
|
|
|
|
|
|
|
return std::sqrt(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************************************\
|
2011-04-17 21:14:45 +08:00
|
|
|
* MulTransposed *
|
2010-05-12 01:44:00 +08:00
|
|
|
\****************************************************************************************/
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
namespace cv
|
|
|
|
{
|
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
template<typename sT, typename dT> static void
|
|
|
|
MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
|
|
|
|
{
|
|
|
|
int i, j, k;
|
2014-08-13 19:08:27 +08:00
|
|
|
const sT* src = srcmat.ptr<sT>();
|
|
|
|
dT* dst = dstmat.ptr<dT>();
|
|
|
|
const dT* delta = deltamat.ptr<dT>();
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t srcstep = srcmat.step/sizeof(src[0]);
|
|
|
|
size_t dststep = dstmat.step/sizeof(dst[0]);
|
|
|
|
size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
|
|
|
|
int delta_cols = deltamat.cols;
|
|
|
|
Size size = srcmat.size();
|
|
|
|
dT* tdst = dst;
|
|
|
|
dT* col_buf = 0;
|
|
|
|
dT* delta_buf = 0;
|
|
|
|
int buf_size = size.height*sizeof(dT);
|
|
|
|
AutoBuffer<uchar> buf;
|
|
|
|
|
|
|
|
if( delta && delta_cols < size.width )
|
|
|
|
{
|
|
|
|
assert( delta_cols == 1 );
|
|
|
|
buf_size *= 5;
|
|
|
|
}
|
|
|
|
buf.allocate(buf_size);
|
|
|
|
col_buf = (dT*)(uchar*)buf;
|
|
|
|
|
|
|
|
if( delta && delta_cols < size.width )
|
|
|
|
{
|
|
|
|
delta_buf = col_buf + size.height;
|
|
|
|
for( i = 0; i < size.height; i++ )
|
|
|
|
delta_buf[i*4] = delta_buf[i*4+1] =
|
|
|
|
delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep];
|
|
|
|
delta = delta_buf;
|
|
|
|
deltastep = deltastep ? 4 : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( !delta )
|
|
|
|
for( i = 0; i < size.width; i++, tdst += dststep )
|
|
|
|
{
|
|
|
|
for( k = 0; k < size.height; k++ )
|
|
|
|
col_buf[k] = src[k*srcstep+i];
|
|
|
|
|
|
|
|
for( j = i; j <= size.width - 4; j += 4 )
|
|
|
|
{
|
|
|
|
double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
|
|
|
|
const sT *tsrc = src + j;
|
|
|
|
|
|
|
|
for( k = 0; k < size.height; k++, tsrc += srcstep )
|
|
|
|
{
|
|
|
|
double a = col_buf[k];
|
|
|
|
s0 += a * tsrc[0];
|
|
|
|
s1 += a * tsrc[1];
|
|
|
|
s2 += a * tsrc[2];
|
|
|
|
s3 += a * tsrc[3];
|
|
|
|
}
|
|
|
|
|
|
|
|
tdst[j] = (dT)(s0*scale);
|
|
|
|
tdst[j+1] = (dT)(s1*scale);
|
|
|
|
tdst[j+2] = (dT)(s2*scale);
|
|
|
|
tdst[j+3] = (dT)(s3*scale);
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j < size.width; j++ )
|
|
|
|
{
|
|
|
|
double s0 = 0;
|
|
|
|
const sT *tsrc = src + j;
|
|
|
|
|
|
|
|
for( k = 0; k < size.height; k++, tsrc += srcstep )
|
2010-11-06 22:56:01 +08:00
|
|
|
s0 += (double)col_buf[k] * tsrc[0];
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
tdst[j] = (dT)(s0*scale);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
for( i = 0; i < size.width; i++, tdst += dststep )
|
|
|
|
{
|
|
|
|
if( !delta_buf )
|
|
|
|
for( k = 0; k < size.height; k++ )
|
|
|
|
col_buf[k] = src[k*srcstep+i] - delta[k*deltastep+i];
|
|
|
|
else
|
|
|
|
for( k = 0; k < size.height; k++ )
|
|
|
|
col_buf[k] = src[k*srcstep+i] - delta_buf[k*deltastep];
|
|
|
|
|
|
|
|
for( j = i; j <= size.width - 4; j += 4 )
|
|
|
|
{
|
|
|
|
double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
|
|
|
|
const sT *tsrc = src + j;
|
|
|
|
const dT *d = delta_buf ? delta_buf : delta + j;
|
|
|
|
|
|
|
|
for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
|
|
|
|
{
|
|
|
|
double a = col_buf[k];
|
|
|
|
s0 += a * (tsrc[0] - d[0]);
|
|
|
|
s1 += a * (tsrc[1] - d[1]);
|
|
|
|
s2 += a * (tsrc[2] - d[2]);
|
|
|
|
s3 += a * (tsrc[3] - d[3]);
|
|
|
|
}
|
|
|
|
|
|
|
|
tdst[j] = (dT)(s0*scale);
|
|
|
|
tdst[j+1] = (dT)(s1*scale);
|
|
|
|
tdst[j+2] = (dT)(s2*scale);
|
|
|
|
tdst[j+3] = (dT)(s3*scale);
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j < size.width; j++ )
|
|
|
|
{
|
|
|
|
double s0 = 0;
|
|
|
|
const sT *tsrc = src + j;
|
|
|
|
const dT *d = delta_buf ? delta_buf : delta + j;
|
|
|
|
|
|
|
|
for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
|
2010-11-06 22:56:01 +08:00
|
|
|
s0 += (double)col_buf[k] * (tsrc[0] - d[0]);
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
tdst[j] = (dT)(s0*scale);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template<typename sT, typename dT> static void
|
|
|
|
MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
|
|
|
|
{
|
|
|
|
int i, j, k;
|
2014-08-13 19:08:27 +08:00
|
|
|
const sT* src = srcmat.ptr<sT>();
|
|
|
|
dT* dst = dstmat.ptr<dT>();
|
|
|
|
const dT* delta = deltamat.ptr<dT>();
|
2010-05-12 01:44:00 +08:00
|
|
|
size_t srcstep = srcmat.step/sizeof(src[0]);
|
|
|
|
size_t dststep = dstmat.step/sizeof(dst[0]);
|
|
|
|
size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
|
|
|
|
int delta_cols = deltamat.cols;
|
|
|
|
Size size = srcmat.size();
|
|
|
|
dT* tdst = dst;
|
|
|
|
|
|
|
|
if( !delta )
|
|
|
|
for( i = 0; i < size.height; i++, tdst += dststep )
|
|
|
|
for( j = i; j < size.height; j++ )
|
|
|
|
{
|
|
|
|
double s = 0;
|
|
|
|
const sT *tsrc1 = src + i*srcstep;
|
|
|
|
const sT *tsrc2 = src + j*srcstep;
|
|
|
|
|
|
|
|
for( k = 0; k <= size.width - 4; k += 4 )
|
2010-11-06 22:56:01 +08:00
|
|
|
s += (double)tsrc1[k]*tsrc2[k] + (double)tsrc1[k+1]*tsrc2[k+1] +
|
|
|
|
(double)tsrc1[k+2]*tsrc2[k+2] + (double)tsrc1[k+3]*tsrc2[k+3];
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; k < size.width; k++ )
|
2010-11-06 22:56:01 +08:00
|
|
|
s += (double)tsrc1[k] * tsrc2[k];
|
2010-05-12 01:44:00 +08:00
|
|
|
tdst[j] = (dT)(s*scale);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
dT delta_buf[4];
|
|
|
|
int delta_shift = delta_cols == size.width ? 4 : 0;
|
|
|
|
AutoBuffer<uchar> buf(size.width*sizeof(dT));
|
|
|
|
dT* row_buf = (dT*)(uchar*)buf;
|
|
|
|
|
|
|
|
for( i = 0; i < size.height; i++, tdst += dststep )
|
|
|
|
{
|
|
|
|
const sT *tsrc1 = src + i*srcstep;
|
|
|
|
const dT *tdelta1 = delta + i*deltastep;
|
|
|
|
|
|
|
|
if( delta_cols < size.width )
|
|
|
|
for( k = 0; k < size.width; k++ )
|
|
|
|
row_buf[k] = tsrc1[k] - tdelta1[0];
|
|
|
|
else
|
|
|
|
for( k = 0; k < size.width; k++ )
|
|
|
|
row_buf[k] = tsrc1[k] - tdelta1[k];
|
|
|
|
|
|
|
|
for( j = i; j < size.height; j++ )
|
|
|
|
{
|
|
|
|
double s = 0;
|
|
|
|
const sT *tsrc2 = src + j*srcstep;
|
|
|
|
const dT *tdelta2 = delta + j*deltastep;
|
|
|
|
if( delta_cols < size.width )
|
|
|
|
{
|
|
|
|
delta_buf[0] = delta_buf[1] =
|
|
|
|
delta_buf[2] = delta_buf[3] = tdelta2[0];
|
|
|
|
tdelta2 = delta_buf;
|
|
|
|
}
|
|
|
|
for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift )
|
2010-11-06 22:56:01 +08:00
|
|
|
s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]) +
|
|
|
|
(double)row_buf[k+1]*(tsrc2[k+1] - tdelta2[1]) +
|
|
|
|
(double)row_buf[k+2]*(tsrc2[k+2] - tdelta2[2]) +
|
|
|
|
(double)row_buf[k+3]*(tsrc2[k+3] - tdelta2[3]);
|
2010-05-12 01:44:00 +08:00
|
|
|
for( ; k < size.width; k++, tdelta2++ )
|
2010-11-06 22:56:01 +08:00
|
|
|
s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]);
|
2010-05-12 01:44:00 +08:00
|
|
|
tdst[j] = (dT)(s*scale);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale);
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-06-06 22:51:27 +08:00
|
|
|
void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata,
|
|
|
|
InputArray _delta, double scale, int dtype )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
Mat src = _src.getMat(), delta = _delta.getMat();
|
2010-05-12 01:44:00 +08:00
|
|
|
const int gemm_level = 100; // boundary above which GEMM is faster.
|
|
|
|
int stype = src.type();
|
|
|
|
dtype = std::max(std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), delta.depth()), CV_32F);
|
|
|
|
CV_Assert( src.channels() == 1 );
|
|
|
|
|
2014-08-13 19:08:27 +08:00
|
|
|
if( !delta.empty() )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
CV_Assert( delta.channels() == 1 &&
|
|
|
|
(delta.rows == src.rows || delta.rows == 1) &&
|
|
|
|
(delta.cols == src.cols || delta.cols == 1));
|
|
|
|
if( delta.type() != dtype )
|
2011-04-17 21:14:45 +08:00
|
|
|
delta.convertTo(delta, dtype);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int dsize = ata ? src.cols : src.rows;
|
2011-04-17 21:14:45 +08:00
|
|
|
_dst.create( dsize, dsize, dtype );
|
|
|
|
Mat dst = _dst.getMat();
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
if( src.data == dst.data || (stype == dtype &&
|
|
|
|
(dst.cols >= gemm_level && dst.rows >= gemm_level &&
|
|
|
|
src.cols >= gemm_level && src.rows >= gemm_level)))
|
|
|
|
{
|
|
|
|
Mat src2;
|
|
|
|
const Mat* tsrc = &src;
|
2014-08-13 19:08:27 +08:00
|
|
|
if( !delta.empty() )
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
|
|
|
if( delta.size() == src.size() )
|
|
|
|
subtract( src, delta, src2 );
|
|
|
|
else
|
|
|
|
{
|
|
|
|
repeat(delta, src.rows/delta.rows, src.cols/delta.cols, src2);
|
|
|
|
subtract( src, src2, src2 );
|
|
|
|
}
|
|
|
|
tsrc = &src2;
|
|
|
|
}
|
|
|
|
gemm( *tsrc, *tsrc, scale, Mat(), 0, dst, ata ? GEMM_1_T : GEMM_2_T );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
MulTransposedFunc func = 0;
|
|
|
|
if(stype == CV_8U && dtype == CV_32F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<uchar,float>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<uchar,float>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_8U && dtype == CV_64F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<uchar,double>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<uchar,double>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_16U && dtype == CV_32F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<ushort,float>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<ushort,float>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_16U && dtype == CV_64F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<ushort,double>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<ushort,double>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_16S && dtype == CV_32F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<short,float>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<short,float>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_16S && dtype == CV_64F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<short,double>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<short,double>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_32F && dtype == CV_32F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<float,float>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<float,float>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_32F && dtype == CV_64F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<float,double>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<float,double>;
|
|
|
|
}
|
|
|
|
else if(stype == CV_64F && dtype == CV_64F)
|
|
|
|
{
|
|
|
|
if(ata)
|
|
|
|
func = MulTransposedR<double,double>;
|
|
|
|
else
|
|
|
|
func = MulTransposedL<double,double>;
|
|
|
|
}
|
|
|
|
if( !func )
|
|
|
|
CV_Error( CV_StsUnsupportedFormat, "" );
|
|
|
|
|
|
|
|
func( src, dst, delta, scale );
|
|
|
|
completeSymm( dst, false );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************************************\
|
|
|
|
* Dot Product *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
namespace cv
|
2011-02-28 02:12:30 +08:00
|
|
|
{
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
template<typename T> double
|
|
|
|
dotProd_(const T* src1, const T* src2, int len)
|
2011-02-28 02:12:30 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
int i = 0;
|
|
|
|
double result = 0;
|
2014-09-28 19:35:33 +08:00
|
|
|
|
|
|
|
#if CV_ENABLE_UNROLLED
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; i <= len - 4; i += 4 )
|
|
|
|
result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
|
|
|
|
(double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
|
2012-02-10 14:05:04 +08:00
|
|
|
#endif
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; i < len; i++ )
|
|
|
|
result += (double)src1[i]*src2[i];
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
return result;
|
|
|
|
}
|
2011-02-28 02:12:30 +08:00
|
|
|
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
|
2011-02-28 02:12:30 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
double r = 0;
|
2017-04-21 19:52:45 +08:00
|
|
|
#if ARITHM_USE_IPP
|
|
|
|
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_8u64f_C1R, src1, len*sizeof(uchar), src2, len*sizeof(uchar), ippiSize(len, 1), &r) >= 0, r);
|
2014-04-07 18:17:22 +08:00
|
|
|
#endif
|
2011-04-17 21:14:45 +08:00
|
|
|
int i = 0;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
#if CV_SSE2
|
|
|
|
if( USE_SSE2 )
|
2011-02-28 02:12:30 +08:00
|
|
|
{
|
2011-12-06 06:58:27 +08:00
|
|
|
int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize;
|
2011-04-17 21:14:45 +08:00
|
|
|
__m128i z = _mm_setzero_si128();
|
2014-09-28 19:35:33 +08:00
|
|
|
CV_DECL_ALIGNED(16) int buf[4];
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
while( i < len0 )
|
|
|
|
{
|
2011-04-18 23:14:32 +08:00
|
|
|
blockSize = std::min(len0 - i, blockSize0);
|
2014-09-28 19:35:33 +08:00
|
|
|
__m128i s = z;
|
2012-06-09 23:00:04 +08:00
|
|
|
j = 0;
|
2011-12-06 06:58:27 +08:00
|
|
|
for( ; j <= blockSize - 16; j += 16 )
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
|
|
|
__m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j));
|
|
|
|
__m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j));
|
|
|
|
__m128i s0, s1, s2, s3;
|
|
|
|
s0 = _mm_unpacklo_epi8(b0, z);
|
|
|
|
s2 = _mm_unpackhi_epi8(b0, z);
|
|
|
|
s1 = _mm_unpacklo_epi8(b1, z);
|
|
|
|
s3 = _mm_unpackhi_epi8(b1, z);
|
|
|
|
s0 = _mm_madd_epi16(s0, s1);
|
2011-12-06 06:58:27 +08:00
|
|
|
s2 = _mm_madd_epi16(s2, s3);
|
2011-04-17 21:14:45 +08:00
|
|
|
s = _mm_add_epi32(s, s0);
|
|
|
|
s = _mm_add_epi32(s, s2);
|
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( ; j < blockSize; j += 4 )
|
|
|
|
{
|
|
|
|
__m128i s0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + j)), z);
|
|
|
|
__m128i s1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src2 + j)), z);
|
|
|
|
s0 = _mm_madd_epi16(s0, s1);
|
|
|
|
s = _mm_add_epi32(s, s0);
|
|
|
|
}
|
2014-09-28 19:35:33 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
_mm_store_si128((__m128i*)buf, s);
|
|
|
|
r += buf[0] + buf[1] + buf[2] + buf[3];
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
src1 += blockSize;
|
|
|
|
src2 += blockSize;
|
|
|
|
i += blockSize;
|
|
|
|
}
|
2011-02-28 02:12:30 +08:00
|
|
|
}
|
2014-09-28 19:35:33 +08:00
|
|
|
#elif CV_NEON
|
|
|
|
int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
|
|
|
|
uint32x4_t v_zero = vdupq_n_u32(0u);
|
|
|
|
CV_DECL_ALIGNED(16) uint buf[4];
|
|
|
|
|
|
|
|
while( i < len0 )
|
|
|
|
{
|
|
|
|
blockSize = std::min(len0 - i, blockSize0);
|
|
|
|
uint32x4_t v_sum = v_zero;
|
|
|
|
|
|
|
|
int j = 0;
|
|
|
|
for( ; j <= blockSize - 16; j += 16 )
|
|
|
|
{
|
|
|
|
uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j);
|
|
|
|
|
|
|
|
uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2));
|
|
|
|
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
|
|
|
|
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
|
|
|
|
|
|
|
|
v_src10 = vmovl_u8(vget_high_u8(v_src1));
|
|
|
|
v_src20 = vmovl_u8(vget_high_u8(v_src2));
|
|
|
|
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
|
|
|
|
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j <= blockSize - 8; j += 8 )
|
|
|
|
{
|
|
|
|
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j));
|
|
|
|
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2));
|
|
|
|
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2));
|
|
|
|
}
|
|
|
|
|
|
|
|
vst1q_u32(buf, v_sum);
|
|
|
|
r += buf[0] + buf[1] + buf[2] + buf[3];
|
|
|
|
|
|
|
|
src1 += blockSize;
|
|
|
|
src2 += blockSize;
|
|
|
|
i += blockSize;
|
|
|
|
}
|
2011-04-17 21:14:45 +08:00
|
|
|
#endif
|
|
|
|
return r + dotProd_(src1, src2, len - i);
|
|
|
|
}
|
2011-02-28 02:12:30 +08:00
|
|
|
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static double dotProd_8s(const schar* src1, const schar* src2, int len)
|
2011-02-28 02:12:30 +08:00
|
|
|
{
|
2014-09-28 19:35:33 +08:00
|
|
|
int i = 0;
|
|
|
|
double r = 0.0;
|
|
|
|
|
2014-12-30 05:34:09 +08:00
|
|
|
#if CV_SSE2
|
|
|
|
if( USE_SSE2 )
|
|
|
|
{
|
|
|
|
int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize;
|
|
|
|
__m128i z = _mm_setzero_si128();
|
|
|
|
CV_DECL_ALIGNED(16) int buf[4];
|
|
|
|
|
|
|
|
while( i < len0 )
|
|
|
|
{
|
|
|
|
blockSize = std::min(len0 - i, blockSize0);
|
|
|
|
__m128i s = z;
|
|
|
|
j = 0;
|
|
|
|
for( ; j <= blockSize - 16; j += 16 )
|
|
|
|
{
|
|
|
|
__m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j));
|
|
|
|
__m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j));
|
|
|
|
__m128i s0, s1, s2, s3;
|
|
|
|
s0 = _mm_srai_epi16(_mm_unpacklo_epi8(b0, b0), 8);
|
|
|
|
s2 = _mm_srai_epi16(_mm_unpackhi_epi8(b0, b0), 8);
|
|
|
|
s1 = _mm_srai_epi16(_mm_unpacklo_epi8(b1, b1), 8);
|
|
|
|
s3 = _mm_srai_epi16(_mm_unpackhi_epi8(b1, b1), 8);
|
|
|
|
s0 = _mm_madd_epi16(s0, s1);
|
|
|
|
s2 = _mm_madd_epi16(s2, s3);
|
|
|
|
s = _mm_add_epi32(s, s0);
|
|
|
|
s = _mm_add_epi32(s, s2);
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j < blockSize; j += 4 )
|
|
|
|
{
|
|
|
|
__m128i s0 = _mm_cvtsi32_si128(*(const int*)(src1 + j));
|
|
|
|
__m128i s1 = _mm_cvtsi32_si128(*(const int*)(src2 + j));
|
|
|
|
s0 = _mm_srai_epi16(_mm_unpacklo_epi8(s0, s0), 8);
|
|
|
|
s1 = _mm_srai_epi16(_mm_unpacklo_epi8(s1, s1), 8);
|
|
|
|
s0 = _mm_madd_epi16(s0, s1);
|
|
|
|
s = _mm_add_epi32(s, s0);
|
|
|
|
}
|
|
|
|
|
|
|
|
_mm_store_si128((__m128i*)buf, s);
|
|
|
|
r += buf[0] + buf[1] + buf[2] + buf[3];
|
|
|
|
|
|
|
|
src1 += blockSize;
|
|
|
|
src2 += blockSize;
|
|
|
|
i += blockSize;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#elif CV_NEON
|
2014-09-28 19:35:33 +08:00
|
|
|
int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
|
|
|
|
int32x4_t v_zero = vdupq_n_s32(0);
|
|
|
|
CV_DECL_ALIGNED(16) int buf[4];
|
|
|
|
|
|
|
|
while( i < len0 )
|
|
|
|
{
|
|
|
|
blockSize = std::min(len0 - i, blockSize0);
|
|
|
|
int32x4_t v_sum = v_zero;
|
|
|
|
|
|
|
|
int j = 0;
|
|
|
|
for( ; j <= blockSize - 16; j += 16 )
|
|
|
|
{
|
|
|
|
int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j);
|
|
|
|
|
|
|
|
int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2));
|
|
|
|
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
|
|
|
|
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
|
|
|
|
|
|
|
|
v_src10 = vmovl_s8(vget_high_s8(v_src1));
|
|
|
|
v_src20 = vmovl_s8(vget_high_s8(v_src2));
|
|
|
|
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
|
|
|
|
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
|
|
|
|
}
|
|
|
|
|
|
|
|
for( ; j <= blockSize - 8; j += 8 )
|
|
|
|
{
|
|
|
|
int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j));
|
|
|
|
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2));
|
|
|
|
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2));
|
|
|
|
}
|
|
|
|
|
|
|
|
vst1q_s32(buf, v_sum);
|
|
|
|
r += buf[0] + buf[1] + buf[2] + buf[3];
|
|
|
|
|
|
|
|
src1 += blockSize;
|
|
|
|
src2 += blockSize;
|
|
|
|
i += blockSize;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return r + dotProd_(src1, src2, len - i);
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-02-28 02:12:30 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static double dotProd_16u(const ushort* src1, const ushort* src2, int len)
|
2011-02-28 02:12:30 +08:00
|
|
|
{
|
2017-04-21 19:52:45 +08:00
|
|
|
#if ARITHM_USE_IPP
|
|
|
|
double r = 0;
|
|
|
|
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16u64f_C1R, src1, len*sizeof(ushort), src2, len*sizeof(ushort), ippiSize(len, 1), &r) >= 0, r);
|
2014-03-21 19:27:56 +08:00
|
|
|
#endif
|
|
|
|
return dotProd_(src1, src2, len);
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-02-28 02:12:30 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static double dotProd_16s(const short* src1, const short* src2, int len)
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2017-04-21 19:52:45 +08:00
|
|
|
#if ARITHM_USE_IPP && (IPP_VERSION_X100 != 900) // bug in IPP 9.0.0
|
|
|
|
double r = 0;
|
|
|
|
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16s64f_C1R, src1, len*sizeof(short), src2, len*sizeof(short), ippiSize(len, 1), &r) >= 0, r);
|
2014-03-21 19:27:56 +08:00
|
|
|
#endif
|
|
|
|
return dotProd_(src1, src2, len);
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static double dotProd_32s(const int* src1, const int* src2, int len)
|
|
|
|
{
|
2017-04-21 19:52:45 +08:00
|
|
|
#if ARITHM_USE_IPP
|
|
|
|
double r = 0;
|
|
|
|
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32s64f_C1R, src1, len*sizeof(int), src2, len*sizeof(int), ippiSize(len, 1), &r) >= 0, r);
|
2014-03-21 19:27:56 +08:00
|
|
|
#endif
|
|
|
|
return dotProd_(src1, src2, len);
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-02-28 02:12:30 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static double dotProd_32f(const float* src1, const float* src2, int len)
|
|
|
|
{
|
2014-09-28 19:35:33 +08:00
|
|
|
double r = 0.0;
|
2017-04-21 19:52:45 +08:00
|
|
|
|
|
|
|
#if ARITHM_USE_IPP
|
|
|
|
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32f64f_C1R, src1, len*sizeof(float), src2, len*sizeof(float), ippiSize(len, 1), &r, ippAlgHintFast) >= 0, r);
|
|
|
|
#endif
|
2014-09-28 19:35:33 +08:00
|
|
|
int i = 0;
|
|
|
|
|
2017-04-21 19:52:45 +08:00
|
|
|
#if CV_NEON
|
2014-10-09 20:38:14 +08:00
|
|
|
int len0 = len & -4, blockSize0 = (1 << 13), blockSize;
|
2014-09-28 19:35:33 +08:00
|
|
|
float32x4_t v_zero = vdupq_n_f32(0.0f);
|
|
|
|
CV_DECL_ALIGNED(16) float buf[4];
|
|
|
|
|
|
|
|
while( i < len0 )
|
|
|
|
{
|
|
|
|
blockSize = std::min(len0 - i, blockSize0);
|
|
|
|
float32x4_t v_sum = v_zero;
|
|
|
|
|
|
|
|
int j = 0;
|
|
|
|
for( ; j <= blockSize - 4; j += 4 )
|
|
|
|
v_sum = vmlaq_f32(v_sum, vld1q_f32(src1 + j), vld1q_f32(src2 + j));
|
|
|
|
|
|
|
|
vst1q_f32(buf, v_sum);
|
|
|
|
r += buf[0] + buf[1] + buf[2] + buf[3];
|
|
|
|
|
|
|
|
src1 += blockSize;
|
|
|
|
src2 += blockSize;
|
|
|
|
i += blockSize;
|
|
|
|
}
|
2014-03-21 19:27:56 +08:00
|
|
|
#endif
|
2014-09-28 19:35:33 +08:00
|
|
|
return r + dotProd_(src1, src2, len - i);
|
2011-04-17 21:14:45 +08:00
|
|
|
}
|
2011-02-28 02:12:30 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
static double dotProd_64f(const double* src1, const double* src2, int len)
|
|
|
|
{
|
2017-04-21 19:52:45 +08:00
|
|
|
#if ARITHM_USE_IPP
|
|
|
|
double r = 0;
|
|
|
|
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsDotProd_64f, src1, src2, len, &r) >= 0, r);
|
2014-03-21 19:27:56 +08:00
|
|
|
#endif
|
2017-04-21 19:52:45 +08:00
|
|
|
|
2014-03-21 19:27:56 +08:00
|
|
|
return dotProd_(src1, src2, len);
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2013-08-15 15:01:40 +08:00
|
|
|
static DotProdFunc getDotProdFunc(int depth)
|
2010-05-12 01:44:00 +08:00
|
|
|
{
|
2013-08-15 15:01:40 +08:00
|
|
|
static DotProdFunc dotProdTab[] =
|
|
|
|
{
|
|
|
|
(DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s),
|
|
|
|
(DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
|
|
|
|
(DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f),
|
|
|
|
(DotProdFunc)dotProd_64f, 0
|
|
|
|
};
|
|
|
|
|
|
|
|
return dotProdTab[depth];
|
|
|
|
}
|
2010-05-12 01:44:00 +08:00
|
|
|
|
2011-06-06 22:51:27 +08:00
|
|
|
double Mat::dot(InputArray _mat) const
|
2011-04-17 21:14:45 +08:00
|
|
|
{
|
2016-08-18 14:53:00 +08:00
|
|
|
CV_INSTRUMENT_REGION()
|
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
Mat mat = _mat.getMat();
|
|
|
|
int cn = channels();
|
2013-08-15 15:01:40 +08:00
|
|
|
DotProdFunc func = getDotProdFunc(depth());
|
2011-04-17 21:14:45 +08:00
|
|
|
CV_Assert( mat.type() == type() && mat.size == size && func != 0 );
|
2011-02-28 02:12:30 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
if( isContinuous() && mat.isContinuous() )
|
2011-02-28 02:12:30 +08:00
|
|
|
{
|
2011-04-17 21:14:45 +08:00
|
|
|
size_t len = total()*cn;
|
|
|
|
if( len == (size_t)(int)len )
|
2011-07-19 20:27:07 +08:00
|
|
|
return func(data, mat.data, (int)len);
|
2011-02-28 02:12:30 +08:00
|
|
|
}
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
const Mat* arrays[] = {this, &mat, 0};
|
|
|
|
uchar* ptrs[2];
|
|
|
|
NAryMatIterator it(arrays, ptrs);
|
|
|
|
int len = (int)(it.size*cn);
|
|
|
|
double r = 0;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
for( size_t i = 0; i < it.nplanes; i++, ++it )
|
|
|
|
r += func( ptrs[0], ptrs[1], len );
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2011-04-17 21:14:45 +08:00
|
|
|
return r;
|
2010-05-12 01:44:00 +08:00
|
|
|
}
|
|
|
|
|
2011-07-19 00:31:30 +08:00
|
|
|
}
|
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
/****************************************************************************************\
|
|
|
|
* Earlier API *
|
|
|
|
\****************************************************************************************/
|
|
|
|
|
|
|
|
CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha,
|
|
|
|
const CvArr* Carr, double beta, CvArr* Darr, int flags )
|
|
|
|
{
|
|
|
|
cv::Mat A = cv::cvarrToMat(Aarr), B = cv::cvarrToMat(Barr);
|
|
|
|
cv::Mat C, D = cv::cvarrToMat(Darr);
|
|
|
|
|
|
|
|
if( Carr )
|
|
|
|
C = cv::cvarrToMat(Carr);
|
|
|
|
|
|
|
|
CV_Assert( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)) &&
|
|
|
|
(D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)) &&
|
|
|
|
D.type() == A.type() );
|
|
|
|
|
|
|
|
gemm( A, B, alpha, C, beta, D, flags );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL void
|
|
|
|
cvTransform( const CvArr* srcarr, CvArr* dstarr,
|
|
|
|
const CvMat* transmat, const CvMat* shiftvec )
|
|
|
|
{
|
|
|
|
cv::Mat m = cv::cvarrToMat(transmat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
|
|
|
|
|
|
|
|
if( shiftvec )
|
|
|
|
{
|
|
|
|
cv::Mat v = cv::cvarrToMat(shiftvec).reshape(1,m.rows),
|
|
|
|
_m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(0,m.cols), v1 = _m.col(m.cols);
|
|
|
|
m.convertTo(m1, m1.type());
|
|
|
|
v.convertTo(v1, v1.type());
|
|
|
|
m = _m;
|
|
|
|
}
|
|
|
|
|
|
|
|
CV_Assert( dst.depth() == src.depth() && dst.channels() == m.rows );
|
|
|
|
cv::transform( src, dst, m );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL void
|
|
|
|
cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat )
|
|
|
|
{
|
|
|
|
cv::Mat m = cv::cvarrToMat(mat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
|
|
|
|
|
|
|
|
CV_Assert( dst.type() == src.type() && dst.channels() == m.rows-1 );
|
|
|
|
cv::perspectiveTransform( src, dst, m );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale,
|
|
|
|
const CvArr* srcarr2, CvArr* dstarr )
|
|
|
|
{
|
|
|
|
cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
|
|
|
|
|
2010-10-20 20:33:57 +08:00
|
|
|
CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
|
2010-05-12 01:44:00 +08:00
|
|
|
cv::scaleAdd( src1, scale.val[0], cv::cvarrToMat(srcarr2), dst );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL void
|
|
|
|
cvCalcCovarMatrix( const CvArr** vecarr, int count,
|
|
|
|
CvArr* covarr, CvArr* avgarr, int flags )
|
|
|
|
{
|
|
|
|
cv::Mat cov0 = cv::cvarrToMat(covarr), cov = cov0, mean0, mean;
|
|
|
|
CV_Assert( vecarr != 0 && count >= 1 );
|
|
|
|
|
|
|
|
if( avgarr )
|
|
|
|
mean = mean0 = cv::cvarrToMat(avgarr);
|
|
|
|
|
|
|
|
if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 )
|
|
|
|
{
|
|
|
|
|
|
|
|
cv::Mat data = cv::cvarrToMat(vecarr[0]);
|
|
|
|
cv::calcCovarMatrix( data, cov, mean, flags, cov.type() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
std::vector<cv::Mat> data(count);
|
|
|
|
for( int i = 0; i < count; i++ )
|
|
|
|
data[i] = cv::cvarrToMat(vecarr[i]);
|
|
|
|
cv::calcCovarMatrix( &data[0], count, cov, mean, flags, cov.type() );
|
|
|
|
}
|
|
|
|
|
|
|
|
if( mean.data != mean0.data && mean0.data )
|
|
|
|
mean.convertTo(mean0, mean0.type());
|
|
|
|
|
|
|
|
if( cov.data != cov0.data )
|
|
|
|
cov.convertTo(cov0, cov0.type());
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL double
|
|
|
|
cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr )
|
|
|
|
{
|
|
|
|
return cv::Mahalanobis(cv::cvarrToMat(srcAarr),
|
|
|
|
cv::cvarrToMat(srcBarr), cv::cvarrToMat(matarr));
|
|
|
|
}
|
|
|
|
|
|
|
|
CV_IMPL void
|
|
|
|
cvMulTransposed( const CvArr* srcarr, CvArr* dstarr,
|
|
|
|
int order, const CvArr* deltaarr, double scale )
|
|
|
|
{
|
|
|
|
cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0, delta;
|
|
|
|
if( deltaarr )
|
|
|
|
delta = cv::cvarrToMat(deltaarr);
|
|
|
|
cv::mulTransposed( src, dst, order != 0, delta, scale, dst.type());
|
|
|
|
if( dst.data != dst0.data )
|
|
|
|
dst.convertTo(dst0, dst0.type());
|
|
|
|
}
|
|
|
|
|
|
|
|
CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr )
|
|
|
|
{
|
|
|
|
return cv::cvarrToMat(srcAarr).dot(cv::cvarrToMat(srcBarr));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL void
|
|
|
|
cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags )
|
|
|
|
{
|
|
|
|
cv::Mat data = cv::cvarrToMat(data_arr), mean0 = cv::cvarrToMat(avg_arr);
|
|
|
|
cv::Mat evals0 = cv::cvarrToMat(eigenvals), evects0 = cv::cvarrToMat(eigenvects);
|
|
|
|
cv::Mat mean = mean0, evals = evals0, evects = evects0;
|
|
|
|
|
|
|
|
cv::PCA pca;
|
|
|
|
pca.mean = mean;
|
|
|
|
pca.eigenvalues = evals;
|
|
|
|
pca.eigenvectors = evects;
|
|
|
|
|
|
|
|
pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(),
|
2014-08-13 19:08:27 +08:00
|
|
|
flags, !evals.empty() ? evals.rows + evals.cols - 1 : 0);
|
2010-05-12 01:44:00 +08:00
|
|
|
|
|
|
|
if( pca.mean.size() == mean.size() )
|
|
|
|
pca.mean.convertTo( mean, mean.type() );
|
|
|
|
else
|
|
|
|
{
|
|
|
|
cv::Mat temp; pca.mean.convertTo( temp, mean.type() );
|
|
|
|
transpose( temp, mean );
|
|
|
|
}
|
|
|
|
|
|
|
|
evals = pca.eigenvalues;
|
|
|
|
evects = pca.eigenvectors;
|
|
|
|
int ecount0 = evals0.cols + evals0.rows - 1;
|
|
|
|
int ecount = evals.cols + evals.rows - 1;
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
CV_Assert( (evals0.cols == 1 || evals0.rows == 1) &&
|
|
|
|
ecount0 <= ecount &&
|
|
|
|
evects0.cols == evects.cols &&
|
|
|
|
evects0.rows == ecount0 );
|
2011-04-19 05:24:57 +08:00
|
|
|
|
2010-05-12 01:44:00 +08:00
|
|
|
cv::Mat temp = evals0;
|
|
|
|
if( evals.rows == 1 )
|
|
|
|
evals.colRange(0, ecount0).convertTo(temp, evals0.type());
|
|
|
|
else
|
|
|
|
evals.rowRange(0, ecount0).convertTo(temp, evals0.type());
|
|
|
|
if( temp.data != evals0.data )
|
|
|
|
transpose(temp, evals0);
|
|
|
|
evects.rowRange(0, ecount0).convertTo( evects0, evects0.type() );
|
|
|
|
|
|
|
|
// otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated
|
|
|
|
CV_Assert( mean0.data == mean.data );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL void
|
|
|
|
cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr,
|
|
|
|
const CvArr* eigenvects, CvArr* result_arr )
|
|
|
|
{
|
|
|
|
cv::Mat data = cv::cvarrToMat(data_arr), mean = cv::cvarrToMat(avg_arr);
|
|
|
|
cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;
|
|
|
|
|
|
|
|
cv::PCA pca;
|
|
|
|
pca.mean = mean;
|
|
|
|
int n;
|
|
|
|
if( mean.rows == 1 )
|
|
|
|
{
|
|
|
|
CV_Assert(dst.cols <= evects.rows && dst.rows == data.rows);
|
|
|
|
n = dst.cols;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CV_Assert(dst.rows <= evects.rows && dst.cols == data.cols);
|
|
|
|
n = dst.rows;
|
|
|
|
}
|
|
|
|
pca.eigenvectors = evects.rowRange(0, n);
|
|
|
|
|
|
|
|
cv::Mat result = pca.project(data);
|
|
|
|
if( result.cols != dst.cols )
|
|
|
|
result = result.reshape(1, 1);
|
|
|
|
result.convertTo(dst, dst.type());
|
|
|
|
|
|
|
|
CV_Assert(dst0.data == dst.data);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CV_IMPL void
|
|
|
|
cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr,
|
|
|
|
const CvArr* eigenvects, CvArr* result_arr )
|
|
|
|
{
|
|
|
|
cv::Mat data = cv::cvarrToMat(proj_arr), mean = cv::cvarrToMat(avg_arr);
|
|
|
|
cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;
|
|
|
|
|
|
|
|
cv::PCA pca;
|
|
|
|
pca.mean = mean;
|
|
|
|
int n;
|
|
|
|
if( mean.rows == 1 )
|
|
|
|
{
|
|
|
|
CV_Assert(data.cols <= evects.rows && dst.rows == data.rows);
|
|
|
|
n = data.cols;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CV_Assert(data.rows <= evects.rows && dst.cols == data.cols);
|
|
|
|
n = data.rows;
|
|
|
|
}
|
|
|
|
pca.eigenvectors = evects.rowRange(0, n);
|
|
|
|
|
|
|
|
cv::Mat result = pca.backProject(data);
|
|
|
|
result.convertTo(dst, dst.type());
|
|
|
|
|
|
|
|
CV_Assert(dst0.data == dst.data);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* End of file. */
|