mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #26821 from fengyuentau:core/transform_simd
Core: vectorize cv::transform in terms of all data types #26821 ## Performance ### i7-12700K ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) Mat_Transform::Size_MatType::(127x61, 8SC3) 0.017 0.004 4.64 Mat_Transform::Size_MatType::(127x61, 16SC3) 0.015 0.004 3.78 Mat_Transform::Size_MatType::(127x61, 32SC3) 0.015 0.007 2.03 Mat_Transform::Size_MatType::(127x61, 64FC3) 0.007 0.004 1.78 Mat_Transform::Size_MatType::(640x480, 8SC3) 0.673 0.140 4.80 Mat_Transform::Size_MatType::(640x480, 16SC3) 0.618 0.158 3.90 Mat_Transform::Size_MatType::(640x480, 32SC3) 0.579 0.278 2.08 Mat_Transform::Size_MatType::(640x480, 64FC3) 0.290 0.266 1.09 Mat_Transform::Size_MatType::(1280x720, 8SC3) 1.919 0.414 4.63 Mat_Transform::Size_MatType::(1280x720, 16SC3) 1.811 0.488 3.71 Mat_Transform::Size_MatType::(1280x720, 32SC3) 1.736 0.917 1.89 Mat_Transform::Size_MatType::(1280x720, 64FC3) 2.310 2.030 1.14 Mat_Transform::Size_MatType::(1920x1080, 8SC3) 4.339 0.924 4.70 Mat_Transform::Size_MatType::(1920x1080, 16SC3) 4.095 1.288 3.18 Mat_Transform::Size_MatType::(1920x1080, 32SC3) 4.267 3.191 1.34 Mat_Transform::Size_MatType::(1920x1080, 64FC3) 6.641 5.481 1.21 Mat_Transform_Diagonal::Size_MatType::(640x480, 8SC3) 0.415 0.104 3.98 Mat_Transform_Diagonal::Size_MatType::(640x480, 16SC3) 0.385 0.128 3.00 Mat_Transform_Diagonal::Size_MatType::(640x480, 32SC3) 0.389 0.225 1.72 Mat_Transform_Diagonal::Size_MatType::(640x480, 64FC3) 0.279 0.275 1.01 Mat_Transform_Diagonal::Size_MatType::(1280x720, 8SC3) 1.223 0.313 3.91 Mat_Transform_Diagonal::Size_MatType::(1280x720, 16SC3) 1.118 0.387 2.89 Mat_Transform_Diagonal::Size_MatType::(1280x720, 32SC3) 1.215 0.801 1.52 Mat_Transform_Diagonal::Size_MatType::(1280x720, 64FC3) 2.198 1.900 1.16 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 8SC3) 2.772 0.705 3.93 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 16SC3) 2.572 1.134 2.27 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 32SC3) 3.477 3.276 1.06 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 64FC3) 5.984 5.186 1.15 ``` ### A311D ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) Mat_Transform::Size_MatType::(127x61, 8SC3) 0.143 0.035 4.05 Mat_Transform::Size_MatType::(127x61, 16SC3) 0.135 0.037 3.67 Mat_Transform::Size_MatType::(127x61, 32SC3) 0.110 0.062 1.77 Mat_Transform::Size_MatType::(127x61, 64FC3) 0.034 0.039 0.89 Mat_Transform::Size_MatType::(640x480, 8SC3) 5.673 1.395 4.07 Mat_Transform::Size_MatType::(640x480, 16SC3) 5.331 1.439 3.70 Mat_Transform::Size_MatType::(640x480, 32SC3) 4.329 2.472 1.75 Mat_Transform::Size_MatType::(640x480, 64FC3) 1.560 2.316 0.67 Mat_Transform::Size_MatType::(1280x720, 8SC3) 17.002 4.139 4.11 Mat_Transform::Size_MatType::(1280x720, 16SC3) 15.996 4.308 3.71 Mat_Transform::Size_MatType::(1280x720, 32SC3) 12.948 7.241 1.79 Mat_Transform::Size_MatType::(1280x720, 64FC3) 4.742 7.376 0.64 Mat_Transform::Size_MatType::(1920x1080, 8SC3) 38.253 9.384 4.08 Mat_Transform::Size_MatType::(1920x1080, 16SC3) 35.913 9.750 3.68 Mat_Transform::Size_MatType::(1920x1080, 32SC3) 29.145 16.528 1.76 Mat_Transform::Size_MatType::(1920x1080, 64FC3) 10.606 20.968 0.51 Mat_Transform_Diagonal::Size_MatType::(640x480, 8SC3) 4.439 1.086 4.09 Mat_Transform_Diagonal::Size_MatType::(640x480, 16SC3) 4.251 1.136 3.74 Mat_Transform_Diagonal::Size_MatType::(640x480, 32SC3) 3.786 1.999 1.89 Mat_Transform_Diagonal::Size_MatType::(640x480, 64FC3) 1.555 1.551 1.00 Mat_Transform_Diagonal::Size_MatType::(1280x720, 8SC3) 13.319 3.243 4.11 Mat_Transform_Diagonal::Size_MatType::(1280x720, 16SC3) 12.828 3.398 3.78 Mat_Transform_Diagonal::Size_MatType::(1280x720, 32SC3) 11.336 5.989 1.89 Mat_Transform_Diagonal::Size_MatType::(1280x720, 64FC3) 4.707 4.690 1.00 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 8SC3) 29.952 7.293 4.11 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 16SC3) 28.817 7.656 3.76 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 32SC3) 25.476 13.388 1.90 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 64FC3) 10.533 10.509 1.00 ``` ### M2 ``` Geometric mean (ms) Name of Test base patch patch vs base (x-factor) Mat_Transform::Size_MatType::(127x61, 8SC3) 0.020 0.004 4.45 Mat_Transform::Size_MatType::(127x61, 16SC3) 0.016 0.004 4.48 Mat_Transform::Size_MatType::(127x61, 32SC3) 0.016 0.007 2.23 Mat_Transform::Size_MatType::(127x61, 64FC3) 0.007 0.006 1.20 Mat_Transform::Size_MatType::(640x480, 8SC3) 0.793 0.197 4.03 Mat_Transform::Size_MatType::(640x480, 16SC3) 0.626 0.154 4.08 Mat_Transform::Size_MatType::(640x480, 32SC3) 0.627 0.306 2.05 Mat_Transform::Size_MatType::(640x480, 64FC3) 0.273 0.253 1.08 Mat_Transform::Size_MatType::(1280x720, 8SC3) 2.350 0.540 4.35 Mat_Transform::Size_MatType::(1280x720, 16SC3) 1.875 0.415 4.52 Mat_Transform::Size_MatType::(1280x720, 32SC3) 1.893 0.844 2.24 Mat_Transform::Size_MatType::(1280x720, 64FC3) 0.830 0.808 1.03 Mat_Transform::Size_MatType::(1920x1080, 8SC3) 5.302 1.178 4.50 Mat_Transform::Size_MatType::(1920x1080, 16SC3) 4.475 0.946 4.73 Mat_Transform::Size_MatType::(1920x1080, 32SC3) 4.409 1.864 2.37 Mat_Transform::Size_MatType::(1920x1080, 64FC3) 1.853 1.512 1.23 Mat_Transform_Diagonal::Size_MatType::(640x480, 8SC3) 0.586 0.110 5.32 Mat_Transform_Diagonal::Size_MatType::(640x480, 16SC3) 0.518 0.110 4.69 Mat_Transform_Diagonal::Size_MatType::(640x480, 32SC3) 0.430 0.220 1.95 Mat_Transform_Diagonal::Size_MatType::(640x480, 64FC3) 0.228 0.178 1.28 Mat_Transform_Diagonal::Size_MatType::(1280x720, 8SC3) 1.768 0.336 5.26 Mat_Transform_Diagonal::Size_MatType::(1280x720, 16SC3) 1.514 0.335 4.52 Mat_Transform_Diagonal::Size_MatType::(1280x720, 32SC3) 1.292 0.670 1.93 Mat_Transform_Diagonal::Size_MatType::(1280x720, 64FC3) 0.681 0.579 1.18 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 8SC3) 3.998 0.747 5.35 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 16SC3) 3.392 0.757 4.48 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 32SC3) 2.956 1.491 1.98 Mat_Transform_Diagonal::Size_MatType::(1920x1080, 64FC3) 1.546 1.476 1.05 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
28deafcbd4
commit
603b1cafdf
@ -166,4 +166,30 @@ PERF_TEST_P(Size_MatType, Mat_Transform,
|
||||
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
|
||||
}
|
||||
|
||||
PERF_TEST_P(Size_MatType, Mat_Transform_Diagonal,
|
||||
testing::Combine(testing::Values(szVGA, sz720p, sz1080p),
|
||||
testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3))
|
||||
)
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size srcSize0 = get<0>(params);
|
||||
const Size srcSize = Size(1, srcSize0.width*srcSize0.height);
|
||||
const int type = get<1>(params);
|
||||
const float transform[] = { 0.5f, 0.f, 0.f, 128,
|
||||
0.f, 0.86602540378f, 0.f, -64,
|
||||
0.f, 0.f, 0.4330127019f, 32, };
|
||||
Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
|
||||
|
||||
Mat src(srcSize, type), dst(srcSize, type);
|
||||
randu(src, 0, 30);
|
||||
declare.in(src).out(dst);
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
cv::transform(src, dst, mtx);
|
||||
}
|
||||
|
||||
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -1682,24 +1682,232 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
|
||||
static void
|
||||
transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int BITS = 10, SCALE = 1 << BITS;
|
||||
const float MAX_M = (float)(1 << (15 - BITS));
|
||||
|
||||
if( scn == 3 && dcn == 3 &&
|
||||
std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[ 2]) < MAX_M*256 && std::abs(m[ 3]) < MAX_M*256 &&
|
||||
std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[ 6]) < MAX_M*256 && std::abs(m[ 7]) < MAX_M*256 &&
|
||||
std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M*256 && std::abs(m[11]) < MAX_M*256 )
|
||||
{
|
||||
const int nChannels = 3;
|
||||
|
||||
union {
|
||||
short s[6];
|
||||
int p[3];
|
||||
} m16;
|
||||
m16.s[0] = saturate_cast<short>(m[0] * SCALE); m16.s[1] = saturate_cast<short>(m[1] * SCALE);
|
||||
m16.s[2] = saturate_cast<short>(m[4] * SCALE); m16.s[3] = saturate_cast<short>(m[5] * SCALE);
|
||||
m16.s[4] = saturate_cast<short>(m[8] * SCALE); m16.s[5] = saturate_cast<short>(m[9] * SCALE);
|
||||
int m32[] = {saturate_cast<int>(m[ 2] * SCALE), saturate_cast<int>(m[ 3] * SCALE),
|
||||
saturate_cast<int>(m[ 6] * SCALE), saturate_cast<int>(m[ 7] * SCALE),
|
||||
saturate_cast<int>(m[10] * SCALE), saturate_cast<int>(m[11] * SCALE)};
|
||||
v_int16 m01 = v_reinterpret_as_s16(vx_setall_s32(m16.p[0]));
|
||||
v_int32 m2 = vx_setall_s32(m32[0]);
|
||||
v_int32 m3 = vx_setall_s32(m32[1]);
|
||||
v_int16 m45 = v_reinterpret_as_s16(vx_setall_s32(m16.p[1]));
|
||||
v_int32 m6 = vx_setall_s32(m32[2]);
|
||||
v_int32 m7 = vx_setall_s32(m32[3]);
|
||||
v_int16 m89 = v_reinterpret_as_s16(vx_setall_s32(m16.p[2]));
|
||||
v_int32 m10 = vx_setall_s32(m32[4]);
|
||||
v_int32 m11 = vx_setall_s32(m32[5]);
|
||||
int x = 0;
|
||||
for (; x <= (len - VTraits<v_int8>::vlanes()) * nChannels; x += VTraits<v_int8>::vlanes() * nChannels)
|
||||
{
|
||||
v_int8 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_int8 bgl, bgh;
|
||||
v_zip(b, g, bgl, bgh);
|
||||
v_int16 rl, rh;
|
||||
v_expand(r, rl, rh);
|
||||
|
||||
v_int16 dbl, dbh, dgl, dgh, drl, drh;
|
||||
v_int16 p0, p2;
|
||||
v_int32 p1, p3;
|
||||
v_expand(bgl, p0, p2);
|
||||
v_expand(v_reinterpret_as_s16(rl), p1, p3);
|
||||
dbl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m01), v_mul(p1, m2)), m3),
|
||||
v_add(v_add(v_dotprod(p2, m01), v_mul(p3, m2)), m3));
|
||||
dgl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m45), v_mul(p1, m6)), m7),
|
||||
v_add(v_add(v_dotprod(p2, m45), v_mul(p3, m6)), m7));
|
||||
drl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m89), v_mul(p1, m10)), m11),
|
||||
v_add(v_add(v_dotprod(p2, m89), v_mul(p3, m10)), m11));
|
||||
v_expand(bgh, p0, p2);
|
||||
v_expand(v_reinterpret_as_s16(rh), p1, p3);
|
||||
dbh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m01), v_mul(p1, m2)), m3),
|
||||
v_add(v_add(v_dotprod(p2, m01), v_mul(p3, m2)), m3));
|
||||
dgh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m45), v_mul(p1, m6)), m7),
|
||||
v_add(v_add(v_dotprod(p2, m45), v_mul(p3, m6)), m7));
|
||||
drh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m89), v_mul(p1, m10)), m11),
|
||||
v_add(v_add(v_dotprod(p2, m89), v_mul(p3, m10)), m11));
|
||||
v_store_interleave(dst + x, v_pack(dbl, dbh), v_pack(dgl, dgh), v_pack(drl, drh));
|
||||
}
|
||||
m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE);
|
||||
m32[3] = saturate_cast<int>((m[7] + 0.5f)*SCALE);
|
||||
m32[5] = saturate_cast<int>((m[11] + 0.5f)*SCALE);
|
||||
for( ; x < len * nChannels; x += nChannels )
|
||||
{
|
||||
int v0 = src[x], v1 = src[x+1], v2 = src[x+2];
|
||||
schar t0 = saturate_cast<schar>((m16.s[0] * v0 + m16.s[1] * v1 + m32[0] * v2 + m32[1]) >> BITS);
|
||||
schar t1 = saturate_cast<schar>((m16.s[2] * v0 + m16.s[3] * v1 + m32[2] * v2 + m32[3]) >> BITS);
|
||||
schar t2 = saturate_cast<schar>((m16.s[4] * v0 + m16.s[5] * v1 + m32[4] * v2 + m32[5]) >> BITS);
|
||||
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
transform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float32 m0 = vx_setall_f32(m[ 0]);
|
||||
v_float32 m1 = vx_setall_f32(m[ 1]);
|
||||
v_float32 m2 = vx_setall_f32(m[ 2]);
|
||||
v_float32 m3 = vx_setall_f32(m[ 3]);
|
||||
v_float32 m4 = vx_setall_f32(m[ 4]);
|
||||
v_float32 m5 = vx_setall_f32(m[ 5]);
|
||||
v_float32 m6 = vx_setall_f32(m[ 6]);
|
||||
v_float32 m7 = vx_setall_f32(m[ 7]);
|
||||
v_float32 m8 = vx_setall_f32(m[ 8]);
|
||||
v_float32 m9 = vx_setall_f32(m[ 9]);
|
||||
v_float32 m10 = vx_setall_f32(m[10]);
|
||||
v_float32 m11 = vx_setall_f32(m[11]);
|
||||
for (; x <= (len - VTraits<v_int16>::vlanes())*3; x += VTraits<v_int16>::vlanes()*3)
|
||||
{
|
||||
v_int16 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_int32 bl, bh, gl, gh, rl, rh;
|
||||
v_expand(b, bl, bh);
|
||||
v_expand(g, gl, gh);
|
||||
v_expand(r, rl, rh);
|
||||
|
||||
v_int16 db, dg, dr;
|
||||
db = v_pack(v_round(v_muladd(v_cvt_f32(bl), m0, v_muladd(v_cvt_f32(gl), m1, v_muladd(v_cvt_f32(rl), m2, m3)))),
|
||||
v_round(v_muladd(v_cvt_f32(bh), m0, v_muladd(v_cvt_f32(gh), m1, v_muladd(v_cvt_f32(rh), m2, m3)))));
|
||||
dg = v_pack(v_round(v_muladd(v_cvt_f32(bl), m4, v_muladd(v_cvt_f32(gl), m5, v_muladd(v_cvt_f32(rl), m6, m7)))),
|
||||
v_round(v_muladd(v_cvt_f32(bh), m4, v_muladd(v_cvt_f32(gh), m5, v_muladd(v_cvt_f32(rh), m6, m7)))));
|
||||
dr = v_pack(v_round(v_muladd(v_cvt_f32(bl), m8, v_muladd(v_cvt_f32(gl), m9, v_muladd(v_cvt_f32(rl), m10, m11)))),
|
||||
v_round(v_muladd(v_cvt_f32(bh), m8, v_muladd(v_cvt_f32(gh), m9, v_muladd(v_cvt_f32(rh), m10, m11)))));
|
||||
v_store_interleave(dst + x, db, dg, dr);
|
||||
}
|
||||
for( ; x < len * 3; x += 3 )
|
||||
{
|
||||
float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
|
||||
short t0 = saturate_cast<short>(m[0] * v0 + m[1] * v1 + m[ 2] * v2 + m[ 3]);
|
||||
short t1 = saturate_cast<short>(m[4] * v0 + m[5] * v1 + m[ 6] * v2 + m[ 7]);
|
||||
short t2 = saturate_cast<short>(m[8] * v0 + m[9] * v1 + m[10] * v2 + m[11]);
|
||||
dst[x] = t0; dst[x + 1] = t1; dst[x + 2] = t2;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
transform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
if( scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float64 m0 = vx_setall_f64(m[ 0]);
|
||||
v_float64 m1 = vx_setall_f64(m[ 1]);
|
||||
v_float64 m2 = vx_setall_f64(m[ 2]);
|
||||
v_float64 m3 = vx_setall_f64(m[ 3]);
|
||||
v_float64 m4 = vx_setall_f64(m[ 4]);
|
||||
v_float64 m5 = vx_setall_f64(m[ 5]);
|
||||
v_float64 m6 = vx_setall_f64(m[ 6]);
|
||||
v_float64 m7 = vx_setall_f64(m[ 7]);
|
||||
v_float64 m8 = vx_setall_f64(m[ 8]);
|
||||
v_float64 m9 = vx_setall_f64(m[ 9]);
|
||||
v_float64 m10 = vx_setall_f64(m[10]);
|
||||
v_float64 m11 = vx_setall_f64(m[11]);
|
||||
for (; x <= (len - VTraits<v_int32>::vlanes()) * 3; x += VTraits<v_int32>::vlanes() * 3) {
|
||||
v_int32 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_float64 bh = v_cvt_f64_high(b), bl = v_cvt_f64(b);
|
||||
v_float64 gh = v_cvt_f64_high(g), gl = v_cvt_f64(g);
|
||||
v_float64 rh = v_cvt_f64_high(r), rl = v_cvt_f64(r);
|
||||
|
||||
v_int32 db, dg, dr;
|
||||
db = v_round(v_fma(bl, m0, v_fma(gl, m1, v_fma(rl, m2, m3))),
|
||||
v_fma(bh, m0, v_fma(gh, m1, v_fma(rh, m2, m3))));
|
||||
dg = v_round(v_fma(bl, m4, v_fma(gl, m5, v_fma(rl, m6, m7))),
|
||||
v_fma(bh, m4, v_fma(gh, m5, v_fma(rh, m6, m7))));
|
||||
dr = v_round(v_fma(bl, m8, v_fma(gl, m9, v_fma(rl, m10, m11))),
|
||||
v_fma(bh, m8, v_fma(gh, m9, v_fma(rh, m10, m11))));
|
||||
|
||||
v_store_interleave(dst + x, db, dg, dr);
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
double b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
int db = saturate_cast<int>(m[0] * b + m[1] * g + m[ 2] * r + m[ 3]);
|
||||
int dg = saturate_cast<int>(m[4] * b + m[5] * g + m[ 6] * r + m[ 7]);
|
||||
int dr = saturate_cast<int>(m[8] * b + m[9] * g + m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
transform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
if( scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float64 m0 = vx_setall_f64(m[ 0]);
|
||||
v_float64 m1 = vx_setall_f64(m[ 1]);
|
||||
v_float64 m2 = vx_setall_f64(m[ 2]);
|
||||
v_float64 m3 = vx_setall_f64(m[ 3]);
|
||||
v_float64 m4 = vx_setall_f64(m[ 4]);
|
||||
v_float64 m5 = vx_setall_f64(m[ 5]);
|
||||
v_float64 m6 = vx_setall_f64(m[ 6]);
|
||||
v_float64 m7 = vx_setall_f64(m[ 7]);
|
||||
v_float64 m8 = vx_setall_f64(m[ 8]);
|
||||
v_float64 m9 = vx_setall_f64(m[ 9]);
|
||||
v_float64 m10 = vx_setall_f64(m[10]);
|
||||
v_float64 m11 = vx_setall_f64(m[11]);
|
||||
for (; x <= (len - VTraits<v_float64>::vlanes()) * 3; x += VTraits<v_float64>::vlanes() * 3) {
|
||||
v_float64 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
|
||||
v_float64 db, dg, dr;
|
||||
db = v_fma(b, m0, v_fma(g, m1, v_fma(r, m2, m3)));
|
||||
dg = v_fma(b, m4, v_fma(g, m5, v_fma(r, m6, m7)));
|
||||
dr = v_fma(b, m8, v_fma(g, m9, v_fma(r, m10, m11)));
|
||||
|
||||
v_store_interleave(dst + x, db, dg, dr);
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
double b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
double db = saturate_cast<double>(m[0] * b + m[1] * g + m[ 2] * r + m[ 3]);
|
||||
double dg = saturate_cast<double>(m[4] * b + m[5] * g + m[ 6] * r + m[ 7]);
|
||||
double dr = saturate_cast<double>(m[8] * b + m[9] * g + m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
transform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
@ -1753,42 +1961,358 @@ diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
|
||||
static void
|
||||
diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float32 m0 = vx_setall_f32(m[ 0]);
|
||||
v_float32 m3 = vx_setall_f32(m[ 3]);
|
||||
v_float32 m5 = vx_setall_f32(m[ 5]);
|
||||
v_float32 m7 = vx_setall_f32(m[ 7]);
|
||||
v_float32 m10 = vx_setall_f32(m[10]);
|
||||
v_float32 m11 = vx_setall_f32(m[11]);
|
||||
for (; x <= (len - VTraits<v_uint8>::vlanes()) * 3; x += VTraits<v_uint8>::vlanes() * 3) {
|
||||
v_uint8 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_uint16 b_u16_l, g_u16_l, r_u16_l;
|
||||
v_uint16 b_u16_h, g_u16_h, r_u16_h;
|
||||
v_expand(b, b_u16_l, b_u16_h);
|
||||
v_expand(g, g_u16_l, g_u16_h);
|
||||
v_expand(r, r_u16_l, r_u16_h);
|
||||
|
||||
v_uint32 b_u32_l0, g_u32_l0, r_u32_l0;
|
||||
v_uint32 b_u32_l1, g_u32_l1, r_u32_l1;
|
||||
v_uint32 b_u32_h0, g_u32_h0, r_u32_h0;
|
||||
v_uint32 b_u32_h1, g_u32_h1, r_u32_h1;
|
||||
v_expand(b_u16_l, b_u32_l0, b_u32_l1);
|
||||
v_expand(b_u16_h, b_u32_h0, b_u32_h1);
|
||||
v_expand(g_u16_l, g_u32_l0, g_u32_l1);
|
||||
v_expand(g_u16_h, g_u32_h0, g_u32_h1);
|
||||
v_expand(r_u16_l, r_u32_l0, r_u32_l1);
|
||||
v_expand(r_u16_h, r_u32_h0, r_u32_h1);
|
||||
|
||||
v_float32 db_f32_l0 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_l0)), m3);
|
||||
v_float32 db_f32_l1 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_l1)), m3);
|
||||
v_float32 db_f32_h0 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_h0)), m3);
|
||||
v_float32 db_f32_h1 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_h1)), m3);
|
||||
|
||||
v_float32 dg_f32_l0 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_l0)), m7);
|
||||
v_float32 dg_f32_l1 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_l1)), m7);
|
||||
v_float32 dg_f32_h0 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_h0)), m7);
|
||||
v_float32 dg_f32_h1 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_h1)), m7);
|
||||
|
||||
v_float32 dr_f32_l0 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_l0)), m11);
|
||||
v_float32 dr_f32_l1 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_l1)), m11);
|
||||
v_float32 dr_f32_h0 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_h0)), m11);
|
||||
v_float32 dr_f32_h1 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_h1)), m11);
|
||||
|
||||
v_store_interleave(dst + x,
|
||||
v_pack(v_pack_u(v_round(db_f32_l0), v_round(db_f32_l1)), v_pack_u(v_round(db_f32_h0), v_round(db_f32_h1))),
|
||||
v_pack(v_pack_u(v_round(dg_f32_l0), v_round(dg_f32_l1)), v_pack_u(v_round(dg_f32_h0), v_round(dg_f32_h1))),
|
||||
v_pack(v_pack_u(v_round(dr_f32_l0), v_round(dr_f32_l1)), v_pack_u(v_round(dr_f32_h0), v_round(dr_f32_h1))));
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
int b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
uchar db = saturate_cast<uchar>(m[ 0] * b + m[ 3]);
|
||||
uchar dg = saturate_cast<uchar>(m[ 5] * g + m[ 7]);
|
||||
uchar dr = saturate_cast<uchar>(m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
diagtransform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float32 m0 = vx_setall_f32(m[ 0]);
|
||||
v_float32 m3 = vx_setall_f32(m[ 3]);
|
||||
v_float32 m5 = vx_setall_f32(m[ 5]);
|
||||
v_float32 m7 = vx_setall_f32(m[ 7]);
|
||||
v_float32 m10 = vx_setall_f32(m[10]);
|
||||
v_float32 m11 = vx_setall_f32(m[11]);
|
||||
for (; x <= (len - VTraits<v_int8>::vlanes()) * 3; x += VTraits<v_int8>::vlanes() * 3) {
|
||||
v_int8 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_int16 b_u16_l, g_u16_l, r_u16_l;
|
||||
v_int16 b_u16_h, g_u16_h, r_u16_h;
|
||||
v_expand(b, b_u16_l, b_u16_h);
|
||||
v_expand(g, g_u16_l, g_u16_h);
|
||||
v_expand(r, r_u16_l, r_u16_h);
|
||||
|
||||
v_int32 b_u32_l0, g_u32_l0, r_u32_l0;
|
||||
v_int32 b_u32_l1, g_u32_l1, r_u32_l1;
|
||||
v_int32 b_u32_h0, g_u32_h0, r_u32_h0;
|
||||
v_int32 b_u32_h1, g_u32_h1, r_u32_h1;
|
||||
v_expand(b_u16_l, b_u32_l0, b_u32_l1);
|
||||
v_expand(b_u16_h, b_u32_h0, b_u32_h1);
|
||||
v_expand(g_u16_l, g_u32_l0, g_u32_l1);
|
||||
v_expand(g_u16_h, g_u32_h0, g_u32_h1);
|
||||
v_expand(r_u16_l, r_u32_l0, r_u32_l1);
|
||||
v_expand(r_u16_h, r_u32_h0, r_u32_h1);
|
||||
|
||||
v_float32 db_f32_l0 = v_fma( m0, v_cvt_f32(b_u32_l0), m3);
|
||||
v_float32 db_f32_l1 = v_fma( m0, v_cvt_f32(b_u32_l1), m3);
|
||||
v_float32 db_f32_h0 = v_fma( m0, v_cvt_f32(b_u32_h0), m3);
|
||||
v_float32 db_f32_h1 = v_fma( m0, v_cvt_f32(b_u32_h1), m3);
|
||||
|
||||
v_float32 dg_f32_l0 = v_fma( m5, v_cvt_f32(g_u32_l0), m7);
|
||||
v_float32 dg_f32_l1 = v_fma( m5, v_cvt_f32(g_u32_l1), m7);
|
||||
v_float32 dg_f32_h0 = v_fma( m5, v_cvt_f32(g_u32_h0), m7);
|
||||
v_float32 dg_f32_h1 = v_fma( m5, v_cvt_f32(g_u32_h1), m7);
|
||||
|
||||
v_float32 dr_f32_l0 = v_fma(m10, v_cvt_f32(r_u32_l0), m11);
|
||||
v_float32 dr_f32_l1 = v_fma(m10, v_cvt_f32(r_u32_l1), m11);
|
||||
v_float32 dr_f32_h0 = v_fma(m10, v_cvt_f32(r_u32_h0), m11);
|
||||
v_float32 dr_f32_h1 = v_fma(m10, v_cvt_f32(r_u32_h1), m11);
|
||||
|
||||
v_store_interleave(dst + x,
|
||||
v_pack(v_pack(v_round(db_f32_l0), v_round(db_f32_l1)), v_pack(v_round(db_f32_h0), v_round(db_f32_h1))),
|
||||
v_pack(v_pack(v_round(dg_f32_l0), v_round(dg_f32_l1)), v_pack(v_round(dg_f32_h0), v_round(dg_f32_h1))),
|
||||
v_pack(v_pack(v_round(dr_f32_l0), v_round(dr_f32_l1)), v_pack(v_round(dr_f32_h0), v_round(dr_f32_h1))));
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
int b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
schar db = saturate_cast<schar>(m[ 0] * b + m[ 3]);
|
||||
schar dg = saturate_cast<schar>(m[ 5] * g + m[ 7]);
|
||||
schar dr = saturate_cast<schar>(m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
diagtransform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float32 m0 = vx_setall_f32(m[ 0]);
|
||||
v_float32 m3 = vx_setall_f32(m[ 3]);
|
||||
v_float32 m5 = vx_setall_f32(m[ 5]);
|
||||
v_float32 m7 = vx_setall_f32(m[ 7]);
|
||||
v_float32 m10 = vx_setall_f32(m[10]);
|
||||
v_float32 m11 = vx_setall_f32(m[11]);
|
||||
for (; x <= (len - VTraits<v_uint16>::vlanes()) * 3; x += VTraits<v_uint16>::vlanes() * 3) {
|
||||
v_uint16 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_uint32 b_u32_l, g_u32_l, r_u32_l;
|
||||
v_uint32 b_u32_h, g_u32_h, r_u32_h;
|
||||
v_expand(b, b_u32_l, b_u32_h);
|
||||
v_expand(g, g_u32_l, g_u32_h);
|
||||
v_expand(r, r_u32_l, r_u32_h);
|
||||
|
||||
v_float32 db_f32_l = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_l)), m3);
|
||||
v_float32 db_f32_h = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_h)), m3);
|
||||
|
||||
v_float32 dg_f32_l = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_l)), m7);
|
||||
v_float32 dg_f32_h = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_h)), m7);
|
||||
|
||||
v_float32 dr_f32_l = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_l)), m11);
|
||||
v_float32 dr_f32_h = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_h)), m11);
|
||||
|
||||
v_store_interleave(dst + x,
|
||||
v_pack_u(v_round(db_f32_l), v_round(db_f32_h)),
|
||||
v_pack_u(v_round(dg_f32_l), v_round(dg_f32_h)),
|
||||
v_pack_u(v_round(dr_f32_l), v_round(dr_f32_h)));
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
int b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
ushort db = saturate_cast<ushort>(m[ 0] * b + m[ 3]);
|
||||
ushort dg = saturate_cast<ushort>(m[ 5] * g + m[ 7]);
|
||||
ushort dr = saturate_cast<ushort>(m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
diagtransform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float32 m0 = vx_setall_f32(m[ 0]);
|
||||
v_float32 m3 = vx_setall_f32(m[ 3]);
|
||||
v_float32 m5 = vx_setall_f32(m[ 5]);
|
||||
v_float32 m7 = vx_setall_f32(m[ 7]);
|
||||
v_float32 m10 = vx_setall_f32(m[10]);
|
||||
v_float32 m11 = vx_setall_f32(m[11]);
|
||||
for (; x <= (len - VTraits<v_int16>::vlanes()) * 3; x += VTraits<v_int16>::vlanes() * 3) {
|
||||
v_int16 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_int32 b_u32_l, g_u32_l, r_u32_l;
|
||||
v_int32 b_u32_h, g_u32_h, r_u32_h;
|
||||
v_expand(b, b_u32_l, b_u32_h);
|
||||
v_expand(g, g_u32_l, g_u32_h);
|
||||
v_expand(r, r_u32_l, r_u32_h);
|
||||
|
||||
v_float32 db_f32_l = v_fma( m0, v_cvt_f32(b_u32_l), m3);
|
||||
v_float32 db_f32_h = v_fma( m0, v_cvt_f32(b_u32_h), m3);
|
||||
|
||||
v_float32 dg_f32_l = v_fma( m5, v_cvt_f32(g_u32_l), m7);
|
||||
v_float32 dg_f32_h = v_fma( m5, v_cvt_f32(g_u32_h), m7);
|
||||
|
||||
v_float32 dr_f32_l = v_fma(m10, v_cvt_f32(r_u32_l), m11);
|
||||
v_float32 dr_f32_h = v_fma(m10, v_cvt_f32(r_u32_h), m11);
|
||||
|
||||
v_store_interleave(dst + x,
|
||||
v_pack(v_round(db_f32_l), v_round(db_f32_h)),
|
||||
v_pack(v_round(dg_f32_l), v_round(dg_f32_h)),
|
||||
v_pack(v_round(dr_f32_l), v_round(dr_f32_h)));
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
int b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
short db = saturate_cast<short>(m[ 0] * b + m[ 3]);
|
||||
short dg = saturate_cast<short>(m[ 5] * g + m[ 7]);
|
||||
short dr = saturate_cast<short>(m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
diagtransform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float64 m0 = vx_setall_f64(m[ 0]);
|
||||
v_float64 m3 = vx_setall_f64(m[ 3]);
|
||||
v_float64 m5 = vx_setall_f64(m[ 5]);
|
||||
v_float64 m7 = vx_setall_f64(m[ 7]);
|
||||
v_float64 m10 = vx_setall_f64(m[10]);
|
||||
v_float64 m11 = vx_setall_f64(m[11]);
|
||||
for (; x <= (len - VTraits<v_float32>::vlanes()) * 3; x += VTraits<v_float32>::vlanes() * 3) {
|
||||
v_int32 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
v_float64 bh = v_cvt_f64_high(b), bl = v_cvt_f64(b);
|
||||
v_float64 gh = v_cvt_f64_high(g), gl = v_cvt_f64(g);
|
||||
v_float64 rh = v_cvt_f64_high(r), rl = v_cvt_f64(r);
|
||||
|
||||
v_int32 db = v_round(v_fma( m0, bl, m3),
|
||||
v_fma( m0, bh, m3));
|
||||
v_int32 dg = v_round(v_fma( m5, gl, m7),
|
||||
v_fma( m5, gh, m7));
|
||||
v_int32 dr = v_round(v_fma(m10, rl, m11),
|
||||
v_fma(m10, rh, m11));
|
||||
|
||||
v_store_interleave(dst + x, db, dg, dr);
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
int b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
int db = saturate_cast<int>(m[ 0] * b + m[ 3]);
|
||||
int dg = saturate_cast<int>(m[ 5] * g + m[ 7]);
|
||||
int dr = saturate_cast<int>(m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
diagtransform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float32 m0 = vx_setall_f32(m[ 0]);
|
||||
v_float32 m3 = vx_setall_f32(m[ 3]);
|
||||
v_float32 m5 = vx_setall_f32(m[ 5]);
|
||||
v_float32 m7 = vx_setall_f32(m[ 7]);
|
||||
v_float32 m10 = vx_setall_f32(m[10]);
|
||||
v_float32 m11 = vx_setall_f32(m[11]);
|
||||
for (; x <= (len - VTraits<v_float32>::vlanes()) * 3; x += VTraits<v_float32>::vlanes() * 3) {
|
||||
v_float32 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
|
||||
v_float32 db = v_fma( m0, b, m3);
|
||||
v_float32 dg = v_fma( m5, g, m7);
|
||||
v_float32 dr = v_fma(m10, r, m11);
|
||||
|
||||
v_store_interleave(dst + x, db, dg, dr);
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
float b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
float db = saturate_cast<float>(m[ 0] * b + m[ 3]);
|
||||
float dg = saturate_cast<float>(m[ 5] * g + m[ 7]);
|
||||
float dr = saturate_cast<float>(m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
diagtransform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
static void
|
||||
diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
|
||||
{
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
if (scn == 3 && dcn == 3) {
|
||||
int x = 0;
|
||||
|
||||
v_float64 m0 = vx_setall_f64(m[ 0]);
|
||||
v_float64 m3 = vx_setall_f64(m[ 3]);
|
||||
v_float64 m5 = vx_setall_f64(m[ 5]);
|
||||
v_float64 m7 = vx_setall_f64(m[ 7]);
|
||||
v_float64 m10 = vx_setall_f64(m[10]);
|
||||
v_float64 m11 = vx_setall_f64(m[11]);
|
||||
for (; x <= (len - VTraits<v_float64>::vlanes()) * 3; x += VTraits<v_float64>::vlanes() * 3) {
|
||||
v_float64 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
|
||||
v_float64 db = v_fma( m0, b, m3);
|
||||
v_float64 dg = v_fma( m5, g, m7);
|
||||
v_float64 dr = v_fma(m10, r, m11);
|
||||
|
||||
v_store_interleave(dst + x, db, dg, dr);
|
||||
}
|
||||
for (; x < len * 3; x += 3) {
|
||||
double b = src[x], g = src[x + 1], r = src[x + 2];
|
||||
double db = saturate_cast<double>(m[ 0] * b + m[ 3]);
|
||||
double dg = saturate_cast<double>(m[ 5] * g + m[ 7]);
|
||||
double dr = saturate_cast<double>(m[10] * r + m[11]);
|
||||
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
|
||||
}
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
diagtransform_(src, dst, m, len, scn, dcn);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user