Merge pull request #26821 from fengyuentau:core/transform_simd

Core: vectorize cv::transform in terms of all data types #26821

## Performance

### i7-12700K

```
Geometric mean (ms)

                      Name of Test                       base  patch   patch
                                                                         vs
                                                                        base
                                                                     (x-factor)
Mat_Transform::Size_MatType::(127x61, 8SC3)              0.017 0.004    4.64
Mat_Transform::Size_MatType::(127x61, 16SC3)             0.015 0.004    3.78
Mat_Transform::Size_MatType::(127x61, 32SC3)             0.015 0.007    2.03
Mat_Transform::Size_MatType::(127x61, 64FC3)             0.007 0.004    1.78
Mat_Transform::Size_MatType::(640x480, 8SC3)             0.673 0.140    4.80
Mat_Transform::Size_MatType::(640x480, 16SC3)            0.618 0.158    3.90
Mat_Transform::Size_MatType::(640x480, 32SC3)            0.579 0.278    2.08
Mat_Transform::Size_MatType::(640x480, 64FC3)            0.290 0.266    1.09
Mat_Transform::Size_MatType::(1280x720, 8SC3)            1.919 0.414    4.63
Mat_Transform::Size_MatType::(1280x720, 16SC3)           1.811 0.488    3.71
Mat_Transform::Size_MatType::(1280x720, 32SC3)           1.736 0.917    1.89
Mat_Transform::Size_MatType::(1280x720, 64FC3)           2.310 2.030    1.14
Mat_Transform::Size_MatType::(1920x1080, 8SC3)           4.339 0.924    4.70
Mat_Transform::Size_MatType::(1920x1080, 16SC3)          4.095 1.288    3.18
Mat_Transform::Size_MatType::(1920x1080, 32SC3)          4.267 3.191    1.34
Mat_Transform::Size_MatType::(1920x1080, 64FC3)          6.641 5.481    1.21
Mat_Transform_Diagonal::Size_MatType::(640x480, 8SC3)    0.415 0.104    3.98
Mat_Transform_Diagonal::Size_MatType::(640x480, 16SC3)   0.385 0.128    3.00
Mat_Transform_Diagonal::Size_MatType::(640x480, 32SC3)   0.389 0.225    1.72
Mat_Transform_Diagonal::Size_MatType::(640x480, 64FC3)   0.279 0.275    1.01
Mat_Transform_Diagonal::Size_MatType::(1280x720, 8SC3)   1.223 0.313    3.91
Mat_Transform_Diagonal::Size_MatType::(1280x720, 16SC3)  1.118 0.387    2.89
Mat_Transform_Diagonal::Size_MatType::(1280x720, 32SC3)  1.215 0.801    1.52
Mat_Transform_Diagonal::Size_MatType::(1280x720, 64FC3)  2.198 1.900    1.16
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 8SC3)  2.772 0.705    3.93
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 16SC3) 2.572 1.134    2.27
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 32SC3) 3.477 3.276    1.06
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 64FC3) 5.984 5.186    1.15
```

### A311D

```
Geometric mean (ms)

                      Name of Test                        base  patch    patch
                                                                           vs
                                                                          base
                                                                       (x-factor)
Mat_Transform::Size_MatType::(127x61, 8SC3)              0.143  0.035     4.05
Mat_Transform::Size_MatType::(127x61, 16SC3)             0.135  0.037     3.67
Mat_Transform::Size_MatType::(127x61, 32SC3)             0.110  0.062     1.77
Mat_Transform::Size_MatType::(127x61, 64FC3)             0.034  0.039     0.89
Mat_Transform::Size_MatType::(640x480, 8SC3)             5.673  1.395     4.07
Mat_Transform::Size_MatType::(640x480, 16SC3)            5.331  1.439     3.70
Mat_Transform::Size_MatType::(640x480, 32SC3)            4.329  2.472     1.75
Mat_Transform::Size_MatType::(640x480, 64FC3)            1.560  2.316     0.67
Mat_Transform::Size_MatType::(1280x720, 8SC3)            17.002 4.139     4.11
Mat_Transform::Size_MatType::(1280x720, 16SC3)           15.996 4.308     3.71
Mat_Transform::Size_MatType::(1280x720, 32SC3)           12.948 7.241     1.79
Mat_Transform::Size_MatType::(1280x720, 64FC3)           4.742  7.376     0.64
Mat_Transform::Size_MatType::(1920x1080, 8SC3)           38.253 9.384     4.08
Mat_Transform::Size_MatType::(1920x1080, 16SC3)          35.913 9.750     3.68
Mat_Transform::Size_MatType::(1920x1080, 32SC3)          29.145 16.528    1.76
Mat_Transform::Size_MatType::(1920x1080, 64FC3)          10.606 20.968    0.51
Mat_Transform_Diagonal::Size_MatType::(640x480, 8SC3)    4.439  1.086     4.09
Mat_Transform_Diagonal::Size_MatType::(640x480, 16SC3)   4.251  1.136     3.74
Mat_Transform_Diagonal::Size_MatType::(640x480, 32SC3)   3.786  1.999     1.89
Mat_Transform_Diagonal::Size_MatType::(640x480, 64FC3)   1.555  1.551     1.00
Mat_Transform_Diagonal::Size_MatType::(1280x720, 8SC3)   13.319 3.243     4.11
Mat_Transform_Diagonal::Size_MatType::(1280x720, 16SC3)  12.828 3.398     3.78
Mat_Transform_Diagonal::Size_MatType::(1280x720, 32SC3)  11.336 5.989     1.89
Mat_Transform_Diagonal::Size_MatType::(1280x720, 64FC3)  4.707  4.690     1.00
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 8SC3)  29.952 7.293     4.11
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 16SC3) 28.817 7.656     3.76
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 32SC3) 25.476 13.388    1.90
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 64FC3) 10.533 10.509    1.00
```

### M2

```
Geometric mean (ms)

                      Name of Test                       base  patch   patch
                                                                         vs
                                                                        base
                                                                     (x-factor)
Mat_Transform::Size_MatType::(127x61, 8SC3)              0.020 0.004    4.45
Mat_Transform::Size_MatType::(127x61, 16SC3)             0.016 0.004    4.48
Mat_Transform::Size_MatType::(127x61, 32SC3)             0.016 0.007    2.23
Mat_Transform::Size_MatType::(127x61, 64FC3)             0.007 0.006    1.20
Mat_Transform::Size_MatType::(640x480, 8SC3)             0.793 0.197    4.03
Mat_Transform::Size_MatType::(640x480, 16SC3)            0.626 0.154    4.08
Mat_Transform::Size_MatType::(640x480, 32SC3)            0.627 0.306    2.05
Mat_Transform::Size_MatType::(640x480, 64FC3)            0.273 0.253    1.08
Mat_Transform::Size_MatType::(1280x720, 8SC3)            2.350 0.540    4.35
Mat_Transform::Size_MatType::(1280x720, 16SC3)           1.875 0.415    4.52
Mat_Transform::Size_MatType::(1280x720, 32SC3)           1.893 0.844    2.24
Mat_Transform::Size_MatType::(1280x720, 64FC3)           0.830 0.808    1.03
Mat_Transform::Size_MatType::(1920x1080, 8SC3)           5.302 1.178    4.50
Mat_Transform::Size_MatType::(1920x1080, 16SC3)          4.475 0.946    4.73
Mat_Transform::Size_MatType::(1920x1080, 32SC3)          4.409 1.864    2.37
Mat_Transform::Size_MatType::(1920x1080, 64FC3)          1.853 1.512    1.23
Mat_Transform_Diagonal::Size_MatType::(640x480, 8SC3)    0.586 0.110    5.32
Mat_Transform_Diagonal::Size_MatType::(640x480, 16SC3)   0.518 0.110    4.69
Mat_Transform_Diagonal::Size_MatType::(640x480, 32SC3)   0.430 0.220    1.95
Mat_Transform_Diagonal::Size_MatType::(640x480, 64FC3)   0.228 0.178    1.28
Mat_Transform_Diagonal::Size_MatType::(1280x720, 8SC3)   1.768 0.336    5.26
Mat_Transform_Diagonal::Size_MatType::(1280x720, 16SC3)  1.514 0.335    4.52
Mat_Transform_Diagonal::Size_MatType::(1280x720, 32SC3)  1.292 0.670    1.93
Mat_Transform_Diagonal::Size_MatType::(1280x720, 64FC3)  0.681 0.579    1.18
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 8SC3)  3.998 0.747    5.35
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 16SC3) 3.392 0.757    4.48
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 32SC3) 2.956 1.491    1.98
Mat_Transform_Diagonal::Size_MatType::(1920x1080, 64FC3) 1.546 1.476    1.05
```


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2025-02-06 18:38:16 +08:00 committed by GitHub
parent 28deafcbd4
commit 603b1cafdf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 550 additions and 0 deletions

View File

@ -166,4 +166,30 @@ PERF_TEST_P(Size_MatType, Mat_Transform,
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
}
PERF_TEST_P(Size_MatType, Mat_Transform_Diagonal,
testing::Combine(testing::Values(szVGA, sz720p, sz1080p),
testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3))
)
{
const Size_MatType_t params = GetParam();
const Size srcSize0 = get<0>(params);
const Size srcSize = Size(1, srcSize0.width*srcSize0.height);
const int type = get<1>(params);
const float transform[] = { 0.5f, 0.f, 0.f, 128,
0.f, 0.86602540378f, 0.f, -64,
0.f, 0.f, 0.4330127019f, 32, };
Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
Mat src(srcSize, type), dst(srcSize, type);
randu(src, 0, 30);
declare.in(src).out(dst);
TEST_CYCLE()
{
cv::transform(src, dst, mtx);
}
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
}
} // namespace

View File

@ -1682,24 +1682,232 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
static void
transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
{
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int BITS = 10, SCALE = 1 << BITS;
const float MAX_M = (float)(1 << (15 - BITS));
if( scn == 3 && dcn == 3 &&
std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[ 2]) < MAX_M*256 && std::abs(m[ 3]) < MAX_M*256 &&
std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[ 6]) < MAX_M*256 && std::abs(m[ 7]) < MAX_M*256 &&
std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M*256 && std::abs(m[11]) < MAX_M*256 )
{
const int nChannels = 3;
union {
short s[6];
int p[3];
} m16;
m16.s[0] = saturate_cast<short>(m[0] * SCALE); m16.s[1] = saturate_cast<short>(m[1] * SCALE);
m16.s[2] = saturate_cast<short>(m[4] * SCALE); m16.s[3] = saturate_cast<short>(m[5] * SCALE);
m16.s[4] = saturate_cast<short>(m[8] * SCALE); m16.s[5] = saturate_cast<short>(m[9] * SCALE);
int m32[] = {saturate_cast<int>(m[ 2] * SCALE), saturate_cast<int>(m[ 3] * SCALE),
saturate_cast<int>(m[ 6] * SCALE), saturate_cast<int>(m[ 7] * SCALE),
saturate_cast<int>(m[10] * SCALE), saturate_cast<int>(m[11] * SCALE)};
v_int16 m01 = v_reinterpret_as_s16(vx_setall_s32(m16.p[0]));
v_int32 m2 = vx_setall_s32(m32[0]);
v_int32 m3 = vx_setall_s32(m32[1]);
v_int16 m45 = v_reinterpret_as_s16(vx_setall_s32(m16.p[1]));
v_int32 m6 = vx_setall_s32(m32[2]);
v_int32 m7 = vx_setall_s32(m32[3]);
v_int16 m89 = v_reinterpret_as_s16(vx_setall_s32(m16.p[2]));
v_int32 m10 = vx_setall_s32(m32[4]);
v_int32 m11 = vx_setall_s32(m32[5]);
int x = 0;
for (; x <= (len - VTraits<v_int8>::vlanes()) * nChannels; x += VTraits<v_int8>::vlanes() * nChannels)
{
v_int8 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_int8 bgl, bgh;
v_zip(b, g, bgl, bgh);
v_int16 rl, rh;
v_expand(r, rl, rh);
v_int16 dbl, dbh, dgl, dgh, drl, drh;
v_int16 p0, p2;
v_int32 p1, p3;
v_expand(bgl, p0, p2);
v_expand(v_reinterpret_as_s16(rl), p1, p3);
dbl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m01), v_mul(p1, m2)), m3),
v_add(v_add(v_dotprod(p2, m01), v_mul(p3, m2)), m3));
dgl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m45), v_mul(p1, m6)), m7),
v_add(v_add(v_dotprod(p2, m45), v_mul(p3, m6)), m7));
drl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m89), v_mul(p1, m10)), m11),
v_add(v_add(v_dotprod(p2, m89), v_mul(p3, m10)), m11));
v_expand(bgh, p0, p2);
v_expand(v_reinterpret_as_s16(rh), p1, p3);
dbh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m01), v_mul(p1, m2)), m3),
v_add(v_add(v_dotprod(p2, m01), v_mul(p3, m2)), m3));
dgh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m45), v_mul(p1, m6)), m7),
v_add(v_add(v_dotprod(p2, m45), v_mul(p3, m6)), m7));
drh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(p0, m89), v_mul(p1, m10)), m11),
v_add(v_add(v_dotprod(p2, m89), v_mul(p3, m10)), m11));
v_store_interleave(dst + x, v_pack(dbl, dbh), v_pack(dgl, dgh), v_pack(drl, drh));
}
m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE);
m32[3] = saturate_cast<int>((m[7] + 0.5f)*SCALE);
m32[5] = saturate_cast<int>((m[11] + 0.5f)*SCALE);
for( ; x < len * nChannels; x += nChannels )
{
int v0 = src[x], v1 = src[x+1], v2 = src[x+2];
schar t0 = saturate_cast<schar>((m16.s[0] * v0 + m16.s[1] * v1 + m32[0] * v2 + m32[1]) >> BITS);
schar t1 = saturate_cast<schar>((m16.s[2] * v0 + m16.s[3] * v1 + m32[2] * v2 + m32[3]) >> BITS);
schar t2 = saturate_cast<schar>((m16.s[4] * v0 + m16.s[5] * v1 + m32[4] * v2 + m32[5]) >> BITS);
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
}
vx_cleanup();
return;
}
#endif
transform_(src, dst, m, len, scn, dcn);
}
static void
transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
{
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float32 m0 = vx_setall_f32(m[ 0]);
v_float32 m1 = vx_setall_f32(m[ 1]);
v_float32 m2 = vx_setall_f32(m[ 2]);
v_float32 m3 = vx_setall_f32(m[ 3]);
v_float32 m4 = vx_setall_f32(m[ 4]);
v_float32 m5 = vx_setall_f32(m[ 5]);
v_float32 m6 = vx_setall_f32(m[ 6]);
v_float32 m7 = vx_setall_f32(m[ 7]);
v_float32 m8 = vx_setall_f32(m[ 8]);
v_float32 m9 = vx_setall_f32(m[ 9]);
v_float32 m10 = vx_setall_f32(m[10]);
v_float32 m11 = vx_setall_f32(m[11]);
for (; x <= (len - VTraits<v_int16>::vlanes())*3; x += VTraits<v_int16>::vlanes()*3)
{
v_int16 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_int32 bl, bh, gl, gh, rl, rh;
v_expand(b, bl, bh);
v_expand(g, gl, gh);
v_expand(r, rl, rh);
v_int16 db, dg, dr;
db = v_pack(v_round(v_muladd(v_cvt_f32(bl), m0, v_muladd(v_cvt_f32(gl), m1, v_muladd(v_cvt_f32(rl), m2, m3)))),
v_round(v_muladd(v_cvt_f32(bh), m0, v_muladd(v_cvt_f32(gh), m1, v_muladd(v_cvt_f32(rh), m2, m3)))));
dg = v_pack(v_round(v_muladd(v_cvt_f32(bl), m4, v_muladd(v_cvt_f32(gl), m5, v_muladd(v_cvt_f32(rl), m6, m7)))),
v_round(v_muladd(v_cvt_f32(bh), m4, v_muladd(v_cvt_f32(gh), m5, v_muladd(v_cvt_f32(rh), m6, m7)))));
dr = v_pack(v_round(v_muladd(v_cvt_f32(bl), m8, v_muladd(v_cvt_f32(gl), m9, v_muladd(v_cvt_f32(rl), m10, m11)))),
v_round(v_muladd(v_cvt_f32(bh), m8, v_muladd(v_cvt_f32(gh), m9, v_muladd(v_cvt_f32(rh), m10, m11)))));
v_store_interleave(dst + x, db, dg, dr);
}
for( ; x < len * 3; x += 3 )
{
float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
short t0 = saturate_cast<short>(m[0] * v0 + m[1] * v1 + m[ 2] * v2 + m[ 3]);
short t1 = saturate_cast<short>(m[4] * v0 + m[5] * v1 + m[ 6] * v2 + m[ 7]);
short t2 = saturate_cast<short>(m[8] * v0 + m[9] * v1 + m[10] * v2 + m[11]);
dst[x] = t0; dst[x + 1] = t1; dst[x + 2] = t2;
}
vx_cleanup();
return;
}
#endif
transform_(src, dst, m, len, scn, dcn);
}
static void
transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
{
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
if( scn == 3 && dcn == 3) {
int x = 0;
v_float64 m0 = vx_setall_f64(m[ 0]);
v_float64 m1 = vx_setall_f64(m[ 1]);
v_float64 m2 = vx_setall_f64(m[ 2]);
v_float64 m3 = vx_setall_f64(m[ 3]);
v_float64 m4 = vx_setall_f64(m[ 4]);
v_float64 m5 = vx_setall_f64(m[ 5]);
v_float64 m6 = vx_setall_f64(m[ 6]);
v_float64 m7 = vx_setall_f64(m[ 7]);
v_float64 m8 = vx_setall_f64(m[ 8]);
v_float64 m9 = vx_setall_f64(m[ 9]);
v_float64 m10 = vx_setall_f64(m[10]);
v_float64 m11 = vx_setall_f64(m[11]);
for (; x <= (len - VTraits<v_int32>::vlanes()) * 3; x += VTraits<v_int32>::vlanes() * 3) {
v_int32 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_float64 bh = v_cvt_f64_high(b), bl = v_cvt_f64(b);
v_float64 gh = v_cvt_f64_high(g), gl = v_cvt_f64(g);
v_float64 rh = v_cvt_f64_high(r), rl = v_cvt_f64(r);
v_int32 db, dg, dr;
db = v_round(v_fma(bl, m0, v_fma(gl, m1, v_fma(rl, m2, m3))),
v_fma(bh, m0, v_fma(gh, m1, v_fma(rh, m2, m3))));
dg = v_round(v_fma(bl, m4, v_fma(gl, m5, v_fma(rl, m6, m7))),
v_fma(bh, m4, v_fma(gh, m5, v_fma(rh, m6, m7))));
dr = v_round(v_fma(bl, m8, v_fma(gl, m9, v_fma(rl, m10, m11))),
v_fma(bh, m8, v_fma(gh, m9, v_fma(rh, m10, m11))));
v_store_interleave(dst + x, db, dg, dr);
}
for (; x < len * 3; x += 3) {
double b = src[x], g = src[x + 1], r = src[x + 2];
int db = saturate_cast<int>(m[0] * b + m[1] * g + m[ 2] * r + m[ 3]);
int dg = saturate_cast<int>(m[4] * b + m[5] * g + m[ 6] * r + m[ 7]);
int dr = saturate_cast<int>(m[8] * b + m[9] * g + m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
transform_(src, dst, m, len, scn, dcn);
}
static void
transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
{
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
if( scn == 3 && dcn == 3) {
int x = 0;
v_float64 m0 = vx_setall_f64(m[ 0]);
v_float64 m1 = vx_setall_f64(m[ 1]);
v_float64 m2 = vx_setall_f64(m[ 2]);
v_float64 m3 = vx_setall_f64(m[ 3]);
v_float64 m4 = vx_setall_f64(m[ 4]);
v_float64 m5 = vx_setall_f64(m[ 5]);
v_float64 m6 = vx_setall_f64(m[ 6]);
v_float64 m7 = vx_setall_f64(m[ 7]);
v_float64 m8 = vx_setall_f64(m[ 8]);
v_float64 m9 = vx_setall_f64(m[ 9]);
v_float64 m10 = vx_setall_f64(m[10]);
v_float64 m11 = vx_setall_f64(m[11]);
for (; x <= (len - VTraits<v_float64>::vlanes()) * 3; x += VTraits<v_float64>::vlanes() * 3) {
v_float64 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_float64 db, dg, dr;
db = v_fma(b, m0, v_fma(g, m1, v_fma(r, m2, m3)));
dg = v_fma(b, m4, v_fma(g, m5, v_fma(r, m6, m7)));
dr = v_fma(b, m8, v_fma(g, m9, v_fma(r, m10, m11)));
v_store_interleave(dst + x, db, dg, dr);
}
for (; x < len * 3; x += 3) {
double b = src[x], g = src[x + 1], r = src[x + 2];
double db = saturate_cast<double>(m[0] * b + m[1] * g + m[ 2] * r + m[ 3]);
double dg = saturate_cast<double>(m[4] * b + m[5] * g + m[ 6] * r + m[ 7]);
double dr = saturate_cast<double>(m[8] * b + m[9] * g + m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
transform_(src, dst, m, len, scn, dcn);
}
@ -1753,42 +1961,358 @@ diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
static void
diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
{
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float32 m0 = vx_setall_f32(m[ 0]);
v_float32 m3 = vx_setall_f32(m[ 3]);
v_float32 m5 = vx_setall_f32(m[ 5]);
v_float32 m7 = vx_setall_f32(m[ 7]);
v_float32 m10 = vx_setall_f32(m[10]);
v_float32 m11 = vx_setall_f32(m[11]);
for (; x <= (len - VTraits<v_uint8>::vlanes()) * 3; x += VTraits<v_uint8>::vlanes() * 3) {
v_uint8 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_uint16 b_u16_l, g_u16_l, r_u16_l;
v_uint16 b_u16_h, g_u16_h, r_u16_h;
v_expand(b, b_u16_l, b_u16_h);
v_expand(g, g_u16_l, g_u16_h);
v_expand(r, r_u16_l, r_u16_h);
v_uint32 b_u32_l0, g_u32_l0, r_u32_l0;
v_uint32 b_u32_l1, g_u32_l1, r_u32_l1;
v_uint32 b_u32_h0, g_u32_h0, r_u32_h0;
v_uint32 b_u32_h1, g_u32_h1, r_u32_h1;
v_expand(b_u16_l, b_u32_l0, b_u32_l1);
v_expand(b_u16_h, b_u32_h0, b_u32_h1);
v_expand(g_u16_l, g_u32_l0, g_u32_l1);
v_expand(g_u16_h, g_u32_h0, g_u32_h1);
v_expand(r_u16_l, r_u32_l0, r_u32_l1);
v_expand(r_u16_h, r_u32_h0, r_u32_h1);
v_float32 db_f32_l0 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_l0)), m3);
v_float32 db_f32_l1 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_l1)), m3);
v_float32 db_f32_h0 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_h0)), m3);
v_float32 db_f32_h1 = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_h1)), m3);
v_float32 dg_f32_l0 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_l0)), m7);
v_float32 dg_f32_l1 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_l1)), m7);
v_float32 dg_f32_h0 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_h0)), m7);
v_float32 dg_f32_h1 = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_h1)), m7);
v_float32 dr_f32_l0 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_l0)), m11);
v_float32 dr_f32_l1 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_l1)), m11);
v_float32 dr_f32_h0 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_h0)), m11);
v_float32 dr_f32_h1 = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_h1)), m11);
v_store_interleave(dst + x,
v_pack(v_pack_u(v_round(db_f32_l0), v_round(db_f32_l1)), v_pack_u(v_round(db_f32_h0), v_round(db_f32_h1))),
v_pack(v_pack_u(v_round(dg_f32_l0), v_round(dg_f32_l1)), v_pack_u(v_round(dg_f32_h0), v_round(dg_f32_h1))),
v_pack(v_pack_u(v_round(dr_f32_l0), v_round(dr_f32_l1)), v_pack_u(v_round(dr_f32_h0), v_round(dr_f32_h1))));
}
for (; x < len * 3; x += 3) {
int b = src[x], g = src[x + 1], r = src[x + 2];
uchar db = saturate_cast<uchar>(m[ 0] * b + m[ 3]);
uchar dg = saturate_cast<uchar>(m[ 5] * g + m[ 7]);
uchar dr = saturate_cast<uchar>(m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
diagtransform_(src, dst, m, len, scn, dcn);
}
static void
diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
{
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float32 m0 = vx_setall_f32(m[ 0]);
v_float32 m3 = vx_setall_f32(m[ 3]);
v_float32 m5 = vx_setall_f32(m[ 5]);
v_float32 m7 = vx_setall_f32(m[ 7]);
v_float32 m10 = vx_setall_f32(m[10]);
v_float32 m11 = vx_setall_f32(m[11]);
for (; x <= (len - VTraits<v_int8>::vlanes()) * 3; x += VTraits<v_int8>::vlanes() * 3) {
v_int8 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_int16 b_u16_l, g_u16_l, r_u16_l;
v_int16 b_u16_h, g_u16_h, r_u16_h;
v_expand(b, b_u16_l, b_u16_h);
v_expand(g, g_u16_l, g_u16_h);
v_expand(r, r_u16_l, r_u16_h);
v_int32 b_u32_l0, g_u32_l0, r_u32_l0;
v_int32 b_u32_l1, g_u32_l1, r_u32_l1;
v_int32 b_u32_h0, g_u32_h0, r_u32_h0;
v_int32 b_u32_h1, g_u32_h1, r_u32_h1;
v_expand(b_u16_l, b_u32_l0, b_u32_l1);
v_expand(b_u16_h, b_u32_h0, b_u32_h1);
v_expand(g_u16_l, g_u32_l0, g_u32_l1);
v_expand(g_u16_h, g_u32_h0, g_u32_h1);
v_expand(r_u16_l, r_u32_l0, r_u32_l1);
v_expand(r_u16_h, r_u32_h0, r_u32_h1);
v_float32 db_f32_l0 = v_fma( m0, v_cvt_f32(b_u32_l0), m3);
v_float32 db_f32_l1 = v_fma( m0, v_cvt_f32(b_u32_l1), m3);
v_float32 db_f32_h0 = v_fma( m0, v_cvt_f32(b_u32_h0), m3);
v_float32 db_f32_h1 = v_fma( m0, v_cvt_f32(b_u32_h1), m3);
v_float32 dg_f32_l0 = v_fma( m5, v_cvt_f32(g_u32_l0), m7);
v_float32 dg_f32_l1 = v_fma( m5, v_cvt_f32(g_u32_l1), m7);
v_float32 dg_f32_h0 = v_fma( m5, v_cvt_f32(g_u32_h0), m7);
v_float32 dg_f32_h1 = v_fma( m5, v_cvt_f32(g_u32_h1), m7);
v_float32 dr_f32_l0 = v_fma(m10, v_cvt_f32(r_u32_l0), m11);
v_float32 dr_f32_l1 = v_fma(m10, v_cvt_f32(r_u32_l1), m11);
v_float32 dr_f32_h0 = v_fma(m10, v_cvt_f32(r_u32_h0), m11);
v_float32 dr_f32_h1 = v_fma(m10, v_cvt_f32(r_u32_h1), m11);
v_store_interleave(dst + x,
v_pack(v_pack(v_round(db_f32_l0), v_round(db_f32_l1)), v_pack(v_round(db_f32_h0), v_round(db_f32_h1))),
v_pack(v_pack(v_round(dg_f32_l0), v_round(dg_f32_l1)), v_pack(v_round(dg_f32_h0), v_round(dg_f32_h1))),
v_pack(v_pack(v_round(dr_f32_l0), v_round(dr_f32_l1)), v_pack(v_round(dr_f32_h0), v_round(dr_f32_h1))));
}
for (; x < len * 3; x += 3) {
int b = src[x], g = src[x + 1], r = src[x + 2];
schar db = saturate_cast<schar>(m[ 0] * b + m[ 3]);
schar dg = saturate_cast<schar>(m[ 5] * g + m[ 7]);
schar dr = saturate_cast<schar>(m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
diagtransform_(src, dst, m, len, scn, dcn);
}
static void
diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
{
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float32 m0 = vx_setall_f32(m[ 0]);
v_float32 m3 = vx_setall_f32(m[ 3]);
v_float32 m5 = vx_setall_f32(m[ 5]);
v_float32 m7 = vx_setall_f32(m[ 7]);
v_float32 m10 = vx_setall_f32(m[10]);
v_float32 m11 = vx_setall_f32(m[11]);
for (; x <= (len - VTraits<v_uint16>::vlanes()) * 3; x += VTraits<v_uint16>::vlanes() * 3) {
v_uint16 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_uint32 b_u32_l, g_u32_l, r_u32_l;
v_uint32 b_u32_h, g_u32_h, r_u32_h;
v_expand(b, b_u32_l, b_u32_h);
v_expand(g, g_u32_l, g_u32_h);
v_expand(r, r_u32_l, r_u32_h);
v_float32 db_f32_l = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_l)), m3);
v_float32 db_f32_h = v_fma( m0, v_cvt_f32(v_reinterpret_as_s32(b_u32_h)), m3);
v_float32 dg_f32_l = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_l)), m7);
v_float32 dg_f32_h = v_fma( m5, v_cvt_f32(v_reinterpret_as_s32(g_u32_h)), m7);
v_float32 dr_f32_l = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_l)), m11);
v_float32 dr_f32_h = v_fma(m10, v_cvt_f32(v_reinterpret_as_s32(r_u32_h)), m11);
v_store_interleave(dst + x,
v_pack_u(v_round(db_f32_l), v_round(db_f32_h)),
v_pack_u(v_round(dg_f32_l), v_round(dg_f32_h)),
v_pack_u(v_round(dr_f32_l), v_round(dr_f32_h)));
}
for (; x < len * 3; x += 3) {
int b = src[x], g = src[x + 1], r = src[x + 2];
ushort db = saturate_cast<ushort>(m[ 0] * b + m[ 3]);
ushort dg = saturate_cast<ushort>(m[ 5] * g + m[ 7]);
ushort dr = saturate_cast<ushort>(m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
diagtransform_(src, dst, m, len, scn, dcn);
}
static void
diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
{
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float32 m0 = vx_setall_f32(m[ 0]);
v_float32 m3 = vx_setall_f32(m[ 3]);
v_float32 m5 = vx_setall_f32(m[ 5]);
v_float32 m7 = vx_setall_f32(m[ 7]);
v_float32 m10 = vx_setall_f32(m[10]);
v_float32 m11 = vx_setall_f32(m[11]);
for (; x <= (len - VTraits<v_int16>::vlanes()) * 3; x += VTraits<v_int16>::vlanes() * 3) {
v_int16 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_int32 b_u32_l, g_u32_l, r_u32_l;
v_int32 b_u32_h, g_u32_h, r_u32_h;
v_expand(b, b_u32_l, b_u32_h);
v_expand(g, g_u32_l, g_u32_h);
v_expand(r, r_u32_l, r_u32_h);
v_float32 db_f32_l = v_fma( m0, v_cvt_f32(b_u32_l), m3);
v_float32 db_f32_h = v_fma( m0, v_cvt_f32(b_u32_h), m3);
v_float32 dg_f32_l = v_fma( m5, v_cvt_f32(g_u32_l), m7);
v_float32 dg_f32_h = v_fma( m5, v_cvt_f32(g_u32_h), m7);
v_float32 dr_f32_l = v_fma(m10, v_cvt_f32(r_u32_l), m11);
v_float32 dr_f32_h = v_fma(m10, v_cvt_f32(r_u32_h), m11);
v_store_interleave(dst + x,
v_pack(v_round(db_f32_l), v_round(db_f32_h)),
v_pack(v_round(dg_f32_l), v_round(dg_f32_h)),
v_pack(v_round(dr_f32_l), v_round(dr_f32_h)));
}
for (; x < len * 3; x += 3) {
int b = src[x], g = src[x + 1], r = src[x + 2];
short db = saturate_cast<short>(m[ 0] * b + m[ 3]);
short dg = saturate_cast<short>(m[ 5] * g + m[ 7]);
short dr = saturate_cast<short>(m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
diagtransform_(src, dst, m, len, scn, dcn);
}
static void
diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
{
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float64 m0 = vx_setall_f64(m[ 0]);
v_float64 m3 = vx_setall_f64(m[ 3]);
v_float64 m5 = vx_setall_f64(m[ 5]);
v_float64 m7 = vx_setall_f64(m[ 7]);
v_float64 m10 = vx_setall_f64(m[10]);
v_float64 m11 = vx_setall_f64(m[11]);
for (; x <= (len - VTraits<v_float32>::vlanes()) * 3; x += VTraits<v_float32>::vlanes() * 3) {
v_int32 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_float64 bh = v_cvt_f64_high(b), bl = v_cvt_f64(b);
v_float64 gh = v_cvt_f64_high(g), gl = v_cvt_f64(g);
v_float64 rh = v_cvt_f64_high(r), rl = v_cvt_f64(r);
v_int32 db = v_round(v_fma( m0, bl, m3),
v_fma( m0, bh, m3));
v_int32 dg = v_round(v_fma( m5, gl, m7),
v_fma( m5, gh, m7));
v_int32 dr = v_round(v_fma(m10, rl, m11),
v_fma(m10, rh, m11));
v_store_interleave(dst + x, db, dg, dr);
}
for (; x < len * 3; x += 3) {
int b = src[x], g = src[x + 1], r = src[x + 2];
int db = saturate_cast<int>(m[ 0] * b + m[ 3]);
int dg = saturate_cast<int>(m[ 5] * g + m[ 7]);
int dr = saturate_cast<int>(m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
diagtransform_(src, dst, m, len, scn, dcn);
}
static void
diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
{
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float32 m0 = vx_setall_f32(m[ 0]);
v_float32 m3 = vx_setall_f32(m[ 3]);
v_float32 m5 = vx_setall_f32(m[ 5]);
v_float32 m7 = vx_setall_f32(m[ 7]);
v_float32 m10 = vx_setall_f32(m[10]);
v_float32 m11 = vx_setall_f32(m[11]);
for (; x <= (len - VTraits<v_float32>::vlanes()) * 3; x += VTraits<v_float32>::vlanes() * 3) {
v_float32 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_float32 db = v_fma( m0, b, m3);
v_float32 dg = v_fma( m5, g, m7);
v_float32 dr = v_fma(m10, r, m11);
v_store_interleave(dst + x, db, dg, dr);
}
for (; x < len * 3; x += 3) {
float b = src[x], g = src[x + 1], r = src[x + 2];
float db = saturate_cast<float>(m[ 0] * b + m[ 3]);
float dg = saturate_cast<float>(m[ 5] * g + m[ 7]);
float dr = saturate_cast<float>(m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
diagtransform_(src, dst, m, len, scn, dcn);
}
static void
diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
{
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
if (scn == 3 && dcn == 3) {
int x = 0;
v_float64 m0 = vx_setall_f64(m[ 0]);
v_float64 m3 = vx_setall_f64(m[ 3]);
v_float64 m5 = vx_setall_f64(m[ 5]);
v_float64 m7 = vx_setall_f64(m[ 7]);
v_float64 m10 = vx_setall_f64(m[10]);
v_float64 m11 = vx_setall_f64(m[11]);
for (; x <= (len - VTraits<v_float64>::vlanes()) * 3; x += VTraits<v_float64>::vlanes() * 3) {
v_float64 b, g, r;
v_load_deinterleave(src + x, b, g, r);
v_float64 db = v_fma( m0, b, m3);
v_float64 dg = v_fma( m5, g, m7);
v_float64 dr = v_fma(m10, r, m11);
v_store_interleave(dst + x, db, dg, dr);
}
for (; x < len * 3; x += 3) {
double b = src[x], g = src[x + 1], r = src[x + 2];
double db = saturate_cast<double>(m[ 0] * b + m[ 3]);
double dg = saturate_cast<double>(m[ 5] * g + m[ 7]);
double dr = saturate_cast<double>(m[10] * r + m[11]);
dst[x] = db; dst[x + 1] = dg; dst[x + 2] = dr;
}
vx_cleanup();
return;
}
#endif
diagtransform_(src, dst, m, len, scn, dcn);
}