mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
540 lines
23 KiB
C++
540 lines
23 KiB
C++
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
#include "opencv2/core/hal/intrin.hpp"
|
|
|
|
namespace cv {
|
|
namespace dnn {
|
|
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
|
|
|
void fastDepthwiseConv(const float* weights,
|
|
int kernel_h, int kernel_w,
|
|
int stride_h, int stride_w,
|
|
int dilation_h, int dilation_w,
|
|
int pad_t, int pad_l,
|
|
const float* bias, const float* relu,
|
|
const float* inptr,
|
|
int height, int width,
|
|
float* outptr,
|
|
int out_d, int outH, int outW);
|
|
|
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
|
|
|
|
#if !CV_FMA3 // AVX workaround
|
|
#undef _mm256_fmadd_ps
|
|
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
|
|
#endif
|
|
|
|
static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
|
|
{
|
|
__m256 t0 = _mm256_loadu_ps(ptr);
|
|
__m256 t1 = _mm256_loadu_ps(ptr + 8);
|
|
|
|
__m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
|
|
__m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
|
|
a = _mm256_shuffle_ps(lo, hi, 0x88);
|
|
b = _mm256_shuffle_ps(lo, hi, 0xdd);
|
|
}
|
|
|
|
void fastDepthwiseConv( const float* wptr,
|
|
int kernel_h, int kernel_w,
|
|
int stride_h, int stride_w,
|
|
int dilation_h, int dilation_w,
|
|
int pad_t, int pad_l,
|
|
const float* biasptr, const float* relu,
|
|
const float* inptr_,
|
|
int height, int width,
|
|
float* outptr_,
|
|
int out_d, int outH, int outW )
|
|
{
|
|
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
|
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
|
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
|
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
|
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
|
|
|
|
for (int out_i = 0; out_i < outH; out_i++)
|
|
{
|
|
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
|
const float* imgptr0 = inptr_ + in_i*width;
|
|
const float* imgptr1 = imgptr0 + dilation_h*width;
|
|
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
|
float out, w00 = w00_, w01 = w01_, w02 = w02_;
|
|
float w20 = w20_, w21 = w21_, w22 = w22_;
|
|
if (in_i < 0)
|
|
{
|
|
w00 = w01 = w02 = 0.f;
|
|
imgptr0 = imgptr1;
|
|
}
|
|
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
|
{
|
|
w20 = w21 = w22 = 0.f;
|
|
imgptr2 = imgptr1;
|
|
}
|
|
float* outptr = outptr_ + out_i*outW;
|
|
if (pad_l > 0)
|
|
{
|
|
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
|
|
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
|
|
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[0] = out;
|
|
out_j = 1;
|
|
}
|
|
|
|
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
|
|
{
|
|
const int VECSZ = 8;
|
|
__m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
|
|
vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
|
|
vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
|
|
__m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
|
|
|
|
if( stride_w == 1 )
|
|
for( ; out_j < outW1; out_j += VECSZ )
|
|
{
|
|
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
|
out_j = outW1 - VECSZ;
|
|
int in_j = out_j * stride_w - pad_l;
|
|
__m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
|
|
v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
|
|
v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
|
|
v10 = _mm256_loadu_ps(imgptr1 + in_j),
|
|
v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
|
|
v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
|
|
v20 = _mm256_loadu_ps(imgptr2 + in_j),
|
|
v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
|
|
v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
|
|
|
|
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
|
|
__m256 vout1 = _mm256_mul_ps(v01, vw01);
|
|
__m256 vout2 = _mm256_mul_ps(v02, vw02);
|
|
|
|
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
|
|
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
|
|
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
|
|
|
|
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
|
|
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
|
|
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
|
|
|
|
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
|
|
if (relu)
|
|
{
|
|
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
|
|
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
|
|
}
|
|
_mm256_storeu_ps(outptr + out_j, vout0);
|
|
}
|
|
else
|
|
for( ; out_j < outW1; out_j += VECSZ )
|
|
{
|
|
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
|
out_j = outW1 - VECSZ;
|
|
int in_j = out_j * stride_w - pad_l;
|
|
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
|
|
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
|
|
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
|
|
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
|
|
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
|
|
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
|
|
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
|
|
|
|
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
|
|
__m256 vout1 = _mm256_mul_ps(v01, vw01);
|
|
__m256 vout2 = _mm256_mul_ps(v02, vw02);
|
|
|
|
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
|
|
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
|
|
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
|
|
|
|
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
|
|
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
|
|
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
|
|
|
|
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
|
|
if (relu)
|
|
{
|
|
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
|
|
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
|
|
}
|
|
_mm256_storeu_ps(outptr + out_j, vout0);
|
|
}
|
|
}
|
|
|
|
for (; out_j < outW1; out_j++)
|
|
{
|
|
int in_j = out_j * stride_w - pad_l;
|
|
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
|
|
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
|
|
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[out_j] = out;
|
|
}
|
|
|
|
for (; out_j < outW; out_j++ )
|
|
{
|
|
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
|
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
|
|
if (in_j0 >= width)
|
|
{
|
|
in_j0 = 0;
|
|
s0 = 0.f;
|
|
}
|
|
if (in_j1 >= width)
|
|
{
|
|
in_j1 = 0;
|
|
s1 = 0.f;
|
|
}
|
|
if (in_j2 >= width)
|
|
{
|
|
in_j2 = 0;
|
|
s2 = 0.f;
|
|
}
|
|
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
|
|
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
|
|
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[out_j] = out;
|
|
}
|
|
}
|
|
_mm256_zeroupper();
|
|
}
|
|
|
|
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
|
|
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV
|
|
|
|
|
|
void fastDepthwiseConv( const float* wptr,
|
|
int kernel_h, int kernel_w,
|
|
int stride_h, int stride_w,
|
|
int dilation_h, int dilation_w,
|
|
int pad_t, int pad_l,
|
|
const float* biasptr, const float* relu,
|
|
const float* inptr_,
|
|
int height, int width,
|
|
float* outptr_,
|
|
int out_d, int outH, int outW )
|
|
{
|
|
int vl;
|
|
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
|
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
|
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
|
int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
|
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
|
|
|
|
for (int out_i = 0; out_i < outH; out_i++)
|
|
{
|
|
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
|
const float* imgptr0 = inptr_ + in_i*width;
|
|
const float* imgptr1 = imgptr0 + dilation_h*width;
|
|
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
|
float out, w00 = w00_, w01 = w01_, w02 = w02_;
|
|
float w20 = w20_, w21 = w21_, w22 = w22_;
|
|
if (in_i < 0)
|
|
{
|
|
w00 = w01 = w02 = 0.f;
|
|
imgptr0 = imgptr1;
|
|
}
|
|
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
|
{
|
|
w20 = w21 = w22 = 0.f;
|
|
imgptr2 = imgptr1;
|
|
}
|
|
float* outptr = outptr_ + out_i*outW;
|
|
if (pad_l > 0)
|
|
{
|
|
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
|
|
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
|
|
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[0] = out;
|
|
out_j = 1;
|
|
}
|
|
|
|
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
|
|
{
|
|
int avl = outW1 - out_j;
|
|
if( stride_w == 1 )
|
|
for( ; out_j < outW1; out_j += vl, avl -= vl)
|
|
{
|
|
vl = vsetvl_e32m8(avl);
|
|
int in_j = out_j * stride_w - pad_l;
|
|
vfloat32m8_t vout0 = vfmacc_vf_f32m8(vfmv_v_f_f32m8(bias, vl), w00, vle32_v_f32m8(imgptr0 + in_j, vl), vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w01, vle32_v_f32m8(imgptr0 + in_j + dilation_w, vl), vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w02, vle32_v_f32m8(imgptr0 + in_j + dilation_w*2, vl), vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w10, vle32_v_f32m8(imgptr1 + in_j, vl),vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w11, vle32_v_f32m8(imgptr1 + in_j + dilation_w, vl),vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w12, vle32_v_f32m8(imgptr1 + in_j + dilation_w*2, vl),vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w20, vle32_v_f32m8(imgptr2 + in_j, vl), vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w21, vle32_v_f32m8(imgptr2 + in_j + dilation_w, vl), vl);
|
|
vout0 = vfmacc_vf_f32m8(vout0, w22, vle32_v_f32m8(imgptr2 + in_j + dilation_w*2, vl), vl);
|
|
if (relu)
|
|
{
|
|
vbool4_t m = vmfgt_vf_f32m8_b4(vout0, 0, vl);
|
|
vout0 = vmerge_vvm_f32m8(m, vfmul_vf_f32m8(vout0, relu_coeff, vl), vout0, vl);
|
|
}
|
|
vse32_v_f32m8(outptr + out_j, vout0, vl);
|
|
}
|
|
else //stride_w == 2 && dilation_w == 1
|
|
for( ; out_j < outW1; out_j += vl, avl -= vl)
|
|
{
|
|
vl = vsetvl_e32m2(avl);
|
|
int in_j = out_j * stride_w - pad_l;
|
|
vfloat32m2_t vout0 = vfmacc_vf_f32m2(vfmv_v_f_f32m2(bias, vl), w00, vlse32_v_f32m2(imgptr0+in_j , 8, vl), vl);
|
|
vfloat32m2_t vout1 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+1, 8, vl), w01, vl);
|
|
vfloat32m2_t vout2 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+2, 8, vl), w02, vl);
|
|
|
|
vout0 = vfmacc_vf_f32m2(vout0, w10, vlse32_v_f32m2(imgptr1+in_j , 8, vl), vl);
|
|
vout1 = vfmacc_vf_f32m2(vout1, w11, vlse32_v_f32m2(imgptr1+in_j+1, 8, vl), vl);
|
|
vout2 = vfmacc_vf_f32m2(vout2, w12, vlse32_v_f32m2(imgptr1+in_j+2, 8, vl), vl);
|
|
|
|
vout0 = vfmacc_vf_f32m2(vout0, w20, vlse32_v_f32m2(imgptr2+in_j , 8, vl), vl);
|
|
vout1 = vfmacc_vf_f32m2(vout1, w21, vlse32_v_f32m2(imgptr2+in_j+1, 8, vl), vl);
|
|
vout2 = vfmacc_vf_f32m2(vout2, w22, vlse32_v_f32m2(imgptr2+in_j+2, 8, vl), vl);
|
|
|
|
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
|
|
if (relu)
|
|
{
|
|
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
|
|
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
|
|
}
|
|
vse32_v_f32m2(outptr + out_j, vout0, vl);
|
|
}
|
|
}
|
|
|
|
for (; out_j < outW1; out_j++)
|
|
{
|
|
int in_j = out_j * stride_w - pad_l;
|
|
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
|
|
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
|
|
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[out_j] = out;
|
|
}
|
|
|
|
for (; out_j < outW; out_j++ )
|
|
{
|
|
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
|
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
|
|
if (in_j0 >= width)
|
|
{
|
|
in_j0 = 0;
|
|
s0 = 0.f;
|
|
}
|
|
if (in_j1 >= width)
|
|
{
|
|
in_j1 = 0;
|
|
s1 = 0.f;
|
|
}
|
|
if (in_j2 >= width)
|
|
{
|
|
in_j2 = 0;
|
|
s2 = 0.f;
|
|
}
|
|
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
|
|
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
|
|
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[out_j] = out;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif // CV_RVV
|
|
|
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
|
|
|
|
static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
|
|
{
|
|
__m256 t0 = (__m256)__lasx_xvld(ptr, 0);
|
|
__m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
|
|
|
|
__m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
|
|
__m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
|
|
|
|
a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
|
|
b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
|
|
}
|
|
|
|
void fastDepthwiseConv( const float* wptr,
|
|
int kernel_h, int kernel_w,
|
|
int stride_h, int stride_w,
|
|
int dilation_h, int dilation_w,
|
|
int pad_t, int pad_l,
|
|
const float* biasptr, const float* relu,
|
|
const float* inptr_,
|
|
int height, int width,
|
|
float* outptr_,
|
|
int out_d, int outH, int outW )
|
|
{
|
|
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
|
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
|
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
|
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
|
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
|
|
|
|
for (int out_i = 0; out_i < outH; out_i++)
|
|
{
|
|
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
|
const float* imgptr0 = inptr_ + in_i*width;
|
|
const float* imgptr1 = imgptr0 + dilation_h*width;
|
|
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
|
float out, w00 = w00_, w01 = w01_, w02 = w02_;
|
|
float w20 = w20_, w21 = w21_, w22 = w22_;
|
|
if (in_i < 0)
|
|
{
|
|
w00 = w01 = w02 = 0.f;
|
|
imgptr0 = imgptr1;
|
|
}
|
|
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
|
{
|
|
w20 = w21 = w22 = 0.f;
|
|
imgptr2 = imgptr1;
|
|
}
|
|
float* outptr = outptr_ + out_i*outW;
|
|
if (pad_l > 0)
|
|
{
|
|
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
|
|
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
|
|
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[0] = out;
|
|
out_j = 1;
|
|
}
|
|
|
|
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
|
|
{
|
|
const int VECSZ = 8;
|
|
__m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
|
|
vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
|
|
vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
|
|
__m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
|
|
vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
|
|
|
|
if( stride_w == 1 )
|
|
for( ; out_j < outW1; out_j += VECSZ )
|
|
{
|
|
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
|
out_j = outW1 - VECSZ;
|
|
int in_j = out_j * stride_w - pad_l;
|
|
__m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
|
|
v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
|
|
v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
|
|
v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
|
|
v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
|
|
v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
|
|
v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
|
|
v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
|
|
v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
|
|
|
|
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
|
|
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
|
|
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
|
|
|
|
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
|
|
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
|
|
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
|
|
|
|
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
|
|
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
|
|
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
|
|
|
|
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
|
|
if (relu)
|
|
{
|
|
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
|
|
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
|
|
}
|
|
__lasx_xvst(vout0, outptr + out_j, 0);
|
|
}
|
|
else
|
|
for( ; out_j < outW1; out_j += VECSZ )
|
|
{
|
|
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
|
out_j = outW1 - VECSZ;
|
|
int in_j = out_j * stride_w - pad_l;
|
|
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
|
|
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
|
|
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
|
|
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
|
|
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
|
|
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
|
|
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
|
|
|
|
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
|
|
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
|
|
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
|
|
|
|
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
|
|
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
|
|
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
|
|
|
|
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
|
|
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
|
|
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
|
|
|
|
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
|
|
if (relu)
|
|
{
|
|
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
|
|
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
|
|
}
|
|
__lasx_xvst(vout0, outptr + out_j, 0);
|
|
}
|
|
}
|
|
|
|
for (; out_j < outW1; out_j++)
|
|
{
|
|
int in_j = out_j * stride_w - pad_l;
|
|
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
|
|
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
|
|
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[out_j] = out;
|
|
}
|
|
|
|
for (; out_j < outW; out_j++ )
|
|
{
|
|
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
|
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
|
|
if (in_j0 >= width)
|
|
{
|
|
in_j0 = 0;
|
|
s0 = 0.f;
|
|
}
|
|
if (in_j1 >= width)
|
|
{
|
|
in_j1 = 0;
|
|
s1 = 0.f;
|
|
}
|
|
if (in_j2 >= width)
|
|
{
|
|
in_j2 = 0;
|
|
s2 = 0.f;
|
|
}
|
|
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
|
|
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
|
|
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
|
|
if (relu)
|
|
out = out > 0.f ? out : out*relu_coeff;
|
|
outptr[out_j] = out;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif // CV_LASX
|
|
|
|
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
|
}} // namespace
|