opencv/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp
2024-04-07 11:34:41 +08:00

540 lines
23 KiB
C++

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "opencv2/core/hal/intrin.hpp"
namespace cv {
namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void fastDepthwiseConv(const float* weights,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* bias, const float* relu,
const float* inptr,
int height, int width,
float* outptr,
int out_d, int outH, int outW);
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
#if !CV_FMA3 // AVX workaround
#undef _mm256_fmadd_ps
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = _mm256_loadu_ps(ptr);
__m256 t1 = _mm256_loadu_ps(ptr + 8);
__m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
__m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
a = _mm256_shuffle_ps(lo, hi, 0x88);
b = _mm256_shuffle_ps(lo, hi, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
__m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
v10 = _mm256_loadu_ps(imgptr1 + in_j),
v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
v20 = _mm256_loadu_ps(imgptr2 + in_j),
v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
_mm256_zeroupper();
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
int vl;
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
int avl = outW1 - out_j;
if( stride_w == 1 )
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m8(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m8_t vout0 = vfmacc_vf_f32m8(vfmv_v_f_f32m8(bias, vl), w00, vle32_v_f32m8(imgptr0 + in_j, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w01, vle32_v_f32m8(imgptr0 + in_j + dilation_w, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w02, vle32_v_f32m8(imgptr0 + in_j + dilation_w*2, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w10, vle32_v_f32m8(imgptr1 + in_j, vl),vl);
vout0 = vfmacc_vf_f32m8(vout0, w11, vle32_v_f32m8(imgptr1 + in_j + dilation_w, vl),vl);
vout0 = vfmacc_vf_f32m8(vout0, w12, vle32_v_f32m8(imgptr1 + in_j + dilation_w*2, vl),vl);
vout0 = vfmacc_vf_f32m8(vout0, w20, vle32_v_f32m8(imgptr2 + in_j, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w21, vle32_v_f32m8(imgptr2 + in_j + dilation_w, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w22, vle32_v_f32m8(imgptr2 + in_j + dilation_w*2, vl), vl);
if (relu)
{
vbool4_t m = vmfgt_vf_f32m8_b4(vout0, 0, vl);
vout0 = vmerge_vvm_f32m8(m, vfmul_vf_f32m8(vout0, relu_coeff, vl), vout0, vl);
}
vse32_v_f32m8(outptr + out_j, vout0, vl);
}
else //stride_w == 2 && dilation_w == 1
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m2_t vout0 = vfmacc_vf_f32m2(vfmv_v_f_f32m2(bias, vl), w00, vlse32_v_f32m2(imgptr0+in_j , 8, vl), vl);
vfloat32m2_t vout1 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+1, 8, vl), w01, vl);
vfloat32m2_t vout2 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+2, 8, vl), w02, vl);
vout0 = vfmacc_vf_f32m2(vout0, w10, vlse32_v_f32m2(imgptr1+in_j , 8, vl), vl);
vout1 = vfmacc_vf_f32m2(vout1, w11, vlse32_v_f32m2(imgptr1+in_j+1, 8, vl), vl);
vout2 = vfmacc_vf_f32m2(vout2, w12, vlse32_v_f32m2(imgptr1+in_j+2, 8, vl), vl);
vout0 = vfmacc_vf_f32m2(vout0, w20, vlse32_v_f32m2(imgptr2+in_j , 8, vl), vl);
vout1 = vfmacc_vf_f32m2(vout1, w21, vlse32_v_f32m2(imgptr2+in_j+1, 8, vl), vl);
vout2 = vfmacc_vf_f32m2(vout2, w22, vlse32_v_f32m2(imgptr2+in_j+2, 8, vl), vl);
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
if (relu)
{
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
}
vse32_v_f32m2(outptr + out_j, vout0, vl);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
#endif // CV_RVV
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = (__m256)__lasx_xvld(ptr, 0);
__m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
__m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
__m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
__m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
if (relu)
{
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
}
__lasx_xvst(vout0, outptr + out_j, 0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
#endif // CV_LASX
CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // namespace