mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #25230 from hanliutong/rvv-conv
Optimize int8 layers in DNN modules by using RISC-V Vector intrinsic. #25230 This patch optimize 3 functions in the int8 layer by using RVV Native Intrinsic. This patch was tested on QEMU using VLEN=128 and VLEN=256 on `./bin/opencv_test_dnn --gtest_filter="*Int8*"`; On the real device (k230, VLEN=128), `EfficientDet_int8` in `opencv_perf_dnn` showed a performance improvement of 1.46x. | Name of Test | Original | optimized | Speed-up | | ------------------------------------------ | -------- | ---------- | -------- | | EfficientDet_int8::DNNTestNetwork::OCV/CPU | 2843.467 | 1947.013 | 1.46 | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
8c540a56af
commit
eba158fb0c
@ -5,7 +5,7 @@ endif()
|
||||
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
|
||||
|
||||
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
|
||||
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
|
||||
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16)
|
||||
|
@ -702,13 +702,14 @@ public:
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
bool useLASX;
|
||||
bool useRVV;
|
||||
int blk_size_cn;
|
||||
int inpZp, outZp;
|
||||
const std::vector<float>* multiplier;
|
||||
|
||||
ParallelConv()
|
||||
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
|
||||
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
|
||||
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false)
|
||||
, blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
|
||||
{}
|
||||
|
||||
@ -765,6 +766,7 @@ public:
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
|
||||
|
||||
p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D;
|
||||
p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D;
|
||||
|
||||
int kernel_d = isConv3D? kernel_size[0] : 1;
|
||||
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
|
||||
@ -970,6 +972,13 @@ public:
|
||||
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
|
||||
if(useRVV)
|
||||
opt_RVV::fastDepthwiseConv(wptr, kernel_h, kernel_w,
|
||||
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
||||
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
|
||||
else
|
||||
#endif
|
||||
#if CV_RVP052
|
||||
if(isConv2D)
|
||||
opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
|
||||
@ -1356,6 +1365,12 @@ public:
|
||||
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
|
||||
if(useRVV)
|
||||
opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
||||
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
|
||||
else
|
||||
#endif
|
||||
#if CV_RVP052
|
||||
if(isConv2D)
|
||||
opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
||||
|
@ -228,7 +228,7 @@ public:
|
||||
{
|
||||
public:
|
||||
FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
|
||||
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
|
||||
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false) {}
|
||||
|
||||
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
|
||||
const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
|
||||
@ -253,6 +253,7 @@ public:
|
||||
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
|
||||
p.useLASX = checkHardwareSupport(CPU_LASX);
|
||||
p.useRVV = checkHardwareSupport(CPU_RVV);
|
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes);
|
||||
}
|
||||
@ -303,6 +304,11 @@ public:
|
||||
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
|
||||
if( useRVV)
|
||||
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
|
||||
else
|
||||
#endif
|
||||
#if CV_RVP052
|
||||
if( 1 )
|
||||
opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
|
||||
@ -363,6 +369,7 @@ public:
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
bool useLASX;
|
||||
bool useRVV;
|
||||
};
|
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
||||
|
@ -1257,5 +1257,440 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
|
||||
}
|
||||
#endif // CV_LASX
|
||||
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
|
||||
|
||||
static const size_t __cv_rvv_e8m1_max = __riscv_vsetvlmax_e8m1();
|
||||
static const size_t __cv_rvv_e16m1_max = __riscv_vsetvlmax_e16m1();
|
||||
static const size_t __cv_rvv_e32m2_max = __riscv_vsetvlmax_e32m2();
|
||||
|
||||
inline vint32m2_t __riscv_vwmacc_vv_i32m2(vint32m2_t& dst, const vint8m1_t& a, const vint8m1_t& b, size_t vl) {
|
||||
vint16m2_t tmp = __riscv_vwmul(a, b, vl);
|
||||
dst = __riscv_vwadd_wv_i32m2_tu(dst, dst, __riscv_vget_i16m1(tmp, 0), vl);
|
||||
dst = __riscv_vwadd_wv_i32m2_tu(dst, dst, __riscv_vget_i16m1(tmp, 1), vl > __cv_rvv_e16m1_max ? vl - __cv_rvv_e16m1_max : 0);
|
||||
return dst;
|
||||
}
|
||||
|
||||
enum { FASCONV_BASE_VECSZ = 4 };
|
||||
void fastConv( const int8_t* weights, size_t wstep, const int* bias,
|
||||
const int8_t* rowbuf, int* output, const int* outShape,
|
||||
int blockSize, int vecsize, int vecsize_aligned, int outZp,
|
||||
const float* multiplier, bool initOutput, bool finalOutput )
|
||||
{
|
||||
const size_t e8m1 = __cv_rvv_e8m1_max;
|
||||
int outCn = outShape[1];
|
||||
size_t outPlaneSize = outShape[2]*outShape[3];
|
||||
// now compute dot product of the weights
|
||||
// and im2row-transformed part of the tensor
|
||||
for( int i = 0; i < outCn; i += 3 )
|
||||
{
|
||||
int unroll_tail = FASCONV_BASE_VECSZ;
|
||||
const int8_t* wptr0 = weights + i*wstep;
|
||||
const int8_t* wptr1 = wptr0 + wstep;
|
||||
const int8_t* wptr2 = wptr1 + wstep;
|
||||
int* outptr0 = output + i*outPlaneSize;
|
||||
int* outptr1 = outptr0 + outPlaneSize;
|
||||
int* outptr2 = outptr1 + outPlaneSize;
|
||||
int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
|
||||
float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
|
||||
|
||||
if( i+2 >= outCn )
|
||||
{
|
||||
wptr2 = wptr1;
|
||||
outptr2 = outptr1;
|
||||
bias2 = bias1;
|
||||
mult2 = mult1;
|
||||
if( i+1 >= outCn )
|
||||
{
|
||||
wptr2 = wptr1 = wptr0;
|
||||
outptr2 = outptr1 = outptr0;
|
||||
bias2 = bias1 = bias0;
|
||||
mult2 = mult1 = mult0;
|
||||
}
|
||||
}
|
||||
|
||||
int j = 0;
|
||||
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
|
||||
{
|
||||
const int8_t* rptr = rowbuf + j*vecsize_aligned;
|
||||
const int8_t *rptr1 = rptr + vecsize_aligned*1,
|
||||
*rptr2 = rptr + vecsize_aligned*2,
|
||||
*rptr3 = rptr + vecsize_aligned*3;
|
||||
|
||||
if (j + FASCONV_BASE_VECSZ > blockSize)
|
||||
{
|
||||
unroll_tail = blockSize - j;
|
||||
rptr1 = rptr + vecsize_aligned*std::min(1, unroll_tail-1);
|
||||
rptr2 = rptr + vecsize_aligned*std::min(2, unroll_tail-1);
|
||||
rptr3 = rptr + vecsize_aligned*std::min(3, unroll_tail-1);
|
||||
}
|
||||
|
||||
int vl, avl = vecsize;
|
||||
|
||||
vint32m2_t
|
||||
vs00 = __riscv_vmv_v_x_i32m2(0, e8m1), vs10 = __riscv_vmv_v_x_i32m2(0, e8m1), vs20 = __riscv_vmv_v_x_i32m2(0, e8m1),
|
||||
vs01 = __riscv_vmv_v_x_i32m2(0, e8m1), vs11 = __riscv_vmv_v_x_i32m2(0, e8m1), vs21 = __riscv_vmv_v_x_i32m2(0, e8m1),
|
||||
vs02 = __riscv_vmv_v_x_i32m2(0, e8m1), vs12 = __riscv_vmv_v_x_i32m2(0, e8m1), vs22 = __riscv_vmv_v_x_i32m2(0, e8m1),
|
||||
vs03 = __riscv_vmv_v_x_i32m2(0, e8m1), vs13 = __riscv_vmv_v_x_i32m2(0, e8m1), vs23 = __riscv_vmv_v_x_i32m2(0, e8m1);
|
||||
for (int k = 0; k < vecsize; k += vl, avl -= vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e8m1(avl);
|
||||
|
||||
vint8m1_t w0 = (__riscv_vle8_v_i8m1(wptr0 + k, vl));
|
||||
vint8m1_t w1 = (__riscv_vle8_v_i8m1(wptr1 + k, vl));
|
||||
vint8m1_t w2 = (__riscv_vle8_v_i8m1(wptr2 + k, vl));
|
||||
vint8m1_t r0 = (__riscv_vle8_v_i8m1(rptr, vl));
|
||||
|
||||
|
||||
vs00 = __riscv_vwmacc_vv_i32m2(vs00, w0, r0, vl);
|
||||
vs10 = __riscv_vwmacc_vv_i32m2(vs10, w1, r0, vl);
|
||||
vs20 = __riscv_vwmacc_vv_i32m2(vs20, w2, r0, vl);
|
||||
|
||||
r0 = (__riscv_vle8_v_i8m1(rptr1, vl));
|
||||
vs01 = __riscv_vwmacc_vv_i32m2(vs01, w0, r0, vl);
|
||||
vs11 = __riscv_vwmacc_vv_i32m2(vs11, w1, r0, vl);
|
||||
vs21 = __riscv_vwmacc_vv_i32m2(vs21, w2, r0, vl);
|
||||
|
||||
r0 = (__riscv_vle8_v_i8m1(rptr2, vl));
|
||||
vs02 = __riscv_vwmacc_vv_i32m2(vs02, w0, r0, vl);
|
||||
vs12 = __riscv_vwmacc_vv_i32m2(vs12, w1, r0, vl);
|
||||
vs22 = __riscv_vwmacc_vv_i32m2(vs22, w2, r0, vl);
|
||||
|
||||
r0 = (__riscv_vle8_v_i8m1(rptr3, vl));
|
||||
vs03 = __riscv_vwmacc_vv_i32m2(vs03, w0, r0, vl);
|
||||
vs13 = __riscv_vwmacc_vv_i32m2(vs13, w1, r0, vl);
|
||||
vs23 = __riscv_vwmacc_vv_i32m2(vs23, w2, r0, vl);
|
||||
|
||||
rptr += vl; rptr1 += vl; rptr2 += vl; rptr3 += vl;
|
||||
}
|
||||
|
||||
// compute sum of each vs
|
||||
vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, e8m1);
|
||||
int sum0[FASCONV_BASE_VECSZ], sum1[FASCONV_BASE_VECSZ], sum2[FASCONV_BASE_VECSZ];
|
||||
|
||||
sum0[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs00, zero, e8m1));
|
||||
sum0[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs01, zero, e8m1));
|
||||
sum0[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs02, zero, e8m1));
|
||||
sum0[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs03, zero, e8m1));
|
||||
|
||||
sum1[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, e8m1));
|
||||
sum1[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, e8m1));
|
||||
sum1[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, e8m1));
|
||||
sum1[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, e8m1));
|
||||
|
||||
sum2[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs20, zero, e8m1));
|
||||
sum2[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs21, zero, e8m1));
|
||||
sum2[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs22, zero, e8m1));
|
||||
sum2[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs23, zero, e8m1));
|
||||
|
||||
vint32m1_t s0, s1, s2;
|
||||
if( initOutput )
|
||||
{
|
||||
s0 = __riscv_vmv_v_x_i32m1(bias0, unroll_tail);
|
||||
s1 = __riscv_vmv_v_x_i32m1(bias1, unroll_tail);
|
||||
s2 = __riscv_vmv_v_x_i32m1(bias2, unroll_tail);
|
||||
}
|
||||
else
|
||||
{
|
||||
s0 = __riscv_vle32_v_i32m1(outptr0 + j, unroll_tail);
|
||||
s1 = __riscv_vle32_v_i32m1(outptr1 + j, unroll_tail);
|
||||
s2 = __riscv_vle32_v_i32m1(outptr2 + j, unroll_tail);
|
||||
}
|
||||
s0 = __riscv_vadd(__riscv_vle32_v_i32m1(sum0, unroll_tail), s0, unroll_tail);
|
||||
s1 = __riscv_vadd(__riscv_vle32_v_i32m1(sum1, unroll_tail), s1, unroll_tail);
|
||||
s2 = __riscv_vadd(__riscv_vle32_v_i32m1(sum2, unroll_tail), s2, unroll_tail);
|
||||
|
||||
if( finalOutput )
|
||||
{
|
||||
s0 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s0, unroll_tail), mult0, unroll_tail), unroll_tail), outZp, unroll_tail);
|
||||
s1 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s1, unroll_tail), mult1, unroll_tail), unroll_tail), outZp, unroll_tail);
|
||||
s2 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s2, unroll_tail), mult2, unroll_tail), unroll_tail), outZp, unroll_tail);
|
||||
|
||||
s0 = __riscv_vmin(__riscv_vmax(s0, -128, unroll_tail), 127, unroll_tail);
|
||||
s1 = __riscv_vmin(__riscv_vmax(s1, -128, unroll_tail), 127, unroll_tail);
|
||||
s2 = __riscv_vmin(__riscv_vmax(s2, -128, unroll_tail), 127, unroll_tail);
|
||||
}
|
||||
|
||||
__riscv_vse32(outptr0 + j, s0, unroll_tail);
|
||||
__riscv_vse32(outptr1 + j, s1, unroll_tail);
|
||||
__riscv_vse32(outptr2 + j, s2, unroll_tail);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fastDepthwiseConv( const int8_t* wptr,
|
||||
int kernel_h, int kernel_w,
|
||||
int stride_h, int stride_w,
|
||||
int dilation_h, int dilation_w,
|
||||
int pad_t, int pad_l,
|
||||
const int* biasptr, const float* multptr,
|
||||
const int8_t* inptr_,
|
||||
int height, int width,
|
||||
int* outptr_,
|
||||
int out_d, int outH, int outW,
|
||||
int inpZp, int outZp)
|
||||
{
|
||||
int vl;
|
||||
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
||||
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
||||
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
||||
int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
||||
float mult = multptr[out_d];
|
||||
int bias = biasptr[out_d];
|
||||
int biasCopy;
|
||||
|
||||
for (int out_i = 0; out_i < outH; out_i++)
|
||||
{
|
||||
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
||||
const int8_t* imgptr0 = inptr_ + in_i*width;
|
||||
const int8_t* imgptr1 = imgptr0 + dilation_h*width;
|
||||
const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
||||
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
|
||||
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
|
||||
int out, out1;
|
||||
biasCopy = bias;
|
||||
if (in_i < 0)
|
||||
{
|
||||
biasCopy += inpZp * (w00 + w01 + w02);
|
||||
w00 = w01 = w02 = 0;
|
||||
imgptr0 = imgptr1;
|
||||
}
|
||||
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
||||
{
|
||||
biasCopy += inpZp * (w20 + w21 + w22);
|
||||
w20 = w21 = w22 = 0;
|
||||
imgptr2 = imgptr1;
|
||||
}
|
||||
int* outptr = outptr_ + out_i*outW;
|
||||
if (pad_l > 0)
|
||||
{
|
||||
out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
|
||||
(int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
|
||||
(int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
|
||||
biasCopy + inpZp*(w00 + w10 + w20);
|
||||
out1 = outZp + (int)std::round(out*mult);
|
||||
outptr[0] = std::min(std::max(out1, -128), 127);
|
||||
out_j = 1;
|
||||
}
|
||||
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
|
||||
{
|
||||
int avl = outW1 - out_j;
|
||||
if( stride_w == 1 )
|
||||
for( ; out_j < outW1; out_j += vl, avl -= vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e8m2(avl);
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
|
||||
vint32m8_t vout = __riscv_vmv_v_x_i32m8(biasCopy, vl);
|
||||
vout = __riscv_vwmacc(vout, w00, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j , vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w01, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j + dilation_w , vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w02, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j + dilation_w*2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w10, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j , vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w11, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j + dilation_w , vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w12, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j + dilation_w*2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w20, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j , vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w21, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j + dilation_w , vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w22, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j + dilation_w*2, vl), vl), vl);
|
||||
|
||||
vout = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m8(vout, vl), mult, vl), vl);
|
||||
vout = __riscv_vadd(vout, outZp, vl);
|
||||
vout = __riscv_vmin(__riscv_vmax(vout, -128, vl), 127, vl);
|
||||
|
||||
__riscv_vse32_v_i32m8(outptr + out_j, vout, vl);
|
||||
|
||||
}
|
||||
else //stride_w == 2 && dilation_w == 1;
|
||||
{
|
||||
for( ; out_j < outW1; out_j += vl, avl -= vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e8m2(avl);
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
|
||||
vint32m8_t vout = __riscv_vmv_v_x_i32m8(biasCopy, vl);
|
||||
|
||||
vout = __riscv_vwmacc(vout, w00, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j , 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w01, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j+1, 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w02, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j+2, 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w10, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j , 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w11, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j+1, 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w12, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j+2, 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w20, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j , 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w21, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j+1, 2, vl), vl), vl);
|
||||
vout = __riscv_vwmacc(vout, w22, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j+2, 2, vl), vl), vl);
|
||||
|
||||
vout = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m8(vout, vl), mult, vl), vl);
|
||||
vout = __riscv_vadd(vout, outZp, vl);
|
||||
vout = __riscv_vmin(__riscv_vmax(vout, -128, vl), 127, vl);
|
||||
|
||||
__riscv_vse32_v_i32m8(outptr + out_j, vout, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; out_j < outW1; out_j++)
|
||||
{
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
|
||||
(int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
|
||||
(int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
|
||||
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
|
||||
}
|
||||
|
||||
for (; out_j < outW; out_j++ )
|
||||
{
|
||||
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
||||
int s0 = 1, s1 = 1, s2 = 1;
|
||||
if (in_j0 >= width)
|
||||
{
|
||||
in_j0 = 0;
|
||||
s0 = 0;
|
||||
biasCopy += inpZp*(w00 + w10 + w20);
|
||||
}
|
||||
if (in_j1 >= width)
|
||||
{
|
||||
in_j1 = 0;
|
||||
s1 = 0;
|
||||
biasCopy += inpZp*(w01 + w11 + w21);
|
||||
}
|
||||
if (in_j2 >= width)
|
||||
{
|
||||
in_j2 = 0;
|
||||
s2 = 0;
|
||||
biasCopy += inpZp*(w02 + w12 + w22);
|
||||
}
|
||||
out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
|
||||
(int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
|
||||
(int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
|
||||
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fastGEMM1T( const int8_t* vec, const int8_t* weights,
|
||||
size_t wstep, const int* bias, const float* multiplier,
|
||||
int* dst, int nvecs, int vecsize, int outZp )
|
||||
{
|
||||
int i = 0;
|
||||
for( ; i <= nvecs - 15; i += 15 )
|
||||
{
|
||||
const int8_t* wptr = weights + i*wstep;
|
||||
vint32m2_t
|
||||
vs0 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs1 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs2 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs3 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs4 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs5 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs6 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs7 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs8 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs9 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs10 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs11 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs12 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs13 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs14 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max);
|
||||
int avl = vecsize, vl;
|
||||
for(int k = 0 ; k < vecsize; k += vl, wptr += vl, avl -= vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e8m1(avl);
|
||||
vint8m1_t v = __riscv_vle8_v_i8m1(vec + k, vl);
|
||||
|
||||
vs0 = __riscv_vwmacc_vv_i32m2(vs0, __riscv_vle8_v_i8m1(wptr, vl), v, vl);
|
||||
vs1 = __riscv_vwmacc_vv_i32m2(vs1, __riscv_vle8_v_i8m1(wptr + wstep, vl), v, vl);
|
||||
vs2 = __riscv_vwmacc_vv_i32m2(vs2, __riscv_vle8_v_i8m1(wptr + wstep*2, vl), v, vl);
|
||||
vs3 = __riscv_vwmacc_vv_i32m2(vs3, __riscv_vle8_v_i8m1(wptr + wstep*3, vl), v, vl);
|
||||
vs4 = __riscv_vwmacc_vv_i32m2(vs4, __riscv_vle8_v_i8m1(wptr + wstep*4, vl), v, vl);
|
||||
vs5 = __riscv_vwmacc_vv_i32m2(vs5, __riscv_vle8_v_i8m1(wptr + wstep*5, vl), v, vl);
|
||||
vs6 = __riscv_vwmacc_vv_i32m2(vs6, __riscv_vle8_v_i8m1(wptr + wstep*6, vl), v, vl);
|
||||
vs7 = __riscv_vwmacc_vv_i32m2(vs7, __riscv_vle8_v_i8m1(wptr + wstep*7, vl), v, vl);
|
||||
vs8 = __riscv_vwmacc_vv_i32m2(vs8, __riscv_vle8_v_i8m1(wptr + wstep*8, vl), v, vl);
|
||||
vs9 = __riscv_vwmacc_vv_i32m2(vs9, __riscv_vle8_v_i8m1(wptr + wstep*9, vl), v, vl);
|
||||
vs10 = __riscv_vwmacc_vv_i32m2(vs10, __riscv_vle8_v_i8m1(wptr + wstep*10, vl), v, vl);
|
||||
vs11 = __riscv_vwmacc_vv_i32m2(vs11, __riscv_vle8_v_i8m1(wptr + wstep*11, vl), v, vl);
|
||||
vs12 = __riscv_vwmacc_vv_i32m2(vs12, __riscv_vle8_v_i8m1(wptr + wstep*12, vl), v, vl);
|
||||
vs13 = __riscv_vwmacc_vv_i32m2(vs13, __riscv_vle8_v_i8m1(wptr + wstep*13, vl), v, vl);
|
||||
vs14 = __riscv_vwmacc_vv_i32m2(vs14, __riscv_vle8_v_i8m1(wptr + wstep*14, vl), v, vl);
|
||||
}
|
||||
|
||||
int sum[15];
|
||||
vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, __cv_rvv_e32m2_max);
|
||||
sum[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs0, zero, __cv_rvv_e32m2_max));
|
||||
sum[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs1, zero, __cv_rvv_e32m2_max));
|
||||
sum[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs2, zero, __cv_rvv_e32m2_max));
|
||||
sum[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs3, zero, __cv_rvv_e32m2_max));
|
||||
sum[4] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs4, zero, __cv_rvv_e32m2_max));
|
||||
sum[5] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs5, zero, __cv_rvv_e32m2_max));
|
||||
sum[6] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs6, zero, __cv_rvv_e32m2_max));
|
||||
sum[7] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs7, zero, __cv_rvv_e32m2_max));
|
||||
sum[8] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs8, zero, __cv_rvv_e32m2_max));
|
||||
sum[9] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs9, zero, __cv_rvv_e32m2_max));
|
||||
sum[10] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, __cv_rvv_e32m2_max));
|
||||
sum[11] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, __cv_rvv_e32m2_max));
|
||||
sum[12] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, __cv_rvv_e32m2_max));
|
||||
sum[13] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, __cv_rvv_e32m2_max));
|
||||
sum[14] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs14, zero, __cv_rvv_e32m2_max));
|
||||
|
||||
vint32m4_t s0 = __riscv_vadd(__riscv_vle32_v_i32m4(sum, 15), __riscv_vle32_v_i32m4(bias + i, 15), 15);
|
||||
|
||||
s0 = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m4(s0, 15), __riscv_vle32_v_f32m4(multiplier + i, 15), 15), 15);
|
||||
s0 = __riscv_vadd(s0, outZp, 15);
|
||||
s0 = __riscv_vmin(__riscv_vmax(s0, -128, 15), 127, 15);
|
||||
__riscv_vse32_v_i32m4(dst + i, s0, 15);
|
||||
}
|
||||
int unroll_tail = nvecs - i;
|
||||
if (unroll_tail > 0)
|
||||
{
|
||||
const int8_t* wptr = weights + i*wstep;
|
||||
vint32m2_t
|
||||
vs0 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs1 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs2 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs3 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs4 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs5 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs6 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs7 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs8 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs9 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs10 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs11 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
|
||||
vs12 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs13 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max);
|
||||
int avl = vecsize, vl;
|
||||
for(int k = 0 ; k < vecsize; k += vl, wptr += vl, avl -= vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e8m1(avl);
|
||||
vint8m1_t v = __riscv_vle8_v_i8m1(vec + k, vl);
|
||||
|
||||
vs0 = __riscv_vwmacc_vv_i32m2(vs0, __riscv_vle8_v_i8m1(wptr, vl), v, vl);
|
||||
vs1 = __riscv_vwmacc_vv_i32m2(vs1, __riscv_vle8_v_i8m1(wptr + wstep*std::min(1, unroll_tail-1), vl), v, vl);
|
||||
vs2 = __riscv_vwmacc_vv_i32m2(vs2, __riscv_vle8_v_i8m1(wptr + wstep*std::min(2, unroll_tail-1), vl), v, vl);
|
||||
vs3 = __riscv_vwmacc_vv_i32m2(vs3, __riscv_vle8_v_i8m1(wptr + wstep*std::min(3, unroll_tail-1), vl), v, vl);
|
||||
vs4 = __riscv_vwmacc_vv_i32m2(vs4, __riscv_vle8_v_i8m1(wptr + wstep*std::min(4, unroll_tail-1), vl), v, vl);
|
||||
vs5 = __riscv_vwmacc_vv_i32m2(vs5, __riscv_vle8_v_i8m1(wptr + wstep*std::min(5, unroll_tail-1), vl), v, vl);
|
||||
vs6 = __riscv_vwmacc_vv_i32m2(vs6, __riscv_vle8_v_i8m1(wptr + wstep*std::min(6, unroll_tail-1), vl), v, vl);
|
||||
vs7 = __riscv_vwmacc_vv_i32m2(vs7, __riscv_vle8_v_i8m1(wptr + wstep*std::min(7, unroll_tail-1), vl), v, vl);
|
||||
vs8 = __riscv_vwmacc_vv_i32m2(vs8, __riscv_vle8_v_i8m1(wptr + wstep*std::min(8, unroll_tail-1), vl), v, vl);
|
||||
vs9 = __riscv_vwmacc_vv_i32m2(vs9, __riscv_vle8_v_i8m1(wptr + wstep*std::min(9, unroll_tail-1), vl), v, vl);
|
||||
vs10 = __riscv_vwmacc_vv_i32m2(vs10, __riscv_vle8_v_i8m1(wptr + wstep*std::min(10, unroll_tail-1), vl), v, vl);
|
||||
vs11 = __riscv_vwmacc_vv_i32m2(vs11, __riscv_vle8_v_i8m1(wptr + wstep*std::min(11, unroll_tail-1), vl), v, vl);
|
||||
vs13 = __riscv_vwmacc_vv_i32m2(vs13, __riscv_vle8_v_i8m1(wptr + wstep*std::min(12, unroll_tail-1), vl), v, vl);
|
||||
vs12 = __riscv_vwmacc_vv_i32m2(vs12, __riscv_vle8_v_i8m1(wptr + wstep*std::min(13, unroll_tail-1), vl), v, vl);
|
||||
}
|
||||
|
||||
int sum[14];
|
||||
vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, __cv_rvv_e32m2_max);
|
||||
sum[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs0, zero, __cv_rvv_e32m2_max));
|
||||
sum[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs1, zero, __cv_rvv_e32m2_max));
|
||||
sum[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs2, zero, __cv_rvv_e32m2_max));
|
||||
sum[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs3, zero, __cv_rvv_e32m2_max));
|
||||
sum[4] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs4, zero, __cv_rvv_e32m2_max));
|
||||
sum[5] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs5, zero, __cv_rvv_e32m2_max));
|
||||
sum[6] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs6, zero, __cv_rvv_e32m2_max));
|
||||
sum[7] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs7, zero, __cv_rvv_e32m2_max));
|
||||
sum[8] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs8, zero, __cv_rvv_e32m2_max));
|
||||
sum[9] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs9, zero, __cv_rvv_e32m2_max));
|
||||
sum[10] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, __cv_rvv_e32m2_max));
|
||||
sum[11] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, __cv_rvv_e32m2_max));
|
||||
sum[12] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, __cv_rvv_e32m2_max));
|
||||
sum[13] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, __cv_rvv_e32m2_max));
|
||||
|
||||
vint32m4_t s0 = __riscv_vadd(__riscv_vle32_v_i32m4(sum, unroll_tail), __riscv_vle32_v_i32m4(bias + i, unroll_tail), unroll_tail);
|
||||
|
||||
s0 = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m4(s0, unroll_tail), __riscv_vle32_v_f32m4(multiplier + i, unroll_tail), unroll_tail), unroll_tail);
|
||||
s0 = __riscv_vadd(s0, outZp, unroll_tail);
|
||||
s0 = __riscv_vmin(__riscv_vmax(s0, -128, unroll_tail), 127, unroll_tail);
|
||||
__riscv_vse32_v_i32m4(dst + i, s0, unroll_tail);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CV_RVV
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
}} // namespace
|
||||
|
Loading…
Reference in New Issue
Block a user