Merge pull request #24556 from plctlab:rvp

Optimization based on RISC-V P Packed SIMD Extension v0.5.2
This commit is contained in:
Alexander Smorkalov 2024-01-16 11:36:31 +03:00 committed by GitHub
commit 99c86bb40c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 287 additions and 0 deletions

View File

@ -969,6 +969,13 @@ public:
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
else
#endif
#if CV_RVP052
if(isConv2D)
opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
else
#endif
{
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1348,6 +1355,12 @@ public:
opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
else
#endif
#if CV_RVP052
if(isConv2D)
opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
else
#endif
for( int i = 0; i < outCn; i += 2 )
{

View File

@ -302,6 +302,11 @@ public:
if( useLASX )
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
#if CV_RVP052
if( 1 )
opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
{
int i = 0;

View File

@ -13,6 +13,8 @@
#include "int8layers/layers_common.simd_declarations.hpp"
#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#include "./layers_rvp052.hpp"
#ifdef HAVE_OPENCL
#include "../ocl4dnn/include/ocl4dnn.hpp"
#endif

View File

@ -0,0 +1,221 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "./layers_rvp052.hpp"
#if CV_RVP052
namespace cv {
namespace dnn {
namespace opt_RVP052 {
void fastConv(const int8_t *weights, size_t wstep, const int *bias,
const int8_t *rowbuf, int *output, const int *outShape,
int blockSize, int vecsize, int vecsize_aligned, int outZp,
const float *multiplier, bool initOutput, bool finalOutput)
{
int outCn = outShape[1];
size_t outPlaneSize = outShape[2] * outShape[3];
for (int i = 0; i < outCn; i += 2)
{
const int8_t *wptr0 = weights + i * wstep;
const int8_t *wptr1 = wptr0 + wstep;
int *outptr0 = output + i * outPlaneSize;
int *outptr1 = outptr0 + outPlaneSize;
int bias0 = bias[i], bias1 = bias[i + 1];
float mult0 = multiplier[i], mult1 = multiplier[i + 1];
if (i + 1 >= outCn)
{
wptr1 = wptr0;
outptr1 = outptr0;
bias1 = bias0;
mult1 = mult0;
}
int j = 0;
for (; j < blockSize; j++)
{
const int8_t *rptr = rowbuf + j * vecsize_aligned;
int s00 = initOutput ? bias0 : outptr0[j];
int s10 = initOutput ? bias1 : outptr1[j];
int32x2_t vsx0 = {s00, s10};
for (int k = 0; k < vecsize; k += 4)
{
int8x4_t vrptr[2] = {*(int8x4_t*)(rptr + k), *(int8x4_t*)(rptr + k)};
int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
vsx0 = __nds__v_smaqa(vsx0, *(int8x8_t*)vwptr, *(int8x8_t*)vrptr);
}
if (finalOutput)
{
vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0);
vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1);
vsx0 = __nds__v_sclip32(vsx0, 7);
}
outptr0[j] = vsx0[0];
outptr1[j] = vsx0[1];
}
}
}
void fastDepthwiseConv(const int8_t *wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const int *biasptr, const float *multptr,
const int8_t *inptr_,
int height, int width,
int *outptr_,
int out_d, int outH, int outW,
int inpZp, int outZp)
{
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l) / stride_w);
int bias = biasptr[out_d], biasCopy;
float mult = multptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const int8_t *imgptr0 = inptr_ + in_i * width;
const int8_t *imgptr1 = imgptr0 + dilation_h * width;
const int8_t *imgptr2 = imgptr0 + (dilation_h * 2) * width;
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
int out;
biasCopy = bias;
if (in_i < 0)
{
biasCopy += inpZp * (w00 + w01 + w02);
w00 = w01 = w02 = 0;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h * (kernel_h - 1) >= height)
{
biasCopy += inpZp * (w20 + w21 + w22);
w20 = w21 = w22 = 0;
imgptr2 = imgptr1;
}
int *outptr = outptr_ + out_i * outW;
if (pad_l > 0)
{
out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 +
(int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 +
(int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 +
biasCopy + inpZp * (w00 + w10 + w20);
outptr[0] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
out_j = 1;
}
int8x8_t vwx0 = (int8x8_t){w00, w10, w20, 0, w00, w10, w20, 0};
int8x8_t vwx1 = (int8x8_t){w01, w11, w21, 0, w01, w11, w21, 0};
int8x8_t vwx2 = (int8x8_t){w02, w12, w22, 0, w02, w12, w22, 0};
int8x8_t vimgx0, vimgx1, vimgx2;
int32x2_t vout = {0, 0};
for (; out_j < outW1; out_j+=2)
{
int in_j = out_j * stride_w - pad_l;
vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j], 0,
imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w], 0};
vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w], 0,
imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w], 0};
vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w * 2], imgptr1[in_j + dilation_w * 2], imgptr2[in_j + dilation_w * 2], 0,
imgptr0[in_j + dilation_w * 2 + stride_w], imgptr1[in_j + dilation_w * 2 + stride_w], imgptr2[in_j + dilation_w * 2 + stride_w], 0};
vout = (int32x2_t){biasCopy, biasCopy};
vout = __nds__v_smaqa(vout, vwx0, vimgx0);
vout = __nds__v_smaqa(vout, vwx1, vimgx1);
vout = __nds__v_smaqa(vout, vwx2, vimgx2);
outptr[out_j] = __nds__sclip32(outZp + (int)std::round(vout[0] * mult), 7);
outptr[out_j + 1] = __nds__sclip32(outZp + (int)std::round(vout[1] * mult), 7);
}
while (out_j > outW1) out_j--;
for (; out_j < outW; out_j++)
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w * 2;
int s0 = 1, s1 = 1, s2 = 1;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0;
biasCopy += inpZp * (w00 + w10 + w20);
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0;
biasCopy += inpZp * (w01 + w11 + w21);
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0;
biasCopy += inpZp * (w02 + w12 + w22);
}
out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 +
(int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 +
(int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy;
outptr[out_j] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
}
}
}
// dst = vec * weights^t + bias
void fastGEMM1T( const int8_t* vec, const int8_t* weights,
size_t wstep, const int* bias, const float* multiplier,
int* dst, int nvecs, int vecsize, int outZp )
{
int i = 0;
for( ; i <= nvecs - 2; i += 2 )
{
const int8_t* wptr0 = weights + i * wstep;
const int8_t* wptr1 = weights + (i + 1) * wstep;
int32x2_t vs0 = *(int32x2_t*)(bias + i);
for( int k = 0; k < vecsize; k += 4 )
{
int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), *(int8x4_t*)(vec + k)};
int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
vs0 = __nds__v_smaqa(vs0, *(int8x8_t*)vwptr, *(int8x8_t*)vvec);
}
int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i + 1])};
vdst = __nds__v_sclip32(vdst + outZp, 7);
*(int32x2_t*)(dst + i) = vdst;
}
for( ; i < nvecs; i++ )
{
const int8_t* wptr = weights + i * wstep;
int s0 = bias[i];
for( int k = 0; k < vecsize; k += 4 )
{
int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), 0};
int8x4_t vwptr[2] = {*(int8x4_t*)(wptr + k), 0};
s0 = __nds__smaqa(s0, *(unsigned long*)vwptr, *(unsigned long*)vvec);
}
dst[i] = __nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]), 7);
}
}
}}} // namespace
#endif

View File

@ -0,0 +1,36 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#if defined(__riscv) && defined(__riscv_dsp) && defined(__ANDES)
# include <nds_intrinsic.h>
# define CV_RVP052 1
namespace cv {
namespace dnn {
namespace opt_RVP052 {
void fastConv( const int8_t* weights, size_t wstep, const int* bias,
const int8_t* rowbuf, int* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, int outZp,
const float* multiplier, bool initOutput, bool finalOutput );
void fastDepthwiseConv( const int8_t* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const int* biasptr, const float* multptr,
const int8_t* inptr_,
int height, int width,
int* outptr_,
int out_d, int outH, int outW,
int inpZp, int outZp );
void fastGEMM1T( const int8_t* vec, const int8_t* weights,
size_t wstep, const int* bias, const float* multiplier,
int* dst, int nvecs, int vecsize, int outZp );
}}}
#else
# define CV_RVP052 0
#endif

View File

@ -0,0 +1,10 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR riscv64)
set(RISCV_GCC_INSTALL_ROOT $ENV{RISCV} CACHE PATH "Path to GCC for RISC-V cross compiler installation directory")
set(CMAKE_C_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc)
set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp")