mirror of
https://github.com/opencv/opencv.git
synced 2025-07-25 14:47:07 +08:00
Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX
* Add Loongson Advanced SIMD Extension support: -DCPU_BASELINE=LASX * Add resize.lasx.cpp for Loongson SIMD acceleration * Add imgwarp.lasx.cpp for Loongson SIMD acceleration * Add LASX acceleration support for dnn/conv * Add CV_PAUSE(v) for Loongarch * Set LASX by default on Loongarch64 * LoongArch: tune test threshold for Core/HAL.mat_decomp/15 Co-authored-by: shengwenxue <shengwenxue@loongson.cn>
This commit is contained in:
parent
866191478f
commit
4154bd0667
@ -50,6 +50,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
|
||||
list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
|
||||
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
|
||||
|
||||
ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
|
||||
@ -380,6 +381,12 @@ elseif(RISCV)
|
||||
set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}")
|
||||
set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}")
|
||||
|
||||
elseif(LOONGARCH64)
|
||||
ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
|
||||
ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
|
||||
ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
|
||||
set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
|
||||
|
||||
endif()
|
||||
|
||||
# Helper values for cmake-gui
|
||||
|
@ -100,6 +100,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
|
||||
set(MIPS 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv.*|RISCV.*)")
|
||||
set(RISCV 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64.*|LOONGARCH64.*)")
|
||||
set(LOONGARCH64 1)
|
||||
else()
|
||||
if(NOT OPENCV_SUPPRESS_MESSAGE_UNRECOGNIZED_SYSTEM_PROCESSOR)
|
||||
message(WARNING "OpenCV: unrecognized target processor configuration")
|
||||
|
23
cmake/checks/cpu_lasx.cpp
Normal file
23
cmake/checks/cpu_lasx.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
# include <lasxintrin.h>
|
||||
# define CV_LASX 1
|
||||
#endif
|
||||
|
||||
#if defined CV_LASX
|
||||
int test()
|
||||
{
|
||||
const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
|
||||
v8f32 val = (v8f32)__lasx_xvld((const float*)(src), 0);
|
||||
return __lasx_xvpickve2gr_w(__lasx_xvftint_w_s (val), 7);
|
||||
}
|
||||
#else
|
||||
#error "LASX is not supported"
|
||||
#endif
|
||||
|
||||
int main()
|
||||
{
|
||||
printf("%d\n", test());
|
||||
return 0;
|
||||
}
|
@ -172,6 +172,11 @@
|
||||
# define CV_MSA 1
|
||||
#endif
|
||||
|
||||
#ifdef CV_CPU_COMPILE_LASX
|
||||
# include <lasxintrin.h>
|
||||
# define CV_LASX 1
|
||||
#endif
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
# define CV_WASM_SIMD 1
|
||||
# include <wasm_simd128.h>
|
||||
@ -370,3 +375,7 @@ struct VZeroUpperGuard {
|
||||
#ifndef CV_RVV
|
||||
# define CV_RVV 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_LASX
|
||||
# define CV_LASX 0
|
||||
#endif
|
||||
|
@ -525,5 +525,26 @@
|
||||
#endif
|
||||
#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
|
||||
|
||||
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
|
||||
# define CV_TRY_LASX 1
|
||||
# define CV_CPU_FORCE_LASX 1
|
||||
# define CV_CPU_HAS_SUPPORT_LASX 1
|
||||
# define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args)
|
||||
# define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args)
|
||||
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX
|
||||
# define CV_TRY_LASX 1
|
||||
# define CV_CPU_FORCE_LASX 0
|
||||
# define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX))
|
||||
# define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
|
||||
# define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
|
||||
#else
|
||||
# define CV_TRY_LASX 0
|
||||
# define CV_CPU_FORCE_LASX 0
|
||||
# define CV_CPU_HAS_SUPPORT_LASX 0
|
||||
# define CV_CPU_CALL_LASX(fn, args)
|
||||
# define CV_CPU_CALL_LASX_(fn, args)
|
||||
#endif
|
||||
#define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...) CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
|
||||
|
||||
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
|
||||
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
|
||||
|
@ -279,6 +279,8 @@ namespace cv {
|
||||
|
||||
#define CV_CPU_RVV 210
|
||||
|
||||
#define CV_CPU_LASX 230
|
||||
|
||||
// CPU features groups
|
||||
#define CV_CPU_AVX512_SKX 256
|
||||
#define CV_CPU_AVX512_COMMON 257
|
||||
@ -336,6 +338,8 @@ enum CpuFeatures {
|
||||
|
||||
CPU_RVV = 210,
|
||||
|
||||
CPU_LASX = 230,
|
||||
|
||||
CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
|
||||
CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
|
||||
CPU_AVX512_KNL = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
|
||||
|
@ -231,8 +231,16 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
|
||||
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
|
||||
#include "opencv2/core/hal/intrin_rvv.hpp"
|
||||
|
||||
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
|
||||
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
|
||||
|
||||
#elif CV_LASX
|
||||
#if !defined(CV_FORCE_SIMD128_CPP)
|
||||
#define CV_FORCE_SIMD128_CPP 1
|
||||
#endif
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
@ -267,6 +275,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
|
||||
#endif
|
||||
|
||||
#if CV_LASX
|
||||
|
||||
#define CV__SIMD_FORWARD 256
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#include "opencv2/core/hal/intrin_lasx.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv {
|
||||
|
3236
modules/core/include/opencv2/core/hal/intrin_lasx.hpp
Normal file
3236
modules/core/include/opencv2/core/hal/intrin_lasx.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
|
||||
// https://github.com/riscv/riscv-isa-manual/issues/43
|
||||
// # define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause"); } } while (0)
|
||||
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
|
||||
# elif defined __GNUC__ && defined __loongarch__
|
||||
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("nop"); } } while (0)
|
||||
# else
|
||||
# warning "Can't detect 'pause' (CPU-yield) instruction on the target platform. Specify CV_PAUSE() definition via compiler flags."
|
||||
# define CV_PAUSE(...) do { /* no-op: works, but not effective */ } while (0)
|
||||
|
@ -434,6 +434,8 @@ struct HWFeatures
|
||||
g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";
|
||||
|
||||
g_hwFeatureNames[CPU_RVV] = "RVV";
|
||||
|
||||
g_hwFeatureNames[CPU_LASX] = "LASX";
|
||||
}
|
||||
|
||||
void initialize(void)
|
||||
@ -689,6 +691,10 @@ struct HWFeatures
|
||||
have[CV_CPU_RVV] = true;
|
||||
#endif
|
||||
|
||||
#if defined __loongarch_asx
|
||||
have[CV_CPU_LASX] = true;
|
||||
#endif
|
||||
|
||||
bool skip_baseline_check = false;
|
||||
#ifndef NO_GETENV
|
||||
if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK"))
|
||||
|
@ -136,7 +136,11 @@ TEST_P(HAL, mat_decomp)
|
||||
int size = (hcase / 2) % 4;
|
||||
size = size == 0 ? 3 : size == 1 ? 4 : size == 2 ? 6 : 15;
|
||||
int nfunc = (hcase / 8);
|
||||
#if CV_LASX
|
||||
double eps = depth == CV_32F ? 1e-5 : 2e-10;
|
||||
#else
|
||||
double eps = depth == CV_32F ? 1e-5 : 1e-10;
|
||||
#endif
|
||||
|
||||
if( size == 3 )
|
||||
return; // TODO ???
|
||||
|
@ -8,8 +8,8 @@ endif()
|
||||
|
||||
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
|
||||
|
||||
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
|
||||
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
|
||||
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
|
||||
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
|
||||
|
||||
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
|
||||
|
||||
|
@ -579,13 +579,14 @@ public:
|
||||
bool is1x1_;
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
bool useLASX;
|
||||
int blk_size_cn;
|
||||
int inpZp, outZp;
|
||||
const std::vector<float>* multiplier;
|
||||
|
||||
ParallelConv()
|
||||
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
|
||||
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false)
|
||||
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
|
||||
, blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
|
||||
{}
|
||||
|
||||
@ -641,6 +642,8 @@ public:
|
||||
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
|
||||
|
||||
p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D;
|
||||
|
||||
int kernel_d = isConv3D? kernel_size[0] : 1;
|
||||
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
|
||||
int kernel_w = kernel_size.back();
|
||||
@ -837,6 +840,13 @@ public:
|
||||
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
||||
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if(useLASX)
|
||||
opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
|
||||
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
||||
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
||||
@ -1210,6 +1220,12 @@ public:
|
||||
opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
||||
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if(useLASX)
|
||||
opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
||||
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
|
||||
else
|
||||
#endif
|
||||
for( int i = 0; i < outCn; i += 2 )
|
||||
{
|
||||
|
@ -226,7 +226,7 @@ public:
|
||||
{
|
||||
public:
|
||||
FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
|
||||
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {}
|
||||
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
|
||||
|
||||
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
|
||||
const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
|
||||
@ -250,6 +250,7 @@ public:
|
||||
p.activ = !activationLUT.empty() ? activ : 0;
|
||||
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
|
||||
p.useLASX = checkHardwareSupport(CPU_LASX);
|
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes);
|
||||
}
|
||||
@ -294,6 +295,11 @@ public:
|
||||
if( useAVX2 )
|
||||
opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if( useLASX )
|
||||
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
|
||||
else
|
||||
#endif
|
||||
{
|
||||
int i = 0;
|
||||
@ -349,6 +355,7 @@ public:
|
||||
int nstripes, outZp;
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
bool useLASX;
|
||||
};
|
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
||||
|
@ -633,5 +633,629 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
|
||||
}
|
||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
|
||||
|
||||
inline __m256i _v256_fmadds8_s32(const __m256i& a, const __m256i& b, const __m256i& c)
|
||||
{
|
||||
__m256i vzero = __lasx_xvreplgr2vr_d(0);
|
||||
__m256i even_ab = __lasx_xvmaddwev_h_b(vzero, a, b);
|
||||
__m256i madd_ab = __lasx_xvmaddwod_h_b(even_ab, a, b);
|
||||
|
||||
__m256i even_madd_ab = __lasx_xvsrai_w(__lasx_xvslli_w(madd_ab, 16), 16);
|
||||
__m256i odd_madd_ab = __lasx_xvsrai_w(madd_ab, 16);
|
||||
|
||||
return __lasx_xvadd_w(__lasx_xvadd_w(even_madd_ab, odd_madd_ab), c);
|
||||
}
|
||||
|
||||
enum { FASCONV_BASE_VECSZ = 4 };
|
||||
|
||||
void fastConv( const int8_t* weights, size_t wstep, const int* bias,
|
||||
const int8_t* rowbuf, int* output, const int* outShape,
|
||||
int blockSize, int vecsize, int vecsize_aligned, int outZp,
|
||||
const float* multiplier, bool initOutput, bool finalOutput )
|
||||
{
|
||||
int outCn = outShape[1];
|
||||
size_t outPlaneSize = outShape[2]*outShape[3];
|
||||
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
|
||||
int rsz = blockSize % FASCONV_BASE_VECSZ;
|
||||
for( int i = 0; i < rsz; i++ )
|
||||
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
|
||||
__m128i mask = __lsx_vld((const float*)maskbuf, 0);
|
||||
|
||||
// now compute dot product of the weights
|
||||
// and im2row-transformed part of the tensor
|
||||
for( int i = 0; i < outCn; i += 3 )
|
||||
{
|
||||
const int8_t* wptr0 = weights + i*wstep;
|
||||
const int8_t* wptr1 = wptr0 + wstep;
|
||||
const int8_t* wptr2 = wptr1 + wstep;
|
||||
int* outptr0 = output + i*outPlaneSize;
|
||||
int* outptr1 = outptr0 + outPlaneSize;
|
||||
int* outptr2 = outptr1 + outPlaneSize;
|
||||
int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
|
||||
float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
|
||||
|
||||
if( i+2 >= outCn )
|
||||
{
|
||||
wptr2 = wptr1;
|
||||
outptr2 = outptr1;
|
||||
bias2 = bias1;
|
||||
mult2 = mult1;
|
||||
|
||||
if( i+1 >= outCn )
|
||||
{
|
||||
wptr2 = wptr1 = wptr0;
|
||||
outptr2 = outptr1 = outptr0;
|
||||
bias2 = bias1 = bias0;
|
||||
mult2 = mult1 = mult0;
|
||||
}
|
||||
}
|
||||
int j = 0;
|
||||
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
|
||||
{
|
||||
bool tail = false;
|
||||
if (j + FASCONV_BASE_VECSZ > blockSize)
|
||||
{
|
||||
if (j == 0)
|
||||
break;
|
||||
j = blockSize - FASCONV_BASE_VECSZ;
|
||||
tail = true;
|
||||
}
|
||||
int k = 0;
|
||||
const int8_t* rptr = rowbuf + j*vecsize_aligned;
|
||||
|
||||
__m256i vs00 = __lasx_xvreplgr2vr_d(0), vs01 = __lasx_xvreplgr2vr_d(0),
|
||||
vs02 = __lasx_xvreplgr2vr_d(0), vs03 = __lasx_xvreplgr2vr_d(0),
|
||||
vs10 = __lasx_xvreplgr2vr_d(0), vs11 = __lasx_xvreplgr2vr_d(0),
|
||||
vs12 = __lasx_xvreplgr2vr_d(0), vs13 = __lasx_xvreplgr2vr_d(0),
|
||||
vs20 = __lasx_xvreplgr2vr_d(0), vs21 = __lasx_xvreplgr2vr_d(0),
|
||||
vs22 = __lasx_xvreplgr2vr_d(0), vs23 = __lasx_xvreplgr2vr_d(0);
|
||||
|
||||
for (; k < vecsize; k += 32, rptr += 32 )
|
||||
{
|
||||
__m256i w0 = __lasx_xvld((const __m256i*)(wptr0 + k), 0);
|
||||
__m256i w1 = __lasx_xvld((const __m256i*)(wptr1 + k), 0);
|
||||
__m256i w2 = __lasx_xvld((const __m256i*)(wptr2 + k), 0);
|
||||
__m256i r0 = __lasx_xvld((const __m256i*)(rptr), 0);
|
||||
|
||||
vs00 = _v256_fmadds8_s32(w0, r0, vs00);
|
||||
vs10 = _v256_fmadds8_s32(w1, r0, vs10);
|
||||
vs20 = _v256_fmadds8_s32(w2, r0, vs20);
|
||||
|
||||
r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned), 0);
|
||||
vs01 = _v256_fmadds8_s32(w0, r0, vs01);
|
||||
vs11 = _v256_fmadds8_s32(w1, r0, vs11);
|
||||
vs21 = _v256_fmadds8_s32(w2, r0, vs21);
|
||||
|
||||
r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*2), 0);
|
||||
vs02 = _v256_fmadds8_s32(w0, r0, vs02);
|
||||
vs12 = _v256_fmadds8_s32(w1, r0, vs12);
|
||||
vs22 = _v256_fmadds8_s32(w2, r0, vs22);
|
||||
|
||||
r0 = __lasx_xvld((const __m256i*)(rptr + vecsize_aligned*3), 0);
|
||||
vs03 = _v256_fmadds8_s32(w0, r0, vs03);
|
||||
vs13 = _v256_fmadds8_s32(w1, r0, vs13);
|
||||
vs23 = _v256_fmadds8_s32(w2, r0, vs23);
|
||||
}
|
||||
|
||||
/*t0*/
|
||||
__m256i vs00_hadd_w = __lasx_xvhaddw_d_w(vs00, vs00);
|
||||
__m256i vs00_hadd_d = __lasx_xvhaddw_q_d(vs00_hadd_w, vs00_hadd_w);
|
||||
|
||||
__m256i vs01_hadd_w = __lasx_xvhaddw_d_w(vs01, vs01);
|
||||
__m256i vs01_hadd_d = __lasx_xvhaddw_q_d(vs01_hadd_w, vs01_hadd_w);
|
||||
|
||||
__m256i vs02_hadd_w = __lasx_xvhaddw_d_w(vs02, vs02);
|
||||
__m256i vs02_hadd_d = __lasx_xvhaddw_q_d(vs02_hadd_w, vs02_hadd_w);
|
||||
|
||||
__m256i vs03_hadd_w = __lasx_xvhaddw_d_w(vs03, vs03);
|
||||
__m256i vs03_hadd_d = __lasx_xvhaddw_q_d(vs03_hadd_w, vs03_hadd_w);
|
||||
|
||||
__m256i vs01_vs00 = __lasx_xvpackev_w(vs01_hadd_d, vs00_hadd_d);
|
||||
__m256i vs03_vs02 = __lasx_xvpackev_w(vs03_hadd_d, vs02_hadd_d);
|
||||
__m256i t0 = __lasx_xvpackev_d(vs03_vs02, vs01_vs00);
|
||||
|
||||
/*t1*/
|
||||
__m256i vs10_hadd_w = __lasx_xvhaddw_d_w(vs10, vs10);
|
||||
__m256i vs10_hadd_d = __lasx_xvhaddw_q_d(vs10_hadd_w, vs10_hadd_w);
|
||||
|
||||
__m256i vs11_hadd_w = __lasx_xvhaddw_d_w(vs11, vs11);
|
||||
__m256i vs11_hadd_d = __lasx_xvhaddw_q_d(vs11_hadd_w, vs11_hadd_w);
|
||||
|
||||
__m256i vs12_hadd_w = __lasx_xvhaddw_d_w(vs12, vs12);
|
||||
__m256i vs12_hadd_d = __lasx_xvhaddw_q_d(vs12_hadd_w, vs12_hadd_w);
|
||||
|
||||
__m256i vs13_hadd_w = __lasx_xvhaddw_d_w(vs13, vs13);
|
||||
__m256i vs13_hadd_d = __lasx_xvhaddw_q_d(vs13_hadd_w, vs13_hadd_w);
|
||||
|
||||
__m256i vs11_vs10 = __lasx_xvpackev_w(vs11_hadd_d, vs10_hadd_d);
|
||||
__m256i vs13_vs12 = __lasx_xvpackev_w(vs13_hadd_d, vs12_hadd_d);
|
||||
__m256i t1 = __lasx_xvpackev_d(vs13_vs12, vs11_vs10);
|
||||
|
||||
/*t2*/
|
||||
__m256i vs20_hadd_w = __lasx_xvhaddw_d_w(vs20, vs20);
|
||||
__m256i vs20_hadd_d = __lasx_xvhaddw_q_d(vs20_hadd_w, vs20_hadd_w);
|
||||
|
||||
__m256i vs21_hadd_w = __lasx_xvhaddw_d_w(vs21, vs21);
|
||||
__m256i vs21_hadd_d = __lasx_xvhaddw_q_d(vs21_hadd_w, vs21_hadd_w);
|
||||
|
||||
__m256i vs22_hadd_w = __lasx_xvhaddw_d_w(vs22, vs22);
|
||||
__m256i vs22_hadd_d = __lasx_xvhaddw_q_d(vs22_hadd_w, vs22_hadd_w);
|
||||
|
||||
__m256i vs23_hadd_w = __lasx_xvhaddw_d_w(vs23, vs23);
|
||||
__m256i vs23_hadd_d = __lasx_xvhaddw_q_d(vs23_hadd_w, vs23_hadd_w);
|
||||
|
||||
__m256i vs21_vs20 = __lasx_xvpackev_w(vs21_hadd_d, vs20_hadd_d);
|
||||
__m256i vs23_vs22 = __lasx_xvpackev_w(vs23_hadd_d, vs22_hadd_d);
|
||||
__m256i t2 = __lasx_xvpackev_d(vs23_vs22, vs21_vs20);
|
||||
|
||||
t0 = __lasx_xvadd_w(t0, __lasx_xvpermi_q(t0, t0, 1));
|
||||
t1 = __lasx_xvadd_w(t1, __lasx_xvpermi_q(t1, t1, 1));
|
||||
t2 = __lasx_xvadd_w(t2, __lasx_xvpermi_q(t2, t2, 1));
|
||||
|
||||
__m128i s0, s1, s2;
|
||||
|
||||
if( initOutput )
|
||||
{
|
||||
s0 = __lsx_vreplgr2vr_w(bias0);
|
||||
s1 = __lsx_vreplgr2vr_w(bias1);
|
||||
s2 = __lsx_vreplgr2vr_w(bias2);
|
||||
}
|
||||
else
|
||||
{
|
||||
s0 = __lsx_vld((__m128i*)(outptr0 + j), 0);
|
||||
s1 = __lsx_vld((__m128i*)(outptr1 + j), 0);
|
||||
s2 = __lsx_vld((__m128i*)(outptr2 + j), 0);
|
||||
}
|
||||
|
||||
s0 = __lsx_vadd_w(s0, *(__m128i*)&t0);
|
||||
s1 = __lsx_vadd_w(s1, *(__m128i*)&t1);
|
||||
s2 = __lsx_vadd_w(s2, *(__m128i*)&t2);
|
||||
|
||||
if( finalOutput )
|
||||
{
|
||||
__m128i voutzp = __lsx_vreplgr2vr_w(outZp);
|
||||
__m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
|
||||
__m256 v_mult0 = _v256_setall_ps(mult0);
|
||||
__m256 v_mult1 = _v256_setall_ps(mult1);
|
||||
__m256 v_mult2 = _v256_setall_ps(mult2);
|
||||
|
||||
s0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s0), *(__m128*)&v_mult0)));
|
||||
s1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s1), *(__m128*)&v_mult1)));
|
||||
s2 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(s2), *(__m128*)&v_mult2)));
|
||||
|
||||
s0 = __lsx_vmin_w(__lsx_vmax_w(s0, outmin), outmax);
|
||||
s1 = __lsx_vmin_w(__lsx_vmax_w(s1, outmin), outmax);
|
||||
s2 = __lsx_vmin_w(__lsx_vmax_w(s2, outmin), outmax);
|
||||
}
|
||||
if( tail )
|
||||
{
|
||||
s0 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr0 + j, 0), s0, mask);
|
||||
s1 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr1 + j, 0), s1, mask);
|
||||
s2 = __lsx_vbitsel_v(__lsx_vld((const float*)outptr2 + j, 0), s2, mask);
|
||||
}
|
||||
__lsx_vst(s0, (__m128i*)(outptr0 + j), 0);
|
||||
__lsx_vst(s1, (__m128i*)(outptr1 + j), 0);
|
||||
__lsx_vst(s2, (__m128i*)(outptr2 + j), 0);
|
||||
}
|
||||
|
||||
for( ; j <= blockSize - 2; j += 2 )
|
||||
{
|
||||
const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
|
||||
const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
|
||||
int s00, s01, s10, s11, s20, s21;
|
||||
|
||||
if( initOutput )
|
||||
{
|
||||
s00 = s01 = bias0;
|
||||
s10 = s11 = bias1;
|
||||
s20 = s21 = bias2;
|
||||
}
|
||||
else
|
||||
{
|
||||
s00 = outptr0[j]; s01 = outptr0[j+1];
|
||||
s10 = outptr1[j]; s11 = outptr1[j+1];
|
||||
s20 = outptr2[j]; s21 = outptr2[j+1];
|
||||
}
|
||||
|
||||
for( int k = 0; k < vecsize; k++ )
|
||||
{
|
||||
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
|
||||
int8_t r = rptr0[k];
|
||||
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
|
||||
r = rptr1[k];
|
||||
s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
|
||||
}
|
||||
|
||||
if( finalOutput )
|
||||
{
|
||||
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
|
||||
s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
|
||||
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
|
||||
s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
|
||||
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
|
||||
s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
|
||||
}
|
||||
outptr0[j] = s00;
|
||||
outptr0[j+1] = s01;
|
||||
outptr1[j] = s10;
|
||||
outptr1[j+1] = s11;
|
||||
outptr2[j] = s20;
|
||||
outptr2[j+1] = s21;
|
||||
}
|
||||
|
||||
for( ; j < blockSize; j++ )
|
||||
{
|
||||
const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
|
||||
int s00, s10, s20;
|
||||
|
||||
if( initOutput )
|
||||
{
|
||||
s00 = bias0;
|
||||
s10 = bias1;
|
||||
s20 = bias2;
|
||||
}
|
||||
else
|
||||
{
|
||||
s00 = outptr0[j];
|
||||
s10 = outptr1[j];
|
||||
s20 = outptr2[j];
|
||||
}
|
||||
|
||||
for( int k = 0; k < vecsize; k++ )
|
||||
{
|
||||
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
|
||||
int8_t r = rptr0[k];
|
||||
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
|
||||
}
|
||||
|
||||
if( finalOutput )
|
||||
{
|
||||
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
|
||||
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
|
||||
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
|
||||
}
|
||||
outptr0[j] = s00;
|
||||
outptr1[j] = s10;
|
||||
outptr2[j] = s20;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void _v256_expand_mul_add(const __m256i& a, const __m256i& b,
|
||||
__m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
|
||||
{
|
||||
__m256i a0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x10), 0);
|
||||
__m256i a1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(a, 0x32), 0);
|
||||
|
||||
__m256i b0 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x10), 0);
|
||||
__m256i b1 = __lasx_xvsllwil_h_b(__lasx_xvpermi_d(b, 0x32), 0);
|
||||
|
||||
__m256i a0b0 = __lasx_xvmul_h(a0, b0);
|
||||
__m256i a1b1 = __lasx_xvmul_h(a1, b1);
|
||||
|
||||
out0 = __lasx_xvadd_w(out0, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x10), 0));
|
||||
out1 = __lasx_xvadd_w(out1, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a0b0, 0x32), 0));
|
||||
out2 = __lasx_xvadd_w(out2, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x10), 0));
|
||||
out3 = __lasx_xvadd_w(out3, __lasx_xvsllwil_w_h(__lasx_xvpermi_d(a1b1, 0x32), 0));
|
||||
}
|
||||
|
||||
static inline void _v256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
|
||||
{
|
||||
__m256i t0 = __lasx_xvld((const __m256i*)ptr, 0);
|
||||
__m256i t1 = __lasx_xvld((const __m256i*)ptr, 32*1);
|
||||
|
||||
const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
|
||||
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
|
||||
__m256i p0 = __lasx_xvshuf_b(t0, t0, sh);
|
||||
__m256i p1 = __lasx_xvshuf_b(t1, t1, sh);
|
||||
__m256i lo = __lasx_xvpermi_q(p0, p1, 0x02);
|
||||
__m256i hi = __lasx_xvpermi_q(p0, p1, 0x13);
|
||||
|
||||
a = __lasx_xvilvl_d(hi, lo);
|
||||
b = __lasx_xvilvh_d(hi, lo);
|
||||
}
|
||||
|
||||
void fastDepthwiseConv( const int8_t* wptr,
|
||||
int kernel_h, int kernel_w,
|
||||
int stride_h, int stride_w,
|
||||
int dilation_h, int dilation_w,
|
||||
int pad_t, int pad_l,
|
||||
const int* biasptr, const float* multptr,
|
||||
const int8_t* inptr_,
|
||||
int height, int width,
|
||||
int* outptr_,
|
||||
int out_d, int outH, int outW,
|
||||
int inpZp, int outZp)
|
||||
{
|
||||
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
||||
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
||||
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
||||
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
||||
float mult = multptr[out_d];
|
||||
int bias = biasptr[out_d];
|
||||
int biasCopy;
|
||||
|
||||
for (int out_i = 0; out_i < outH; out_i++)
|
||||
{
|
||||
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
||||
const int8_t* imgptr0 = inptr_ + in_i*width;
|
||||
const int8_t* imgptr1 = imgptr0 + dilation_h*width;
|
||||
const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
||||
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
|
||||
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
|
||||
int out;
|
||||
biasCopy = bias;
|
||||
if (in_i < 0)
|
||||
{
|
||||
biasCopy += inpZp * (w00 + w01 + w02);
|
||||
w00 = w01 = w02 = 0;
|
||||
imgptr0 = imgptr1;
|
||||
}
|
||||
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
||||
{
|
||||
biasCopy += inpZp * (w20 + w21 + w22);
|
||||
w20 = w21 = w22 = 0;
|
||||
imgptr2 = imgptr1;
|
||||
}
|
||||
int* outptr = outptr_ + out_i*outW;
|
||||
if (pad_l > 0)
|
||||
{
|
||||
out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
|
||||
(int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
|
||||
(int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
|
||||
biasCopy + inpZp*(w00 + w10 + w20);
|
||||
outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
|
||||
out_j = 1;
|
||||
}
|
||||
|
||||
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
|
||||
{
|
||||
const int VECSZ = 32;
|
||||
__m256i vw00 = __lasx_xvreplgr2vr_b(w00), vw01 = __lasx_xvreplgr2vr_b(w01), vw02 = __lasx_xvreplgr2vr_b(w02),
|
||||
vw10 = __lasx_xvreplgr2vr_b(w10), vw11 = __lasx_xvreplgr2vr_b(w11), vw12 = __lasx_xvreplgr2vr_b(w12),
|
||||
vw20 = __lasx_xvreplgr2vr_b(w20), vw21 = __lasx_xvreplgr2vr_b(w21), vw22 = __lasx_xvreplgr2vr_b(w22);
|
||||
__m256i vbias = __lasx_xvreplgr2vr_w(biasCopy), voutzp = __lasx_xvreplgr2vr_w(outZp),
|
||||
outmin = __lasx_xvreplgr2vr_w(-128), outmax = __lasx_xvreplgr2vr_w(127);
|
||||
__m256 vmult = _v256_setall_ps(mult);
|
||||
__m256i vout0, vout1, vout2, vout3;
|
||||
|
||||
if( stride_w == 1 )
|
||||
{
|
||||
for( ; out_j < outW1; out_j += VECSZ )
|
||||
{
|
||||
if (out_j + VECSZ > outW1)
|
||||
{
|
||||
if (out_j <= pad_l)
|
||||
break;
|
||||
out_j = outW1 - VECSZ;
|
||||
}
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
__m256i v00 = __lasx_xvld((const __m256i*)(imgptr0 + in_j), 0),
|
||||
v01 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w), 0),
|
||||
v02 = __lasx_xvld((const __m256i*)(imgptr0 + in_j + dilation_w*2), 0),
|
||||
v10 = __lasx_xvld((const __m256i*)(imgptr1 + in_j), 0),
|
||||
v11 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w), 0),
|
||||
v12 = __lasx_xvld((const __m256i*)(imgptr1 + in_j + dilation_w*2), 0),
|
||||
v20 = __lasx_xvld((const __m256i*)(imgptr2 + in_j), 0),
|
||||
v21 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w), 0),
|
||||
v22 = __lasx_xvld((const __m256i*)(imgptr2 + in_j + dilation_w*2), 0);
|
||||
|
||||
vout0 = vout1 = vout2 = vout3 = vbias;
|
||||
_v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
|
||||
|
||||
vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
|
||||
vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
|
||||
vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
|
||||
vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
|
||||
|
||||
vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
|
||||
vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
|
||||
vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
|
||||
vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
|
||||
|
||||
__lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
|
||||
__lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
|
||||
__lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
|
||||
__lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ; out_j < outW1; out_j += VECSZ )
|
||||
{
|
||||
if (out_j + VECSZ > outW1)
|
||||
{
|
||||
if (out_j <= pad_l)
|
||||
break;
|
||||
out_j = outW1 - VECSZ;
|
||||
}
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
__m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
|
||||
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
|
||||
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
|
||||
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
|
||||
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
|
||||
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
|
||||
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
|
||||
|
||||
vout0 = vout1 = vout2 = vout3 = vbias;
|
||||
_v256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
|
||||
_v256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
|
||||
|
||||
vout0 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout0), vmult)));
|
||||
vout1 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout1), vmult)));
|
||||
vout2 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout2), vmult)));
|
||||
vout3 = __lasx_xvadd_w(voutzp, __lasx_xvftint_w_s(__lasx_xvfmul_s(__lasx_xvffint_s_w(vout3), vmult)));
|
||||
|
||||
vout0 = __lasx_xvmin_w(__lasx_xvmax_w(vout0, outmin), outmax);
|
||||
vout1 = __lasx_xvmin_w(__lasx_xvmax_w(vout1, outmin), outmax);
|
||||
vout2 = __lasx_xvmin_w(__lasx_xvmax_w(vout2, outmin), outmax);
|
||||
vout3 = __lasx_xvmin_w(__lasx_xvmax_w(vout3, outmin), outmax);
|
||||
|
||||
__lasx_xvst(vout0, (__m256i*)(outptr + out_j), 0);
|
||||
__lasx_xvst(vout1, (__m256i*)(outptr + out_j), 8*4);
|
||||
__lasx_xvst(vout2, (__m256i*)(outptr + out_j), 16*4);
|
||||
__lasx_xvst(vout3, (__m256i*)(outptr + out_j), 24*4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; out_j < outW1; out_j++)
|
||||
{
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
|
||||
(int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
|
||||
(int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
|
||||
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
|
||||
}
|
||||
|
||||
for (; out_j < outW; out_j++ )
|
||||
{
|
||||
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
||||
int s0 = 1, s1 = 1, s2 = 1;
|
||||
if (in_j0 >= width)
|
||||
{
|
||||
in_j0 = 0;
|
||||
s0 = 0;
|
||||
biasCopy += inpZp*(w00 + w10 + w20);
|
||||
}
|
||||
if (in_j1 >= width)
|
||||
{
|
||||
in_j1 = 0;
|
||||
s1 = 0;
|
||||
biasCopy += inpZp*(w01 + w11 + w21);
|
||||
}
|
||||
if (in_j2 >= width)
|
||||
{
|
||||
in_j2 = 0;
|
||||
s2 = 0;
|
||||
biasCopy += inpZp*(w02 + w12 + w22);
|
||||
}
|
||||
out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
|
||||
(int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
|
||||
(int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
|
||||
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dst = vec * weights^t + bias
|
||||
void fastGEMM1T( const int8_t* vec, const int8_t* weights,
|
||||
size_t wstep, const int* bias, const float* multiplier,
|
||||
int* dst, int nvecs, int vecsize, int outZp )
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
for( ; i <= nvecs - 8; i += 8 )
|
||||
{
|
||||
const int8_t* wptr = weights + i*wstep;
|
||||
__m256i vs0 = __lasx_xvreplgr2vr_d(0), vs1 = __lasx_xvreplgr2vr_d(0),
|
||||
vs2 = __lasx_xvreplgr2vr_d(0), vs3 = __lasx_xvreplgr2vr_d(0),
|
||||
vs4 = __lasx_xvreplgr2vr_d(0), vs5 = __lasx_xvreplgr2vr_d(0),
|
||||
vs6 = __lasx_xvreplgr2vr_d(0), vs7 = __lasx_xvreplgr2vr_d(0);
|
||||
|
||||
__m128i voutzp = __lsx_vreplgr2vr_w(outZp);
|
||||
__m128i outmin = __lsx_vreplgr2vr_w(-128), outmax = __lsx_vreplgr2vr_w(127);
|
||||
|
||||
for( int k = 0; k < vecsize; k += 32, wptr += 32 )
|
||||
{
|
||||
__m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
|
||||
|
||||
vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
|
||||
vs1 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep), 0), v, vs1);
|
||||
vs2 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*2), 0), v, vs2);
|
||||
vs3 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*3), 0), v, vs3);
|
||||
vs4 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*4), 0), v, vs4);
|
||||
vs5 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*5), 0), v, vs5);
|
||||
vs6 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*6), 0), v, vs6);
|
||||
vs7 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)(wptr + wstep*7), 0), v, vs7);
|
||||
}
|
||||
|
||||
/*s0*/
|
||||
__m256i vs0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
|
||||
__m256i vs0_hadd_d = __lasx_xvhaddw_q_d(vs0_hadd_w, vs0_hadd_w);
|
||||
|
||||
__m256i vs1_hadd_w = __lasx_xvhaddw_d_w(vs1, vs1);
|
||||
__m256i vs1_hadd_d = __lasx_xvhaddw_q_d(vs1_hadd_w, vs1_hadd_w);
|
||||
|
||||
__m256i vs2_hadd_w = __lasx_xvhaddw_d_w(vs2, vs2);
|
||||
__m256i vs2_hadd_d = __lasx_xvhaddw_q_d(vs2_hadd_w, vs2_hadd_w);
|
||||
|
||||
__m256i vs3_hadd_w = __lasx_xvhaddw_d_w(vs3, vs3);
|
||||
__m256i vs3_hadd_d = __lasx_xvhaddw_q_d(vs3_hadd_w, vs3_hadd_w);
|
||||
|
||||
__m256i vs1_vs0 = __lasx_xvpackev_w(vs1_hadd_d, vs0_hadd_d);
|
||||
__m256i vs3_vs2 = __lasx_xvpackev_w(vs3_hadd_d, vs2_hadd_d);
|
||||
__m256i s0 = __lasx_xvpackev_d(vs3_vs2, vs1_vs0);
|
||||
|
||||
/*s1*/
|
||||
__m256i vs4_hadd_w = __lasx_xvhaddw_d_w(vs4, vs4);
|
||||
__m256i vs4_hadd_d = __lasx_xvhaddw_q_d(vs4_hadd_w, vs4_hadd_w);
|
||||
|
||||
__m256i vs5_hadd_w = __lasx_xvhaddw_d_w(vs5, vs5);
|
||||
__m256i vs5_hadd_d = __lasx_xvhaddw_q_d(vs5_hadd_w, vs5_hadd_w);
|
||||
|
||||
__m256i vs6_hadd_w = __lasx_xvhaddw_d_w(vs6, vs6);
|
||||
__m256i vs6_hadd_d = __lasx_xvhaddw_q_d(vs6_hadd_w, vs6_hadd_w);
|
||||
|
||||
__m256i vs7_hadd_w = __lasx_xvhaddw_d_w(vs7, vs7);
|
||||
__m256i vs7_hadd_d = __lasx_xvhaddw_q_d(vs7_hadd_w, vs7_hadd_w);
|
||||
|
||||
__m256i vs5_vs4 = __lasx_xvpackev_w(vs5_hadd_d, vs4_hadd_d);
|
||||
__m256i vs7_vs6 = __lasx_xvpackev_w(vs7_hadd_d, vs6_hadd_d);
|
||||
__m256i s1 = __lasx_xvpackev_d(vs7_vs6, vs5_vs4);
|
||||
|
||||
s0 = __lasx_xvadd_w(s0, __lasx_xvpermi_q(s0, s0, 1));
|
||||
s1 = __lasx_xvadd_w(s1, __lasx_xvpermi_q(s1, s1, 1));
|
||||
|
||||
__m128i t0 = __lsx_vadd_w(*(__m128i*)(&s0), __lsx_vld((__m128i*)(bias + i), 0));
|
||||
__m128i t1 = __lsx_vadd_w(*(__m128i*)(&s1), __lsx_vld((__m128i*)(bias + i), 4*4));
|
||||
|
||||
t0 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t0), (__m128)__lsx_vld(multiplier + i, 0))));
|
||||
t1 = __lsx_vadd_w(voutzp, __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(t1), (__m128)__lsx_vld(multiplier + i, 4*4))));
|
||||
|
||||
t0 = __lsx_vmin_w(__lsx_vmax_w(t0, outmin), outmax);
|
||||
t1 = __lsx_vmin_w(__lsx_vmax_w(t1, outmin), outmax);
|
||||
|
||||
__lsx_vst(t0, (__m128i*)(dst + i), 0);
|
||||
__lsx_vst(t1, (__m128i*)(dst + i), 4*4);
|
||||
}
|
||||
|
||||
for( ; i < nvecs; i++ )
|
||||
{
|
||||
const int8_t* wptr = weights + i*wstep;
|
||||
__m256i vs0 = __lasx_xvreplgr2vr_d(0);
|
||||
|
||||
for( int k = 0; k < vecsize; k += 32, wptr += 32 )
|
||||
{
|
||||
__m256i v = __lasx_xvld((const __m256i*)(vec + k), 0);
|
||||
vs0 = _v256_fmadds8_s32(__lasx_xvld((const __m256i*)wptr, 0), v, vs0);
|
||||
}
|
||||
|
||||
__m256i s0_hadd_w = __lasx_xvhaddw_d_w(vs0, vs0);
|
||||
int temp = ((v4i64)s0_hadd_w)[0] + ((v4i64)s0_hadd_w)[1] + ((v4i64)s0_hadd_w)[2] + ((v4i64)s0_hadd_w)[3];
|
||||
dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
|
||||
}
|
||||
|
||||
}
|
||||
#endif // CV_LASX
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
}} // namespace
|
||||
|
@ -986,12 +986,13 @@ public:
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
bool useRVV;
|
||||
bool useLASX;
|
||||
int blk_size_cn;
|
||||
|
||||
ParallelConv()
|
||||
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
|
||||
biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false)
|
||||
, blk_size_cn(0)
|
||||
, useLASX(false), blk_size_cn(0)
|
||||
{}
|
||||
|
||||
static void run( const Mat& input, Mat& output, const Mat& weights,
|
||||
@ -1049,6 +1050,7 @@ public:
|
||||
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
|
||||
p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D;
|
||||
p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D;
|
||||
|
||||
int kernel_d = isConv3D? kernel_size[0] : 1;
|
||||
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
|
||||
@ -1256,6 +1258,13 @@ public:
|
||||
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
||||
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if(useLASX)
|
||||
opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
|
||||
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
||||
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
||||
@ -1631,6 +1640,12 @@ public:
|
||||
opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
||||
outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if(useLASX)
|
||||
opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
|
||||
outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
|
||||
else
|
||||
#endif
|
||||
for( int i = 0; i < outCn; i += 2 )
|
||||
{
|
||||
@ -2437,6 +2452,7 @@ public:
|
||||
useAVX2 = checkHardwareSupport(CPU_AVX2);
|
||||
useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
|
||||
useRVV = checkHardwareSupport(CPU_RVV);
|
||||
useLASX = checkHardwareSupport(CPU_LASX);
|
||||
}
|
||||
|
||||
void operator()(const Range& range_) const CV_OVERRIDE
|
||||
@ -2474,6 +2490,11 @@ public:
|
||||
opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if( useLASX )
|
||||
opt_LASX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
|
||||
else
|
||||
#endif
|
||||
for( m = 0; m < mmax; m += 2 )
|
||||
{
|
||||
@ -2574,6 +2595,7 @@ public:
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
bool useRVV;
|
||||
bool useLASX;
|
||||
};
|
||||
|
||||
class Col2ImInvoker : public cv::ParallelLoopBody
|
||||
|
@ -173,7 +173,7 @@ public:
|
||||
class FullyConnected : public ParallelLoopBody
|
||||
{
|
||||
public:
|
||||
FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {}
|
||||
FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false), useLASX(false) {}
|
||||
|
||||
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
|
||||
Mat& dstMat, const ActivationLayer* activ, int nstripes)
|
||||
@ -197,6 +197,7 @@ public:
|
||||
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
|
||||
p.useRVV = checkHardwareSupport(CPU_RVV);
|
||||
p.useLASX = checkHardwareSupport(CPU_LASX);
|
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes);
|
||||
}
|
||||
@ -250,6 +251,11 @@ public:
|
||||
if( useRVV )
|
||||
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if( useLASX )
|
||||
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
int i = 0;
|
||||
@ -305,6 +311,7 @@ public:
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
bool useRVV;
|
||||
bool useLASX;
|
||||
};
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
@ -1343,5 +1343,684 @@ void fastDepthwiseConv( const float* wptr,
|
||||
|
||||
#endif // CV_RVV
|
||||
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
|
||||
|
||||
enum { FASCONV_BASE_VECSZ = 4 };
|
||||
|
||||
void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
const float* rowbuf, float* output, const int* outShape,
|
||||
int blockSize, int vecsize, int vecsize_aligned,
|
||||
const float* relu, bool initOutput )
|
||||
{
|
||||
int outCn = outShape[1];
|
||||
size_t outPlaneSize = outShape[2]*outShape[3];
|
||||
float r0 = 1.f, r1 = 1.f, r2 = 1.f;
|
||||
__m256 t1 = _v256_setall_ps(1.f), t2 = _v256_setall_ps(0.f);
|
||||
__m128 vr0 = *(__m128*)&t1, vr1 = vr0, vr2 = vr0, z = *(__m128*)&t2;
|
||||
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
|
||||
int rsz = blockSize % FASCONV_BASE_VECSZ;
|
||||
for( int i = 0; i < rsz; i++ )
|
||||
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
|
||||
__m128i mask = __lsx_vld((const float*)maskbuf, 0);
|
||||
|
||||
// now compute dot product of the weights
|
||||
// and im2row-transformed part of the tensor
|
||||
for( int i = 0; i < outCn; i += 3 )
|
||||
{
|
||||
const float* wptr0 = weights + i*wstep;
|
||||
const float* wptr1 = wptr0 + wstep;
|
||||
const float* wptr2 = wptr1 + wstep;
|
||||
float* outptr0 = output + i*outPlaneSize;
|
||||
float* outptr1 = outptr0 + outPlaneSize;
|
||||
float* outptr2 = outptr1 + outPlaneSize;
|
||||
float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
|
||||
|
||||
if( i+2 >= outCn )
|
||||
{
|
||||
wptr2 = wptr1;
|
||||
outptr2 = outptr1;
|
||||
bias2 = bias1;
|
||||
if( i+1 >= outCn )
|
||||
{
|
||||
wptr2 = wptr1 = wptr0;
|
||||
outptr2 = outptr1 = outptr0;
|
||||
bias2 = bias1 = bias0;
|
||||
}
|
||||
}
|
||||
|
||||
if( relu )
|
||||
{
|
||||
r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
|
||||
if( i+2 >= outCn )
|
||||
{
|
||||
r2 = r1;
|
||||
if( i+1 >= outCn )
|
||||
r2 = r1 = r0;
|
||||
}
|
||||
vr0 = _v256_extract_low(_v256_setall_ps(r0));
|
||||
vr1 = _v256_extract_low(_v256_setall_ps(r1));
|
||||
vr2 = _v256_extract_low(_v256_setall_ps(r2));
|
||||
}
|
||||
|
||||
int j = 0;
|
||||
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
|
||||
{
|
||||
bool tail = false;
|
||||
if (j + FASCONV_BASE_VECSZ > blockSize)
|
||||
{
|
||||
if (j == 0)
|
||||
break;
|
||||
j = blockSize - FASCONV_BASE_VECSZ;
|
||||
tail = true;
|
||||
}
|
||||
int k = 0;
|
||||
const float* rptr = rowbuf + j*vecsize_aligned;
|
||||
|
||||
__m256i tmp;
|
||||
__m256 vs00 = (__m256)__lasx_xvxor_v(tmp, tmp), vs01 = (__m256)__lasx_xvxor_v(tmp, tmp),
|
||||
vs02 = (__m256)__lasx_xvxor_v(tmp, tmp), vs03 = (__m256)__lasx_xvxor_v(tmp, tmp),
|
||||
vs10 = (__m256)__lasx_xvxor_v(tmp, tmp), vs11 = (__m256)__lasx_xvxor_v(tmp, tmp),
|
||||
vs12 = (__m256)__lasx_xvxor_v(tmp, tmp), vs13 = (__m256)__lasx_xvxor_v(tmp, tmp),
|
||||
vs20 = (__m256)__lasx_xvxor_v(tmp, tmp), vs21 = (__m256)__lasx_xvxor_v(tmp, tmp),
|
||||
vs22 = (__m256)__lasx_xvxor_v(tmp, tmp), vs23 = (__m256)__lasx_xvxor_v(tmp, tmp);
|
||||
|
||||
for (; k < vecsize; k += 8, rptr += 8 )
|
||||
{
|
||||
__m256 w0 = (__m256)__lasx_xvld(wptr0 + k, 0);
|
||||
__m256 w1 = (__m256)__lasx_xvld(wptr1 + k, 0);
|
||||
__m256 w2 = (__m256)__lasx_xvld(wptr2 + k, 0);
|
||||
__m256 r0 = (__m256)__lasx_xvld(rptr, 0);
|
||||
|
||||
vs00 = __lasx_xvfmadd_s(w0, r0, vs00);
|
||||
vs10 = __lasx_xvfmadd_s(w1, r0, vs10);
|
||||
vs20 = __lasx_xvfmadd_s(w2, r0, vs20);
|
||||
|
||||
r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned, 0);
|
||||
vs01 = __lasx_xvfmadd_s(w0, r0, vs01);
|
||||
vs11 = __lasx_xvfmadd_s(w1, r0, vs11);
|
||||
vs21 = __lasx_xvfmadd_s(w2, r0, vs21);
|
||||
|
||||
r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*2, 0);
|
||||
vs02 = __lasx_xvfmadd_s(w0, r0, vs02);
|
||||
vs12 = __lasx_xvfmadd_s(w1, r0, vs12);
|
||||
vs22 = __lasx_xvfmadd_s(w2, r0, vs22);
|
||||
|
||||
r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*3, 0);
|
||||
vs03 = __lasx_xvfmadd_s(w0, r0, vs03);
|
||||
vs13 = __lasx_xvfmadd_s(w1, r0, vs13);
|
||||
vs23 = __lasx_xvfmadd_s(w2, r0, vs23);
|
||||
}
|
||||
|
||||
/*t0*/
|
||||
__m256 vs00_perm = (__m256)__lasx_xvpermi_d(vs00, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs00_add_2w = __lasx_xvfadd_s(vs00, vs00_perm);
|
||||
__m256 tmp00_srl = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
|
||||
__m256 vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
|
||||
|
||||
__m256 vs01_perm = (__m256)__lasx_xvpermi_d(vs01, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs01_add_2w = __lasx_xvfadd_s(vs01, vs01_perm);
|
||||
__m256 tmp01_srl = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
|
||||
__m256 vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
|
||||
|
||||
__m256 vs02_perm = (__m256)__lasx_xvpermi_d(vs02, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs02_add_2w = __lasx_xvfadd_s(vs02, vs02_perm);
|
||||
__m256 tmp02_srl = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
|
||||
__m256 vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
|
||||
|
||||
__m256 vs03_perm = (__m256)__lasx_xvpermi_d(vs03, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs03_add_2w = __lasx_xvfadd_s(vs03, vs03_perm);
|
||||
__m256 tmp03_srl = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
|
||||
__m256 vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
|
||||
|
||||
__m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
|
||||
__m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
|
||||
__m256 t0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
|
||||
|
||||
/*t1*/
|
||||
__m256 vs10_perm = (__m256)__lasx_xvpermi_d(vs10, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs10_add_2w = __lasx_xvfadd_s(vs10, vs10_perm);
|
||||
__m256 tmp10_srl = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
|
||||
__m256 vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
|
||||
|
||||
__m256 vs11_perm = (__m256)__lasx_xvpermi_d(vs11, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs11_add_2w = __lasx_xvfadd_s(vs11, vs11_perm);
|
||||
__m256 tmp11_srl = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
|
||||
__m256 vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
|
||||
|
||||
__m256 vs12_perm = (__m256)__lasx_xvpermi_d(vs12, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs12_add_2w = __lasx_xvfadd_s(vs12, vs12_perm);
|
||||
__m256 tmp12_srl = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
|
||||
__m256 vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
|
||||
|
||||
__m256 vs13_perm = (__m256)__lasx_xvpermi_d(vs13, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs13_add_2w = __lasx_xvfadd_s(vs13, vs13_perm);
|
||||
__m256 tmp13_srl = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
|
||||
__m256 vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
|
||||
|
||||
__m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
|
||||
__m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
|
||||
__m256 t1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
|
||||
|
||||
/*t2*/
|
||||
__m256 vs20_perm = (__m256)__lasx_xvpermi_d(vs20, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs20_add_2w = __lasx_xvfadd_s(vs20, vs20_perm);
|
||||
__m256 tmp20_srl = (__m256)__lasx_xvsrli_d(vs20_add_2w, 32);
|
||||
__m256 vs20_add_4w = __lasx_xvfadd_s(vs20_add_2w, tmp20_srl);
|
||||
|
||||
__m256 vs21_perm = (__m256)__lasx_xvpermi_d(vs21, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs21_add_2w = __lasx_xvfadd_s(vs21, vs21_perm);
|
||||
__m256 tmp21_srl = (__m256)__lasx_xvsrli_d(vs21_add_2w, 32);
|
||||
__m256 vs21_add_4w = __lasx_xvfadd_s(vs21_add_2w, tmp21_srl);
|
||||
|
||||
__m256 vs22_perm = (__m256)__lasx_xvpermi_d(vs22, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs22_add_2w = __lasx_xvfadd_s(vs22, vs22_perm);
|
||||
__m256 tmp22_srl = (__m256)__lasx_xvsrli_d(vs22_add_2w, 32);
|
||||
__m256 vs22_add_4w = __lasx_xvfadd_s(vs22_add_2w, tmp22_srl);
|
||||
|
||||
__m256 vs23_perm = (__m256)__lasx_xvpermi_d(vs23, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs23_add_2w = __lasx_xvfadd_s(vs23, vs23_perm);
|
||||
__m256 tmp23_srl = (__m256)__lasx_xvsrli_d(vs23_add_2w, 32);
|
||||
__m256 vs23_add_4w = __lasx_xvfadd_s(vs23_add_2w, tmp23_srl);
|
||||
|
||||
__m256i vs21_vs20 = __lasx_xvpackev_w((__m256i)vs21_add_4w, (__m256i)vs20_add_4w);
|
||||
__m256i vs23_vs22 = __lasx_xvpackev_w((__m256i)vs23_add_4w, (__m256i)vs22_add_4w);
|
||||
__m256 t2 = (__m256)__lasx_xvpackev_d(vs23_vs22, vs21_vs20);
|
||||
|
||||
t0 = __lasx_xvfadd_s(t0, (__m256)__lasx_xvpermi_q(t0, t0, 1));
|
||||
t1 = __lasx_xvfadd_s(t1, (__m256)__lasx_xvpermi_q(t1, t1, 1));
|
||||
t2 = __lasx_xvfadd_s(t2, (__m256)__lasx_xvpermi_q(t2, t2, 1));
|
||||
|
||||
__m128 s0, s1, s2;
|
||||
|
||||
if( initOutput )
|
||||
{
|
||||
s0 = _v256_extract_low(_v256_setall_ps(bias0));
|
||||
s1 = _v256_extract_low(_v256_setall_ps(bias1));
|
||||
s2 = _v256_extract_low(_v256_setall_ps(bias2));
|
||||
}
|
||||
else
|
||||
{
|
||||
s0 = (__m128)__lsx_vld(outptr0 + j, 0);
|
||||
s1 = (__m128)__lsx_vld(outptr1 + j, 0);
|
||||
s2 = (__m128)__lsx_vld(outptr2 + j, 0);
|
||||
}
|
||||
|
||||
s0 = __lsx_vfadd_s(s0, *(__m128*)&t0);
|
||||
s1 = __lsx_vfadd_s(s1, *(__m128*)&t1);
|
||||
s2 = __lsx_vfadd_s(s2, *(__m128*)&t2);
|
||||
|
||||
if( relu )
|
||||
{
|
||||
__m128i m0 = __lsx_vfcmp_clt_s(z, s0);
|
||||
__m128i m1 = __lsx_vfcmp_clt_s(z, s1);
|
||||
__m128i m2 = __lsx_vfcmp_clt_s(z, s2);
|
||||
s0 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s0, vr0), (__m128i)s0, m0);
|
||||
s1 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s1, vr1), (__m128i)s1, m1);
|
||||
s2 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s2, vr2), (__m128i)s2, m2);
|
||||
}
|
||||
|
||||
if( tail )
|
||||
{
|
||||
s0 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr0 + j, 0), (__m128i)s0, mask);
|
||||
s1 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr1 + j, 0), (__m128i)s1, mask);
|
||||
s2 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr2 + j, 0), (__m128i)s2, mask);
|
||||
}
|
||||
|
||||
__lsx_vst(s0, outptr0 + j, 0);
|
||||
__lsx_vst(s1, outptr1 + j, 0);
|
||||
__lsx_vst(s2, outptr2 + j, 0);
|
||||
}
|
||||
|
||||
for( ; j <= blockSize - 2; j += 2 )
|
||||
{
|
||||
const float* rptr0 = rowbuf + j*vecsize_aligned;
|
||||
const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
|
||||
float s00, s01, s10, s11, s20, s21;
|
||||
|
||||
if( initOutput )
|
||||
{
|
||||
s00 = s01 = bias0;
|
||||
s10 = s11 = bias1;
|
||||
s20 = s21 = bias2;
|
||||
}
|
||||
else
|
||||
{
|
||||
s00 = outptr0[j]; s01 = outptr0[j+1];
|
||||
s10 = outptr1[j]; s11 = outptr1[j+1];
|
||||
s20 = outptr2[j]; s21 = outptr2[j+1];
|
||||
}
|
||||
|
||||
for( int k = 0; k < vecsize; k++ )
|
||||
{
|
||||
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
|
||||
float r = rptr0[k];
|
||||
s00 += w0*r; s10 += w1*r; s20 += w2*r;
|
||||
r = rptr1[k];
|
||||
s01 += w0*r; s11 += w1*r; s21 += w2*r;
|
||||
}
|
||||
|
||||
if( relu )
|
||||
{
|
||||
s00 = s00 > 0.f ? s00 : s00*r0;
|
||||
s01 = s01 > 0.f ? s01 : s01*r0;
|
||||
s10 = s10 > 0.f ? s10 : s10*r1;
|
||||
s11 = s11 > 0.f ? s11 : s11*r1;
|
||||
s20 = s20 > 0.f ? s20 : s20*r2;
|
||||
s21 = s21 > 0.f ? s21 : s21*r2;
|
||||
}
|
||||
|
||||
outptr0[j] = s00;
|
||||
outptr0[j+1] = s01;
|
||||
outptr1[j] = s10;
|
||||
outptr1[j+1] = s11;
|
||||
outptr2[j] = s20;
|
||||
outptr2[j+1] = s21;
|
||||
}
|
||||
|
||||
for( ; j < blockSize; j++ )
|
||||
{
|
||||
const float* rptr0 = rowbuf + j*vecsize_aligned;
|
||||
float s00, s10, s20;
|
||||
|
||||
if( initOutput )
|
||||
{
|
||||
s00 = bias0;
|
||||
s10 = bias1;
|
||||
s20 = bias2;
|
||||
}
|
||||
else
|
||||
{
|
||||
s00 = outptr0[j];
|
||||
s10 = outptr1[j];
|
||||
s20 = outptr2[j];
|
||||
}
|
||||
|
||||
for( int k = 0; k < vecsize; k++ )
|
||||
{
|
||||
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
|
||||
float r = rptr0[k];
|
||||
s00 += w0*r; s10 += w1*r; s20 += w2*r;
|
||||
}
|
||||
|
||||
if( relu )
|
||||
{
|
||||
s00 = s00 > 0.f ? s00 : s00*r0;
|
||||
s10 = s10 > 0.f ? s10 : s10*r1;
|
||||
s20 = s20 > 0.f ? s20 : s20*r2;
|
||||
}
|
||||
|
||||
outptr0[j] = s00;
|
||||
outptr1[j] = s10;
|
||||
outptr2[j] = s20;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
|
||||
{
|
||||
__m256 t0 = (__m256)__lasx_xvld(ptr, 0);
|
||||
__m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
|
||||
|
||||
__m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
|
||||
__m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
|
||||
|
||||
a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
|
||||
b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
|
||||
}
|
||||
|
||||
void fastDepthwiseConv( const float* wptr,
|
||||
int kernel_h, int kernel_w,
|
||||
int stride_h, int stride_w,
|
||||
int dilation_h, int dilation_w,
|
||||
int pad_t, int pad_l,
|
||||
const float* biasptr, const float* relu,
|
||||
const float* inptr_,
|
||||
int height, int width,
|
||||
float* outptr_,
|
||||
int out_d, int outH, int outW )
|
||||
{
|
||||
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
||||
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
||||
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
||||
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
||||
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
|
||||
|
||||
for (int out_i = 0; out_i < outH; out_i++)
|
||||
{
|
||||
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
||||
const float* imgptr0 = inptr_ + in_i*width;
|
||||
const float* imgptr1 = imgptr0 + dilation_h*width;
|
||||
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
||||
float out, w00 = w00_, w01 = w01_, w02 = w02_;
|
||||
float w20 = w20_, w21 = w21_, w22 = w22_;
|
||||
if (in_i < 0)
|
||||
{
|
||||
w00 = w01 = w02 = 0.f;
|
||||
imgptr0 = imgptr1;
|
||||
}
|
||||
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
||||
{
|
||||
w20 = w21 = w22 = 0.f;
|
||||
imgptr2 = imgptr1;
|
||||
}
|
||||
float* outptr = outptr_ + out_i*outW;
|
||||
if (pad_l > 0)
|
||||
{
|
||||
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
|
||||
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
|
||||
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[0] = out;
|
||||
out_j = 1;
|
||||
}
|
||||
|
||||
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
|
||||
{
|
||||
const int VECSZ = 8;
|
||||
__m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
|
||||
vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
|
||||
vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
|
||||
__m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
|
||||
vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
|
||||
|
||||
if( stride_w == 1 )
|
||||
for( ; out_j < outW1; out_j += VECSZ )
|
||||
{
|
||||
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
||||
out_j = outW1 - VECSZ;
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
__m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
|
||||
v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
|
||||
v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
|
||||
v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
|
||||
v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
|
||||
v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
|
||||
v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
|
||||
v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
|
||||
v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
|
||||
|
||||
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
|
||||
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
|
||||
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
|
||||
|
||||
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
|
||||
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
|
||||
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
|
||||
|
||||
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
|
||||
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
|
||||
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
|
||||
|
||||
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
|
||||
if (relu)
|
||||
{
|
||||
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
|
||||
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
|
||||
}
|
||||
__lasx_xvst(vout0, outptr + out_j, 0);
|
||||
}
|
||||
else
|
||||
for( ; out_j < outW1; out_j += VECSZ )
|
||||
{
|
||||
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
||||
out_j = outW1 - VECSZ;
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
|
||||
_v256_load_deinterleave(imgptr0 + in_j, v00, v01);
|
||||
_v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
|
||||
_v256_load_deinterleave(imgptr1 + in_j, v10, v11);
|
||||
_v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
|
||||
_v256_load_deinterleave(imgptr2 + in_j, v20, v21);
|
||||
_v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
|
||||
|
||||
__m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
|
||||
__m256 vout1 = __lasx_xvfmul_s(v01, vw01);
|
||||
__m256 vout2 = __lasx_xvfmul_s(v02, vw02);
|
||||
|
||||
vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
|
||||
vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
|
||||
vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
|
||||
|
||||
vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
|
||||
vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
|
||||
vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
|
||||
|
||||
vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
|
||||
if (relu)
|
||||
{
|
||||
__m256i m = __lasx_xvfcmp_clt_s(z, vout0);
|
||||
vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
|
||||
}
|
||||
__lasx_xvst(vout0, outptr + out_j, 0);
|
||||
}
|
||||
}
|
||||
|
||||
for (; out_j < outW1; out_j++)
|
||||
{
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
|
||||
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
|
||||
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[out_j] = out;
|
||||
}
|
||||
|
||||
for (; out_j < outW; out_j++ )
|
||||
{
|
||||
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
||||
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
|
||||
if (in_j0 >= width)
|
||||
{
|
||||
in_j0 = 0;
|
||||
s0 = 0.f;
|
||||
}
|
||||
if (in_j1 >= width)
|
||||
{
|
||||
in_j1 = 0;
|
||||
s1 = 0.f;
|
||||
}
|
||||
if (in_j2 >= width)
|
||||
{
|
||||
in_j2 = 0;
|
||||
s2 = 0.f;
|
||||
}
|
||||
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
|
||||
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
|
||||
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[out_j] = out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dst = vec * weights^t + bias
|
||||
void fastGEMM1T( const float* vec, const float* weights,
|
||||
size_t wstep, const float* bias,
|
||||
float* dst, int nvecs, int vecsize )
|
||||
{
|
||||
int i = 0;
|
||||
__m256i v256_tmp;
|
||||
|
||||
for( ; i <= nvecs - 8; i += 8 )
|
||||
{
|
||||
const float* wptr = weights + i*wstep;
|
||||
__m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs1 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
|
||||
vs2 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs3 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
|
||||
vs4 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs5 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp),
|
||||
vs6 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), vs7 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
|
||||
|
||||
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
|
||||
{
|
||||
__m256 v = (__m256)__lasx_xvld(vec + k, 0);
|
||||
|
||||
vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
|
||||
vs1 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep, 0), v, vs1);
|
||||
vs2 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*2, 0), v, vs2);
|
||||
vs3 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*3, 0), v, vs3);
|
||||
vs4 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*4, 0), v, vs4);
|
||||
vs5 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*5, 0), v, vs5);
|
||||
vs6 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*6, 0), v, vs6);
|
||||
vs7 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr + wstep*7, 0), v, vs7);
|
||||
}
|
||||
|
||||
/*s0*/
|
||||
__m256 vs00_perm = (__m256)__lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs00_add_2w = __lasx_xvfadd_s(vs0, vs00_perm);
|
||||
__m256 tmp00_srl = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32);
|
||||
__m256 vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl);
|
||||
|
||||
__m256 vs01_perm = (__m256)__lasx_xvpermi_d(vs1, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs01_add_2w = __lasx_xvfadd_s(vs1, vs01_perm);
|
||||
__m256 tmp01_srl = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32);
|
||||
__m256 vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl);
|
||||
|
||||
__m256 vs02_perm = (__m256)__lasx_xvpermi_d(vs2, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs02_add_2w = __lasx_xvfadd_s(vs2, vs02_perm);
|
||||
__m256 tmp02_srl = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32);
|
||||
__m256 vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl);
|
||||
|
||||
__m256 vs03_perm = (__m256)__lasx_xvpermi_d(vs3, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs03_add_2w = __lasx_xvfadd_s(vs3, vs03_perm);
|
||||
__m256 tmp03_srl = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32);
|
||||
__m256 vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl);
|
||||
|
||||
__m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w);
|
||||
__m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w);
|
||||
__m256 s0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00);
|
||||
|
||||
/*s1*/
|
||||
__m256 vs10_perm = (__m256)__lasx_xvpermi_d(vs4, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs10_add_2w = __lasx_xvfadd_s(vs4, vs10_perm);
|
||||
__m256 tmp10_srl = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32);
|
||||
__m256 vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl);
|
||||
|
||||
__m256 vs11_perm = (__m256)__lasx_xvpermi_d(vs5, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs11_add_2w = __lasx_xvfadd_s(vs5, vs11_perm);
|
||||
__m256 tmp11_srl = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32);
|
||||
__m256 vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl);
|
||||
|
||||
__m256 vs12_perm = (__m256)__lasx_xvpermi_d(vs6, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs12_add_2w = __lasx_xvfadd_s(vs6, vs12_perm);
|
||||
__m256 tmp12_srl = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32);
|
||||
__m256 vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl);
|
||||
|
||||
__m256 vs13_perm = (__m256)__lasx_xvpermi_d(vs7, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs13_add_2w = __lasx_xvfadd_s(vs7, vs13_perm);
|
||||
__m256 tmp13_srl = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32);
|
||||
__m256 vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl);
|
||||
|
||||
__m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w);
|
||||
__m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w);
|
||||
__m256 s1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10);
|
||||
|
||||
s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvpermi_q(s0, s0, 1));
|
||||
s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvpermi_q(s1, s1, 1));
|
||||
|
||||
s0 = __lasx_xvfadd_s(s0, (__m256)__lasx_xvld(bias + i, 0));
|
||||
s1 = __lasx_xvfadd_s(s1, (__m256)__lasx_xvld(bias + i, 4*4));
|
||||
|
||||
__lsx_vst(*(__m128*)&s0, dst + i, 0);
|
||||
__lsx_vst(*(__m128*)&s1, dst + i, 4*4);
|
||||
}
|
||||
|
||||
float temp = 0.f;
|
||||
for( ; i < nvecs; i++ )
|
||||
{
|
||||
const float* wptr = weights + i*wstep;
|
||||
__m256 vs0 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
|
||||
|
||||
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
|
||||
{
|
||||
__m256 v = (__m256)__lasx_xvld(vec + k, 0);
|
||||
vs0 = __lasx_xvfmadd_s((__m256)__lasx_xvld(wptr, 0), v, vs0);
|
||||
}
|
||||
|
||||
__m256i vs0_perm = __lasx_xvpermi_d(vs0, (2<<6) + (3<<4) + (0<<2) + 1);
|
||||
__m256 vs0_add_2w = __lasx_xvfadd_s(vs0, (__m256)vs0_perm);
|
||||
__m256i tmp_srl = __lasx_xvsrli_d(vs0_add_2w, 32);
|
||||
__m256 vs0_add_4w = __lasx_xvfadd_s(vs0_add_2w, (__m256)tmp_srl);
|
||||
temp = ((v8f32)vs0_add_4w)[0] + ((v8f32)vs0_add_4w)[4];
|
||||
dst[i] = temp + bias[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void fastGEMM( const float* aptr, size_t astep, const float* bptr,
|
||||
size_t bstep, float* cptr, size_t cstep,
|
||||
int ma, int na, int nb )
|
||||
{
|
||||
int n = 0;
|
||||
|
||||
for( ; n <= nb - 16; n += 16 )
|
||||
{
|
||||
for( int m = 0; m < ma; m += 4 )
|
||||
{
|
||||
const float* aptr0 = aptr + astep*m;
|
||||
const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
|
||||
const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
|
||||
const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
|
||||
|
||||
float* cptr0 = cptr + cstep*m;
|
||||
float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
|
||||
float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
|
||||
float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
|
||||
|
||||
__m256i v256_tmp;
|
||||
__m256 d00 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d01 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
|
||||
__m256 d10 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d11 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
|
||||
__m256 d20 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d21 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
|
||||
__m256 d30 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp), d31 = (__m256)__lasx_xvxor_v(v256_tmp, v256_tmp);
|
||||
|
||||
for( int k = 0; k < na; k++ )
|
||||
{
|
||||
__m256 a0 = _v256_setall_ps(aptr0[k]);
|
||||
__m256 a1 = _v256_setall_ps(aptr1[k]);
|
||||
__m256 a2 = _v256_setall_ps(aptr2[k]);
|
||||
__m256 a3 = _v256_setall_ps(aptr3[k]);
|
||||
|
||||
__m256 b0 = (__m256)__lasx_xvld(bptr + k*bstep + n, 0);
|
||||
__m256 b1 = (__m256)__lasx_xvld(bptr + k*bstep + n + 8, 0);
|
||||
d00 = __lasx_xvfmadd_s(a0, b0, d00);
|
||||
d01 = __lasx_xvfmadd_s(a0, b1, d01);
|
||||
d10 = __lasx_xvfmadd_s(a1, b0, d10);
|
||||
d11 = __lasx_xvfmadd_s(a1, b1, d11);
|
||||
d20 = __lasx_xvfmadd_s(a2, b0, d20);
|
||||
d21 = __lasx_xvfmadd_s(a2, b1, d21);
|
||||
d30 = __lasx_xvfmadd_s(a3, b0, d30);
|
||||
d31 = __lasx_xvfmadd_s(a3, b1, d31);
|
||||
}
|
||||
|
||||
__lasx_xvst(d00, cptr0 + n, 0);
|
||||
__lasx_xvst(d01, cptr0 + n, 8*4);
|
||||
__lasx_xvst(d10, cptr1 + n, 0);
|
||||
__lasx_xvst(d11, cptr1 + n, 8*4);
|
||||
__lasx_xvst(d20, cptr2 + n, 0);
|
||||
__lasx_xvst(d21, cptr2 + n, 8*4);
|
||||
__lasx_xvst(d30, cptr3 + n, 0);
|
||||
__lasx_xvst(d31, cptr3 + n, 8*4);
|
||||
}
|
||||
}
|
||||
|
||||
for( ; n < nb; n++ )
|
||||
{
|
||||
for( int m = 0; m < ma; m++ )
|
||||
{
|
||||
const float* aptr0 = aptr + astep*m;
|
||||
float* cptr0 = cptr + cstep*m;
|
||||
float d0 = 0.f;
|
||||
|
||||
for( int k = 0; k < na; k++ )
|
||||
d0 += aptr0[k]*bptr[k*bstep + n];
|
||||
|
||||
cptr0[n] = d0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CV_LASX
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||
}} // namespace
|
||||
|
@ -2178,6 +2178,9 @@ public:
|
||||
#if CV_TRY_SSE4_1
|
||||
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
|
||||
#endif
|
||||
|
||||
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
|
||||
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
|
||||
@ -2241,6 +2244,10 @@ public:
|
||||
if ( useAVX2 )
|
||||
x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if ( useLASX )
|
||||
x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
|
||||
#endif
|
||||
#if CV_SIMD128
|
||||
{
|
||||
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
|
||||
|
@ -61,6 +61,13 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace opt_LASX
|
||||
{
|
||||
#if CV_TRY_LASX
|
||||
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace opt_SSE4_1
|
||||
{
|
||||
#if CV_TRY_SSE4_1
|
||||
|
98
modules/imgproc/src/imgwarp.lasx.cpp
Normal file
98
modules/imgproc/src/imgwarp.lasx.cpp
Normal file
@ -0,0 +1,98 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
/* ////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Geometrical transforms on images and matrices: rotation, zoom etc.
|
||||
//
|
||||
// */
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "imgwarp.hpp"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace opt_LASX
|
||||
{
|
||||
|
||||
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
|
||||
{
|
||||
const int AB_BITS = MAX(10, (int)INTER_BITS);
|
||||
int x1 = 0;
|
||||
__m256i fxy_mask = _v256_setall_w(INTER_TAB_SIZE - 1);
|
||||
__m256i XX = _v256_setall_w(X0), YY = _v256_setall_w(Y0);
|
||||
for (; x1 <= bw - 16; x1 += 16)
|
||||
{
|
||||
__m256i tx0, tx1, ty0, ty1;
|
||||
tx0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 0), XX);
|
||||
ty0 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 0), YY);
|
||||
tx1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(adelta + x1), 8*4), XX);
|
||||
ty1 = __lasx_xvadd_w(__lasx_xvld((const __m256i*)(bdelta + x1), 8*4), YY);
|
||||
|
||||
tx0 = __lasx_xvsrai_w(tx0, AB_BITS - INTER_BITS);
|
||||
ty0 = __lasx_xvsrai_w(ty0, AB_BITS - INTER_BITS);
|
||||
tx1 = __lasx_xvsrai_w(tx1, AB_BITS - INTER_BITS);
|
||||
ty1 = __lasx_xvsrai_w(ty1, AB_BITS - INTER_BITS);
|
||||
|
||||
__m256i fx_ = _lasx_packs_w(__lasx_xvand_v(tx0, fxy_mask),
|
||||
__lasx_xvand_v(tx1, fxy_mask));
|
||||
__m256i fy_ = _lasx_packs_w(__lasx_xvand_v(ty0, fxy_mask),
|
||||
__lasx_xvand_v(ty1, fxy_mask));
|
||||
tx0 = _lasx_packs_w(__lasx_xvsrai_w(tx0, INTER_BITS),
|
||||
__lasx_xvsrai_w(tx1, INTER_BITS));
|
||||
ty0 = _lasx_packs_w(__lasx_xvsrai_w(ty0, INTER_BITS),
|
||||
__lasx_xvsrai_w(ty1, INTER_BITS));
|
||||
fx_ = __lasx_xvsadd_h(fx_, __lasx_xvslli_h(fy_, INTER_BITS));
|
||||
fx_ = __lasx_xvpermi_d(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
|
||||
|
||||
__lasx_xvst(__lasx_xvilvl_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 0);
|
||||
__lasx_xvst(__lasx_xvilvh_h(ty0, tx0), (__m256i*)(xy + x1 * 2), 16*2);
|
||||
__lasx_xvst(fx_, (__m256i*)(alpha + x1), 0);
|
||||
}
|
||||
return x1;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
/* End of file. */
|
@ -1098,6 +1098,16 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
|
||||
opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, ify);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_LASX
|
||||
if(CV_CPU_HAS_SUPPORT_LASX && ((pix_size == 2) || (pix_size == 4)))
|
||||
{
|
||||
if(pix_size == 2)
|
||||
opt_LASX::resizeNN2_LASX(range, src, dst, x_ofs, ify);
|
||||
else
|
||||
opt_LASX::resizeNN4_LASX(range, src, dst, x_ofs, ify);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
resizeNNInvoker invoker(src, dst, x_ofs, ify);
|
||||
|
@ -70,6 +70,15 @@ void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, double);
|
||||
int VResizeLanczos4Vec_32f16u_SSE41(const float** src, ushort* dst, const float* beta, int width);
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace opt_LASX
|
||||
{
|
||||
#if CV_TRY_LASX
|
||||
void resizeNN2_LASX(const Range&, const Mat&, Mat&, int*, double);
|
||||
void resizeNN4_LASX(const Range&, const Mat&, Mat&, int*, double);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
/* End of file. */
|
||||
|
249
modules/imgproc/src/resize.lasx.cpp
Normal file
249
modules/imgproc/src/resize.lasx.cpp
Normal file
@ -0,0 +1,249 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
/* ////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Geometrical transforms on images and matrices: rotation, zoom etc.
|
||||
//
|
||||
// */
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "resize.hpp"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace opt_LASX
|
||||
{
|
||||
|
||||
class resizeNNInvokerLASX4 CV_FINAL :
|
||||
public ParallelLoopBody
|
||||
{
|
||||
public:
|
||||
resizeNNInvokerLASX4(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
|
||||
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
|
||||
ify(_ify)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void operator() (const Range& range) const CV_OVERRIDE
|
||||
{
|
||||
Size ssize = src.size(), dsize = dst.size();
|
||||
int y, x;
|
||||
int width = dsize.width;
|
||||
int avxWidth = width - (width & 0x7);
|
||||
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
|
||||
{
|
||||
for(y = range.start; y < range.end; y++)
|
||||
{
|
||||
uchar* D = dst.data + dst.step*y;
|
||||
uchar* Dstart = D;
|
||||
int sy = std::min(cvFloor(y*ify), ssize.height-1);
|
||||
const uchar* S = src.data + sy*src.step;
|
||||
#ifdef CV_ICC
|
||||
#pragma unroll(4)
|
||||
#endif
|
||||
for(x = 0; x < avxWidth; x += 8)
|
||||
{
|
||||
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
|
||||
__m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
|
||||
__lasx_xvst(pixels, (int*)D, 0);
|
||||
D += 32;
|
||||
}
|
||||
for(; x < width; x++)
|
||||
{
|
||||
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(y = range.start; y < range.end; y++)
|
||||
{
|
||||
uchar* D = dst.data + dst.step*y;
|
||||
uchar* Dstart = D;
|
||||
int sy = std::min(cvFloor(y*ify), ssize.height-1);
|
||||
const uchar* S = src.data + sy*src.step;
|
||||
#ifdef CV_ICC
|
||||
#pragma unroll(4)
|
||||
#endif
|
||||
for(x = 0; x < avxWidth; x += 8)
|
||||
{
|
||||
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
|
||||
__m256i CV_DECL_ALIGNED(64) pixels = v256_lut_quads((schar *)S, (int *)addr).val;
|
||||
__lasx_xvst(pixels, (int*)D, 0);
|
||||
D += 32;
|
||||
}
|
||||
for(; x < width; x++)
|
||||
{
|
||||
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const Mat& src;
|
||||
Mat& dst;
|
||||
int* x_ofs;
|
||||
double ify;
|
||||
|
||||
resizeNNInvokerLASX4(const resizeNNInvokerLASX4&);
|
||||
resizeNNInvokerLASX4& operator=(const resizeNNInvokerLASX4&);
|
||||
};
|
||||
|
||||
class resizeNNInvokerLASX2 CV_FINAL :
|
||||
public ParallelLoopBody
|
||||
{
|
||||
public:
|
||||
resizeNNInvokerLASX2(const Mat& _src, Mat &_dst, int *_x_ofs, double _ify) :
|
||||
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
|
||||
ify(_ify)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void operator() (const Range& range) const CV_OVERRIDE
|
||||
{
|
||||
Size ssize = src.size(), dsize = dst.size();
|
||||
int y, x;
|
||||
int width = dsize.width;
|
||||
int avxWidth = width - (width & 0xf);
|
||||
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _v256_set_b(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
|
||||
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
|
||||
const __m256i CV_DECL_ALIGNED(64) permute_mask = _v256_set_w(7, 5, 3, 1, 6, 4, 2, 0);
|
||||
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
|
||||
{
|
||||
for(y = range.start; y < range.end; y++)
|
||||
{
|
||||
uchar* D = dst.data + dst.step*y;
|
||||
uchar* Dstart = D;
|
||||
int sy = std::min(cvFloor(y*ify), ssize.height-1);
|
||||
const uchar* S = src.data + sy*src.step;
|
||||
const uchar* S2 = S - 2;
|
||||
#ifdef CV_ICC
|
||||
#pragma unroll(4)
|
||||
#endif
|
||||
for(x = 0; x < avxWidth; x += 16)
|
||||
{
|
||||
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
|
||||
__m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
|
||||
|
||||
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
|
||||
__m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
|
||||
|
||||
const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
|
||||
__m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
|
||||
|
||||
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
|
||||
__m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
|
||||
__lasx_xvst(ints_permuted, (int*)D, 0);
|
||||
D += 32;
|
||||
}
|
||||
for(; x < width; x++)
|
||||
{
|
||||
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(y = range.start; y < range.end; y++)
|
||||
{
|
||||
uchar* D = dst.data + dst.step*y;
|
||||
uchar* Dstart = D;
|
||||
int sy = std::min(cvFloor(y*ify), ssize.height-1);
|
||||
const uchar* S = src.data + sy*src.step;
|
||||
const uchar* S2 = S - 2;
|
||||
#ifdef CV_ICC
|
||||
#pragma unroll(4)
|
||||
#endif
|
||||
for(x = 0; x < avxWidth; x += 16)
|
||||
{
|
||||
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
|
||||
__m256i CV_DECL_ALIGNED(64) pixels1 = v256_lut_quads((schar *)S, (int *)addr).val;
|
||||
|
||||
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
|
||||
__m256i CV_DECL_ALIGNED(64) pixels2 = v256_lut_quads((schar *)S2, (int *)addr2).val;
|
||||
|
||||
const __m256i h_mask = __lasx_xvreplgr2vr_w(0xFFFF0000);
|
||||
__m256i CV_DECL_ALIGNED(64) unpacked = __lasx_xvbitsel_v(pixels1, pixels2, h_mask);
|
||||
|
||||
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = __lasx_xvshuf_b(unpacked, unpacked, shuffle_mask);
|
||||
__m256i CV_DECL_ALIGNED(64) ints_permuted = __lasx_xvperm_w(bytes_shuffled, permute_mask);
|
||||
__lasx_xvst(ints_permuted, (int*)D, 0);
|
||||
D += 32;
|
||||
}
|
||||
for(; x < width; x++)
|
||||
{
|
||||
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const Mat& src;
|
||||
Mat& dst;
|
||||
int* x_ofs;
|
||||
double ify;
|
||||
|
||||
resizeNNInvokerLASX2(const resizeNNInvokerLASX2&);
|
||||
resizeNNInvokerLASX2& operator=(const resizeNNInvokerLASX2&);
|
||||
};
|
||||
|
||||
void resizeNN2_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
|
||||
{
|
||||
resizeNNInvokerLASX2 invoker(src, dst, x_ofs, ify);
|
||||
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
|
||||
}
|
||||
|
||||
void resizeNN4_LASX(const Range& range, const Mat& src, Mat &dst, int *x_ofs, double ify)
|
||||
{
|
||||
resizeNNInvokerLASX4 invoker(src, dst, x_ofs, ify);
|
||||
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
/* End of file. */
|
Loading…
Reference in New Issue
Block a user