mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
Merge pull request #17858 from vpisarev:dnn_depthwise_conv
* added depth-wise convolution; gives ~20-30% performance improvement in MobileSSD networks * hopefully, eliminated compile warnings, errors, as well as failure in one test * * fixed a few typos * decreased buffer size in some cases * added more optimal im2row branch in the case of 1x1 convolutions * tuned fastConv to reduce the number of passes over arrays
This commit is contained in:
parent
f53f491cd2
commit
77b01deb80
@ -829,6 +829,7 @@ public:
|
||||
bool useAVX;
|
||||
bool useAVX2;
|
||||
bool useAVX512;
|
||||
int blk_size_cn;
|
||||
|
||||
ParallelConv()
|
||||
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
|
||||
@ -885,12 +886,17 @@ public:
|
||||
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
|
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
|
||||
|
||||
int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
|
||||
|
||||
int kernel_d = !isConv2D? kernel_size[0] : 1;
|
||||
int kernel_h = kernel_size[kernel_size.size() - 2];
|
||||
int kernel_w = kernel_size.back();
|
||||
|
||||
int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h));
|
||||
int ncn = 16;
|
||||
while (ncn*2 < blk_size_cn0 && ncn < inpCn)
|
||||
ncn *= 2;
|
||||
ncn = std::min(ncn, inpCn);
|
||||
p.blk_size_cn = ncn;
|
||||
|
||||
int dil_d = !isConv2D? dilations[0] : 1;
|
||||
int dil_h = dilations[dilations.size() - 2];
|
||||
int dil_w = dilations.back();
|
||||
@ -958,18 +964,26 @@ public:
|
||||
int dilation_w = dilations.back();
|
||||
|
||||
int i, j, k, d;
|
||||
size_t inpPlaneSize = input_->total(2);
|
||||
size_t outPlaneSize = output_->total(2);
|
||||
int inpPlaneSize = (int)input_->total(2);
|
||||
int outPlaneSize = (int)output_->total(2);
|
||||
bool is1x1 = is1x1_;
|
||||
|
||||
int stripesPerSample;
|
||||
size_t stripeSize;
|
||||
int stripeSize;
|
||||
Range r = r0;
|
||||
bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
|
||||
outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
|
||||
width >= 16 + dilation_w*(kernel_w - 1);
|
||||
// for now only 3x3 depth-wise convolutions are supported
|
||||
depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
|
||||
// computing at most 1 pixel from each side can involve padding
|
||||
max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
|
||||
pad_l <= 1 && pad_t <= 1;
|
||||
|
||||
if( nstripes >= batchSize*2 )
|
||||
if( !depthWiseConvolution && nstripes >= batchSize*2 )
|
||||
{
|
||||
stripesPerSample = nstripes/batchSize;
|
||||
stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
|
||||
stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
|
||||
stripeSize = std::min(stripeSize, outPlaneSize);
|
||||
}
|
||||
else
|
||||
@ -988,20 +1002,29 @@ public:
|
||||
const float* biasptr_ = &biasvec_->at(0);
|
||||
const float* reluptr_ = reluslope_->empty() ? 0 : &reluslope_->at(0);
|
||||
float* data_out0_ = output_->ptr<float>();
|
||||
size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
|
||||
AutoBuffer<float> rowbuf0_(rowbufsz + valign);
|
||||
float* rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
|
||||
AutoBuffer<float> rowbuf0_;
|
||||
float* rowbuf0 = 0;
|
||||
bool use_rowbuf = !depthWiseConvolution;
|
||||
int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
|
||||
|
||||
// we clear the buffer once; ultimately, it lets us to avoid
|
||||
// tail processing after running the unrolled/vectorized loop.
|
||||
// the main idea is to make sure that the tail (a.k.a. padding) of each row
|
||||
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
|
||||
// does not contain NaNs or Infs. Because the padding in the weights
|
||||
// matrix is explicitly initialized with 0's, we handle all other
|
||||
// cases nicely, i.e. we can skip expliciting re-initialization
|
||||
// of the padding - we just retain elements from the previous iteration
|
||||
// of the loop over channels (cn0).
|
||||
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
|
||||
// im2row buffer is not used for depth-wise convolution
|
||||
if(use_rowbuf)
|
||||
{
|
||||
size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
|
||||
//printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
|
||||
rowbuf0_.allocate(rowbufsz + valign);
|
||||
rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
|
||||
// we clear the buffer once; ultimately, it lets us to avoid
|
||||
// tail processing after running the unrolled/vectorized loop.
|
||||
// the main idea is to make sure that the tail (a.k.a. padding) of each row
|
||||
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
|
||||
// does not contain NaNs or Infs. Because the padding in the weights
|
||||
// matrix is explicitly initialized with 0's, we handle all other
|
||||
// cases nicely, i.e. we can skip expliciting re-initialization
|
||||
// of the padding - we just retain elements from the previous iteration
|
||||
// of the loop over channels (cn0).
|
||||
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
|
||||
}
|
||||
|
||||
for( int stripe = r.start; stripe < r.end; stripe++ )
|
||||
{
|
||||
@ -1016,28 +1039,213 @@ public:
|
||||
const float* wptr_orig = wptr_orig_ + wstep*startOutCn;
|
||||
const float* biasptr = biasptr_ + startOutCn;
|
||||
|
||||
for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN )
|
||||
for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
|
||||
{
|
||||
int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn);
|
||||
int cn1 = std::min(cn0 + blk_size_cn, inpCn);
|
||||
int ncn = cn1 - cn0, vsz = karea*ncn;
|
||||
int vsz_a = (int)alignSize(vsz, valign);
|
||||
const float* wptr = wptr_orig + cn0*karea;
|
||||
// we apply [Channels][P]ReLU (if any) during the final pass only.
|
||||
const float* relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0;
|
||||
|
||||
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
|
||||
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
|
||||
{
|
||||
int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
|
||||
int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
|
||||
int bsz = ofs1 - ofs0;
|
||||
|
||||
int out_d = ofs0 / (outH * outW);
|
||||
int out_i = (ofs0 - out_d * outH * outW) / outW;
|
||||
int out_j = ofs0 % outW;
|
||||
|
||||
if (depthWiseConvolution)
|
||||
{
|
||||
CV_Assert(out_i == 0 && out_j == 0);
|
||||
int in_d = out_d * stride_d - pad_d;
|
||||
const float* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
|
||||
float* outptr_ = data_out0 + ofs0;
|
||||
|
||||
#if CV_TRY_AVX2
|
||||
if(useAVX2)
|
||||
opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
|
||||
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
||||
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_AVX
|
||||
if(useAVX)
|
||||
opt_AVX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
|
||||
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
|
||||
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
||||
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
||||
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
||||
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
||||
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
|
||||
|
||||
for (int out_i = 0; out_i < outH; out_i++)
|
||||
{
|
||||
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
||||
const float* imgptr0 = inptr_ + in_i*width;
|
||||
const float* imgptr1 = imgptr0 + dilation_h*width;
|
||||
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
||||
float out, w00 = w00_, w01 = w01_, w02 = w02_;
|
||||
float w20 = w20_, w21 = w21_, w22 = w22_;
|
||||
if (in_i < 0)
|
||||
{
|
||||
w00 = w01 = w02 = 0.f;
|
||||
imgptr0 = imgptr1;
|
||||
}
|
||||
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
||||
{
|
||||
w20 = w21 = w22 = 0.f;
|
||||
imgptr2 = imgptr1;
|
||||
}
|
||||
float* outptr = outptr_ + out_i*outW;
|
||||
if (pad_l > 0)
|
||||
{
|
||||
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
|
||||
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
|
||||
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[0] = out;
|
||||
out_j = 1;
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
// maybe with AVX or AVX512 strided depthwise convolution
|
||||
// can be accelerated with vector code, but with 4xfloat vectors
|
||||
// it's hardly the case
|
||||
if( stride_w == 1 )
|
||||
{
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int out_delta = VECSZ/stride_w;
|
||||
v_float32 vw00 = vx_setall_f32(w00), vw01 = vx_setall_f32(w01), vw02 = vx_setall_f32(w02),
|
||||
vw10 = vx_setall_f32(w10), vw11 = vx_setall_f32(w11), vw12 = vx_setall_f32(w12),
|
||||
vw20 = vx_setall_f32(w20), vw21 = vx_setall_f32(w21), vw22 = vx_setall_f32(w22);
|
||||
v_float32 z = vx_setzero_f32(), vbias = vx_setall_f32(bias), vrc = vx_setall_f32(relu_coeff);
|
||||
for( ; out_j < outW1; out_j += out_delta )
|
||||
{
|
||||
if (out_j + out_delta > outW1)
|
||||
{
|
||||
if (out_j <= pad_l)
|
||||
break;
|
||||
out_j = outW1 - out_delta;
|
||||
}
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
v_float32 v00 = vx_load(imgptr0 + in_j),
|
||||
v01 = vx_load(imgptr0 + in_j + dilation_w),
|
||||
v02 = vx_load(imgptr0 + in_j + dilation_w*2),
|
||||
v10 = vx_load(imgptr1 + in_j),
|
||||
v11 = vx_load(imgptr1 + in_j + dilation_w),
|
||||
v12 = vx_load(imgptr1 + in_j + dilation_w*2),
|
||||
v20 = vx_load(imgptr2 + in_j),
|
||||
v21 = vx_load(imgptr2 + in_j + dilation_w),
|
||||
v22 = vx_load(imgptr2 + in_j + dilation_w*2);
|
||||
|
||||
v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 +
|
||||
v10*vw10 + v11*vw11 + v12*vw12 +
|
||||
v20*vw20 + v21*vw21 + v22*vw22 + vbias;
|
||||
if (relu)
|
||||
vout = v_select(vout > z, vout, vout*vrc);
|
||||
vx_store(outptr + out_j, vout);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; out_j < outW1; out_j++)
|
||||
{
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
|
||||
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
|
||||
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[out_j] = out;
|
||||
}
|
||||
|
||||
for (; out_j < outW; out_j++ )
|
||||
{
|
||||
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
||||
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
|
||||
if (in_j0 >= width)
|
||||
{
|
||||
in_j0 = 0;
|
||||
s0 = 0.f;
|
||||
}
|
||||
if (in_j1 >= width)
|
||||
{
|
||||
in_j1 = 0;
|
||||
s1 = 0.f;
|
||||
}
|
||||
if (in_j2 >= width)
|
||||
{
|
||||
in_j2 = 0;
|
||||
s2 = 0.f;
|
||||
}
|
||||
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
|
||||
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
|
||||
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[out_j] = out;
|
||||
}
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// do im2row for a part of input tensor
|
||||
float* rowbuf = rowbuf0;
|
||||
|
||||
if (isConv2D)
|
||||
{
|
||||
if( is1x1 && stride_w == 1 && stride_h == 1 )
|
||||
{
|
||||
const float* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
|
||||
for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
|
||||
{
|
||||
if( j + 4 <= bsz )
|
||||
{
|
||||
k = 0;
|
||||
#if CV_SIMD128
|
||||
for( ; k <= vsz - 4; k += 4 )
|
||||
{
|
||||
const float* inp = imgptr + j + k*inpPlaneSize;
|
||||
v_float32x4 p0 = v_load(inp), p1 = v_load(inp + inpPlaneSize);
|
||||
v_float32x4 p2 = v_load(inp + inpPlaneSize*2), p3 = v_load(inp + inpPlaneSize*3);
|
||||
v_float32x4 r0, r1, r2, r3;
|
||||
v_transpose4x4(p0, p1, p2, p3, r0, r1, r2, r3);
|
||||
v_store(rowbuf + k, r0);
|
||||
v_store(rowbuf + k + vsz_a, r1);
|
||||
v_store(rowbuf + k + vsz_a*2, r2);
|
||||
v_store(rowbuf + k + vsz_a*3, r3);
|
||||
}
|
||||
#endif
|
||||
for( ; k < vsz; k++ )
|
||||
{
|
||||
const float* inp = imgptr + j + k*inpPlaneSize;
|
||||
float v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
|
||||
rowbuf[k] = v0;
|
||||
rowbuf[k + vsz_a] = v1;
|
||||
rowbuf[k + vsz_a*2] = v2;
|
||||
rowbuf[k + vsz_a*3] = v3;
|
||||
}
|
||||
j += 3;
|
||||
rowbuf += vsz_a*3;
|
||||
}
|
||||
else
|
||||
{
|
||||
for( k = 0; k < vsz; k++ )
|
||||
{
|
||||
rowbuf[k] = imgptr[j + k*inpPlaneSize];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
|
||||
{
|
||||
int delta = std::min(ofs1 - ofs, outW - out_j);
|
||||
@ -1157,7 +1365,6 @@ public:
|
||||
|
||||
// now compute dot product of the weights
|
||||
// and im2row-transformed part of the tensor
|
||||
int bsz = ofs1 - ofs0;
|
||||
#if CV_TRY_AVX512_SKX
|
||||
/* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */
|
||||
if(useAVX512)
|
||||
|
@ -50,6 +50,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
const float* rowbuf, float* output, const int* outShape,
|
||||
int blockSize, int vecsize, int vecsize_aligned,
|
||||
const float* relu, bool initOutput );
|
||||
void fastDepthwiseConv( const float* weights,
|
||||
int kernel_h, int kernel_w,
|
||||
int stride_h, int stride_w,
|
||||
int dilation_h, int dilation_w,
|
||||
int pad_t, int pad_l,
|
||||
const float* bias, const float* relu,
|
||||
const float* inptr,
|
||||
int height, int width,
|
||||
float* outptr,
|
||||
int out_d, int outH, int outW );
|
||||
void fastGEMM1T( const float* vec, const float* weights,
|
||||
size_t wstep, const float* bias,
|
||||
float* dst, int nvecs, int vecsize );
|
||||
@ -64,6 +74,8 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
|
||||
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
|
||||
#endif
|
||||
|
||||
enum { FASCONV_BASE_VECSZ = 4 };
|
||||
|
||||
void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
const float* rowbuf, float* output, const int* outShape,
|
||||
int blockSize, int vecsize, int vecsize_aligned,
|
||||
@ -73,6 +85,11 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
size_t outPlaneSize = outShape[2]*outShape[3];
|
||||
float r0 = 1.f, r1 = 1.f, r2 = 1.f;
|
||||
__m128 vr0 = _mm_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm_setzero_ps();
|
||||
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
|
||||
int rsz = blockSize % FASCONV_BASE_VECSZ;
|
||||
for( int i = 0; i < rsz; i++ )
|
||||
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
|
||||
__m128 mask = _mm_loadu_ps((const float*)maskbuf);
|
||||
|
||||
// now compute dot product of the weights
|
||||
// and im2row-transformed part of the tensor
|
||||
@ -114,8 +131,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
}
|
||||
|
||||
int j = 0;
|
||||
for( ; j <= blockSize - 4; j += 4 )
|
||||
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
|
||||
{
|
||||
bool tail = false;
|
||||
if (j + FASCONV_BASE_VECSZ > blockSize)
|
||||
{
|
||||
if (j == 0)
|
||||
break;
|
||||
j = blockSize - FASCONV_BASE_VECSZ;
|
||||
tail = true;
|
||||
}
|
||||
int k = 0;
|
||||
const float* rptr = rowbuf + j*vecsize_aligned;
|
||||
|
||||
@ -243,9 +268,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
__m128 m0 = _mm_cmp_ps(s0, z, _CMP_GT_OS);
|
||||
__m128 m1 = _mm_cmp_ps(s1, z, _CMP_GT_OS);
|
||||
__m128 m2 = _mm_cmp_ps(s2, z, _CMP_GT_OS);
|
||||
s0 = _mm_xor_ps(s0, _mm_andnot_ps(m0, _mm_xor_ps(_mm_mul_ps(s0, vr0), s0)));
|
||||
s1 = _mm_xor_ps(s1, _mm_andnot_ps(m1, _mm_xor_ps(_mm_mul_ps(s1, vr1), s1)));
|
||||
s2 = _mm_xor_ps(s2, _mm_andnot_ps(m2, _mm_xor_ps(_mm_mul_ps(s2, vr2), s2)));
|
||||
s0 = _mm_blendv_ps(_mm_mul_ps(s0, vr0), s0, m0);
|
||||
s1 = _mm_blendv_ps(_mm_mul_ps(s1, vr1), s1, m1);
|
||||
s2 = _mm_blendv_ps(_mm_mul_ps(s2, vr2), s2, m2);
|
||||
}
|
||||
|
||||
if( tail )
|
||||
{
|
||||
s0 = _mm_blendv_ps(_mm_loadu_ps(outptr0 + j), s0, mask);
|
||||
s1 = _mm_blendv_ps(_mm_loadu_ps(outptr1 + j), s1, mask);
|
||||
s2 = _mm_blendv_ps(_mm_loadu_ps(outptr2 + j), s2, mask);
|
||||
}
|
||||
|
||||
_mm_storeu_ps(outptr0 + j, s0);
|
||||
@ -253,9 +285,55 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
_mm_storeu_ps(outptr2 + j, s2);
|
||||
}
|
||||
|
||||
for( ; j <= blockSize - 2; j += 2 )
|
||||
{
|
||||
const float* rptr0 = rowbuf + j*vecsize_aligned;
|
||||
const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
|
||||
float s00, s01, s10, s11, s20, s21;
|
||||
|
||||
if( initOutput )
|
||||
{
|
||||
s00 = s01 = bias0;
|
||||
s10 = s11 = bias1;
|
||||
s20 = s21 = bias2;
|
||||
}
|
||||
else
|
||||
{
|
||||
s00 = outptr0[j]; s01 = outptr0[j+1];
|
||||
s10 = outptr1[j]; s11 = outptr1[j+1];
|
||||
s20 = outptr2[j]; s21 = outptr2[j+1];
|
||||
}
|
||||
|
||||
for( int k = 0; k < vecsize; k++ )
|
||||
{
|
||||
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
|
||||
float r = rptr0[k];
|
||||
s00 += w0*r; s10 += w1*r; s20 += w2*r;
|
||||
r = rptr1[k];
|
||||
s01 += w0*r; s11 += w1*r; s21 += w2*r;
|
||||
}
|
||||
|
||||
if( relu )
|
||||
{
|
||||
s00 = s00 > 0.f ? s00 : s00*r0;
|
||||
s01 = s01 > 0.f ? s01 : s01*r0;
|
||||
s10 = s10 > 0.f ? s10 : s10*r1;
|
||||
s11 = s11 > 0.f ? s11 : s11*r1;
|
||||
s20 = s20 > 0.f ? s20 : s20*r2;
|
||||
s21 = s21 > 0.f ? s21 : s21*r2;
|
||||
}
|
||||
|
||||
outptr0[j] = s00;
|
||||
outptr0[j+1] = s01;
|
||||
outptr1[j] = s10;
|
||||
outptr1[j+1] = s11;
|
||||
outptr2[j] = s20;
|
||||
outptr2[j+1] = s21;
|
||||
}
|
||||
|
||||
for( ; j < blockSize; j++ )
|
||||
{
|
||||
const float* rptr = rowbuf + j*vecsize_aligned;
|
||||
const float* rptr0 = rowbuf + j*vecsize_aligned;
|
||||
float s00, s10, s20;
|
||||
|
||||
if( initOutput )
|
||||
@ -273,10 +351,9 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
|
||||
for( int k = 0; k < vecsize; k++ )
|
||||
{
|
||||
float r0 = rptr[k];
|
||||
s00 += wptr0[k]*r0;
|
||||
s10 += wptr1[k]*r0;
|
||||
s20 += wptr2[k]*r0;
|
||||
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
|
||||
float r = rptr0[k];
|
||||
s00 += w0*r; s10 += w1*r; s20 += w2*r;
|
||||
}
|
||||
|
||||
if( relu )
|
||||
@ -294,6 +371,185 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
|
||||
{
|
||||
__m256 t0 = _mm256_loadu_ps(ptr);
|
||||
__m256 t1 = _mm256_loadu_ps(ptr + 8);
|
||||
|
||||
__m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
|
||||
__m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
|
||||
a = _mm256_shuffle_ps(lo, hi, 0x88);
|
||||
b = _mm256_shuffle_ps(lo, hi, 0xdd);
|
||||
}
|
||||
|
||||
void fastDepthwiseConv( const float* wptr,
|
||||
int kernel_h, int kernel_w,
|
||||
int stride_h, int stride_w,
|
||||
int dilation_h, int dilation_w,
|
||||
int pad_t, int pad_l,
|
||||
const float* biasptr, const float* relu,
|
||||
const float* inptr_,
|
||||
int height, int width,
|
||||
float* outptr_,
|
||||
int out_d, int outH, int outW )
|
||||
{
|
||||
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
|
||||
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
|
||||
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
|
||||
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
|
||||
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
|
||||
|
||||
for (int out_i = 0; out_i < outH; out_i++)
|
||||
{
|
||||
int in_i = out_i * stride_h - pad_t, out_j = 0;
|
||||
const float* imgptr0 = inptr_ + in_i*width;
|
||||
const float* imgptr1 = imgptr0 + dilation_h*width;
|
||||
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
|
||||
float out, w00 = w00_, w01 = w01_, w02 = w02_;
|
||||
float w20 = w20_, w21 = w21_, w22 = w22_;
|
||||
if (in_i < 0)
|
||||
{
|
||||
w00 = w01 = w02 = 0.f;
|
||||
imgptr0 = imgptr1;
|
||||
}
|
||||
else if (in_i + dilation_h*(kernel_h-1) >= height)
|
||||
{
|
||||
w20 = w21 = w22 = 0.f;
|
||||
imgptr2 = imgptr1;
|
||||
}
|
||||
float* outptr = outptr_ + out_i*outW;
|
||||
if (pad_l > 0)
|
||||
{
|
||||
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
|
||||
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
|
||||
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[0] = out;
|
||||
out_j = 1;
|
||||
}
|
||||
|
||||
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
|
||||
{
|
||||
const int VECSZ = 8;
|
||||
__m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
|
||||
vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
|
||||
vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
|
||||
__m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
|
||||
|
||||
if( stride_w == 1 )
|
||||
for( ; out_j < outW1; out_j += VECSZ )
|
||||
{
|
||||
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
||||
out_j = outW1 - VECSZ;
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
__m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
|
||||
v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
|
||||
v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
|
||||
v10 = _mm256_loadu_ps(imgptr1 + in_j),
|
||||
v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
|
||||
v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
|
||||
v20 = _mm256_loadu_ps(imgptr2 + in_j),
|
||||
v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
|
||||
v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
|
||||
|
||||
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
|
||||
__m256 vout1 = _mm256_mul_ps(v01, vw01);
|
||||
__m256 vout2 = _mm256_mul_ps(v02, vw02);
|
||||
|
||||
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
|
||||
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
|
||||
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
|
||||
|
||||
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
|
||||
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
|
||||
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
|
||||
|
||||
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
|
||||
if (relu)
|
||||
{
|
||||
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
|
||||
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
|
||||
}
|
||||
_mm256_storeu_ps(outptr + out_j, vout0);
|
||||
}
|
||||
else
|
||||
for( ; out_j < outW1; out_j += VECSZ )
|
||||
{
|
||||
if (out_j + VECSZ > outW1 && out_j > pad_l)
|
||||
out_j = outW1 - VECSZ;
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
|
||||
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
|
||||
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
|
||||
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
|
||||
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
|
||||
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
|
||||
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
|
||||
|
||||
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
|
||||
__m256 vout1 = _mm256_mul_ps(v01, vw01);
|
||||
__m256 vout2 = _mm256_mul_ps(v02, vw02);
|
||||
|
||||
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
|
||||
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
|
||||
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
|
||||
|
||||
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
|
||||
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
|
||||
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
|
||||
|
||||
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
|
||||
if (relu)
|
||||
{
|
||||
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
|
||||
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
|
||||
}
|
||||
_mm256_storeu_ps(outptr + out_j, vout0);
|
||||
}
|
||||
}
|
||||
|
||||
for (; out_j < outW1; out_j++)
|
||||
{
|
||||
int in_j = out_j * stride_w - pad_l;
|
||||
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
|
||||
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
|
||||
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[out_j] = out;
|
||||
}
|
||||
|
||||
for (; out_j < outW; out_j++ )
|
||||
{
|
||||
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
|
||||
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
|
||||
if (in_j0 >= width)
|
||||
{
|
||||
in_j0 = 0;
|
||||
s0 = 0.f;
|
||||
}
|
||||
if (in_j1 >= width)
|
||||
{
|
||||
in_j1 = 0;
|
||||
s1 = 0.f;
|
||||
}
|
||||
if (in_j2 >= width)
|
||||
{
|
||||
in_j2 = 0;
|
||||
s2 = 0.f;
|
||||
}
|
||||
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
|
||||
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
|
||||
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
|
||||
if (relu)
|
||||
out = out > 0.f ? out : out*relu_coeff;
|
||||
outptr[out_j] = out;
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
// dst = vec * weights^t + bias
|
||||
void fastGEMM1T( const float* vec, const float* weights,
|
||||
size_t wstep, const float* bias,
|
||||
|
Loading…
Reference in New Issue
Block a user