Merge pull request #17858 from vpisarev:dnn_depthwise_conv

* added depth-wise convolution; gives ~20-30% performance improvement in MobileSSD networks

* hopefully, eliminated compile warnings, errors, as well as failure in one test

* * fixed a few typos
* decreased buffer size in some cases
* added more optimal im2row branch in the case of 1x1 convolutions
* tuned fastConv to reduce the number of passes over arrays
This commit is contained in:
Vadim Pisarevsky 2020-08-01 15:05:05 +03:00 committed by GitHub
parent f53f491cd2
commit 77b01deb80
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 497 additions and 34 deletions

View File

@ -829,6 +829,7 @@ public:
bool useAVX;
bool useAVX2;
bool useAVX512;
int blk_size_cn;
ParallelConv()
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
@ -885,12 +886,17 @@ public:
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
int kernel_d = !isConv2D? kernel_size[0] : 1;
int kernel_h = kernel_size[kernel_size.size() - 2];
int kernel_w = kernel_size.back();
int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h));
int ncn = 16;
while (ncn*2 < blk_size_cn0 && ncn < inpCn)
ncn *= 2;
ncn = std::min(ncn, inpCn);
p.blk_size_cn = ncn;
int dil_d = !isConv2D? dilations[0] : 1;
int dil_h = dilations[dilations.size() - 2];
int dil_w = dilations.back();
@ -958,18 +964,26 @@ public:
int dilation_w = dilations.back();
int i, j, k, d;
size_t inpPlaneSize = input_->total(2);
size_t outPlaneSize = output_->total(2);
int inpPlaneSize = (int)input_->total(2);
int outPlaneSize = (int)output_->total(2);
bool is1x1 = is1x1_;
int stripesPerSample;
size_t stripeSize;
int stripeSize;
Range r = r0;
bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
width >= 16 + dilation_w*(kernel_w - 1);
// for now only 3x3 depth-wise convolutions are supported
depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
// computing at most 1 pixel from each side can involve padding
max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
pad_l <= 1 && pad_t <= 1;
if( nstripes >= batchSize*2 )
if( !depthWiseConvolution && nstripes >= batchSize*2 )
{
stripesPerSample = nstripes/batchSize;
stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
stripeSize = std::min(stripeSize, outPlaneSize);
}
else
@ -988,20 +1002,29 @@ public:
const float* biasptr_ = &biasvec_->at(0);
const float* reluptr_ = reluslope_->empty() ? 0 : &reluslope_->at(0);
float* data_out0_ = output_->ptr<float>();
size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
AutoBuffer<float> rowbuf0_(rowbufsz + valign);
float* rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
AutoBuffer<float> rowbuf0_;
float* rowbuf0 = 0;
bool use_rowbuf = !depthWiseConvolution;
int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
// we clear the buffer once; ultimately, it lets us to avoid
// tail processing after running the unrolled/vectorized loop.
// the main idea is to make sure that the tail (a.k.a. padding) of each row
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
// does not contain NaNs or Infs. Because the padding in the weights
// matrix is explicitly initialized with 0's, we handle all other
// cases nicely, i.e. we can skip expliciting re-initialization
// of the padding - we just retain elements from the previous iteration
// of the loop over channels (cn0).
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
// im2row buffer is not used for depth-wise convolution
if(use_rowbuf)
{
size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
//printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
rowbuf0_.allocate(rowbufsz + valign);
rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
// we clear the buffer once; ultimately, it lets us to avoid
// tail processing after running the unrolled/vectorized loop.
// the main idea is to make sure that the tail (a.k.a. padding) of each row
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
// does not contain NaNs or Infs. Because the padding in the weights
// matrix is explicitly initialized with 0's, we handle all other
// cases nicely, i.e. we can skip expliciting re-initialization
// of the padding - we just retain elements from the previous iteration
// of the loop over channels (cn0).
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
}
for( int stripe = r.start; stripe < r.end; stripe++ )
{
@ -1016,28 +1039,213 @@ public:
const float* wptr_orig = wptr_orig_ + wstep*startOutCn;
const float* biasptr = biasptr_ + startOutCn;
for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN )
for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
{
int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn);
int cn1 = std::min(cn0 + blk_size_cn, inpCn);
int ncn = cn1 - cn0, vsz = karea*ncn;
int vsz_a = (int)alignSize(vsz, valign);
const float* wptr = wptr_orig + cn0*karea;
// we apply [Channels][P]ReLU (if any) during the final pass only.
const float* relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0;
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
{
int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
int bsz = ofs1 - ofs0;
int out_d = ofs0 / (outH * outW);
int out_i = (ofs0 - out_d * outH * outW) / outW;
int out_j = ofs0 % outW;
if (depthWiseConvolution)
{
CV_Assert(out_i == 0 && out_j == 0);
int in_d = out_d * stride_d - pad_d;
const float* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
float* outptr_ = data_out0 + ofs0;
#if CV_TRY_AVX2
if(useAVX2)
opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
else
#endif
#if CV_TRY_AVX
if(useAVX)
opt_AVX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
else
#endif
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
#if CV_SIMD
// maybe with AVX or AVX512 strided depthwise convolution
// can be accelerated with vector code, but with 4xfloat vectors
// it's hardly the case
if( stride_w == 1 )
{
const int VECSZ = v_float32::nlanes;
const int out_delta = VECSZ/stride_w;
v_float32 vw00 = vx_setall_f32(w00), vw01 = vx_setall_f32(w01), vw02 = vx_setall_f32(w02),
vw10 = vx_setall_f32(w10), vw11 = vx_setall_f32(w11), vw12 = vx_setall_f32(w12),
vw20 = vx_setall_f32(w20), vw21 = vx_setall_f32(w21), vw22 = vx_setall_f32(w22);
v_float32 z = vx_setzero_f32(), vbias = vx_setall_f32(bias), vrc = vx_setall_f32(relu_coeff);
for( ; out_j < outW1; out_j += out_delta )
{
if (out_j + out_delta > outW1)
{
if (out_j <= pad_l)
break;
out_j = outW1 - out_delta;
}
int in_j = out_j * stride_w - pad_l;
v_float32 v00 = vx_load(imgptr0 + in_j),
v01 = vx_load(imgptr0 + in_j + dilation_w),
v02 = vx_load(imgptr0 + in_j + dilation_w*2),
v10 = vx_load(imgptr1 + in_j),
v11 = vx_load(imgptr1 + in_j + dilation_w),
v12 = vx_load(imgptr1 + in_j + dilation_w*2),
v20 = vx_load(imgptr2 + in_j),
v21 = vx_load(imgptr2 + in_j + dilation_w),
v22 = vx_load(imgptr2 + in_j + dilation_w*2);
v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 +
v10*vw10 + v11*vw11 + v12*vw12 +
v20*vw20 + v21*vw21 + v22*vw22 + vbias;
if (relu)
vout = v_select(vout > z, vout, vout*vrc);
vx_store(outptr + out_j, vout);
}
}
#endif
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
continue;
}
// do im2row for a part of input tensor
float* rowbuf = rowbuf0;
if (isConv2D)
{
if( is1x1 && stride_w == 1 && stride_h == 1 )
{
const float* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
{
if( j + 4 <= bsz )
{
k = 0;
#if CV_SIMD128
for( ; k <= vsz - 4; k += 4 )
{
const float* inp = imgptr + j + k*inpPlaneSize;
v_float32x4 p0 = v_load(inp), p1 = v_load(inp + inpPlaneSize);
v_float32x4 p2 = v_load(inp + inpPlaneSize*2), p3 = v_load(inp + inpPlaneSize*3);
v_float32x4 r0, r1, r2, r3;
v_transpose4x4(p0, p1, p2, p3, r0, r1, r2, r3);
v_store(rowbuf + k, r0);
v_store(rowbuf + k + vsz_a, r1);
v_store(rowbuf + k + vsz_a*2, r2);
v_store(rowbuf + k + vsz_a*3, r3);
}
#endif
for( ; k < vsz; k++ )
{
const float* inp = imgptr + j + k*inpPlaneSize;
float v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
rowbuf[k] = v0;
rowbuf[k + vsz_a] = v1;
rowbuf[k + vsz_a*2] = v2;
rowbuf[k + vsz_a*3] = v3;
}
j += 3;
rowbuf += vsz_a*3;
}
else
{
for( k = 0; k < vsz; k++ )
{
rowbuf[k] = imgptr[j + k*inpPlaneSize];
}
}
}
}
else
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
{
int delta = std::min(ofs1 - ofs, outW - out_j);
@ -1157,7 +1365,6 @@ public:
// now compute dot product of the weights
// and im2row-transformed part of the tensor
int bsz = ofs1 - ofs0;
#if CV_TRY_AVX512_SKX
/* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */
if(useAVX512)

View File

@ -50,6 +50,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
const float* rowbuf, float* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned,
const float* relu, bool initOutput );
void fastDepthwiseConv( const float* weights,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* bias, const float* relu,
const float* inptr,
int height, int width,
float* outptr,
int out_d, int outH, int outW );
void fastGEMM1T( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize );
@ -64,6 +74,8 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
enum { FASCONV_BASE_VECSZ = 4 };
void fastConv( const float* weights, size_t wstep, const float* bias,
const float* rowbuf, float* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned,
@ -73,6 +85,11 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
size_t outPlaneSize = outShape[2]*outShape[3];
float r0 = 1.f, r1 = 1.f, r2 = 1.f;
__m128 vr0 = _mm_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm_setzero_ps();
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
int rsz = blockSize % FASCONV_BASE_VECSZ;
for( int i = 0; i < rsz; i++ )
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
__m128 mask = _mm_loadu_ps((const float*)maskbuf);
// now compute dot product of the weights
// and im2row-transformed part of the tensor
@ -114,8 +131,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
}
int j = 0;
for( ; j <= blockSize - 4; j += 4 )
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
{
bool tail = false;
if (j + FASCONV_BASE_VECSZ > blockSize)
{
if (j == 0)
break;
j = blockSize - FASCONV_BASE_VECSZ;
tail = true;
}
int k = 0;
const float* rptr = rowbuf + j*vecsize_aligned;
@ -243,9 +268,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
__m128 m0 = _mm_cmp_ps(s0, z, _CMP_GT_OS);
__m128 m1 = _mm_cmp_ps(s1, z, _CMP_GT_OS);
__m128 m2 = _mm_cmp_ps(s2, z, _CMP_GT_OS);
s0 = _mm_xor_ps(s0, _mm_andnot_ps(m0, _mm_xor_ps(_mm_mul_ps(s0, vr0), s0)));
s1 = _mm_xor_ps(s1, _mm_andnot_ps(m1, _mm_xor_ps(_mm_mul_ps(s1, vr1), s1)));
s2 = _mm_xor_ps(s2, _mm_andnot_ps(m2, _mm_xor_ps(_mm_mul_ps(s2, vr2), s2)));
s0 = _mm_blendv_ps(_mm_mul_ps(s0, vr0), s0, m0);
s1 = _mm_blendv_ps(_mm_mul_ps(s1, vr1), s1, m1);
s2 = _mm_blendv_ps(_mm_mul_ps(s2, vr2), s2, m2);
}
if( tail )
{
s0 = _mm_blendv_ps(_mm_loadu_ps(outptr0 + j), s0, mask);
s1 = _mm_blendv_ps(_mm_loadu_ps(outptr1 + j), s1, mask);
s2 = _mm_blendv_ps(_mm_loadu_ps(outptr2 + j), s2, mask);
}
_mm_storeu_ps(outptr0 + j, s0);
@ -253,9 +285,55 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
_mm_storeu_ps(outptr2 + j, s2);
}
for( ; j <= blockSize - 2; j += 2 )
{
const float* rptr0 = rowbuf + j*vecsize_aligned;
const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
float s00, s01, s10, s11, s20, s21;
if( initOutput )
{
s00 = s01 = bias0;
s10 = s11 = bias1;
s20 = s21 = bias2;
}
else
{
s00 = outptr0[j]; s01 = outptr0[j+1];
s10 = outptr1[j]; s11 = outptr1[j+1];
s20 = outptr2[j]; s21 = outptr2[j+1];
}
for( int k = 0; k < vecsize; k++ )
{
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
float r = rptr0[k];
s00 += w0*r; s10 += w1*r; s20 += w2*r;
r = rptr1[k];
s01 += w0*r; s11 += w1*r; s21 += w2*r;
}
if( relu )
{
s00 = s00 > 0.f ? s00 : s00*r0;
s01 = s01 > 0.f ? s01 : s01*r0;
s10 = s10 > 0.f ? s10 : s10*r1;
s11 = s11 > 0.f ? s11 : s11*r1;
s20 = s20 > 0.f ? s20 : s20*r2;
s21 = s21 > 0.f ? s21 : s21*r2;
}
outptr0[j] = s00;
outptr0[j+1] = s01;
outptr1[j] = s10;
outptr1[j+1] = s11;
outptr2[j] = s20;
outptr2[j+1] = s21;
}
for( ; j < blockSize; j++ )
{
const float* rptr = rowbuf + j*vecsize_aligned;
const float* rptr0 = rowbuf + j*vecsize_aligned;
float s00, s10, s20;
if( initOutput )
@ -273,10 +351,9 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
for( int k = 0; k < vecsize; k++ )
{
float r0 = rptr[k];
s00 += wptr0[k]*r0;
s10 += wptr1[k]*r0;
s20 += wptr2[k]*r0;
float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
float r = rptr0[k];
s00 += w0*r; s10 += w1*r; s20 += w2*r;
}
if( relu )
@ -294,6 +371,185 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
_mm256_zeroupper();
}
static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
{
__m256 t0 = _mm256_loadu_ps(ptr);
__m256 t1 = _mm256_loadu_ps(ptr + 8);
__m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
__m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
a = _mm256_shuffle_ps(lo, hi, 0x88);
b = _mm256_shuffle_ps(lo, hi, 0xdd);
}
void fastDepthwiseConv( const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW )
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 8;
__m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
__m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
if( stride_w == 1 )
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
v10 = _mm256_loadu_ps(imgptr1 + in_j),
v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
v20 = _mm256_loadu_ps(imgptr2 + in_j),
v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
else
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1 && out_j > pad_l)
out_j = outW1 - VECSZ;
int in_j = out_j * stride_w - pad_l;
__m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
__m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
__m256 vout1 = _mm256_mul_ps(v01, vw01);
__m256 vout2 = _mm256_mul_ps(v02, vw02);
vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
if (relu)
{
__m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
}
_mm256_storeu_ps(outptr + out_j, vout0);
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
_mm256_zeroupper();
}
// dst = vec * weights^t + bias
void fastGEMM1T( const float* vec, const float* weights,
size_t wstep, const float* bias,