mirror of
https://github.com/opencv/opencv.git
synced 2025-06-20 18:10:51 +08:00
Merge pull request #10370 from pengli:dnn
This commit is contained in:
commit
a2620f72c7
@ -258,6 +258,12 @@ class OCL4DNNConvSpatial
|
|||||||
int lx, int ly, int lz,
|
int lx, int ly, int lz,
|
||||||
bool swizzle, bool nullLocal);
|
bool swizzle, bool nullLocal);
|
||||||
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
|
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
|
||||||
|
void generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||||
|
int blockM, int blockK, int blockN);
|
||||||
|
void generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||||
|
int blockM, int blockK, int blockN);
|
||||||
|
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||||
|
int blockM, int blockK, int simd_size);
|
||||||
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
|
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
|
||||||
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
|
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
|
||||||
|
|
||||||
|
@ -257,11 +257,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
|
|||||||
addDef("INPUT_DEPTH", channels_ / group_);
|
addDef("INPUT_DEPTH", channels_ / group_);
|
||||||
addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
|
addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
|
||||||
addDef("TOTAL_OUTPUT_DEPTH", num_output_);
|
addDef("TOTAL_OUTPUT_DEPTH", num_output_);
|
||||||
addDef("INPUT_START_X", 0);
|
|
||||||
addDef("INPUT_START_Y", 0);
|
|
||||||
addDef("INPUT_START_Z", 0);
|
|
||||||
addDef("NUM_FILTERS", M_);
|
addDef("NUM_FILTERS", M_);
|
||||||
addDef("OUT_BUFF_OFFSET", 0);
|
|
||||||
addDef("TILE_X", tile_x);
|
addDef("TILE_X", tile_x);
|
||||||
addDef("TILE_Y", tile_y);
|
addDef("TILE_Y", tile_y);
|
||||||
addDef("TILE_Y_STRIDE", tile_y_stride);
|
addDef("TILE_Y_STRIDE", tile_y_stride);
|
||||||
@ -1330,76 +1326,128 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||||
|
int blockM, int blockK, int blockN)
|
||||||
|
{
|
||||||
|
if (group_ != 1 || ((M_ % 8 != 0) || (M_ % 32 == 24)))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (blockM != 1 && blockM != 2)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (blockN != 32)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (blockK != 8 && blockK != 16)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (blockK == 16)
|
||||||
|
{
|
||||||
|
if ((blockM == 1 && (kernel_w_ > 4)) || M_ % 32 != 0)
|
||||||
|
return;
|
||||||
|
if ((blockM == 2) || M_ % 32 != 0)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||||
|
int blockM, int blockK, int simd_size)
|
||||||
|
{
|
||||||
|
int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
|
||||||
|
|
||||||
|
if (simd_size != 8 && simd_size != 16)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
|
||||||
|
return;
|
||||||
|
|
||||||
|
int width_max, height_max, block_size_max;
|
||||||
|
width_max = 14;
|
||||||
|
height_max = 14;
|
||||||
|
block_size_max = 32;
|
||||||
|
|
||||||
|
if (blockM > width_max)
|
||||||
|
return;
|
||||||
|
if (blockK > height_max)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (blockM > output_w_)
|
||||||
|
return;
|
||||||
|
if (blockK > output_h_)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Only when the work items count is less than the device
|
||||||
|
// max work items or the M_ is less than 16, we will tune
|
||||||
|
// for simd 8.
|
||||||
|
if (simd_size == 8 && M_ >= 16 &&
|
||||||
|
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(blockM * blockK)) >=
|
||||||
|
max_compute_units * 7 * 16))
|
||||||
|
return;
|
||||||
|
|
||||||
|
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
|
||||||
|
int tile_x = alignSize(actual_tile_x, 4);
|
||||||
|
int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_;
|
||||||
|
if (tile_x > (4 * simd_size))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int tile_y_stride = (4 * simd_size) / tile_x;
|
||||||
|
int invec_size = divUp(tile_y, tile_y_stride);
|
||||||
|
if (invec_size > 4)
|
||||||
|
return;
|
||||||
|
|
||||||
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void OCL4DNNConvSpatial<float>::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||||
|
int blockM, int blockK, int blockN)
|
||||||
|
{
|
||||||
|
if (!dwconv_)
|
||||||
|
return;
|
||||||
|
|
||||||
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
|
||||||
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
|
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
|
||||||
{
|
{
|
||||||
if (ocl::Device::getDefault().intelSubgroupsSupport())
|
if (ocl::Device::getDefault().intelSubgroupsSupport())
|
||||||
{
|
{
|
||||||
//depth_wise kernels
|
// depthwise kernel
|
||||||
if (dwconv_)
|
generate_dwconv_tuneritems(tunerItems, 1, 1, 1);
|
||||||
{
|
if (tunerItems.size() > 0 && group_ > 8)
|
||||||
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
|
|
||||||
if (group_ > 8)
|
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
/* IDLF kernels are using Intel specific extension which make
|
// gemm like kernel
|
||||||
them intel only. */
|
generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
|
||||||
// Generates static key_
|
generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
|
||||||
int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
|
generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
|
||||||
int kernelCnt = 0;
|
|
||||||
if (group_ == 1 && ((M_ % 8 == 0) && (M_ % 32 != 24))) {
|
|
||||||
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 8, 32));
|
|
||||||
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2, 8, 32));
|
|
||||||
|
|
||||||
if (kernel_w_ < 4 && M_ % 32 == 0)
|
// idlf kernel
|
||||||
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 16, 32));
|
for (int simd_size = 8; simd_size <= 16; simd_size += 8)
|
||||||
}
|
{
|
||||||
|
int width_max, height_max;
|
||||||
for (int simd_size = 8; simd_size <= 16; simd_size += 8) {
|
width_max = 14;
|
||||||
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
|
height_max = 14;
|
||||||
continue;
|
for (uint32_t width = width_max; width > 0; width--)
|
||||||
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
|
{
|
||||||
continue;
|
for (uint32_t height = height_max; height > 0; height--)
|
||||||
const int width_max = 14, height_max = 8, block_size_max = 32;
|
{
|
||||||
for (uint32_t width = width_max; width > 0; width--) {
|
generate_idlf_tuneritems(tunerItems, width, height, simd_size);
|
||||||
int candidate = 0;
|
if (tunerItems.size() >= 8 && height == 2)
|
||||||
if (width > output_w_)
|
|
||||||
continue;
|
|
||||||
for (uint32_t height = height_max; height > 0; height--) {
|
|
||||||
if (width * height > block_size_max || height > output_h_)
|
|
||||||
continue;
|
|
||||||
// Only when the work items count is less than the device
|
|
||||||
// max work items or the M_ is less than 16, we will tune
|
|
||||||
// for simd 8.
|
|
||||||
if (simd_size == 8 &&
|
|
||||||
M_ >= 16 &&
|
|
||||||
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(width * height)) >=
|
|
||||||
max_compute_units * 7 * 16))
|
|
||||||
continue;
|
|
||||||
int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1) * stride_w_;
|
|
||||||
int tile_x = alignSize(actual_tile_x, 4);
|
|
||||||
int tile_y = kernel_h_ * dilation_h_ + (height - 1) * stride_h_;
|
|
||||||
if (tile_x > (4 * simd_size))
|
|
||||||
continue;
|
|
||||||
// If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
|
|
||||||
// This could reduce 75% tuning candidates. It has slightly performance
|
|
||||||
// impact for the final tuning result, less than 2% for most cases.
|
|
||||||
if (actual_tile_x % 4 != 0)
|
|
||||||
continue;
|
|
||||||
if ((width * height + divUp(tile_x * tile_y, simd_size)) > block_size_max)
|
|
||||||
continue;
|
|
||||||
int tile_y_stride = (4 * simd_size) / tile_x;
|
|
||||||
|
|
||||||
if (divUp(tile_y, tile_y_stride) < 4) {
|
|
||||||
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size));
|
|
||||||
candidate++;
|
|
||||||
}
|
|
||||||
if (candidate >= 4 && height == 2)
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
kernelCnt += candidate;
|
if (tunerItems.size() >= 12 && width == 2)
|
||||||
if (kernelCnt >= 12 && width == 2)
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -189,10 +189,8 @@ __kernel void ConvolveBasic(
|
|||||||
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
|
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
|
||||||
|
|
||||||
// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
|
// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
|
||||||
#ifndef __BEIGNET__
|
|
||||||
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
|
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
|
||||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||||
#endif
|
|
||||||
__kernel void
|
__kernel void
|
||||||
convolve_simd(
|
convolve_simd(
|
||||||
ELTWISE_DATA_ARG
|
ELTWISE_DATA_ARG
|
||||||
@ -232,12 +230,12 @@ convolve_simd(
|
|||||||
|
|
||||||
int curr_local_y = ( lid / ( TILE_X / 4 ) );
|
int curr_local_y = ( lid / ( TILE_X / 4 ) );
|
||||||
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
|
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
|
||||||
int curr_y = or * STRIDE_Y + INPUT_START_Y + curr_local_y;
|
int curr_y = or * STRIDE_Y + curr_local_y;
|
||||||
int curr_x = oc * STRIDE_X + INPUT_START_X + curr_local_x;
|
int curr_x = oc * STRIDE_X + curr_local_x;
|
||||||
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
|
||||||
int saved_y = curr_y;
|
int saved_y = curr_y;
|
||||||
#endif
|
#endif
|
||||||
in_addr = input_batch_offset + INPUT_START_Z * input_height * input_width
|
in_addr = input_batch_offset
|
||||||
+ (curr_y - INPUT_PAD_H) * input_width // y tile offset
|
+ (curr_y - INPUT_PAD_H) * input_width // y tile offset
|
||||||
+ curr_x - INPUT_PAD_W; // x tile offset
|
+ curr_x - INPUT_PAD_W; // x tile offset
|
||||||
union {
|
union {
|
||||||
@ -363,7 +361,7 @@ convolve_simd(
|
|||||||
fm = fm % ALIGNED_NUM_FILTERS;
|
fm = fm % ALIGNED_NUM_FILTERS;
|
||||||
|
|
||||||
if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
|
if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
|
||||||
unsigned int out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
|
unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
|
||||||
out_addr += or * output_width + oc;
|
out_addr += or * output_width + oc;
|
||||||
// we need this address calculation for biases because we support views and batching
|
// we need this address calculation for biases because we support views and batching
|
||||||
#if APPLY_BIAS
|
#if APPLY_BIAS
|
||||||
|
Loading…
Reference in New Issue
Block a user