mirror of
https://github.com/opencv/opencv.git
synced 2025-06-08 01:53:19 +08:00
update convolution opencl kernels in dnn module (#11762)
* optimize ocl kernel enqueue in fc layer Signed-off-by: Li Peng <peng.li@intel.com> * use CV_LOG_INFO in convolution auto tuning Signed-off-by: Li Peng <peng.li@intel.com> * update convolution IDLF kernel extend parameter tuning range, also cleanup ocl kernel implementation Signed-off-by: Li Peng <peng.li@intel.com> * update in-memory convolution cache config fp16 and fp32 cache config are stored separately Signed-off-by: Li Peng <peng.li@intel.com>
This commit is contained in:
parent
a2bc075924
commit
ab8022f74e
@ -310,7 +310,6 @@ public:
|
||||
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
|
||||
}
|
||||
|
||||
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
MatShape inshape, outshape;
|
||||
@ -320,7 +319,6 @@ public:
|
||||
UMat srcMat, dstMat;
|
||||
srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
|
||||
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
|
||||
dstMat.setTo(0.0f);
|
||||
|
||||
if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
|
||||
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
|
||||
@ -332,6 +330,7 @@ public:
|
||||
|
||||
if (!use_half && bias && (outerSize > 1))
|
||||
{
|
||||
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
|
||||
UMat& biases = umat_blobs[1];
|
||||
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
|
||||
}
|
||||
@ -354,6 +353,7 @@ public:
|
||||
|
||||
if (bias)
|
||||
{
|
||||
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
|
||||
UMat& biases = umat_blobs[1];
|
||||
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -55,6 +55,7 @@
|
||||
#include "../include/math_functions.hpp"
|
||||
#include "../include/default_kernel_config.hpp"
|
||||
#include "opencv2/dnn/shape_utils.hpp"
|
||||
#include "opencv2/core/utils/logger.hpp"
|
||||
|
||||
#if defined WIN32 || defined _WIN32
|
||||
#include <windows.h>
|
||||
@ -87,10 +88,13 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
|
||||
{
|
||||
CV_Assert(defaultConfigLoaded == false);
|
||||
CV_Assert(kernelConfigMap.empty());
|
||||
const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2;
|
||||
|
||||
/* fp32 config */
|
||||
size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
|
||||
sizeof(default_kernel_config_intel_fp32[0]) / 2;
|
||||
for (size_t i = 0; i < numConfigs; i++)
|
||||
{
|
||||
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i];
|
||||
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
|
||||
if (!cache_path.empty())
|
||||
{
|
||||
std::string cacheFile = cache_path + sanitize(key);
|
||||
@ -100,9 +104,29 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
|
||||
}
|
||||
std::pair<std::string, std::string> entry(
|
||||
key,
|
||||
default_kernel_config_intel[2 * i + 1]);
|
||||
default_kernel_config_intel_fp32[2 * i + 1]);
|
||||
kernelConfigMap.insert(entry);
|
||||
}
|
||||
|
||||
/* fp16 config */
|
||||
numConfigs = sizeof(default_kernel_config_intel_fp16) /
|
||||
sizeof(default_kernel_config_intel_fp16[0]) / 2;
|
||||
for (size_t i = 0; i < numConfigs; i++)
|
||||
{
|
||||
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
|
||||
if (!cache_path.empty())
|
||||
{
|
||||
std::string cacheFile = cache_path + sanitize(key);
|
||||
std::ifstream cachedKernel(cacheFile.c_str());
|
||||
if (cachedKernel)
|
||||
continue; // external configuration found, skip builtin
|
||||
}
|
||||
std::pair<std::string, std::string> entry(
|
||||
key,
|
||||
default_kernel_config_intel_fp16[2 * i + 1]);
|
||||
kernelConfigMap.insert(entry);
|
||||
}
|
||||
|
||||
defaultConfigLoaded = true;
|
||||
}
|
||||
|
||||
@ -311,40 +335,38 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
|
||||
|
||||
// options
|
||||
options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
|
||||
options_ << " -cl-mad-enable";
|
||||
if (clOptionSupport("-cl-no-subgroup-ifp"))
|
||||
options_ << " -cl-no-subgroup-ifp ";
|
||||
|
||||
// defs
|
||||
int32_t output_width = output_w_;
|
||||
int32_t output_height = output_h_;
|
||||
int32_t output_block_width = blockM;
|
||||
int32_t output_block_height = blockK;
|
||||
const int32_t last_block_width = (output_width % output_block_width == 0) ?
|
||||
output_block_width : output_width % output_block_width;
|
||||
const int32_t last_block_height = (output_height % output_block_height == 0) ?
|
||||
output_block_height : output_height % output_block_height;
|
||||
int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4);
|
||||
int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
|
||||
int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
|
||||
int tile_y_stride = (4 * simd_size) / tile_x;
|
||||
int invec_size = divUp(tile_y, tile_y_stride);
|
||||
int invec_size = tile_y;
|
||||
|
||||
addDef("SIMD_SIZE", simd_size);
|
||||
addDef("filter_qualifier", "__global");
|
||||
addDef("OUT_BLOCK_WIDTH", output_block_width);
|
||||
addDef("OUT_BLOCK_HEIGHT", output_block_height);
|
||||
addDef("LAST_BLOCK_WIDTH", last_block_width);
|
||||
addDef("LAST_BLOCK_HEIGHT", last_block_height);
|
||||
addDef("INPUT_DEPTH", channels_ / group_);
|
||||
addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
|
||||
addDef("TOTAL_OUTPUT_DEPTH", num_output_);
|
||||
addDef("NUM_FILTERS", M_);
|
||||
addDef("TILE_X", tile_x);
|
||||
addDef("TILE_Y", tile_y);
|
||||
addDef("TILE_Y_STRIDE", tile_y_stride);
|
||||
addDef("INVEC_SIZE", invec_size);
|
||||
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
|
||||
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
|
||||
addDef("APPLY_BIAS", bias_term_);
|
||||
addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
|
||||
addDef("INPUT_PITCH", (width_ * height_));
|
||||
addDef("OUTPUT_PITCH", (output_w_ * output_h_));
|
||||
addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
|
||||
addDef("INPUT_WIDTH", width_);
|
||||
addDef("INPUT_HEIGHT", height_);
|
||||
addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
|
||||
|
||||
setFusionDefine(fused_activ_, fused_eltwise_);
|
||||
|
||||
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
|
||||
@ -567,13 +589,6 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
|
||||
return;
|
||||
}
|
||||
|
||||
#define dbg
|
||||
#ifdef dbg
|
||||
#define dbgPrint(x) (x)
|
||||
#else
|
||||
#define dbgPrint(x)
|
||||
#endif
|
||||
|
||||
// For large enough input size, we do not need to tune kernels for different
|
||||
// size. The reason is with large input size, there will be enough work items
|
||||
// to feed al the EUs.
|
||||
@ -584,6 +599,7 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
|
||||
template<typename Dtype>
|
||||
void OCL4DNNConvSpatial<Dtype>::generateKey()
|
||||
{
|
||||
std::string precision = (use_half_) ? "FP16" : "FP32";
|
||||
std::stringstream keyBuilder;
|
||||
// FIXME: to support fuse?
|
||||
keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
|
||||
@ -597,7 +613,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
|
||||
<< "num" << num_ << "_"
|
||||
<< "M" << M_ << "_"
|
||||
<< "activ" << fused_activ_ << "_"
|
||||
<< "eltwise" << fused_eltwise_;
|
||||
<< "eltwise" << fused_eltwise_ << "_"
|
||||
<< precision;
|
||||
|
||||
|
||||
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
|
||||
@ -616,11 +633,6 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
|
||||
<< "_" << blockHeight
|
||||
<< "_" << blockDepth;
|
||||
|
||||
if (!use_half_)
|
||||
keyBuilder << "_float";
|
||||
else
|
||||
keyBuilder << "_half";
|
||||
|
||||
return keyBuilder.str();
|
||||
}
|
||||
|
||||
@ -1164,7 +1176,7 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
|
||||
cv::ocl::Timer timer(queue);
|
||||
timer.start();
|
||||
bool res = true;;
|
||||
dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl);
|
||||
CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
|
||||
tuned_ = true;
|
||||
int loop_cnt = 4;
|
||||
for (int i = 0; i < loop_cnt; i++) {
|
||||
@ -1181,7 +1193,6 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
|
||||
}
|
||||
|
||||
float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
|
||||
#ifdef dbg
|
||||
double out_w = output_w_;
|
||||
double out_h = output_h_;
|
||||
double out_z = M_;
|
||||
@ -1189,16 +1200,8 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
|
||||
double k_h = kernel_h_;
|
||||
double k_z = channels_;
|
||||
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
|
||||
std::cout << "\tEstimated Gflops:" << (totalFlops * 1e-9)
|
||||
<< std::endl;
|
||||
std::cout << "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))
|
||||
<< std::endl;
|
||||
#if 0
|
||||
std::cout << "Estimated utilization: " <<
|
||||
((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0
|
||||
<< std::endl;
|
||||
#endif
|
||||
#endif
|
||||
CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
|
||||
CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
|
||||
return elapsedTime;
|
||||
}
|
||||
|
||||
@ -1254,18 +1257,18 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
|
||||
if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
|
||||
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
|
||||
{
|
||||
dbgPrint(printf("test verification failed @ image %d group %d"
|
||||
"out_ch %d h %d w %d got %G expected %G\n",
|
||||
n, g, out_ch, h, w, data[offset], verify_data[offset]));
|
||||
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
|
||||
<< " out_ch " << out_ch << " h " << h << " w " << w
|
||||
<< " got " << data[offset] << " expected " << verify_data[offset]);
|
||||
verificationFail = 1;
|
||||
goto out;
|
||||
}
|
||||
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
|
||||
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
|
||||
{
|
||||
dbgPrint(printf("test verification failed @ image %d group %d"
|
||||
"out_ch %d h %d w %d got %G expected %G\n",
|
||||
n, g, out_ch, h, w, data[offset], verify_data[offset]));
|
||||
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
|
||||
<< " out_ch " << out_ch << " h " << h << " w " << w
|
||||
<< " got " << data[offset] << " expected " << verify_data[offset]);
|
||||
verificationFail = 1;
|
||||
goto out;
|
||||
}
|
||||
@ -1546,17 +1549,11 @@ void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tu
|
||||
return;
|
||||
|
||||
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
|
||||
int tile_x = alignSize(actual_tile_x, 4);
|
||||
int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_;
|
||||
if (tile_x > (4 * simd_size))
|
||||
int tile_x = alignSize(actual_tile_x, simd_size);
|
||||
if (tile_x > simd_size)
|
||||
return;
|
||||
|
||||
if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
|
||||
return;
|
||||
|
||||
int tile_y_stride = (4 * simd_size) / tile_x;
|
||||
int invec_size = divUp(tile_y, tile_y_stride);
|
||||
if (invec_size > 4)
|
||||
if (blockM * blockK > block_size_max)
|
||||
return;
|
||||
|
||||
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
|
||||
@ -1599,11 +1596,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
|
||||
for (uint32_t height = height_max; height > 0; height--)
|
||||
{
|
||||
generate_idlf_tuneritems(tunerItems, width, height, simd_size);
|
||||
if (tunerItems.size() >= 8 && height == 2)
|
||||
break;
|
||||
}
|
||||
if (tunerItems.size() >= 12 && width == 2)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1690,10 +1683,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
|
||||
if (kernelQueue[x]->tested == false) {
|
||||
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
|
||||
if (verified == false) {
|
||||
dbgPrint(std::cout << "Kernel "
|
||||
<< kernelQueue[x]->kernelName
|
||||
<< " failed verification" << std::endl);
|
||||
dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: "
|
||||
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification");
|
||||
CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: "
|
||||
<< kernelQueue[x]->workItem_output[0] << " "
|
||||
<< "kernelQueue[x]->workItem_output[1]: "
|
||||
<< kernelQueue[x]->workItem_output[1] << " "
|
||||
@ -1714,11 +1705,9 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
|
||||
<< "kernelQueue[x]->local_work_size[2]: "
|
||||
<< kernelQueue[x]->local_work_size[2] << " "
|
||||
<< kernelQueue[x]->swizzle_weights << " "
|
||||
<< kernelQueue[x]->use_null_local << std::endl);
|
||||
<< kernelQueue[x]->use_null_local);
|
||||
} else {
|
||||
dbgPrint(std::cout << "Kernel "
|
||||
<< kernelQueue[x]->kernelName
|
||||
<< " pass verification" << std::endl);
|
||||
CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1747,19 +1736,28 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
|
||||
break;
|
||||
} else {
|
||||
kernelQueue[fastestKernel]->tested = true;
|
||||
dbgPrint(std::cout << "Kernel " <<
|
||||
kernelQueue[fastestKernel]->kernelName <<
|
||||
" failed verification" << std::endl);
|
||||
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
|
||||
" failed verification");
|
||||
failures++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (verification) {
|
||||
dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
|
||||
"> passed verification" << std::endl);
|
||||
dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl);
|
||||
CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
|
||||
"> passed verification");
|
||||
CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
|
||||
double out_w = output_w_;
|
||||
double out_h = output_h_;
|
||||
double out_z = M_;
|
||||
double k_w = kernel_w_;
|
||||
double k_h = kernel_h_;
|
||||
double k_z = channels_;
|
||||
float elapsedTime = kernelQueue[kernel_index_]->executionTime;
|
||||
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
|
||||
CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
|
||||
CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
|
||||
} else {
|
||||
dbgPrint(std::cout << "fallback to basic kernel" << std::endl);
|
||||
CV_LOG_INFO(NULL, "fallback to basic kernel");
|
||||
options_.str(""); options_.clear(); // clear contents and state flags
|
||||
createBasicKernel(1, 1, 1);
|
||||
kernel_index_ = kernelQueue.size() - 1;
|
||||
|
@ -206,8 +206,6 @@ __kernel void ConvolveBasic(
|
||||
|
||||
#elif defined KERNEL_IDLF
|
||||
|
||||
#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
|
||||
|
||||
// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
|
||||
// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
|
||||
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
|
||||
@ -219,124 +217,76 @@ __kernel void
|
||||
convolve_simd(
|
||||
ELTWISE_DATA_ARG
|
||||
FUSED_ARG
|
||||
__global Dtype* inputs_base,
|
||||
filter_qualifier Dtype* weights_base,
|
||||
__global Dtype* inputs,
|
||||
__global Dtype* weights,
|
||||
BIAS_KERNEL_ARG
|
||||
__global Dtype* outputs_base,
|
||||
__global Dtype* outputs,
|
||||
const ushort input_width,
|
||||
const ushort input_height,
|
||||
const ushort output_width,
|
||||
const ushort output_height)
|
||||
{
|
||||
__global Dtype* outputs = outputs_base;
|
||||
__global Dtype* inputs = inputs_base;
|
||||
filter_qualifier Dtype* weights = weights_base;
|
||||
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
|
||||
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
|
||||
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
|
||||
unsigned int fmg = get_group_id(2);
|
||||
unsigned int lid = get_local_id(2);
|
||||
|
||||
Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT];
|
||||
|
||||
int in_addr;
|
||||
Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };
|
||||
|
||||
// find weights address of given neuron (lid is index)
|
||||
unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
|
||||
unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
|
||||
INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
|
||||
|
||||
for(int i=0;i<OUT_BLOCK_SIZE;i++) {
|
||||
out[i]=0.0f;
|
||||
}
|
||||
unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;
|
||||
|
||||
unsigned int num_in_batch = ( fm ) / ALIGNED_NUM_FILTERS;
|
||||
unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;
|
||||
|
||||
unsigned int input_batch_offset = num_in_batch * input_height * input_width * TOTAL_INPUT_DEPTH_SIZE;
|
||||
|
||||
int curr_local_y = ( lid / ( TILE_X / 4 ) );
|
||||
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
|
||||
int curr_y = or * STRIDE_Y + curr_local_y;
|
||||
int curr_x = oc * STRIDE_X + curr_local_x;
|
||||
int curr_y = or * STRIDE_Y;
|
||||
int curr_x = oc * STRIDE_X + lid;
|
||||
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
|
||||
int saved_y = curr_y;
|
||||
#endif
|
||||
in_addr = input_batch_offset
|
||||
+ (curr_y - INPUT_PAD_H) * input_width // y tile offset
|
||||
int in_addr = input_batch_offset
|
||||
+ (curr_y - INPUT_PAD_H) * INPUT_WIDTH // y tile offset
|
||||
+ curr_x - INPUT_PAD_W; // x tile offset
|
||||
union {
|
||||
Dtype4 in_vec[INVEC_SIZE];
|
||||
Dtype in_array[INVEC_SIZE * 4];
|
||||
} in_buf;
|
||||
|
||||
Dtype in_buf[INVEC_SIZE];
|
||||
|
||||
for(int kd = 0; kd < INPUT_DEPTH; kd++)
|
||||
{
|
||||
int in_offset = in_addr;
|
||||
int reg = 0;
|
||||
LOOP(INVEC_SIZE, reg,
|
||||
__attribute__((opencl_unroll_hint(INVEC_SIZE)))
|
||||
for (int reg = 0; reg < INVEC_SIZE; reg++)
|
||||
{
|
||||
if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) {
|
||||
in_buf[reg] = inputs[in_offset];
|
||||
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
|
||||
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) {
|
||||
if (curr_x < INPUT_PAD_W) {
|
||||
in_buf.in_vec[reg].s0 = 0;
|
||||
if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W)
|
||||
in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1);
|
||||
else
|
||||
in_buf.in_vec[reg].s1 = 0;
|
||||
if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W)
|
||||
in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
|
||||
else
|
||||
in_buf.in_vec[reg].s2 = 0;
|
||||
if (curr_x + 3 < input_width + INPUT_PAD_W)
|
||||
in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
|
||||
else
|
||||
in_buf.in_vec[reg].s3 = 0;
|
||||
} else {
|
||||
VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
|
||||
if (curr_x + 1 >= input_width + INPUT_PAD_W)
|
||||
in_buf.in_vec[reg].s1 = 0;
|
||||
if (curr_x + 2 >= input_width + INPUT_PAD_W)
|
||||
in_buf.in_vec[reg].s2 = 0;
|
||||
if (curr_x + 3 >= input_width + INPUT_PAD_W)
|
||||
in_buf.in_vec[reg].s3 = 0;
|
||||
if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H &&
|
||||
curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W))
|
||||
{
|
||||
in_buf[reg] = 0;
|
||||
}
|
||||
} else {
|
||||
in_buf.in_vec[reg] = 0;
|
||||
}
|
||||
curr_y += TILE_Y_STRIDE;
|
||||
#else
|
||||
VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
|
||||
#endif
|
||||
curr_y += 1;
|
||||
in_offset += INPUT_WIDTH;
|
||||
}
|
||||
in_offset += input_width * TILE_Y_STRIDE;
|
||||
});
|
||||
in_addr += input_height * input_width;
|
||||
|
||||
in_addr += INPUT_PITCH;
|
||||
|
||||
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
|
||||
curr_y = saved_y;
|
||||
#endif
|
||||
|
||||
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
|
||||
#define WEIGHT_PREF 8
|
||||
#else
|
||||
#define WEIGHT_PREF 1
|
||||
#endif
|
||||
union {
|
||||
Dtype w[WEIGHT_PREF];
|
||||
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
|
||||
INT_TYPE8 ui8;
|
||||
#endif
|
||||
} weight_buf;
|
||||
Dtype weight_buf[WEIGHT_PREF];
|
||||
int w_idx=0;
|
||||
|
||||
unsigned int orig_weight_addr = weight_addr;
|
||||
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
|
||||
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
|
||||
weight_addr += SIMD_SIZE * WEIGHT_PREF;
|
||||
#else
|
||||
weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr]));
|
||||
weight_addr += SIMD_SIZE * 1;
|
||||
#endif
|
||||
for (int i = 0; i < WEIGHT_PREF; i++)
|
||||
{
|
||||
weight_buf[i] = weights[weight_addr];
|
||||
weight_addr += SIMD_SIZE;
|
||||
}
|
||||
|
||||
#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
|
||||
#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
|
||||
|
||||
int kr = 0; // kr = Kernel Row
|
||||
LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
|
||||
@ -344,51 +294,29 @@ convolve_simd(
|
||||
int kc = 0; // kc = Kernel Column
|
||||
LOOP(KERNEL_WIDTH, kc,
|
||||
{
|
||||
for(int br=0; br < OUT_BLOCK_HEIGHT; br++) {
|
||||
for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
|
||||
Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X);
|
||||
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
|
||||
for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
|
||||
{
|
||||
for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
|
||||
{
|
||||
Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
|
||||
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
|
||||
}
|
||||
}
|
||||
#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF
|
||||
// We assume KERNEL_W is equal to KERNEL_H here.
|
||||
if ((w_idx + 1) % WEIGHT_PREF == 0
|
||||
#if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0
|
||||
&& ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))
|
||||
#endif
|
||||
) {
|
||||
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
|
||||
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
|
||||
}
|
||||
#if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0
|
||||
// need to do nothing
|
||||
#else
|
||||
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)))
|
||||
#if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1
|
||||
weight_buf.w[0] = weights[weight_addr];
|
||||
#elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2
|
||||
weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]);
|
||||
#elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4
|
||||
weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]);
|
||||
#else
|
||||
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr];
|
||||
weight_addr += SIMD_SIZE;
|
||||
++w_idx;
|
||||
});
|
||||
});
|
||||
weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE;
|
||||
weight_addr -= WEIGHT_PREF * SIMD_SIZE;
|
||||
}
|
||||
|
||||
}
|
||||
// dead code to work around possible compiler bug.
|
||||
if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) {
|
||||
outputs[0] = BLOCK_IN(fm % SIMD_SIZE);
|
||||
}
|
||||
fm = fm % ALIGNED_NUM_FILTERS;
|
||||
|
||||
if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
|
||||
unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
|
||||
#if LEFT_FILTERS > 0
|
||||
if (fm < NUM_FILTERS)
|
||||
#endif
|
||||
{
|
||||
unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
|
||||
out_addr += or * output_width + oc;
|
||||
// we need this address calculation for biases because we support views and batching
|
||||
#if APPLY_BIAS
|
||||
@ -396,13 +324,16 @@ convolve_simd(
|
||||
#else
|
||||
Dtype bias = 0;
|
||||
#endif
|
||||
for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
|
||||
if (r + or >= output_height) break;
|
||||
for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) {
|
||||
if (c + oc >= output_width) break;
|
||||
// this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
|
||||
ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
|
||||
|
||||
for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
|
||||
{
|
||||
if (r + or >= output_height) break;
|
||||
for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
|
||||
{
|
||||
if (c + oc >= output_width) break;
|
||||
// this does a scattered write to SIMD_SIZE different feature maps,
|
||||
// so that data within one map is contiguous, thus ready for input to next layer.
|
||||
ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user