mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 03:30:34 +08:00
dnn(ocl): avoid mess FP16/FP32 in convolution layer
This commit is contained in:
parent
1bfc75ac23
commit
c240355cc6
@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
|
||||
sdepth == CV_32F ? "half" : "float",
|
||||
rowsPerWI,
|
||||
sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
|
||||
ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
|
||||
ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
|
@ -47,8 +47,17 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
|
||||
__kernel void
|
||||
#ifdef FLOAT_TO_HALF
|
||||
convertFp16_FP32_to_FP16
|
||||
#else
|
||||
convertFp16_FP16_to_FP32
|
||||
#endif
|
||||
(
|
||||
__global const uchar * srcptr, int src_step, int src_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset,
|
||||
int dst_rows, int dst_cols
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
@ -1461,16 +1461,7 @@ public:
|
||||
umat_blobs.resize(n);
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
if (use_half)
|
||||
{
|
||||
Mat matFP32;
|
||||
convertFp16(inputs[i + 1], matFP32);
|
||||
matFP32.copyTo(umat_blobs[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputs[i + 1].copyTo(umat_blobs[i]);
|
||||
}
|
||||
inputs[i + 1].copyTo(umat_blobs[i]);
|
||||
}
|
||||
inputs.resize(1);
|
||||
}
|
||||
@ -1481,7 +1472,10 @@ public:
|
||||
umat_blobs.resize(n);
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
blobs[i].copyTo(umat_blobs[i]);
|
||||
if (use_half)
|
||||
convertFp16(blobs[i], umat_blobs[i]);
|
||||
else
|
||||
blobs[i].copyTo(umat_blobs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1537,14 +1531,20 @@ public:
|
||||
|
||||
if (fusedWeights)
|
||||
{
|
||||
weightsMat.copyTo(umat_blobs[0]);
|
||||
if (use_half)
|
||||
convertFp16(weightsMat, umat_blobs[0]);
|
||||
else
|
||||
weightsMat.copyTo(umat_blobs[0]);
|
||||
fusedWeights = false;
|
||||
}
|
||||
if (fusedBias)
|
||||
{
|
||||
if ( umat_blobs.size() < 2 )
|
||||
umat_blobs.resize(2);
|
||||
umat_blobs[1] = UMat(biasvec, true);
|
||||
if (use_half)
|
||||
convertFp16(Mat(biasvec, true), umat_blobs[1]);
|
||||
else
|
||||
Mat(biasvec, true).copyTo(umat_blobs[1]);
|
||||
convolutionOp->setBias(true);
|
||||
fusedBias = false;
|
||||
}
|
||||
|
@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
|
||||
int32_t group_;
|
||||
bool bias_term_;
|
||||
UMat swizzled_weights_umat;
|
||||
UMat weights_half;
|
||||
UMat bias_half;
|
||||
UMat bottom_data2_;
|
||||
|
||||
int32_t bottom_index_;
|
||||
|
@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
|
||||
fused_eltwise_ = false;
|
||||
}
|
||||
|
||||
if (use_half_ && bias_half.empty() && !bias.empty())
|
||||
convertFp16(bias, bias_half);
|
||||
if (use_half_ && !bias.empty())
|
||||
CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
|
||||
|
||||
if (use_half_ && weights_half.empty())
|
||||
convertFp16(weight, weights_half);
|
||||
if (use_half_)
|
||||
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
|
||||
|
||||
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
|
||||
prepareKernel(bottom, top, weight, bias, numImages);
|
||||
if (bestKernelConfig.empty())
|
||||
return false;
|
||||
return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
|
||||
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
kernel_h_ * (int)alignSize(kernel_w_, 2),
|
||||
(use_half_) ? CV_16SC1 : CV_32FC1);
|
||||
|
||||
UMat swizzled_weights_tmp;
|
||||
if (use_half_)
|
||||
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
|
||||
|
||||
if (!interleave) {
|
||||
cl_uint argIdx = 0;
|
||||
int32_t channels = channels_ / group_;
|
||||
|
||||
ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
|
||||
cv::ocl::dnn::conv_spatial_helper_oclsrc);
|
||||
ocl::Kernel oclk_copy_weight(
|
||||
use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
|
||||
cv::ocl::dnn::conv_spatial_helper_oclsrc,
|
||||
use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
|
||||
);
|
||||
if (oclk_copy_weight.empty())
|
||||
return false;
|
||||
|
||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (use_half_)
|
||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
|
||||
else
|
||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
|
||||
oclk_copy_weight.set(argIdx++, kernel_w_);
|
||||
oclk_copy_weight.set(argIdx++, kernel_h_);
|
||||
oclk_copy_weight.set(argIdx++, channels);
|
||||
oclk_copy_weight.set(argIdx++, num_output_);
|
||||
oclk_copy_weight.set(argIdx++, swizzled_factor);
|
||||
oclk_copy_weight.args(
|
||||
ocl::KernelArg::PtrReadOnly(weight),
|
||||
ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
|
||||
kernel_w_,
|
||||
kernel_h_,
|
||||
channels,
|
||||
num_output_,
|
||||
swizzled_factor
|
||||
);
|
||||
|
||||
size_t global_work_size_copy[3] = {
|
||||
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
|
||||
@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
}
|
||||
} else {
|
||||
// assumption: kernel dimension is 2
|
||||
Mat weightMat = weight.getMat(ACCESS_READ);
|
||||
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
|
||||
Mat weightMat;
|
||||
Mat swizzledWeightMat;
|
||||
UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
|
||||
if (use_half_)
|
||||
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
|
||||
{
|
||||
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
|
||||
convertFp16(weight, weight_tmp);
|
||||
weightMat = weight_tmp.getMat(ACCESS_READ);
|
||||
swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
|
||||
}
|
||||
else
|
||||
{
|
||||
weightMat = weight.getMat(ACCESS_READ);
|
||||
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
|
||||
}
|
||||
|
||||
CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
|
||||
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
|
||||
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
|
||||
|
||||
int interleavedRows = (kernel_w_ / 2) * 2;
|
||||
@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
int blockWidth = swizzled_factor; // should equal to simd size.
|
||||
int rowAlignment = 32;
|
||||
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
|
||||
Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
|
||||
CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
|
||||
cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
|
||||
for (int od = 0; od < M_; od++)
|
||||
for (int id = 0; id < channels_; id++)
|
||||
for (int r = 0; r < kernel_h_; r++)
|
||||
for (int c = 0; c < kernel_w_; c++)
|
||||
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
|
||||
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
|
||||
|
||||
interleaveMatrix(cpu_swizzled_weight,
|
||||
tmpSwizzledWeight,
|
||||
tmpSwizzledWeight.data(),
|
||||
kernel_w_ * kernel_h_ * channels_, M_,
|
||||
interleavedRows,
|
||||
nonInterleavedRows,
|
||||
blockWidth,
|
||||
rowAlignment);
|
||||
free(tmpSwizzledWeight);
|
||||
}
|
||||
|
||||
if (use_half_)
|
||||
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
|
||||
// unmap OpenCL buffers
|
||||
weightMat.release();
|
||||
|
||||
if (use_half_)
|
||||
convertFp16(swizzledWeightMat, swizzled_weights_umat);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
if (use_half_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
|
||||
else
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (bias_term_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, image_offset);
|
||||
if (use_half_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
|
||||
else
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
kernel.set(argIdx++, kernel_offset);
|
||||
if (bias_term_)
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
|
||||
|
||||
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
|
||||
|
||||
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
|
||||
calculateBenchmark(bottom, benchData, weight, bias, numImages);
|
||||
|
||||
if (run_auto_tuning_ || force_auto_tuning_)
|
||||
{
|
||||
|
@ -39,9 +39,14 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifdef HALF_SUPPORT
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define Dtype float
|
||||
|
||||
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
|
||||
(__global Dtype* weightIn,
|
||||
|
Loading…
Reference in New Issue
Block a user