mirror of
https://github.com/opencv/opencv.git
synced 2025-06-17 23:51:16 +08:00
dnn(ocl): avoid mess FP16/FP32 in convolution layer
This commit is contained in:
parent
1bfc75ac23
commit
c240355cc6
@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
|
|||||||
sdepth == CV_32F ? "half" : "float",
|
sdepth == CV_32F ? "half" : "float",
|
||||||
rowsPerWI,
|
rowsPerWI,
|
||||||
sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
|
sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
|
||||||
ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
|
ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -47,8 +47,17 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
|
__kernel void
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
|
#ifdef FLOAT_TO_HALF
|
||||||
|
convertFp16_FP32_to_FP16
|
||||||
|
#else
|
||||||
|
convertFp16_FP16_to_FP32
|
||||||
|
#endif
|
||||||
|
(
|
||||||
|
__global const uchar * srcptr, int src_step, int src_offset,
|
||||||
|
__global uchar * dstptr, int dst_step, int dst_offset,
|
||||||
|
int dst_rows, int dst_cols
|
||||||
|
)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y0 = get_global_id(1) * rowsPerWI;
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
@ -1461,16 +1461,7 @@ public:
|
|||||||
umat_blobs.resize(n);
|
umat_blobs.resize(n);
|
||||||
for (size_t i = 0; i < n; i++)
|
for (size_t i = 0; i < n; i++)
|
||||||
{
|
{
|
||||||
if (use_half)
|
inputs[i + 1].copyTo(umat_blobs[i]);
|
||||||
{
|
|
||||||
Mat matFP32;
|
|
||||||
convertFp16(inputs[i + 1], matFP32);
|
|
||||||
matFP32.copyTo(umat_blobs[i]);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
inputs[i + 1].copyTo(umat_blobs[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
inputs.resize(1);
|
inputs.resize(1);
|
||||||
}
|
}
|
||||||
@ -1481,7 +1472,10 @@ public:
|
|||||||
umat_blobs.resize(n);
|
umat_blobs.resize(n);
|
||||||
for (size_t i = 0; i < n; i++)
|
for (size_t i = 0; i < n; i++)
|
||||||
{
|
{
|
||||||
blobs[i].copyTo(umat_blobs[i]);
|
if (use_half)
|
||||||
|
convertFp16(blobs[i], umat_blobs[i]);
|
||||||
|
else
|
||||||
|
blobs[i].copyTo(umat_blobs[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1537,14 +1531,20 @@ public:
|
|||||||
|
|
||||||
if (fusedWeights)
|
if (fusedWeights)
|
||||||
{
|
{
|
||||||
weightsMat.copyTo(umat_blobs[0]);
|
if (use_half)
|
||||||
|
convertFp16(weightsMat, umat_blobs[0]);
|
||||||
|
else
|
||||||
|
weightsMat.copyTo(umat_blobs[0]);
|
||||||
fusedWeights = false;
|
fusedWeights = false;
|
||||||
}
|
}
|
||||||
if (fusedBias)
|
if (fusedBias)
|
||||||
{
|
{
|
||||||
if ( umat_blobs.size() < 2 )
|
if ( umat_blobs.size() < 2 )
|
||||||
umat_blobs.resize(2);
|
umat_blobs.resize(2);
|
||||||
umat_blobs[1] = UMat(biasvec, true);
|
if (use_half)
|
||||||
|
convertFp16(Mat(biasvec, true), umat_blobs[1]);
|
||||||
|
else
|
||||||
|
Mat(biasvec, true).copyTo(umat_blobs[1]);
|
||||||
convolutionOp->setBias(true);
|
convolutionOp->setBias(true);
|
||||||
fusedBias = false;
|
fusedBias = false;
|
||||||
}
|
}
|
||||||
|
@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
|
|||||||
int32_t group_;
|
int32_t group_;
|
||||||
bool bias_term_;
|
bool bias_term_;
|
||||||
UMat swizzled_weights_umat;
|
UMat swizzled_weights_umat;
|
||||||
UMat weights_half;
|
|
||||||
UMat bias_half;
|
|
||||||
UMat bottom_data2_;
|
UMat bottom_data2_;
|
||||||
|
|
||||||
int32_t bottom_index_;
|
int32_t bottom_index_;
|
||||||
|
@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
|
|||||||
fused_eltwise_ = false;
|
fused_eltwise_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_half_ && bias_half.empty() && !bias.empty())
|
if (use_half_ && !bias.empty())
|
||||||
convertFp16(bias, bias_half);
|
CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
|
||||||
|
|
||||||
if (use_half_ && weights_half.empty())
|
if (use_half_)
|
||||||
convertFp16(weight, weights_half);
|
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
|
||||||
|
|
||||||
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
|
prepareKernel(bottom, top, weight, bias, numImages);
|
||||||
if (bestKernelConfig.empty())
|
if (bestKernelConfig.empty())
|
||||||
return false;
|
return false;
|
||||||
return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
|
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Dtype>
|
template<typename Dtype>
|
||||||
@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
|||||||
kernel_h_ * (int)alignSize(kernel_w_, 2),
|
kernel_h_ * (int)alignSize(kernel_w_, 2),
|
||||||
(use_half_) ? CV_16SC1 : CV_32FC1);
|
(use_half_) ? CV_16SC1 : CV_32FC1);
|
||||||
|
|
||||||
UMat swizzled_weights_tmp;
|
|
||||||
if (use_half_)
|
|
||||||
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
|
|
||||||
|
|
||||||
if (!interleave) {
|
if (!interleave) {
|
||||||
cl_uint argIdx = 0;
|
|
||||||
int32_t channels = channels_ / group_;
|
int32_t channels = channels_ / group_;
|
||||||
|
|
||||||
ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
|
ocl::Kernel oclk_copy_weight(
|
||||||
cv::ocl::dnn::conv_spatial_helper_oclsrc);
|
use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
|
||||||
|
cv::ocl::dnn::conv_spatial_helper_oclsrc,
|
||||||
|
use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
|
||||||
|
);
|
||||||
if (oclk_copy_weight.empty())
|
if (oclk_copy_weight.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
oclk_copy_weight.args(
|
||||||
if (use_half_)
|
ocl::KernelArg::PtrReadOnly(weight),
|
||||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
|
ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
|
||||||
else
|
kernel_w_,
|
||||||
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
|
kernel_h_,
|
||||||
oclk_copy_weight.set(argIdx++, kernel_w_);
|
channels,
|
||||||
oclk_copy_weight.set(argIdx++, kernel_h_);
|
num_output_,
|
||||||
oclk_copy_weight.set(argIdx++, channels);
|
swizzled_factor
|
||||||
oclk_copy_weight.set(argIdx++, num_output_);
|
);
|
||||||
oclk_copy_weight.set(argIdx++, swizzled_factor);
|
|
||||||
|
|
||||||
size_t global_work_size_copy[3] = {
|
size_t global_work_size_copy[3] = {
|
||||||
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
|
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
|
||||||
@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// assumption: kernel dimension is 2
|
// assumption: kernel dimension is 2
|
||||||
Mat weightMat = weight.getMat(ACCESS_READ);
|
Mat weightMat;
|
||||||
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
|
|
||||||
Mat swizzledWeightMat;
|
Mat swizzledWeightMat;
|
||||||
|
UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
|
||||||
if (use_half_)
|
if (use_half_)
|
||||||
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
|
{
|
||||||
|
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
|
||||||
|
convertFp16(weight, weight_tmp);
|
||||||
|
weightMat = weight_tmp.getMat(ACCESS_READ);
|
||||||
|
swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
weightMat = weight.getMat(ACCESS_READ);
|
||||||
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
|
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
|
||||||
|
}
|
||||||
|
|
||||||
|
CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
|
||||||
|
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
|
||||||
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
|
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
|
||||||
|
|
||||||
int interleavedRows = (kernel_w_ / 2) * 2;
|
int interleavedRows = (kernel_w_ / 2) * 2;
|
||||||
@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
|||||||
int blockWidth = swizzled_factor; // should equal to simd size.
|
int blockWidth = swizzled_factor; // should equal to simd size.
|
||||||
int rowAlignment = 32;
|
int rowAlignment = 32;
|
||||||
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
|
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
|
||||||
Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
|
cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
|
||||||
CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
|
|
||||||
for (int od = 0; od < M_; od++)
|
for (int od = 0; od < M_; od++)
|
||||||
for (int id = 0; id < channels_; id++)
|
for (int id = 0; id < channels_; id++)
|
||||||
for (int r = 0; r < kernel_h_; r++)
|
for (int r = 0; r < kernel_h_; r++)
|
||||||
for (int c = 0; c < kernel_w_; c++)
|
for (int c = 0; c < kernel_w_; c++)
|
||||||
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
|
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
|
||||||
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
|
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
|
||||||
|
|
||||||
interleaveMatrix(cpu_swizzled_weight,
|
interleaveMatrix(cpu_swizzled_weight,
|
||||||
tmpSwizzledWeight,
|
tmpSwizzledWeight.data(),
|
||||||
kernel_w_ * kernel_h_ * channels_, M_,
|
kernel_w_ * kernel_h_ * channels_, M_,
|
||||||
interleavedRows,
|
interleavedRows,
|
||||||
nonInterleavedRows,
|
nonInterleavedRows,
|
||||||
blockWidth,
|
blockWidth,
|
||||||
rowAlignment);
|
rowAlignment);
|
||||||
free(tmpSwizzledWeight);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (use_half_)
|
// unmap OpenCL buffers
|
||||||
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
|
weightMat.release();
|
||||||
|
|
||||||
|
if (use_half_)
|
||||||
|
convertFp16(swizzledWeightMat, swizzled_weights_umat);
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
cl_uint argIdx = 0;
|
cl_uint argIdx = 0;
|
||||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||||
if (use_half_)
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
|
|
||||||
else
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
|
||||||
if (bias_term_)
|
if (bias_term_)
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||||
@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
|||||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||||
kernel.set(argIdx++, image_offset);
|
kernel.set(argIdx++, image_offset);
|
||||||
if (use_half_)
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
|
|
||||||
else
|
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
|
||||||
kernel.set(argIdx++, kernel_offset);
|
kernel.set(argIdx++, kernel_offset);
|
||||||
if (bias_term_)
|
if (bias_term_)
|
||||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||||
@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
|
|||||||
|
|
||||||
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
|
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
|
||||||
|
|
||||||
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
|
calculateBenchmark(bottom, benchData, weight, bias, numImages);
|
||||||
|
|
||||||
if (run_auto_tuning_ || force_auto_tuning_)
|
if (run_auto_tuning_ || force_auto_tuning_)
|
||||||
{
|
{
|
||||||
|
@ -39,9 +39,14 @@
|
|||||||
//
|
//
|
||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
|
#ifdef HALF_SUPPORT
|
||||||
|
#ifdef cl_khr_fp16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16:enable
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define CONCAT(A,B) A##_##B
|
#define CONCAT(A,B) A##_##B
|
||||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||||
#define Dtype float
|
|
||||||
|
|
||||||
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
|
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
|
||||||
(__global Dtype* weightIn,
|
(__global Dtype* weightIn,
|
||||||
|
Loading…
Reference in New Issue
Block a user