mirror of
https://github.com/opencv/opencv.git
synced 2025-01-19 06:53:50 +08:00
fp16 ocl support for more layers
Signed-off-by: Li Peng <peng.li@intel.com>
This commit is contained in:
parent
3dd916882a
commit
ba5e8befa9
@ -120,12 +120,16 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inputs_.depth() == CV_16S);
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
|
||||
CV_Assert(blobs.size() >= 2);
|
||||
CV_Assert(inputs.size() == 1);
|
||||
|
||||
if (use_half && inputs[0].dims == 2)
|
||||
return false;
|
||||
|
||||
if (umat_weight.empty())
|
||||
{
|
||||
umat_weight = weights_.getUMat(ACCESS_READ);
|
||||
@ -139,6 +143,7 @@ public:
|
||||
int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
|
||||
int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
|
||||
|
||||
String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
|
||||
for (size_t ii = 0; ii < outputs.size(); ii++)
|
||||
{
|
||||
if (inpBlob.dims == 2)
|
||||
@ -154,8 +159,12 @@ public:
|
||||
UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
|
||||
UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
|
||||
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
|
||||
String buildopt = format("-DNUM=%d", number);
|
||||
String buildopt = format("-DNUM=%d", number) + opts;
|
||||
String kname = format("batch_norm%d", number);
|
||||
if (number == 1)
|
||||
buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
|
||||
else
|
||||
buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
|
||||
ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
|
||||
if (kernel.empty())
|
||||
return false;
|
||||
@ -181,7 +190,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -95,7 +95,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -307,8 +307,24 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
if (use_half)
|
||||
{
|
||||
std::vector<UMat> orig_inputs;
|
||||
std::vector<UMat> orig_outputs;
|
||||
|
||||
inps.getUMatVector(orig_inputs);
|
||||
outs.getUMatVector(orig_outputs);
|
||||
|
||||
inputs.resize(orig_inputs.size());
|
||||
for (size_t i = 0; i < orig_inputs.size(); i++)
|
||||
convertFp16(orig_inputs[i], inputs[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
}
|
||||
|
||||
std::vector<LabelBBox> allDecodedBBoxes;
|
||||
std::vector<Mat> allConfidenceScores;
|
||||
@ -342,7 +358,13 @@ public:
|
||||
{
|
||||
// Set confidences to zeros.
|
||||
Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
|
||||
outputs[0](ranges).setTo(0);
|
||||
if (use_half)
|
||||
{
|
||||
std::vector<UMat> orig_outputs;
|
||||
outs.getUMatVector(orig_outputs);
|
||||
orig_outputs[0](ranges).setTo(0);
|
||||
} else
|
||||
outputs[0](ranges).setTo(0);
|
||||
return true;
|
||||
}
|
||||
int outputShape[] = {1, 1, (int)numKept, 7};
|
||||
@ -360,9 +382,23 @@ public:
|
||||
}
|
||||
CV_Assert(count == numKept);
|
||||
}
|
||||
outputs.clear();
|
||||
outputs.push_back(umat);
|
||||
outs.assign(outputs);
|
||||
|
||||
if (use_half)
|
||||
{
|
||||
UMat half_umat;
|
||||
convertFp16(umat, half_umat);
|
||||
|
||||
std::vector<UMat> orig_outputs;
|
||||
outs.getUMatVector(orig_outputs);
|
||||
orig_outputs.clear();
|
||||
orig_outputs.push_back(half_umat);
|
||||
outs.assign(orig_outputs);
|
||||
} else {
|
||||
outputs.clear();
|
||||
outputs.push_back(umat);
|
||||
outs.assign(outputs);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
@ -372,7 +408,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -87,6 +87,9 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
internals_.getUMatVector(internals);
|
||||
@ -162,7 +165,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -288,9 +288,11 @@ public:
|
||||
if (!_needsPermute)
|
||||
return false;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
String opts = format("-DDtype=%s", use_half ? "half" : "float");
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc);
|
||||
ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
|
||||
|
||||
kernel.set(0, (int)_count);
|
||||
kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
|
||||
@ -313,7 +315,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -316,6 +316,7 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
|
||||
@ -340,9 +341,15 @@ public:
|
||||
heights.copyTo(umat_heights);
|
||||
}
|
||||
|
||||
size_t nthreads = _layerHeight * _layerWidth;
|
||||
String opts;
|
||||
if (use_half)
|
||||
opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
|
||||
else
|
||||
opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
|
||||
|
||||
size_t nthreads = _layerHeight * _layerWidth;
|
||||
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
|
||||
|
||||
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
|
||||
kernel.set(0, (int)nthreads);
|
||||
kernel.set(1, (float)_stepX);
|
||||
kernel.set(2, (float)_stepY);
|
||||
@ -375,7 +382,7 @@ public:
|
||||
|
||||
// set the variance.
|
||||
{
|
||||
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
|
||||
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
|
||||
int offset = total(shape(outputs[0]), 2);
|
||||
size_t nthreads = _layerHeight * _layerWidth * _numPriors;
|
||||
kernel.set(0, (int)nthreads);
|
||||
@ -395,7 +402,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -158,6 +158,9 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
std::vector<UMat> internals;
|
||||
|
||||
if (inputs_.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
inputs_.getUMatVector(inputs);
|
||||
outputs_.getUMatVector(outputs);
|
||||
internals_.getUMatVector(internals);
|
||||
@ -237,7 +240,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -127,7 +127,7 @@ public:
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
// TODO: implement a logistic activation to classification scores.
|
||||
if (useLogistic)
|
||||
if (useLogistic || inps.depth() == CV_16S)
|
||||
return false;
|
||||
|
||||
inps.getUMatVector(inputs);
|
||||
@ -191,7 +191,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -96,9 +96,10 @@ public:
|
||||
std::vector<UMat> inputs;
|
||||
std::vector<UMat> outputs;
|
||||
|
||||
bool use_half = (inps.depth() == CV_16S);
|
||||
inps.getUMatVector(inputs);
|
||||
outs.getUMatVector(outputs);
|
||||
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
|
||||
String buildopt= format("-DDtype=%s ", use_half ? "half" : "float");
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
@ -134,7 +135,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -219,7 +219,7 @@ public:
|
||||
CV_TRACE_FUNCTION();
|
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
|
||||
|
||||
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
|
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr))
|
||||
|
||||
|
@ -40,24 +40,27 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#if NUM == 8
|
||||
#define load(src, index) vload8(0, src + index)
|
||||
#define store(vec, dst, index) vstore8(vec, 0, dst + index)
|
||||
#define vec_type Dtype8
|
||||
#define float_type float8
|
||||
#define convert_f convert_float8
|
||||
#define BATCH_NORM batch_norm8
|
||||
#elif NUM == 4
|
||||
#define load(src, index) vload4(0, src + index)
|
||||
#define store(vec, dst, index) vstore4(vec, 0, dst + index)
|
||||
#define vec_type Dtype4
|
||||
#define float_type float4
|
||||
#define convert_f convert_float4
|
||||
#define BATCH_NORM batch_norm4
|
||||
#elif NUM == 1
|
||||
#define load(src, index) src[index]
|
||||
#define store(vec, dst, index) dst[index] = vec
|
||||
#define vec_type Dtype
|
||||
#define float_type float
|
||||
#define convert_f convert_float
|
||||
#define BATCH_NORM batch_norm1
|
||||
#endif
|
||||
|
||||
@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int channels,
|
||||
__global const Dtype* weight,
|
||||
__global const Dtype* bias,
|
||||
__global const float* weight,
|
||||
__global const float* bias,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src,
|
||||
if (x >= rows || y >= cols)
|
||||
return;
|
||||
|
||||
Dtype w = weight[x % channels];
|
||||
Dtype b = bias[x % channels];
|
||||
vec_type src_vec = load(src, index);
|
||||
vec_type dst_vec = src_vec * w + (vec_type)b;
|
||||
store(dst_vec, dst, index);
|
||||
float w = weight[x % channels];
|
||||
float b = bias[x % channels];
|
||||
float_type src_vec = convert_f(load(src, index));
|
||||
float_type dst_vec = src_vec * w + (float_type)b;
|
||||
store(convert_T(dst_vec), dst, index);
|
||||
}
|
||||
|
@ -40,7 +40,9 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void permute(const int nthreads,
|
||||
__global Dtype* bottom_data,
|
||||
|
@ -39,17 +39,18 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void prior_box(const int nthreads,
|
||||
const Dtype stepX,
|
||||
const Dtype stepY,
|
||||
__global const Dtype* _offsetsX,
|
||||
__global const Dtype* _offsetsY,
|
||||
const float stepX,
|
||||
const float stepY,
|
||||
__global const float* _offsetsX,
|
||||
__global const float* _offsetsY,
|
||||
const int offsetsX_size,
|
||||
__global const Dtype* _widths,
|
||||
__global const Dtype* _heights,
|
||||
__global const float* _widths,
|
||||
__global const float* _heights,
|
||||
const int widths_size,
|
||||
__global Dtype* dst,
|
||||
const int _layerHeight,
|
||||
@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads,
|
||||
|
||||
outputPtr = dst + index * 4 * offsetsX_size * widths_size;
|
||||
|
||||
Dtype _boxWidth, _boxHeight;
|
||||
float _boxWidth, _boxHeight;
|
||||
Dtype4 vec;
|
||||
for (int i = 0; i < widths_size; ++i)
|
||||
{
|
||||
@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads,
|
||||
_boxHeight = _heights[i];
|
||||
for (int j = 0; j < offsetsX_size; ++j)
|
||||
{
|
||||
float center_x = (w + _offsetsX[j]) * stepX;
|
||||
float center_y = (h + _offsetsY[j]) * stepY;
|
||||
Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
|
||||
Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
|
||||
|
||||
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
|
||||
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
|
||||
@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads,
|
||||
__kernel void set_variance(const int nthreads,
|
||||
const int offset,
|
||||
const int variance_size,
|
||||
__global const Dtype* variance,
|
||||
__global const float* variance,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads,
|
||||
if (variance_size == 1)
|
||||
var_vec = (Dtype4)(variance[0]);
|
||||
else
|
||||
var_vec = vload4(0, variance);
|
||||
var_vec = convert_T(vload4(0, variance));
|
||||
|
||||
vstore4(var_vec, 0, dst + offset + index * 4);
|
||||
}
|
||||
|
@ -39,6 +39,10 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void reorg(const int count,
|
||||
__global const Dtype* src,
|
||||
const int channels,
|
||||
|
Loading…
Reference in New Issue
Block a user