mirror of
https://github.com/opencv/opencv.git
synced 2025-06-12 20:42:53 +08:00
dnn(ocl4dnn): refactor pooling OpenCL calls
This commit is contained in:
parent
f071a48ec7
commit
e34b64c979
@ -351,8 +351,6 @@ class OCL4DNNPool
|
|||||||
UMat& top_data,
|
UMat& top_data,
|
||||||
UMat& top_mask);
|
UMat& top_mask);
|
||||||
private:
|
private:
|
||||||
UMat mask_idx_;
|
|
||||||
|
|
||||||
// Pooling parameters
|
// Pooling parameters
|
||||||
std::vector<int32_t> pad_;
|
std::vector<int32_t> pad_;
|
||||||
std::vector<int32_t> stride_;
|
std::vector<int32_t> stride_;
|
||||||
|
@ -88,7 +88,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
|
|||||||
template<typename Dtype>
|
template<typename Dtype>
|
||||||
OCL4DNNPool<Dtype>::~OCL4DNNPool()
|
OCL4DNNPool<Dtype>::~OCL4DNNPool()
|
||||||
{
|
{
|
||||||
mask_idx_.release();
|
// nothing
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Dtype>
|
template<typename Dtype>
|
||||||
@ -99,99 +99,103 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
|
|||||||
bool ret = true;
|
bool ret = true;
|
||||||
size_t global[] = { 128 * 128 };
|
size_t global[] = { 128 * 128 };
|
||||||
size_t local[] = { 128 };
|
size_t local[] = { 128 };
|
||||||
cl_uint argIdx = 0;
|
|
||||||
|
|
||||||
// support 2D case
|
// support 2D case
|
||||||
switch (pool_method_)
|
switch (pool_method_)
|
||||||
{
|
{
|
||||||
case LIBDNN_POOLING_METHOD_MAX:
|
case LIBDNN_POOLING_METHOD_MAX:
|
||||||
{
|
{
|
||||||
if (top_mask.empty() && mask_idx_.empty())
|
bool haveMask = !top_mask.empty();
|
||||||
{
|
ocl::Kernel oclk_max_pool_forward(
|
||||||
mask_idx_.create(1, count_, CV_32FC1);
|
haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
|
||||||
}
|
ocl::dnn::ocl4dnn_pooling_oclsrc,
|
||||||
ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
|
format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||||
cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
|
" -D STRIDE_W=%d -D STRIDE_H=%d"
|
||||||
|
" -D PAD_W=%d -D PAD_H=%d%s",
|
||||||
|
kernel_w_, kernel_h_,
|
||||||
|
stride_w_, stride_h_,
|
||||||
|
pad_w_, pad_h_,
|
||||||
|
haveMask ? " -D HAVE_MASK=1" : ""
|
||||||
|
));
|
||||||
|
|
||||||
if (oclk_max_pool_forward.empty())
|
if (oclk_max_pool_forward.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
argIdx = 0;
|
oclk_max_pool_forward.args(
|
||||||
oclk_max_pool_forward.set(argIdx++, count_);
|
count_,
|
||||||
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
ocl::KernelArg::PtrReadOnly(bottom),
|
||||||
oclk_max_pool_forward.set(argIdx++, batch_size_);
|
batch_size_,
|
||||||
oclk_max_pool_forward.set(argIdx++, channels_);
|
channels_,
|
||||||
oclk_max_pool_forward.set(argIdx++, height_);
|
height_,
|
||||||
oclk_max_pool_forward.set(argIdx++, width_);
|
width_,
|
||||||
oclk_max_pool_forward.set(argIdx++, pooled_height_);
|
pooled_height_,
|
||||||
oclk_max_pool_forward.set(argIdx++, pooled_width_);
|
pooled_width_,
|
||||||
oclk_max_pool_forward.set(argIdx++, kernel_h_);
|
ocl::KernelArg::PtrWriteOnly(top),
|
||||||
oclk_max_pool_forward.set(argIdx++, kernel_w_);
|
ocl::KernelArg::PtrWriteOnly(top_mask)
|
||||||
oclk_max_pool_forward.set(argIdx++, stride_h_);
|
);
|
||||||
oclk_max_pool_forward.set(argIdx++, stride_w_);
|
|
||||||
oclk_max_pool_forward.set(argIdx++, pad_h_);
|
|
||||||
oclk_max_pool_forward.set(argIdx++, pad_w_);
|
|
||||||
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
|
||||||
oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
|
|
||||||
if (mask_idx_.empty())
|
|
||||||
oclk_max_pool_forward.set(argIdx++, (void *)NULL);
|
|
||||||
else
|
|
||||||
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
|
|
||||||
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
|
|
||||||
|
|
||||||
ret = oclk_max_pool_forward.run(1, global, local, false);
|
ret = oclk_max_pool_forward.run(1, global, local, false);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case LIBDNN_POOLING_METHOD_AVE:
|
case LIBDNN_POOLING_METHOD_AVE:
|
||||||
{
|
{
|
||||||
|
CV_Assert(top_mask.empty());
|
||||||
|
|
||||||
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
|
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
|
||||||
cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
|
ocl::dnn::ocl4dnn_pooling_oclsrc,
|
||||||
|
format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||||
|
" -D STRIDE_W=%d -D STRIDE_H=%d"
|
||||||
|
" -D PAD_W=%d -D PAD_H=%d",
|
||||||
|
kernel_w_, kernel_h_,
|
||||||
|
stride_w_, stride_h_,
|
||||||
|
pad_w_, pad_h_
|
||||||
|
));
|
||||||
|
|
||||||
if (oclk_ave_pool_forward.empty())
|
if (oclk_ave_pool_forward.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
argIdx = 0;
|
oclk_ave_pool_forward.args(
|
||||||
oclk_ave_pool_forward.set(argIdx++, count_);
|
count_,
|
||||||
oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
ocl::KernelArg::PtrReadOnly(bottom),
|
||||||
oclk_ave_pool_forward.set(argIdx++, batch_size_);
|
batch_size_,
|
||||||
oclk_ave_pool_forward.set(argIdx++, channels_);
|
channels_,
|
||||||
oclk_ave_pool_forward.set(argIdx++, height_);
|
height_,
|
||||||
oclk_ave_pool_forward.set(argIdx++, width_);
|
width_,
|
||||||
oclk_ave_pool_forward.set(argIdx++, pooled_height_);
|
pooled_height_,
|
||||||
oclk_ave_pool_forward.set(argIdx++, pooled_width_);
|
pooled_width_,
|
||||||
oclk_ave_pool_forward.set(argIdx++, kernel_h_);
|
ocl::KernelArg::PtrWriteOnly(top)
|
||||||
oclk_ave_pool_forward.set(argIdx++, kernel_w_);
|
);
|
||||||
oclk_ave_pool_forward.set(argIdx++, stride_h_);
|
|
||||||
oclk_ave_pool_forward.set(argIdx++, stride_w_);
|
|
||||||
oclk_ave_pool_forward.set(argIdx++, pad_h_);
|
|
||||||
oclk_ave_pool_forward.set(argIdx++, pad_w_);
|
|
||||||
oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
|
||||||
|
|
||||||
ret = oclk_ave_pool_forward.run(1, global, local, false);
|
ret = oclk_ave_pool_forward.run(1, global, local, false);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case LIBDNN_POOLING_METHOD_STO:
|
case LIBDNN_POOLING_METHOD_STO:
|
||||||
{
|
{
|
||||||
|
CV_Assert(top_mask.empty());
|
||||||
|
|
||||||
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
|
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
|
||||||
cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
|
ocl::dnn::ocl4dnn_pooling_oclsrc,
|
||||||
|
format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
|
||||||
|
" -D STRIDE_W=%d -D STRIDE_H=%d",
|
||||||
|
kernel_w_, kernel_h_,
|
||||||
|
stride_w_, stride_h_
|
||||||
|
));
|
||||||
|
|
||||||
|
|
||||||
if (oclk_sto_pool_forward.empty())
|
if (oclk_sto_pool_forward.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
argIdx = 0;
|
oclk_sto_pool_forward.args(
|
||||||
oclk_sto_pool_forward.set(argIdx++, count_);
|
count_,
|
||||||
oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
ocl::KernelArg::PtrReadOnly(bottom),
|
||||||
oclk_sto_pool_forward.set(argIdx++, batch_size_);
|
batch_size_,
|
||||||
oclk_sto_pool_forward.set(argIdx++, channels_);
|
channels_,
|
||||||
oclk_sto_pool_forward.set(argIdx++, height_);
|
height_,
|
||||||
oclk_sto_pool_forward.set(argIdx++, width_);
|
width_,
|
||||||
oclk_sto_pool_forward.set(argIdx++, pooled_height_);
|
pooled_height_,
|
||||||
oclk_sto_pool_forward.set(argIdx++, pooled_width_);
|
pooled_width_,
|
||||||
oclk_sto_pool_forward.set(argIdx++, kernel_h_);
|
ocl::KernelArg::PtrWriteOnly(top)
|
||||||
oclk_sto_pool_forward.set(argIdx++, kernel_w_);
|
);
|
||||||
oclk_sto_pool_forward.set(argIdx++, stride_h_);
|
|
||||||
oclk_sto_pool_forward.set(argIdx++, stride_w_);
|
|
||||||
oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
|
||||||
|
|
||||||
ret = oclk_sto_pool_forward.run(1, global, local, false);
|
ret = oclk_sto_pool_forward.run(1, global, local, false);
|
||||||
}
|
}
|
||||||
|
@ -44,14 +44,23 @@
|
|||||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||||
#define Dtype float
|
#define Dtype float
|
||||||
|
|
||||||
void TEMPLATE(max_pool_forward_impl, Dtype)(
|
#if defined KERNEL_MAX_POOL
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
#ifdef HAVE_MASK
|
||||||
|
TEMPLATE(max_pool_forward_mask, Dtype)
|
||||||
|
#else
|
||||||
|
TEMPLATE(max_pool_forward, Dtype)
|
||||||
|
#endif
|
||||||
|
(
|
||||||
const int nthreads, __global const Dtype* bottom_data, const int num,
|
const int nthreads, __global const Dtype* bottom_data, const int num,
|
||||||
const int channels, const int height, const int width,
|
const int channels, const int height, const int width,
|
||||||
const int pooled_height, const int pooled_width, const int kernel_h,
|
const int pooled_height, const int pooled_width,
|
||||||
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
|
__global Dtype* top_data
|
||||||
const int pad_w,
|
#ifdef HAVE_MASK
|
||||||
__global Dtype* top_data,
|
, __global Dtype* mask
|
||||||
const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
|
#endif
|
||||||
|
)
|
||||||
{
|
{
|
||||||
for (int index = get_global_id(0); index < nthreads;
|
for (int index = get_global_id(0); index < nthreads;
|
||||||
index += get_global_size(0))
|
index += get_global_size(0))
|
||||||
@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
|
|||||||
const int ph = (index / pooled_width) % pooled_height;
|
const int ph = (index / pooled_width) % pooled_height;
|
||||||
const int c = (index / pooled_width / pooled_height) % channels;
|
const int c = (index / pooled_width / pooled_height) % channels;
|
||||||
const int n = index / pooled_width / pooled_height / channels;
|
const int n = index / pooled_width / pooled_height / channels;
|
||||||
int hstart = ph * stride_h - pad_h;
|
int hstart = ph * STRIDE_H - PAD_H;
|
||||||
int wstart = pw * stride_w - pad_w;
|
int wstart = pw * STRIDE_W - PAD_W;
|
||||||
const int hend = min(hstart + kernel_h, height);
|
const int hend = min(hstart + KERNEL_H, height);
|
||||||
const int wend = min(wstart + kernel_w, width);
|
const int wend = min(wstart + KERNEL_W, width);
|
||||||
hstart = max(hstart, (int)0);
|
hstart = max(hstart, (int)0);
|
||||||
wstart = max(wstart, (int)0);
|
wstart = max(wstart, (int)0);
|
||||||
Dtype maxval = -FLT_MAX;
|
Dtype maxval = -FLT_MAX;
|
||||||
@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
top_data[index] = maxval;
|
top_data[index] = maxval;
|
||||||
if (!no_mask) {
|
#ifdef HAVE_MASK
|
||||||
if (use_mask == 1) {
|
mask[index] = maxidx;
|
||||||
mask[index] = maxidx;
|
#endif
|
||||||
} else {
|
|
||||||
top_mask[index] = maxidx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void TEMPLATE(max_pool_forward, Dtype)(
|
#elif defined KERNEL_AVE_POOL
|
||||||
const int nthreads, __global const Dtype* bottom_data, const int num,
|
|
||||||
const int channels, const int height, const int width,
|
|
||||||
const int pooled_height, const int pooled_width, const int kernel_h,
|
|
||||||
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
|
|
||||||
const int pad_w,
|
|
||||||
__global Dtype* top_data,
|
|
||||||
const int use_mask, __global int* mask, __global Dtype* top_mask)
|
|
||||||
{
|
|
||||||
TEMPLATE(max_pool_forward_impl, Dtype)(
|
|
||||||
nthreads, bottom_data, num, channels, height, width,
|
|
||||||
pooled_height, pooled_width, kernel_h,
|
|
||||||
kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void TEMPLATE(ave_pool_forward, Dtype)(
|
__kernel void TEMPLATE(ave_pool_forward, Dtype)(
|
||||||
const int nthreads, __global const Dtype* const bottom_data, const int num,
|
const int nthreads, __global const Dtype* const bottom_data, const int num,
|
||||||
const int channels, const int height, const int width,
|
const int channels, const int height, const int width,
|
||||||
const int pooled_height, const int pooled_width, const int kernel_h,
|
const int pooled_height, const int pooled_width,
|
||||||
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
|
__global Dtype* top_data)
|
||||||
const int pad_w, __global Dtype* top_data)
|
|
||||||
{
|
{
|
||||||
for (int index = get_global_id(0); index < nthreads;
|
for (int index = get_global_id(0); index < nthreads;
|
||||||
index += get_global_size(0))
|
index += get_global_size(0))
|
||||||
@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
|
|||||||
const int ph = (index / pooled_width) % pooled_height;
|
const int ph = (index / pooled_width) % pooled_height;
|
||||||
const int c = (index / pooled_width / pooled_height) % channels;
|
const int c = (index / pooled_width / pooled_height) % channels;
|
||||||
const int n = index / pooled_width / pooled_height / channels;
|
const int n = index / pooled_width / pooled_height / channels;
|
||||||
int hstart = ph * stride_h - pad_h;
|
int hstart = ph * STRIDE_H - PAD_H;
|
||||||
int wstart = pw * stride_w - pad_w;
|
int wstart = pw * STRIDE_W - PAD_W;
|
||||||
int hend = min(hstart + kernel_h, height + pad_h);
|
int hend = min(hstart + KERNEL_H, height + PAD_H);
|
||||||
int wend = min(wstart + kernel_w, width + pad_w);
|
int wend = min(wstart + KERNEL_W, width + PAD_W);
|
||||||
const int pool_size = (hend - hstart) * (wend - wstart);
|
const int pool_size = (hend - hstart) * (wend - wstart);
|
||||||
hstart = max(hstart, (int)0);
|
hstart = max(hstart, (int)0);
|
||||||
wstart = max(wstart, (int)0);
|
wstart = max(wstart, (int)0);
|
||||||
@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined KERNEL_STO_POOL
|
||||||
|
|
||||||
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
|
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
|
||||||
const int nthreads, __global const Dtype* const bottom_data, const int num,
|
const int nthreads, __global const Dtype* const bottom_data, const int num,
|
||||||
const int channels, const int height, const int width,
|
const int channels, const int height, const int width,
|
||||||
const int pooled_height, const int pooled_width, const int kernel_h,
|
const int pooled_height, const int pooled_width,
|
||||||
const int kernel_w, const int stride_h, const int stride_w,
|
|
||||||
__global Dtype* top_data)
|
__global Dtype* top_data)
|
||||||
{
|
{
|
||||||
for (int index = get_global_id(0); index < nthreads;
|
for (int index = get_global_id(0); index < nthreads;
|
||||||
@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
|
|||||||
const int ph = (index / pooled_width) % pooled_height;
|
const int ph = (index / pooled_width) % pooled_height;
|
||||||
const int c = (index / pooled_width / pooled_height) % channels;
|
const int c = (index / pooled_width / pooled_height) % channels;
|
||||||
const int n = index / pooled_width / pooled_height / channels;
|
const int n = index / pooled_width / pooled_height / channels;
|
||||||
const int hstart = ph * stride_h;
|
const int hstart = ph * STRIDE_H;
|
||||||
const int hend = min(hstart + kernel_h, height);
|
const int hend = min(hstart + KERNEL_H, height);
|
||||||
const int wstart = pw * stride_w;
|
const int wstart = pw * STRIDE_W;
|
||||||
const int wend = min(wstart + kernel_w, width);
|
const int wend = min(wstart + KERNEL_W, width);
|
||||||
// We set cumsum to be 0 to avoid divide-by-zero problems
|
// We set cumsum to be 0 to avoid divide-by-zero problems
|
||||||
Dtype cumsum = FLT_MIN;
|
Dtype cumsum = FLT_MIN;
|
||||||
Dtype cumvalues = 0.;
|
Dtype cumvalues = 0.;
|
||||||
@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
|
|||||||
// First pass: get sum
|
// First pass: get sum
|
||||||
for (int h = hstart; h < hend; ++h) {
|
for (int h = hstart; h < hend; ++h) {
|
||||||
for (int w = wstart; w < wend; ++w) {
|
for (int w = wstart; w < wend; ++w) {
|
||||||
cumsum += bottom_slice[h * width + w];
|
Dtype v = bottom_slice[h * width + w];
|
||||||
cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
|
cumsum += v;
|
||||||
|
cumvalues += v * v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
top_data[index] = cumvalues / cumsum;
|
top_data[index] = cumvalues / cumsum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // KERNEL_*
|
||||||
|
Loading…
Reference in New Issue
Block a user