mirror of
https://github.com/opencv/opencv.git
synced 2024-11-25 03:30:34 +08:00
Merge pull request #2840 from ilya-lavrenov:tapi_calchist
This commit is contained in:
commit
7688a18264
@ -1477,42 +1477,44 @@ enum
|
||||
|
||||
static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32S)
|
||||
{
|
||||
int compunits = ocl::Device::getDefault().maxComputeUnits();
|
||||
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
|
||||
const ocl::Device & dev = ocl::Device::getDefault();
|
||||
int compunits = dev.maxComputeUnits();
|
||||
size_t wgs = dev.maxWorkGroupSize();
|
||||
Size size = _src.size();
|
||||
bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0;
|
||||
int kercn = dev.isAMD() && use16 ? 16 : std::min(4, ocl::predictOptimalVectorWidth(_src));
|
||||
|
||||
ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
|
||||
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D cn=%d",
|
||||
BINS, compunits, wgs, use16 ? 16 : 1));
|
||||
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D kercn=%d -D T=%s%s",
|
||||
BINS, compunits, wgs, kercn,
|
||||
kercn == 4 ? "int" : ocl::typeToStr(CV_8UC(kercn)),
|
||||
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
|
||||
if (k1.empty())
|
||||
return false;
|
||||
|
||||
_hist.create(BINS, 1, ddepth);
|
||||
UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1),
|
||||
hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1);
|
||||
hist = _hist.getUMat();
|
||||
|
||||
k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total());
|
||||
k1.args(ocl::KernelArg::ReadOnly(src),
|
||||
ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total());
|
||||
|
||||
size_t globalsize = compunits * wgs;
|
||||
if (!k1.run(1, &globalsize, &wgs, false))
|
||||
return false;
|
||||
|
||||
char cvt[40];
|
||||
ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
|
||||
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs));
|
||||
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D convertToHT=%s -D HT=%s",
|
||||
BINS, compunits, (int)wgs, ocl::convertTypeStr(CV_32S, ddepth, 1, cvt),
|
||||
ocl::typeToStr(ddepth)));
|
||||
if (k2.empty())
|
||||
return false;
|
||||
|
||||
k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist));
|
||||
if (!k2.run(1, &wgs, &wgs, false))
|
||||
return false;
|
||||
k2.args(ocl::KernelArg::PtrReadOnly(ghist),
|
||||
ocl::KernelArg::WriteOnlyNoSize(hist));
|
||||
|
||||
if (hist.depth() != ddepth)
|
||||
hist.convertTo(_hist, ddepth);
|
||||
else
|
||||
_hist.getUMatRef() = hist;
|
||||
|
||||
return true;
|
||||
return k2.run(1, &wgs, &wgs, false);
|
||||
}
|
||||
|
||||
static bool ocl_calcHist(InputArrayOfArrays images, OutputArray hist)
|
||||
@ -3428,24 +3430,40 @@ namespace cv {
|
||||
|
||||
static bool ocl_equalizeHist(InputArray _src, OutputArray _dst)
|
||||
{
|
||||
size_t wgs = std::min<size_t>(ocl::Device::getDefault().maxWorkGroupSize(), BINS);
|
||||
const ocl::Device & dev = ocl::Device::getDefault();
|
||||
int compunits = dev.maxComputeUnits();
|
||||
size_t wgs = dev.maxWorkGroupSize();
|
||||
Size size = _src.size();
|
||||
bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0;
|
||||
int kercn = dev.isAMD() && use16 ? 16 : std::min(4, ocl::predictOptimalVectorWidth(_src));
|
||||
|
||||
// calculation of histogram
|
||||
UMat hist;
|
||||
if (!ocl_calcHist1(_src, hist))
|
||||
ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
|
||||
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D kercn=%d -D T=%s%s",
|
||||
BINS, compunits, wgs, kercn,
|
||||
kercn == 4 ? "int" : ocl::typeToStr(CV_8UC(kercn)),
|
||||
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
|
||||
if (k1.empty())
|
||||
return false;
|
||||
|
||||
UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1);
|
||||
|
||||
k1.args(ocl::KernelArg::ReadOnly(src),
|
||||
ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total());
|
||||
|
||||
size_t globalsize = compunits * wgs;
|
||||
if (!k1.run(1, &globalsize, &wgs, false))
|
||||
return false;
|
||||
|
||||
wgs = std::min<size_t>(ocl::Device::getDefault().maxWorkGroupSize(), BINS);
|
||||
UMat lut(1, 256, CV_8UC1);
|
||||
ocl::Kernel k("calcLUT", ocl::imgproc::histogram_oclsrc,
|
||||
format("-D BINS=%d -D HISTS_COUNT=1 -D WGS=%d", BINS, (int)wgs));
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
k.args(ocl::KernelArg::PtrWriteOnly(lut),
|
||||
ocl::KernelArg::PtrReadOnly(hist), (int)_src.total());
|
||||
ocl::Kernel k2("calcLUT", ocl::imgproc::histogram_oclsrc,
|
||||
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d",
|
||||
BINS, compunits, (int)wgs));
|
||||
k2.args(ocl::KernelArg::PtrWriteOnly(lut),
|
||||
ocl::KernelArg::PtrReadOnly(ghist), (int)_src.total());
|
||||
|
||||
// calculation of LUT
|
||||
if (!k.run(1, &wgs, &wgs, false))
|
||||
if (!k2.run(1, &wgs, &wgs, false))
|
||||
return false;
|
||||
|
||||
// execute LUT transparently
|
||||
|
@ -37,86 +37,153 @@
|
||||
//
|
||||
//
|
||||
|
||||
#ifndef cn
|
||||
#define cn 1
|
||||
#ifndef kercn
|
||||
#define kercn 1
|
||||
#endif
|
||||
|
||||
#if cn == 16
|
||||
#define T uchar16
|
||||
#else
|
||||
#ifndef T
|
||||
#define T uchar
|
||||
#endif
|
||||
|
||||
#define noconvert
|
||||
|
||||
__kernel void calculate_histogram(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * hist, int total)
|
||||
__global uchar * histptr, int total)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
int id = get_global_id(0) * cn;
|
||||
int id = get_global_id(0) * kercn;
|
||||
int gid = get_group_id(0);
|
||||
|
||||
__local int localhist[BINS];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = lid; i < BINS; i += WGS)
|
||||
localhist[i] = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int grain = HISTS_COUNT * WGS * cn; id < total; id += grain)
|
||||
int src_index;
|
||||
|
||||
for (int grain = HISTS_COUNT * WGS * kercn; id < total; id += grain)
|
||||
{
|
||||
int src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols);
|
||||
#if cn == 1
|
||||
atomic_inc(localhist + convert_int(src[src_index]));
|
||||
#ifdef HAVE_SRC_CONT
|
||||
src_index = id;
|
||||
#else
|
||||
src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols);
|
||||
#endif
|
||||
|
||||
#if kercn == 1
|
||||
atomic_inc(localhist + convert_int(src[src_index]));
|
||||
#elif kercn == 4
|
||||
int value = *(__global const int *)(src + src_index);
|
||||
atomic_inc(localhist + (value & 0xff));
|
||||
atomic_inc(localhist + ((value >> 8) & 0xff));
|
||||
atomic_inc(localhist + ((value >> 16) & 0xff));
|
||||
atomic_inc(localhist + ((value >> 24) & 0xff));
|
||||
#elif kercn >= 2
|
||||
T value = *(__global const T *)(src + src_index);
|
||||
atomic_inc(localhist + convert_int(value.s0));
|
||||
atomic_inc(localhist + convert_int(value.s1));
|
||||
atomic_inc(localhist + convert_int(value.s2));
|
||||
atomic_inc(localhist + convert_int(value.s3));
|
||||
atomic_inc(localhist + convert_int(value.s4));
|
||||
atomic_inc(localhist + convert_int(value.s5));
|
||||
atomic_inc(localhist + convert_int(value.s6));
|
||||
atomic_inc(localhist + convert_int(value.s7));
|
||||
atomic_inc(localhist + convert_int(value.s8));
|
||||
atomic_inc(localhist + convert_int(value.s9));
|
||||
atomic_inc(localhist + convert_int(value.sA));
|
||||
atomic_inc(localhist + convert_int(value.sB));
|
||||
atomic_inc(localhist + convert_int(value.sC));
|
||||
atomic_inc(localhist + convert_int(value.sD));
|
||||
atomic_inc(localhist + convert_int(value.sE));
|
||||
atomic_inc(localhist + convert_int(value.sF));
|
||||
atomic_inc(localhist + value.s0);
|
||||
atomic_inc(localhist + value.s1);
|
||||
#if kercn >= 4
|
||||
atomic_inc(localhist + value.s2);
|
||||
atomic_inc(localhist + value.s3);
|
||||
#if kercn >= 8
|
||||
atomic_inc(localhist + value.s4);
|
||||
atomic_inc(localhist + value.s5);
|
||||
atomic_inc(localhist + value.s6);
|
||||
atomic_inc(localhist + value.s7);
|
||||
#if kercn == 16
|
||||
atomic_inc(localhist + value.s8);
|
||||
atomic_inc(localhist + value.s9);
|
||||
atomic_inc(localhist + value.sA);
|
||||
atomic_inc(localhist + value.sB);
|
||||
atomic_inc(localhist + value.sC);
|
||||
atomic_inc(localhist + value.sD);
|
||||
atomic_inc(localhist + value.sE);
|
||||
atomic_inc(localhist + value.sF);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__global int * hist = (__global int *)(histptr + gid * BINS * (int)sizeof(int));
|
||||
#pragma unroll
|
||||
for (int i = lid; i < BINS; i += WGS)
|
||||
*(__global int *)(hist + mad24(gid, BINS * (int)sizeof(int), i * (int)sizeof(int))) = localhist[i];
|
||||
hist[i] = localhist[i];
|
||||
}
|
||||
|
||||
__kernel void merge_histogram(__global const int * ghist, __global int * hist)
|
||||
#ifndef HT
|
||||
#define HT int
|
||||
#endif
|
||||
|
||||
#ifndef convertToHT
|
||||
#define convertToHT noconvert
|
||||
#endif
|
||||
|
||||
__kernel void merge_histogram(__global const int * ghist, __global uchar * histptr, int hist_step, int hist_offset)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
|
||||
__global HT * hist = (__global HT *)(histptr + hist_offset);
|
||||
#if WGS >= BINS
|
||||
HT res = (HT)(0);
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i = lid; i < BINS; i += WGS)
|
||||
hist[i] = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
hist[i] = (HT)(0);
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < HISTS_COUNT; ++i)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = lid; j < BINS; j += WGS)
|
||||
hist[j] += ghist[mad24(i, BINS, j)];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#if WGS >= BINS
|
||||
res += convertToHT(ghist[j]);
|
||||
#else
|
||||
hist[j] += convertToHT(ghist[j]);
|
||||
#endif
|
||||
ghist += BINS;
|
||||
}
|
||||
|
||||
#if WGS >= BINS
|
||||
if (lid < BINS)
|
||||
*(__global HT *)(histptr + mad24(lid, hist_step, hist_offset)) = res;
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void calcLUT(__global uchar * dst, __constant int * hist, int total)
|
||||
__kernel void calcLUT(__global uchar * dst, __global const int * ghist, int total)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
__local int sumhist[BINS];
|
||||
__local float scale;
|
||||
|
||||
sumhist[lid] = hist[lid];
|
||||
#if WGS >= BINS
|
||||
int res = 0;
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i = lid; i < BINS; i += WGS)
|
||||
sumhist[i] = 0;
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < HISTS_COUNT; ++i)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = lid; j < BINS; j += WGS)
|
||||
#if WGS >= BINS
|
||||
res += ghist[j];
|
||||
#else
|
||||
sumhist[j] += ghist[j];
|
||||
#endif
|
||||
ghist += BINS;
|
||||
}
|
||||
|
||||
#if WGS >= BINS
|
||||
if (lid < BINS)
|
||||
sumhist[lid] = res;
|
||||
#endif
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lid == 0)
|
||||
|
Loading…
Reference in New Issue
Block a user