mirror of
https://github.com/opencv/opencv.git
synced 2024-11-30 06:10:02 +08:00
Merge branch 'master' into tvl1_ali
This commit is contained in:
commit
32707317fa
@ -308,6 +308,23 @@ OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
|
||||
OCL_PERF_TEST_P(TransposeFixture, TransposeInplace, ::testing::Combine(
|
||||
OCL_PERF_ENUM(Size(640, 640), Size(1280, 1280), Size(2160, 2160)), OCL_TEST_TYPES_134))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params);
|
||||
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, type);
|
||||
|
||||
UMat src(srcSize, type);
|
||||
declare.in(src, WARMUP_RNG).out(src, WARMUP_NONE);
|
||||
|
||||
OCL_TEST_CYCLE() cv::transpose(src, src);
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
///////////// Flip ////////////////////////
|
||||
|
||||
enum
|
||||
|
@ -3132,9 +3132,16 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
|
||||
(!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
|
||||
return false;
|
||||
|
||||
ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc,
|
||||
format("%s-D cn=%d -D T=%s%s", haveScalar ? "-D HAVE_SCALAR " : "",
|
||||
cn, ocl::typeToStr(sdepth), doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
||||
int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn);
|
||||
if (kercn % cn != 0)
|
||||
kercn = cn;
|
||||
int colsPerWI = kercn / cn;
|
||||
String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
|
||||
haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);
|
||||
|
||||
ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts);
|
||||
if (ker.empty())
|
||||
return false;
|
||||
|
||||
@ -3182,7 +3189,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
|
||||
}
|
||||
|
||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
|
||||
dstarg = ocl::KernelArg::WriteOnly(dst);
|
||||
dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI);
|
||||
|
||||
if (haveScalar)
|
||||
{
|
||||
@ -3196,7 +3203,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
|
||||
ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
|
||||
ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
|
||||
|
||||
size_t globalsize[2] = { ssize.width, (ssize.height + rowsPerWI - 1) / rowsPerWI };
|
||||
size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI };
|
||||
return ker.run(2, globalsize, NULL, false);
|
||||
}
|
||||
|
||||
|
@ -851,6 +851,175 @@ void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
|
||||
namespace cv
|
||||
{
|
||||
|
||||
template<typename T, typename DT, typename WT>
|
||||
struct cvtScaleAbs_SSE2
|
||||
{
|
||||
int operator () (const T *, DT *, int, WT, WT) const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SSE2
|
||||
|
||||
template <>
|
||||
struct cvtScaleAbs_SSE2<uchar, uchar, float>
|
||||
{
|
||||
int operator () (const uchar * src, uchar * dst, int width,
|
||||
float scale, float shift) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
|
||||
v_zero_f = _mm_setzero_ps();
|
||||
__m128i v_zero_i = _mm_setzero_si128();
|
||||
|
||||
for ( ; x <= width - 16; x += 16)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
|
||||
__m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
|
||||
__m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
|
||||
v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
|
||||
__m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
|
||||
v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
|
||||
__m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
|
||||
v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
|
||||
__m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
|
||||
v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
|
||||
|
||||
__m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
|
||||
_mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
|
||||
_mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct cvtScaleAbs_SSE2<ushort, uchar, float>
|
||||
{
|
||||
int operator () (const ushort * src, uchar * dst, int width,
|
||||
float scale, float shift) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
|
||||
v_zero_f = _mm_setzero_ps();
|
||||
__m128i v_zero_i = _mm_setzero_si128();
|
||||
|
||||
for ( ; x <= width - 8; x += 8)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
|
||||
__m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
|
||||
v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
|
||||
__m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
|
||||
v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
|
||||
|
||||
__m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
|
||||
_mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct cvtScaleAbs_SSE2<short, uchar, float>
|
||||
{
|
||||
int operator () (const short * src, uchar * dst, int width,
|
||||
float scale, float shift) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
|
||||
v_zero_f = _mm_setzero_ps();
|
||||
__m128i v_zero_i = _mm_setzero_si128();
|
||||
|
||||
for ( ; x <= width - 8; x += 8)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
|
||||
__m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
|
||||
v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
|
||||
__m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
|
||||
v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
|
||||
|
||||
__m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
|
||||
_mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct cvtScaleAbs_SSE2<int, uchar, float>
|
||||
{
|
||||
int operator () (const int * src, uchar * dst, int width,
|
||||
float scale, float shift) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
|
||||
v_zero_f = _mm_setzero_ps();
|
||||
__m128i v_zero_i = _mm_setzero_si128();
|
||||
|
||||
for ( ; x <= width - 8; x += 4)
|
||||
{
|
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
|
||||
__m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
|
||||
v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
|
||||
|
||||
__m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
|
||||
_mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct cvtScaleAbs_SSE2<float, uchar, float>
|
||||
{
|
||||
int operator () (const float * src, uchar * dst, int width,
|
||||
float scale, float shift) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (USE_SSE2)
|
||||
{
|
||||
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
|
||||
v_zero_f = _mm_setzero_ps();
|
||||
__m128i v_zero_i = _mm_setzero_si128();
|
||||
|
||||
for ( ; x <= width - 8; x += 4)
|
||||
{
|
||||
__m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
|
||||
v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
|
||||
|
||||
__m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
|
||||
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
|
||||
}
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T, typename DT, typename WT> static void
|
||||
cvtScaleAbs_( const T* src, size_t sstep,
|
||||
DT* dst, size_t dstep, Size size,
|
||||
@ -858,10 +1027,12 @@ cvtScaleAbs_( const T* src, size_t sstep,
|
||||
{
|
||||
sstep /= sizeof(src[0]);
|
||||
dstep /= sizeof(dst[0]);
|
||||
cvtScaleAbs_SSE2<T, DT, WT> vop;
|
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep )
|
||||
{
|
||||
int x = 0;
|
||||
int x = vop(src, dst, size.width, scale, shift);
|
||||
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; x <= size.width - 4; x += 4 )
|
||||
{
|
||||
@ -879,7 +1050,6 @@ cvtScaleAbs_( const T* src, size_t sstep,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T, typename DT, typename WT> static void
|
||||
cvtScale_( const T* src, size_t sstep,
|
||||
DT* dst, size_t dstep, Size size,
|
||||
|
@ -2973,8 +2973,10 @@ static inline int divUp(int a, int b)
|
||||
|
||||
static bool ocl_transpose( InputArray _src, OutputArray _dst )
|
||||
{
|
||||
const ocl::Device & dev = ocl::Device::getDefault();
|
||||
const int TILE_DIM = 32, BLOCK_ROWS = 8;
|
||||
int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
|
||||
int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
|
||||
rowsPerWI = dev.isIntel() ? 4 : 1;
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
_dst.create(src.cols, src.rows, type);
|
||||
@ -2990,9 +2992,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
|
||||
}
|
||||
|
||||
ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
|
||||
format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d",
|
||||
format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d",
|
||||
ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
|
||||
cn, TILE_DIM, BLOCK_ROWS));
|
||||
cn, TILE_DIM, BLOCK_ROWS, rowsPerWI));
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
@ -3002,8 +3004,14 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
|
||||
k.args(ocl::KernelArg::ReadOnly(src),
|
||||
ocl::KernelArg::WriteOnlyNoSize(dst));
|
||||
|
||||
size_t localsize[3] = { TILE_DIM, BLOCK_ROWS, 1 };
|
||||
size_t globalsize[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
|
||||
size_t localsize[2] = { TILE_DIM, BLOCK_ROWS };
|
||||
size_t globalsize[2] = { src.cols, inplace ? (src.rows + rowsPerWI - 1) / rowsPerWI : (divUp(src.rows, TILE_DIM) * BLOCK_ROWS) };
|
||||
|
||||
if (inplace && dev.isIntel())
|
||||
{
|
||||
localsize[0] = 16;
|
||||
localsize[1] = dev.maxWorkGroupSize() / localsize[0];
|
||||
}
|
||||
|
||||
return k.run(2, globalsize, localsize, false);
|
||||
}
|
||||
|
@ -1416,7 +1416,16 @@ bool useOpenCL()
|
||||
{
|
||||
CoreTLSData* data = coreTlsData.get();
|
||||
if( data->useOpenCL < 0 )
|
||||
data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
|
||||
{
|
||||
try
|
||||
{
|
||||
data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
data->useOpenCL = 0;
|
||||
}
|
||||
}
|
||||
return data->useOpenCL > 0;
|
||||
}
|
||||
|
||||
@ -2228,7 +2237,8 @@ static cl_device_id selectOpenCLDevice()
|
||||
if (!isID)
|
||||
{
|
||||
deviceTypes.push_back("GPU");
|
||||
deviceTypes.push_back("CPU");
|
||||
if (configuration)
|
||||
deviceTypes.push_back("CPU");
|
||||
}
|
||||
else
|
||||
deviceTypes.push_back("ALL");
|
||||
@ -4427,11 +4437,13 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
|
||||
d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
|
||||
d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
|
||||
d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth];
|
||||
if (d.isIntel())
|
||||
|
||||
// if the device says don't use vectors
|
||||
if (vectorWidths[0] == 1)
|
||||
{
|
||||
// it's heuristic
|
||||
int vectorWidthsIntel[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
|
||||
kercn = vectorWidthsIntel[depth];
|
||||
int vectorWidthsOthers[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
|
||||
kercn = vectorWidthsOthers[depth];
|
||||
}
|
||||
|
||||
if (ssize.width * cn < kercn || kercn <= 0)
|
||||
|
@ -52,7 +52,7 @@
|
||||
__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
#ifdef HAVE_SCALAR
|
||||
__global const T * src2, __global const T * src3,
|
||||
__global const srcT1 * src2, __global const srcT1 * src3,
|
||||
#else
|
||||
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
||||
__global const uchar * src3ptr, int src3_step, int src3_offset,
|
||||
@ -64,31 +64,56 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_of
|
||||
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
|
||||
int dst_index = mad24(y0, dst_step, x + dst_offset);
|
||||
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(srcT1) * kercn, src1_offset));
|
||||
int dst_index = mad24(y0, dst_step, mad24(x, colsPerWI, dst_offset));
|
||||
#ifndef HAVE_SCALAR
|
||||
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
|
||||
int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
|
||||
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(srcT1) * kercn, src2_offset));
|
||||
int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(srcT1) * kercn, src3_offset));
|
||||
#endif
|
||||
|
||||
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)
|
||||
{
|
||||
__global const T * src1 = (__global const T *)(src1ptr + src1_index);
|
||||
#if kercn >= cn && kercn == 4 && depth <= 4 && !defined HAVE_SCALAR
|
||||
srcT src1 = *(__global const srcT *)(src1ptr + src1_index);
|
||||
srcT src2 = *(__global const srcT *)(src2ptr + src2_index);
|
||||
srcT src3 = *(__global const srcT *)(src3ptr + src3_index);
|
||||
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
|
||||
#if cn == 1
|
||||
dst[0] = src2 > src1 || src3 < src1 ? (dstT)(0) : (dstT)(255);
|
||||
#elif cn == 2
|
||||
dst[0] = (dstT)(src2.xy > src1.xy || src3.xy < src1.xy ||
|
||||
src2.zw > src1.zw || src3.zw < src1.zw ? (dstT)(0) : (dstT)(255);
|
||||
#elif cn == 4
|
||||
dst[0] = (dstT)(src2.x > src1.x || src3.x < src1.x ||
|
||||
src2.y > src1.y || src3.y < src1.y ||
|
||||
src2.z > src1.z || src3.z < src1.z ||
|
||||
src2.w > src1.w || src3.w < src1.w ? 0 : 255);
|
||||
#endif
|
||||
#else
|
||||
__global const srcT1 * src1 = (__global const srcT1 *)(src1ptr + src1_index);
|
||||
__global uchar * dst = dstptr + dst_index;
|
||||
#ifndef HAVE_SCALAR
|
||||
__global const T * src2 = (__global const T *)(src2ptr + src2_index);
|
||||
__global const T * src3 = (__global const T *)(src3ptr + src3_index);
|
||||
__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
|
||||
__global const srcT1 * src3 = (__global const srcT1 *)(src3ptr + src3_index);
|
||||
#endif
|
||||
|
||||
dst[0] = 255;
|
||||
|
||||
for (int c = 0; c < cn; ++c)
|
||||
if (src2[c] > src1[c] || src3[c] < src1[c])
|
||||
{
|
||||
dst[0] = 0;
|
||||
break;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int px = 0; px < colsPerWI; ++px, src1 += cn
|
||||
#ifndef HAVE_SCALAR
|
||||
, src2 += cn, src3 += cn
|
||||
#endif
|
||||
)
|
||||
{
|
||||
dst[px] = 255;
|
||||
|
||||
for (int c = 0; c < cn; ++c)
|
||||
if (src2[c] > src1[c] || src3[c] < src1[c])
|
||||
{
|
||||
dst[px] = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif // kercn >= cn
|
||||
#ifndef HAVE_SCALAR
|
||||
src2_index += src2_step;
|
||||
src3_index += src3_step;
|
||||
|
@ -53,7 +53,7 @@
|
||||
#define TSIZE ((int)sizeof(T1)*3)
|
||||
#endif
|
||||
|
||||
#define LDS_STEP TILE_DIM
|
||||
#define LDS_STEP (TILE_DIM + 1)
|
||||
|
||||
__kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset)
|
||||
@ -90,6 +90,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
|
||||
{
|
||||
int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
if (y + i < src_rows)
|
||||
{
|
||||
@ -103,6 +104,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
|
||||
{
|
||||
int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
if ((y_index + i) < src_cols)
|
||||
{
|
||||
@ -115,18 +117,24 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
|
||||
__kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (y < src_rows && x < y)
|
||||
if (x < y + rowsPerWI)
|
||||
{
|
||||
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
|
||||
int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset));
|
||||
T tmp;
|
||||
|
||||
__global const uchar * src = srcptr + src_index;
|
||||
__global uchar * dst = srcptr + dst_index;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE)
|
||||
if (y < src_rows && x < y)
|
||||
{
|
||||
__global uchar * src = srcptr + src_index;
|
||||
__global uchar * dst = srcptr + dst_index;
|
||||
|
||||
T tmp = loadpix(dst);
|
||||
storepix(loadpix(src), dst);
|
||||
storepix(tmp, src);
|
||||
tmp = loadpix(dst);
|
||||
storepix(loadpix(src), dst);
|
||||
storepix(tmp, src);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -448,13 +448,12 @@ double CvCaptureCAM::getProperty(int property_id){
|
||||
QTFormatDescription* format = [[connections objectAtIndex:0] formatDescription];
|
||||
NSSize s1 = [[format attributeForKey:QTFormatDescriptionVideoCleanApertureDisplaySizeAttribute] sizeValue];
|
||||
|
||||
int width=s1.width, height=s1.height;
|
||||
switch (property_id) {
|
||||
case CV_CAP_PROP_FRAME_WIDTH:
|
||||
retval = width;
|
||||
retval = s1.width;
|
||||
break;
|
||||
case CV_CAP_PROP_FRAME_HEIGHT:
|
||||
retval = height;
|
||||
retval = s1.height;
|
||||
break;
|
||||
default:
|
||||
retval = 0;
|
||||
@ -1011,22 +1010,22 @@ bool CvVideoWriter_QT::writeFrame(const IplImage* image) {
|
||||
cvCvtColor(image, argbimage, CV_BGR2BGRA);
|
||||
|
||||
|
||||
unsigned char* imagedata = (unsigned char*)argbimage->imageData;
|
||||
unsigned char* imagedata_ = (unsigned char*)argbimage->imageData;
|
||||
//BGRA --> ARGB
|
||||
|
||||
for (int j = 0; j < argbimage->height; j++) {
|
||||
int rowstart = argbimage->widthStep * j;
|
||||
for (int i = rowstart; i < rowstart+argbimage->widthStep; i+=4) {
|
||||
unsigned char temp = imagedata[i];
|
||||
imagedata[i] = 255;
|
||||
imagedata[i+3] = temp;
|
||||
temp = imagedata[i+2];
|
||||
imagedata[i+2] = imagedata[i+1];
|
||||
imagedata[i+1] = temp;
|
||||
unsigned char temp = imagedata_[i];
|
||||
imagedata_[i] = 255;
|
||||
imagedata_[i+3] = temp;
|
||||
temp = imagedata_[i+2];
|
||||
imagedata_[i+2] = imagedata_[i+1];
|
||||
imagedata_[i+1] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
NSBitmapImageRep* imageRep = [[NSBitmapImageRep alloc] initWithBitmapDataPlanes:&imagedata
|
||||
NSBitmapImageRep* imageRep = [[NSBitmapImageRep alloc] initWithBitmapDataPlanes:&imagedata_
|
||||
pixelsWide:movieSize.width
|
||||
pixelsHigh:movieSize.height
|
||||
bitsPerSample:8
|
||||
|
@ -231,7 +231,7 @@ OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, O
|
||||
|
||||
OCL_TEST_CYCLE() cv::integral(src, dst, ddepth);
|
||||
|
||||
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
|
||||
SANITY_CHECK(dst, 2e-6, ERROR_RELATIVE);
|
||||
}
|
||||
|
||||
OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F)))
|
||||
@ -243,11 +243,11 @@ OCL_PERF_TEST_P(IntegralFixture, Integral2, ::testing::Combine(OCL_TEST_SIZES, O
|
||||
checkDeviceMaxMemoryAllocSize(srcSize, ddepth);
|
||||
|
||||
UMat src(srcSize, CV_8UC1), sum(srcSize + Size(1, 1), ddepth), sqsum(srcSize + Size(1, 1), CV_32F);
|
||||
declare.in(src, WARMUP_RNG).out(sum).out(sqsum);
|
||||
declare.in(src, WARMUP_RNG).out(sum, sqsum);
|
||||
|
||||
OCL_TEST_CYCLE() cv::integral(src, sum, sqsum, ddepth, CV_32F);
|
||||
|
||||
SANITY_CHECK(sum, 1e-6, ERROR_RELATIVE);
|
||||
SANITY_CHECK(sum, 2e-4, ERROR_RELATIVE);
|
||||
SANITY_CHECK(sqsum, 5e-5, ERROR_RELATIVE);
|
||||
}
|
||||
|
||||
|
38
modules/imgproc/perf/perf_moments.cpp
Normal file
38
modules/imgproc/perf/perf_moments.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
|
||||
#include "perf_precomp.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
using namespace perf;
|
||||
using namespace testing;
|
||||
using std::tr1::make_tuple;
|
||||
using std::tr1::get;
|
||||
|
||||
typedef std::tr1::tuple<Size, MatDepth, bool> MomentsParams_t;
|
||||
typedef perf::TestBaseWithParam<MomentsParams_t> MomentsFixture_val;
|
||||
|
||||
PERF_TEST_P(MomentsFixture_val, Moments1,
|
||||
::testing::Combine(
|
||||
testing::Values(TYPICAL_MAT_SIZES),
|
||||
testing::Values(CV_16U, CV_16S, CV_32F, CV_64F),
|
||||
testing::Bool()))
|
||||
{
|
||||
const MomentsParams_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const MatDepth srcDepth = get<1>(params);
|
||||
const bool binaryImage = get<2>(params);
|
||||
|
||||
cv::Moments m;
|
||||
Mat src(srcSize, srcDepth);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
TEST_CYCLE() m = cv::moments(src, binaryImage);
|
||||
|
||||
SANITY_CHECK_MOMENTS(m, 1e-4, ERROR_RELATIVE);
|
||||
}
|
@ -2730,8 +2730,6 @@ struct mRGBA2RGBA
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
|
||||
|
||||
static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
|
||||
{
|
||||
bool ok = false;
|
||||
@ -2739,23 +2737,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
|
||||
Size sz = src.size(), dstSz = sz;
|
||||
int scn = src.channels(), depth = src.depth(), bidx;
|
||||
int dims = 2, stripeSize = 1;
|
||||
size_t globalsize[] = { src.cols, src.rows };
|
||||
ocl::Kernel k;
|
||||
|
||||
if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
|
||||
return false;
|
||||
|
||||
cv::String opts = format("-D depth=%d -D scn=%d ", depth, scn);
|
||||
|
||||
ocl::Device dev = ocl::Device::getDefault();
|
||||
int pxPerWIy = 1;
|
||||
if (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) &&
|
||||
!(code == CV_BGR2Luv || code == CV_RGB2Luv || code == CV_LBGR2Luv || code == CV_LRGB2Luv ||
|
||||
code == CV_Luv2BGR || code == CV_Luv2RGB || code == CV_Luv2LBGR || code == CV_Luv2LRGB))
|
||||
pxPerWIy = 4;
|
||||
int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
|
||||
|
||||
globalsize[1] = DIVUP(globalsize[1], pxPerWIy);
|
||||
opts += format("-D PIX_PER_WI_Y=%d ", pxPerWIy);
|
||||
size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
|
||||
cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
|
||||
depth, scn, pxPerWIy);
|
||||
|
||||
switch (code)
|
||||
{
|
||||
|
@ -608,6 +608,11 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
|
||||
factor *= 255;
|
||||
factor = 1./(factor * factor * factor);
|
||||
|
||||
#if CV_SSE2
|
||||
volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
|
||||
__m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f);
|
||||
#endif
|
||||
|
||||
Size size = src.size();
|
||||
int i, j;
|
||||
for( i = 0; i < size.height; i++ )
|
||||
@ -619,7 +624,26 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
|
||||
const float* d2ydata = (const float*)(D2y.data + i*D2y.step);
|
||||
const float* dxydata = (const float*)(Dxy.data + i*Dxy.step);
|
||||
|
||||
for( j = 0; j < size.width; j++ )
|
||||
j = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if (haveSSE2)
|
||||
{
|
||||
for( ; j <= size.width - 4; j += 4 )
|
||||
{
|
||||
__m128 v_dx = _mm_loadu_ps((const float *)(dxdata + j));
|
||||
__m128 v_dy = _mm_loadu_ps((const float *)(dydata + j));
|
||||
|
||||
__m128 v_s1 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dx), _mm_loadu_ps((const float *)(d2ydata + j)));
|
||||
__m128 v_s2 = _mm_mul_ps(_mm_mul_ps(v_dy, v_dy), _mm_loadu_ps((const float *)(d2xdata + j)));
|
||||
__m128 v_s3 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dy), _mm_loadu_ps((const float *)(dxydata + j)));
|
||||
v_s1 = _mm_mul_ps(v_factor, _mm_add_ps(v_s1, _mm_add_ps(v_s2, _mm_mul_ps(v_s3, v_m2))));
|
||||
_mm_storeu_ps(dstdata + j, v_s1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for( ; j < size.width; j++ )
|
||||
{
|
||||
float dx = dxdata[j];
|
||||
float dy = dydata[j];
|
||||
|
@ -3471,7 +3471,8 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
|
||||
return k.run(2, globalsize, localsize, false);
|
||||
}
|
||||
|
||||
const int optimizedSepFilterLocalSize = 16;
|
||||
const int optimizedSepFilterLocalWidth = 16;
|
||||
const int optimizedSepFilterLocalHeight = 8;
|
||||
|
||||
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
||||
Mat row_kernel, Mat col_kernel,
|
||||
@ -3491,8 +3492,8 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|
||||
borderType == BORDER_REFLECT_101))
|
||||
return false;
|
||||
|
||||
size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize };
|
||||
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) };
|
||||
size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
|
||||
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
|
||||
|
||||
char cvt[2][40];
|
||||
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
|
||||
@ -3584,8 +3585,8 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
|
||||
}
|
||||
|
||||
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
|
||||
imgSize.width > optimizedSepFilterLocalSize + anchor.x &&
|
||||
imgSize.height > optimizedSepFilterLocalSize + anchor.y &&
|
||||
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
|
||||
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
|
||||
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
|
||||
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
|
||||
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
|
||||
|
@ -4188,7 +4188,8 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
|
||||
const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
|
||||
|
||||
int scalarcn = cn == 3 ? 4 : cn;
|
||||
int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth);
|
||||
bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
|
||||
int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
|
||||
int sctype = CV_MAKETYPE(wdepth, scalarcn);
|
||||
|
||||
ocl::Kernel k;
|
||||
|
@ -1257,8 +1257,8 @@ static bool IPPMorphReplicate(int op, const Mat &src, Mat &dst, const Mat &kerne
|
||||
}
|
||||
#undef IPP_MORPH_CASE
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8
|
||||
return false; /// It disables false positive warning in GCC 4.8.2
|
||||
#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 8
|
||||
return false; /// It disables false positive warning in GCC 4.8 and further
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -1333,69 +1333,98 @@ static bool IPPMorphOp(int op, InputArray _src, OutputArray _dst,
|
||||
if( iterations > 1 )
|
||||
return false;
|
||||
|
||||
if (IPPMorphReplicate( op, src, dst, kernel, ksize, anchor, rectKernel ))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return IPPMorphReplicate( op, src, dst, kernel, ksize, anchor, rectKernel );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
|
||||
const Size & ksize, const Point & anchor, int iterations, int op)
|
||||
static bool ocl_morphOp(InputArray _src, OutputArray _dst, InputArray _kernel,
|
||||
Point anchor, int iterations, int op, int borderType,
|
||||
const Scalar &, int actual_op = -1, InputArray _extraMat = noArray())
|
||||
{
|
||||
CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
|
||||
|
||||
const ocl::Device & dev = ocl::Device::getDefault();
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
bool doubleSupport = dev.doubleFPConfig() > 0;
|
||||
|
||||
if (depth == CV_64F && !doubleSupport)
|
||||
if ((depth == CV_64F && !doubleSupport) || borderType != BORDER_CONSTANT)
|
||||
return false;
|
||||
|
||||
UMat kernel8U;
|
||||
kernel.convertTo(kernel8U, CV_8U);
|
||||
kernel8U = kernel8U.reshape(1, 1);
|
||||
Mat kernel = _kernel.getMat();
|
||||
bool haveExtraMat = !_extraMat.empty();
|
||||
Size ksize = kernel.data ? kernel.size() : Size(3, 3), ssize = _src.size();
|
||||
CV_Assert(actual_op <= 3 || haveExtraMat);
|
||||
|
||||
bool rectKernel = true;
|
||||
if (iterations == 0 || kernel.rows*kernel.cols == 1)
|
||||
{
|
||||
Mat m = kernel.reshape(1, 1);
|
||||
for (int i = 0; i < m.size().area(); ++i)
|
||||
if (m.at<uchar>(i) != 1)
|
||||
{
|
||||
rectKernel = false;
|
||||
break;
|
||||
}
|
||||
_src.copyTo(_dst);
|
||||
return true;
|
||||
}
|
||||
|
||||
UMat src = _src.getUMat();
|
||||
if (!kernel.data)
|
||||
{
|
||||
kernel = getStructuringElement(MORPH_RECT, Size(1+iterations*2,1+iterations*2));
|
||||
anchor = Point(iterations, iterations);
|
||||
iterations = 1;
|
||||
}
|
||||
else if( iterations > 1 && countNonZero(kernel) == kernel.rows*kernel.cols )
|
||||
{
|
||||
anchor = Point(anchor.x*iterations, anchor.y*iterations);
|
||||
kernel = getStructuringElement(MORPH_RECT,
|
||||
Size(ksize.width + (iterations-1)*(ksize.width-1),
|
||||
ksize.height + (iterations-1)*(ksize.height-1)),
|
||||
anchor);
|
||||
iterations = 1;
|
||||
}
|
||||
|
||||
#ifdef ANDROID
|
||||
size_t localThreads[3] = {16, 8, 1};
|
||||
size_t localThreads[2] = { 16, 8 };
|
||||
#else
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
size_t localThreads[2] = { 16, 16 };
|
||||
#endif
|
||||
size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1};
|
||||
size_t globalThreads[2] = { ssize.width, ssize.height };
|
||||
|
||||
if (localThreads[0]*localThreads[1] * 2 < (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1))
|
||||
return false;
|
||||
|
||||
static const char * const op2str[] = { "ERODE", "DILATE" };
|
||||
String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s%s"
|
||||
" -D T=%s -D DEPTH_%d -D cn=%d -D T1=%s", anchor.x, anchor.y,
|
||||
(int)localThreads[0], (int)localThreads[1], op2str[op],
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", rectKernel ? " -D RECTKERNEL" : "",
|
||||
ocl::typeToStr(_src.type()), _src.depth(), cn, ocl::typeToStr(depth));
|
||||
// build processing
|
||||
String processing;
|
||||
Mat kernel8u;
|
||||
kernel.convertTo(kernel8u, CV_8U);
|
||||
for (int y = 0; y < kernel8u.rows; ++y)
|
||||
for (int x = 0; x < kernel8u.cols; ++x)
|
||||
if (kernel8u.at<uchar>(y, x) != 0)
|
||||
processing += format("PROCESS(%d,%d)", y, x);
|
||||
|
||||
std::vector<ocl::Kernel> kernels;
|
||||
static const char * const op2str[] = { "OP_ERODE", "OP_DILATE", NULL, NULL, "OP_GRADIENT", "OP_TOPHAT", "OP_BLACKHAT" };
|
||||
|
||||
char cvt[2][50];
|
||||
int wdepth = std::max(depth, CV_32F), scalarcn = cn == 3 ? 4 : cn;
|
||||
|
||||
if (actual_op < 0)
|
||||
actual_op = op;
|
||||
|
||||
std::vector<ocl::Kernel> kernels(iterations);
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
ocl::Kernel k("morph", ocl::imgproc::morph_oclsrc, buildOptions);
|
||||
if (k.empty())
|
||||
int current_op = iterations == i + 1 ? actual_op : op;
|
||||
String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s"
|
||||
" -D PROCESS_ELEMS=%s -D T=%s -D DEPTH_%d -D cn=%d -D T1=%s"
|
||||
" -D convertToWT=%s -D convertToT=%s -D ST=%s%s",
|
||||
anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], op2str[op],
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", processing.c_str(),
|
||||
ocl::typeToStr(type), depth, cn, ocl::typeToStr(depth),
|
||||
ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
|
||||
ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
|
||||
ocl::typeToStr(CV_MAKE_TYPE(depth, scalarcn)),
|
||||
current_op == op ? "" : cv::format(" -D %s", op2str[current_op]).c_str());
|
||||
|
||||
kernels[i].create("morph", ocl::imgproc::morph_oclsrc, buildOptions);
|
||||
if (kernels[i].empty())
|
||||
return false;
|
||||
kernels.push_back(k);
|
||||
}
|
||||
|
||||
UMat src = _src.getUMat(), extraMat = _extraMat.getUMat();
|
||||
_dst.create(src.size(), src.type());
|
||||
UMat dst = _dst.getUMat();
|
||||
|
||||
@ -1406,9 +1435,13 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
|
||||
src.locateROI(wholesize, ofs);
|
||||
int wholecols = wholesize.width, wholerows = wholesize.height;
|
||||
|
||||
kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst),
|
||||
ofs.x, ofs.y, src.cols, src.rows, ocl::KernelArg::PtrReadOnly(kernel8U),
|
||||
wholecols, wholerows);
|
||||
if (haveExtraMat)
|
||||
kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst),
|
||||
ofs.x, ofs.y, src.cols, src.rows, wholecols, wholerows,
|
||||
ocl::KernelArg::ReadOnlyNoSize(extraMat));
|
||||
else
|
||||
kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst),
|
||||
ofs.x, ofs.y, src.cols, src.rows, wholecols, wholerows);
|
||||
|
||||
return kernels[0].run(2, globalThreads, localThreads, false);
|
||||
}
|
||||
@ -1422,19 +1455,20 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
|
||||
if (i == 0)
|
||||
{
|
||||
int cols = src.cols, rows = src.rows;
|
||||
src.locateROI(wholesize,ofs);
|
||||
src.locateROI(wholesize, ofs);
|
||||
src.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x);
|
||||
if(src.u != dst.u)
|
||||
source = src;
|
||||
else
|
||||
src.copyTo(source);
|
||||
|
||||
src.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
|
||||
source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
|
||||
}
|
||||
else
|
||||
{
|
||||
int cols = dst.cols, rows = dst.rows;
|
||||
dst.locateROI(wholesize,ofs);
|
||||
dst.locateROI(wholesize, ofs);
|
||||
dst.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x);
|
||||
dst.copyTo(source);
|
||||
dst.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
|
||||
@ -1442,13 +1476,18 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
|
||||
}
|
||||
source.locateROI(wholesize, ofs);
|
||||
|
||||
kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst),
|
||||
ofs.x, ofs.y, source.cols, source.rows, ocl::KernelArg::PtrReadOnly(kernel8U),
|
||||
wholesize.width, wholesize.height);
|
||||
if (haveExtraMat && iterations == i + 1)
|
||||
kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst),
|
||||
ofs.x, ofs.y, source.cols, source.rows, wholesize.width, wholesize.height,
|
||||
ocl::KernelArg::ReadOnlyNoSize(extraMat));
|
||||
else
|
||||
kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst),
|
||||
ofs.x, ofs.y, source.cols, source.rows, wholesize.width, wholesize.height);
|
||||
|
||||
if (!kernels[i].run(2, globalThreads, localThreads, false))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1459,15 +1498,16 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
|
||||
Point anchor, int iterations,
|
||||
int borderType, const Scalar& borderValue )
|
||||
{
|
||||
#ifdef HAVE_OPENCL
|
||||
int src_type = _src.type(),
|
||||
src_cn = CV_MAT_CN(src_type), src_depth = CV_MAT_DEPTH(src_type);
|
||||
#endif
|
||||
|
||||
Mat kernel = _kernel.getMat();
|
||||
Size ksize = kernel.data ? kernel.size() : Size(3,3);
|
||||
anchor = normalizeAnchor(anchor, ksize);
|
||||
|
||||
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 &&
|
||||
borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() &&
|
||||
(op == MORPH_ERODE || op == MORPH_DILATE) &&
|
||||
anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1,
|
||||
ocl_morphOp(_src, _dst, kernel, anchor, iterations, op, borderType, borderValue) )
|
||||
|
||||
if (iterations == 0 || kernel.rows*kernel.cols == 1)
|
||||
{
|
||||
_src.copyTo(_dst);
|
||||
@ -1490,12 +1530,6 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
|
||||
iterations = 1;
|
||||
}
|
||||
|
||||
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && src_cn <= 4 &&
|
||||
(src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
|
||||
borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() &&
|
||||
(op == MORPH_ERODE || op == MORPH_DILATE),
|
||||
ocl_morphology_op(_src, _dst, kernel, ksize, anchor, iterations, op) )
|
||||
|
||||
#if IPP_VERSION_X100 >= 801
|
||||
if( IPPMorphOp(op, _src, _dst, kernel, anchor, iterations, borderType, borderValue) )
|
||||
return;
|
||||
@ -1515,13 +1549,6 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
|
||||
|
||||
parallel_for_(Range(0, nStripes),
|
||||
MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
|
||||
|
||||
//Ptr<FilterEngine> f = createMorphologyFilter(op, src.type(),
|
||||
// kernel, anchor, borderType, borderType, borderValue );
|
||||
|
||||
//f->apply( src, dst );
|
||||
//for( int i = 1; i < iterations; i++ )
|
||||
// f->apply( dst, dst );
|
||||
}
|
||||
|
||||
}
|
||||
@ -1541,97 +1568,122 @@ void cv::dilate( InputArray src, OutputArray dst, InputArray kernel,
|
||||
morphOp( MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
}
|
||||
|
||||
void cv::morphologyEx( InputArray _src, OutputArray _dst, int op,
|
||||
InputArray kernel, Point anchor, int iterations,
|
||||
int borderType, const Scalar& borderValue )
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
namespace cv {
|
||||
|
||||
static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op,
|
||||
InputArray kernel, Point anchor, int iterations,
|
||||
int borderType, const Scalar& borderValue)
|
||||
{
|
||||
int src_type = _src.type(), dst_type = _dst.type(),
|
||||
src_cn = CV_MAT_CN(src_type), src_depth = CV_MAT_DEPTH(src_type);
|
||||
|
||||
bool use_opencl = cv::ocl::useOpenCL() && _src.isUMat() && _src.size() == _dst.size() && src_type == dst_type &&
|
||||
_src.dims()<=2 && (src_cn == 1 || src_cn == 4) && (anchor.x == -1) && (anchor.y == -1) &&
|
||||
(src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
|
||||
(borderType == cv::BORDER_CONSTANT) && (borderValue == morphologyDefaultBorderValue());
|
||||
|
||||
_dst.create(_src.size(), _src.type());
|
||||
Mat src, dst, temp;
|
||||
UMat usrc, udst, utemp;
|
||||
_dst.createSameSize(_src, _src.type());
|
||||
bool submat = _dst.isSubmatrix();
|
||||
UMat temp;
|
||||
_OutputArray _temp = submat ? _dst : _OutputArray(temp);
|
||||
|
||||
switch( op )
|
||||
{
|
||||
case MORPH_ERODE:
|
||||
erode( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue ))
|
||||
return false;
|
||||
break;
|
||||
case MORPH_DILATE:
|
||||
dilate( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue ))
|
||||
return false;
|
||||
break;
|
||||
case MORPH_OPEN:
|
||||
erode( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
dilate( _dst, _dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue ))
|
||||
return false;
|
||||
if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue ))
|
||||
return false;
|
||||
break;
|
||||
case MORPH_CLOSE:
|
||||
if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue ))
|
||||
return false;
|
||||
if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue ))
|
||||
return false;
|
||||
break;
|
||||
case MORPH_GRADIENT:
|
||||
if (!ocl_morphOp( _src, temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue ))
|
||||
return false;
|
||||
if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue, MORPH_GRADIENT, temp ))
|
||||
return false;
|
||||
break;
|
||||
case MORPH_TOPHAT:
|
||||
if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue ))
|
||||
return false;
|
||||
if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue, MORPH_TOPHAT, _src ))
|
||||
return false;
|
||||
break;
|
||||
case MORPH_BLACKHAT:
|
||||
if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue ))
|
||||
return false;
|
||||
if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue, MORPH_BLACKHAT, _src ))
|
||||
return false;
|
||||
break;
|
||||
default:
|
||||
CV_Error( CV_StsBadArg, "unknown morphological operation" );
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void cv::morphologyEx( InputArray _src, OutputArray _dst, int op,
|
||||
InputArray kernel, Point anchor, int iterations,
|
||||
int borderType, const Scalar& borderValue )
|
||||
{
|
||||
#ifdef HAVE_OPENCL
|
||||
Size ksize = kernel.size();
|
||||
anchor = normalizeAnchor(anchor, ksize);
|
||||
|
||||
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 &&
|
||||
anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1 &&
|
||||
borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue(),
|
||||
ocl_morphologyEx(_src, _dst, op, kernel, anchor, iterations, borderType, borderValue))
|
||||
#endif
|
||||
|
||||
Mat src = _src.getMat(), temp;
|
||||
_dst.create(src.size(), src.type());
|
||||
Mat dst = _dst.getMat();
|
||||
|
||||
switch( op )
|
||||
{
|
||||
case MORPH_ERODE:
|
||||
erode( src, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
break;
|
||||
case MORPH_DILATE:
|
||||
dilate( src, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
break;
|
||||
case MORPH_OPEN:
|
||||
erode( src, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
dilate( dst, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
break;
|
||||
case CV_MOP_CLOSE:
|
||||
dilate( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
erode( _dst, _dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
dilate( src, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
erode( dst, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
break;
|
||||
case CV_MOP_GRADIENT:
|
||||
erode( _src, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
dilate( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
if(use_opencl)
|
||||
{
|
||||
udst = _dst.getUMat();
|
||||
subtract(udst, utemp, udst);
|
||||
}
|
||||
else
|
||||
{
|
||||
dst = _dst.getMat();
|
||||
dst -= temp;
|
||||
}
|
||||
erode( src, temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
dilate( src, dst, kernel, anchor, iterations, borderType, borderValue );
|
||||
dst -= temp;
|
||||
break;
|
||||
case CV_MOP_TOPHAT:
|
||||
if(use_opencl)
|
||||
{
|
||||
usrc = _src.getUMat();
|
||||
udst = _dst.getUMat();
|
||||
if( usrc.u != udst.u )
|
||||
utemp = udst;
|
||||
}
|
||||
else
|
||||
{
|
||||
src = _src.getMat();
|
||||
dst = _dst.getMat();
|
||||
if( src.data != dst.data )
|
||||
temp = dst;
|
||||
}
|
||||
erode( _src, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
dilate( use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel,
|
||||
anchor, iterations, borderType, borderValue );
|
||||
if(use_opencl)
|
||||
subtract(usrc, utemp, udst);
|
||||
else
|
||||
dst = src - temp;
|
||||
if( src.data != dst.data )
|
||||
temp = dst;
|
||||
erode( src, temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
dilate( temp, temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
dst = src - temp;
|
||||
break;
|
||||
case CV_MOP_BLACKHAT:
|
||||
if(use_opencl)
|
||||
{
|
||||
usrc = _src.getUMat();
|
||||
udst = _dst.getUMat();
|
||||
if( usrc.u != udst.u )
|
||||
utemp = udst;
|
||||
}
|
||||
else
|
||||
{
|
||||
src = _src.getMat();
|
||||
dst = _dst.getMat();
|
||||
if( src.data != dst.data )
|
||||
temp = dst;
|
||||
}
|
||||
dilate( _src, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
erode( use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel,
|
||||
anchor, iterations, borderType, borderValue );
|
||||
if(use_opencl)
|
||||
subtract(utemp, usrc, udst);
|
||||
else
|
||||
dst = temp - src;
|
||||
if( src.data != dst.data )
|
||||
temp = dst;
|
||||
dilate( src, temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
erode( temp, temp, kernel, anchor, iterations, borderType, borderValue );
|
||||
dst = temp - src;
|
||||
break;
|
||||
default:
|
||||
CV_Error( CV_StsBadArg, "unknown morphological operation" );
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -119,20 +119,17 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
||||
int liy = get_local_id(1);
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
// calculate pixel position in source image taking image offset into account
|
||||
int srcX = x + srcOffsetX - RADIUSX;
|
||||
int srcY = y + srcOffsetY - RADIUSY;
|
||||
|
||||
// extrapolate coordinates, if needed
|
||||
// and read my own source pixel into local memory
|
||||
// with account for extra border pixels, which will be read by starting workitems
|
||||
int clocY = liy;
|
||||
int cSrcY = srcY;
|
||||
do
|
||||
{
|
||||
int yb = cSrcY;
|
||||
int yb = clocY + srcOffsetY - RADIUSY;
|
||||
EXTRAPOLATE(yb, (height));
|
||||
|
||||
int clocX = lix;
|
||||
@ -149,53 +146,80 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
||||
while(clocX < BLK_X+(RADIUSX*2));
|
||||
|
||||
clocY += BLK_Y;
|
||||
cSrcY += BLK_Y;
|
||||
}
|
||||
while (clocY < BLK_Y+(RADIUSY*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do vertical filter pass
|
||||
// and store intermediate results to second local memory array
|
||||
int i, clocX = lix;
|
||||
WT sum = (WT) 0;
|
||||
do
|
||||
for (int y = 0; y < dst_rows; y+=BLK_Y)
|
||||
{
|
||||
sum = (WT) 0;
|
||||
for (i=0; i<=2*RADIUSY; i++)
|
||||
// do vertical filter pass
|
||||
// and store intermediate results to second local memory array
|
||||
int i, clocX = lix;
|
||||
WT sum = (WT) 0;
|
||||
do
|
||||
{
|
||||
sum = (WT) 0;
|
||||
for (i=0; i<=2*RADIUSY; i++)
|
||||
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||
sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);
|
||||
#else
|
||||
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||
sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);
|
||||
#endif
|
||||
lsmemDy[liy][clocX] = sum;
|
||||
clocX += BLK_X;
|
||||
}
|
||||
while(clocX < BLK_X+(RADIUSX*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
lsmemDy[liy][clocX] = sum;
|
||||
clocX += BLK_X;
|
||||
}
|
||||
while(clocX < BLK_X+(RADIUSX*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// if this pixel happened to be out of image borders because of global size rounding,
|
||||
// then just return
|
||||
if( x >= dst_cols || y >=dst_rows )
|
||||
return;
|
||||
|
||||
// do second horizontal filter pass
|
||||
// and calculate final result
|
||||
sum = 0.0f;
|
||||
for (i=0; i<=2*RADIUSX; i++)
|
||||
// if this pixel happened to be out of image borders because of global size rounding,
|
||||
// then just return
|
||||
if ((x < dst_cols) && (y + liy < dst_rows))
|
||||
{
|
||||
// do second horizontal filter pass
|
||||
// and calculate final result
|
||||
sum = 0.0f;
|
||||
for (i=0; i<=2*RADIUSX; i++)
|
||||
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
#else
|
||||
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||
#endif
|
||||
|
||||
#ifdef INTEGER_ARITHMETIC
|
||||
#ifdef INTEL_DEVICE
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
|
||||
#else
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||
#endif
|
||||
#endif
|
||||
// store result into destination image
|
||||
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
|
||||
}
|
||||
|
||||
for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)
|
||||
{
|
||||
int clocX = i % (BLK_X+(RADIUSX*2));
|
||||
int clocY = i / (BLK_X+(RADIUSX*2));
|
||||
lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY;
|
||||
EXTRAPOLATE(yb, (height));
|
||||
|
||||
clocX = lix;
|
||||
int cSrcX = x + srcOffsetX - RADIUSX;
|
||||
do
|
||||
{
|
||||
int xb = cSrcX;
|
||||
EXTRAPOLATE(xb,(width));
|
||||
lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 );
|
||||
|
||||
clocX += BLK_X;
|
||||
cSrcX += BLK_X;
|
||||
}
|
||||
while(clocX < BLK_X+(RADIUSX*2));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// store result into destination image
|
||||
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
|
||||
}
|
||||
|
@ -1,512 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Shengen Yan,yanshengen@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifdef DOUBLE_SUPPORT
|
||||
#ifdef cl_amd_fp64
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#elif defined (cl_khr_fp64)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if sqdepth == 6
|
||||
#define CONVERT(step) ((step)>>1)
|
||||
#else
|
||||
#define CONVERT(step) ((step))
|
||||
#endif
|
||||
|
||||
#define LSIZE 256
|
||||
#define LSIZE_1 255
|
||||
#define LSIZE_2 254
|
||||
#define HF_LSIZE 128
|
||||
#define LOG_LSIZE 8
|
||||
#define LOG_NUM_BANKS 5
|
||||
#define NUM_BANKS 32
|
||||
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
|
||||
|
||||
#define noconvert
|
||||
|
||||
#if sdepth == 4
|
||||
|
||||
kernel void integral_cols(__global uchar4 *src, __global int *sum, __global TYPE *sqsum,
|
||||
int src_offset, int pre_invalid, int rows, int cols, int src_step, int dst_step, int dst1_step)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int4 src_t[2], sum_t[2];
|
||||
TYPE4 sqsum_t[2];
|
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
|
||||
__local int* sum_p;
|
||||
__local TYPE* sqsum_p;
|
||||
src_step = src_step >> 2;
|
||||
gid = gid << 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : 0);
|
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : 0);
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = convert_TYPE4(src_t[0] * src_t[0]);
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = convert_TYPE4(src_t[1] * src_t[1]);
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step /4, loc_s1 = loc_s0 + dst_step ;
|
||||
int loc_sq0 = gid * CONVERT(dst1_step) + i + lid - 1 - pre_invalid * dst1_step / sizeof(TYPE),loc_sq1 = loc_sq0 + CONVERT(dst1_step);
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
|
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq0 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
|
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq1 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void integral_rows(__global int4 *srcsum, __global TYPE4 * srcsqsum,__global int *sum,
|
||||
__global TYPE *sqsum, int rows, int cols, int src_step, int src1_step, int sum_step,
|
||||
int sqsum_step, int sum_offset, int sqsum_offset)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int4 src_t[2], sum_t[2];
|
||||
TYPE4 sqsrc_t[2],sqsum_t[2];
|
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
|
||||
__local int *sum_p;
|
||||
__local TYPE *sqsum_p;
|
||||
src_step = src_step >> 4;
|
||||
src1_step = (src1_step / sizeof(TYPE)) >> 2 ;
|
||||
gid <<= 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid ] : (int4)0;
|
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid ] : (TYPE4)0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid + 1] : (int4)0;
|
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid + 1] : (TYPE4)0;
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = sqsrc_t[0];
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = sqsrc_t[1];
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(gid == 0 && (i + lid) <= rows)
|
||||
{
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
sqsum[sqsum_offset + i + lid] = 0;
|
||||
}
|
||||
if(i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * sum_step;
|
||||
int loc1 = gid * CONVERT(sqsum_step);
|
||||
for(int k = 1; k <= 8; k++)
|
||||
{
|
||||
if(gid * 4 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
sqsum[sqsum_offset + loc1 + k * sqsum_step / sizeof(TYPE)] = 0;
|
||||
}
|
||||
}
|
||||
int loc_s0 = sum_offset + gid * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
int loc_sq0 = sqsum_offset + gid * CONVERT(sqsum_step) + sqsum_step / sizeof(TYPE) + i + lid, loc_sq1 = loc_sq0 + CONVERT(sqsum_step) ;
|
||||
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k >= cols) break;
|
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq0 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + 4 + k >= cols) break;
|
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq1 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
#elif sdepth == 5
|
||||
|
||||
kernel void integral_cols(__global uchar4 *src, __global float *sum, __global TYPE *sqsum,
|
||||
int src_offset, int pre_invalid, int rows, int cols, int src_step, int dst_step, int dst1_step)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
TYPE4 sqsum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
|
||||
__local float* sum_p;
|
||||
__local TYPE* sqsum_p;
|
||||
src_step = src_step >> 2;
|
||||
gid = gid << 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : (float4)0);
|
||||
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : (float4)0);
|
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = convert_TYPE4(src_t[0] * src_t[0]);
|
||||
// printf("%f\n", src_t[0].s0);
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = convert_TYPE4(src_t[1] * src_t[1]);
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||
int loc_sq0 = gid * CONVERT(dst1_step) + i + lid - 1 - pre_invalid * dst1_step / sizeof(TYPE), loc_sq1 = loc_sq0 + CONVERT(dst1_step);
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
|
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq0 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
|
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq1 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void integral_rows(__global float4 *srcsum, __global TYPE4 * srcsqsum, __global float *sum ,
|
||||
__global TYPE *sqsum, int rows, int cols, int src_step, int src1_step, int sum_step,
|
||||
int sqsum_step, int sum_offset, int sqsum_offset)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
TYPE4 sqsrc_t[2],sqsum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
|
||||
__local float *sum_p;
|
||||
__local TYPE *sqsum_p;
|
||||
src_step = src_step >> 4;
|
||||
src1_step = (src1_step / sizeof(TYPE)) >> 2;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
|
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid * 2] : (TYPE4)0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
|
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid * 2 + 1] : (TYPE4)0;
|
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = sqsrc_t[0];
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = sqsrc_t[1];
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(gid == 0 && (i + lid) <= rows)
|
||||
{
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
sqsum[sqsum_offset + i + lid] = 0;
|
||||
}
|
||||
if(i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
int loc1 = gid * 2 * CONVERT(sqsum_step);
|
||||
for(int k = 1; k <= 8; k++)
|
||||
{
|
||||
if(gid * 8 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
sqsum[sqsum_offset + loc1 + k * sqsum_step / sizeof(TYPE)] = 0;
|
||||
}
|
||||
}
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
int loc_sq0 = sqsum_offset + gid * 2 * CONVERT(sqsum_step) + sqsum_step / sizeof(TYPE) + i + lid, loc_sq1 = loc_sq0 + CONVERT(sqsum_step) ;
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 8 + k >= cols) break;
|
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq0 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 8 + 4 + k >= cols) break;
|
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq1 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -1,46 +1,9 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Shengen Yan,yanshengen@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifdef DOUBLE_SUPPORT
|
||||
@ -51,237 +14,170 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define LSIZE 256
|
||||
#define LSIZE_1 255
|
||||
#define LSIZE_2 254
|
||||
#define HF_LSIZE 128
|
||||
#define LOG_LSIZE 8
|
||||
#define LOG_NUM_BANKS 5
|
||||
#define NUM_BANKS 32
|
||||
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
|
||||
|
||||
#if sdepth == 4
|
||||
#define sumT int
|
||||
#define vecSumT int4
|
||||
#define convertToSum4 convert_int4
|
||||
#elif sdepth == 5
|
||||
#define sumT float
|
||||
#define vecSumT float4
|
||||
#define convertToSum4 convert_float4
|
||||
#ifndef LOCAL_SUM_SIZE
|
||||
#define LOCAL_SUM_SIZE 16
|
||||
#endif
|
||||
|
||||
#define LOCAL_SUM_STRIDE (LOCAL_SUM_SIZE + 1)
|
||||
|
||||
kernel void integral_sum_cols(__global const uchar4 *src, __global uchar *sum_ptr,
|
||||
int src_offset, int rows, int cols, int src_step, int dst_step)
|
||||
|
||||
kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,
|
||||
__global uchar *buf_ptr, int buf_step, int buf_offset
|
||||
#ifdef SUM_SQUARE
|
||||
,__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
__global sumT *sum = (__global sumT *)sum_ptr;
|
||||
__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
||||
#ifdef SUM_SQUARE
|
||||
__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
||||
#endif
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
vecSumT src_t[2], sum_t[2];
|
||||
__local vecSumT lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local sumT* sum_p;
|
||||
src_step = src_step >> 2;
|
||||
gid = gid << 1;
|
||||
int lid_prim = ((lid & 127) << 1) + 1;
|
||||
for (int i = 0; i < rows; i += LSIZE_1)
|
||||
|
||||
int x = get_global_id(0);
|
||||
int src_index = x + src_offset;
|
||||
|
||||
sumT accum = 0;
|
||||
#ifdef SUM_SQUARE
|
||||
sumSQT accum_sq = 0;
|
||||
#endif
|
||||
for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)
|
||||
{
|
||||
if (i + lid < rows)
|
||||
int lsum_index = lid;
|
||||
#pragma unroll
|
||||
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)
|
||||
{
|
||||
int src_index = mad24((lid+i), src_step, gid + src_offset);
|
||||
src_t[0] = convertToSum4(src[src_index]);
|
||||
src_t[1] = convertToSum4(src[src_index + 1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
src_t[0] = (vecSumT)0;
|
||||
src_t[1] = (vecSumT)0;
|
||||
}
|
||||
|
||||
if (i == 0)
|
||||
{
|
||||
sum_t[0] = (vecSumT)0;
|
||||
sum_t[1] = (vecSumT)0;
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_t[0] = lm_sum[0][LSIZE_2 + LOG_LSIZE];
|
||||
sum_t[1] = lm_sum[1][LSIZE_2 + LOG_LSIZE];
|
||||
if ((x < cols) && (y + yin < rows))
|
||||
{
|
||||
__global const uchar *src = src_ptr + src_index;
|
||||
accum += src[0];
|
||||
#ifdef SUM_SQUARE
|
||||
sumSQT temp = src[0] * src[0];
|
||||
accum_sq += temp;
|
||||
#endif
|
||||
}
|
||||
lm_sum[lsum_index] = accum;
|
||||
#ifdef SUM_SQUARE
|
||||
lm_sum_sq[lsum_index] = accum_sq;
|
||||
#endif
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
//int buf_index = buf_offset + buf_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
|
||||
int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));
|
||||
#ifdef SUM_SQUARE
|
||||
int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));
|
||||
#endif
|
||||
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
|
||||
int offset = 1;
|
||||
for (int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
lsum_index = LOCAL_SUM_STRIDE * lid;
|
||||
#pragma unroll
|
||||
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * lid_prim - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for (int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * lid_prim - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
int loc_s0 = mad24(gid, dst_step, i + lid - 1), loc_s1 = loc_s0 + dst_step;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
sum_p = (__local sumT*)(&(lm_sum[0][bf_loc]));
|
||||
for (int k = 0; k < 4; k++)
|
||||
{
|
||||
if (gid * 4 + k >= cols)
|
||||
break;
|
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
|
||||
}
|
||||
sum_p = (__local sumT*)(&(lm_sum[1][bf_loc]));
|
||||
for (int k = 0; k < 4; k++)
|
||||
{
|
||||
if (gid * 4 + k + 4 >= cols)
|
||||
break;
|
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
|
||||
}
|
||||
__global sumT *buf = (__global sumT *)(buf_ptr + buf_index);
|
||||
buf[0] = lm_sum[lsum_index];
|
||||
buf_index += buf_step;
|
||||
#ifdef SUM_SQUARE
|
||||
__global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);
|
||||
bufsq[0] = lm_sum_sq[lsum_index];
|
||||
buf_sq_index += buf_sq_step;
|
||||
#endif
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void integral_sum_rows(__global const uchar *srcsum_ptr, __global uchar *sum_ptr,
|
||||
int rows, int cols, int src_step, int sum_step, int sum_offset)
|
||||
kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,
|
||||
#ifdef SUM_SQUARE
|
||||
__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,
|
||||
#endif
|
||||
__global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols
|
||||
#ifdef SUM_SQUARE
|
||||
,__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
__global const vecSumT *srcsum = (__global const vecSumT *)srcsum_ptr;
|
||||
__global sumT *sum = (__global sumT *)sum_ptr;
|
||||
__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
||||
#ifdef SUM_SQUARE
|
||||
__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
||||
#endif
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
vecSumT src_t[2], sum_t[2];
|
||||
__local vecSumT lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local sumT *sum_p;
|
||||
src_step = src_step >> 4;
|
||||
int lid_prim = ((lid & 127) << 1) + 1;
|
||||
for (int i = 0; i < rows; i += LSIZE_1)
|
||||
|
||||
int gs = get_global_size(0);
|
||||
|
||||
int x = get_global_id(0);
|
||||
|
||||
__global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);
|
||||
for (int xin = x; xin < cols; xin += gs)
|
||||
{
|
||||
if (i + lid < rows)
|
||||
dst[xin] = 0;
|
||||
}
|
||||
dst_offset += dst_step;
|
||||
|
||||
if (x < rows - 1)
|
||||
{
|
||||
dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));
|
||||
dst[0] = 0;
|
||||
}
|
||||
|
||||
int buf_index = mad24((int)sizeof(sumT), x, buf_offset);
|
||||
sumT accum = 0;
|
||||
|
||||
#ifdef SUM_SQUARE
|
||||
__global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);
|
||||
for (int xin = x; xin < cols; xin += gs)
|
||||
{
|
||||
dst_sq[xin] = 0;
|
||||
}
|
||||
dst_sq_offset += dst_sq_step;
|
||||
|
||||
dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));
|
||||
dst_sq[0] = 0;
|
||||
|
||||
int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);
|
||||
sumSQT accum_sq = 0;
|
||||
#endif
|
||||
|
||||
for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)
|
||||
{
|
||||
int lsum_index = lid;
|
||||
#pragma unroll
|
||||
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)
|
||||
{
|
||||
int sum_idx = mad24(lid + i, src_step, gid * 2);
|
||||
src_t[0] = srcsum[sum_idx];
|
||||
src_t[1] = srcsum[sum_idx + 1];
|
||||
}
|
||||
else
|
||||
{
|
||||
src_t[0] = 0;
|
||||
src_t[1] = 0;
|
||||
}
|
||||
if (i == 0)
|
||||
{
|
||||
sum_t[0] = 0;
|
||||
sum_t[1] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_t[0] = lm_sum[0][LSIZE_2 + LOG_LSIZE];
|
||||
sum_t[1] = lm_sum[1][LSIZE_2 + LOG_LSIZE];
|
||||
__global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);
|
||||
accum += buf[0];
|
||||
lm_sum[lsum_index] = accum;
|
||||
buf_index += buf_step;
|
||||
#ifdef SUM_SQUARE
|
||||
__global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);
|
||||
accum_sq += buf_sq[0];
|
||||
lm_sum_sq[lsum_index] = accum_sq;
|
||||
buf_sq_index += buf_sq_step;
|
||||
#endif
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
|
||||
int offset = 1;
|
||||
for (int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
if (y + lid < cols)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * lid_prim - 1, bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
//int dst_index = dst_offset + dst_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
|
||||
int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));
|
||||
#ifdef SUM_SQUARE
|
||||
int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));
|
||||
#endif
|
||||
lsum_index = LOCAL_SUM_STRIDE * lid;
|
||||
int yin_max = min(rows - 1 - LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);
|
||||
#pragma unroll
|
||||
for (int yin = 0; yin < yin_max; yin++, lsum_index++)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for (int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * lid_prim - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if ((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (gid == 0 && (i + lid) <= rows)
|
||||
{
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
}
|
||||
if (i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
for(int k = 1; k <= 8; k++)
|
||||
{
|
||||
if (gid * 8 + k > cols)
|
||||
break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
sum_p = (__local sumT*)(&(lm_sum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if (gid * 8 + k >= cols)
|
||||
break;
|
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
|
||||
}
|
||||
sum_p = (__local sumT*)(&(lm_sum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if (gid * 8 + 4 + k >= cols)
|
||||
break;
|
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
|
||||
dst = (__global sumT *)(dst_ptr + dst_index);
|
||||
dst[0] = lm_sum[lsum_index];
|
||||
dst_index += dst_step;
|
||||
#ifdef SUM_SQUARE
|
||||
dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);
|
||||
dst_sq[0] = lm_sum_sq[lsum_index];
|
||||
dst_sq_index += dst_sq_step;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
@ -43,6 +43,8 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define noconvert
|
||||
|
||||
#if cn != 3
|
||||
#define loadpix(addr) *(__global const T *)(addr)
|
||||
#define storepix(val, addr) *(__global T *)(addr) = val
|
||||
@ -54,59 +56,75 @@
|
||||
#endif
|
||||
|
||||
#ifdef DEPTH_0
|
||||
#ifdef ERODE
|
||||
#define VAL 255
|
||||
#endif
|
||||
#ifdef DILATE
|
||||
#define VAL 0
|
||||
#endif
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL UCHAR_MAX
|
||||
#elif defined DEPTH_1
|
||||
#define MIN_VAL SCHAR_MIN
|
||||
#define MAX_VAL SCHAR_MAX
|
||||
#elif defined DEPTH_2
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL USHRT_MAX
|
||||
#elif defined DEPTH_3
|
||||
#define MIN_VAL SHRT_MIN
|
||||
#define MAX_VAL SHRT_MAX
|
||||
#elif defined DEPTH_4
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MAX_VAL INT_MAX
|
||||
#elif defined DEPTH_5
|
||||
#ifdef ERODE
|
||||
#define VAL FLT_MAX
|
||||
#endif
|
||||
#ifdef DILATE
|
||||
#define VAL -FLT_MAX
|
||||
#endif
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MAX_VAL FLT_MAX
|
||||
#elif defined DEPTH_6
|
||||
#ifdef ERODE
|
||||
#define VAL DBL_MAX
|
||||
#endif
|
||||
#ifdef DILATE
|
||||
#define VAL -DBL_MAX
|
||||
#endif
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MAX_VAL DBL_MAX
|
||||
#endif
|
||||
|
||||
#ifdef ERODE
|
||||
#if defined(INTEL_DEVICE) && (DEPTH_0)
|
||||
#ifdef OP_ERODE
|
||||
#define VAL MAX_VAL
|
||||
#elif defined OP_DILATE
|
||||
#define VAL MIN_VAL
|
||||
#else
|
||||
#error "Unknown operation"
|
||||
#endif
|
||||
|
||||
#ifdef OP_ERODE
|
||||
#if defined INTEL_DEVICE && defined DEPTH_0
|
||||
// workaround for bug in Intel HD graphics drivers (10.18.10.3496 or older)
|
||||
#define __CAT(x, y) x##y
|
||||
#define CAT(x, y) __CAT(x, y)
|
||||
#define WA_CONVERT_1 CAT(convert_uint, cn)
|
||||
#define WA_CONVERT_2 CAT(convert_, T)
|
||||
#define convert_uint1 convert_uint
|
||||
#define MORPH_OP(A,B) WA_CONVERT_2(min(WA_CONVERT_1(A),WA_CONVERT_1(B)))
|
||||
#define MORPH_OP(A, B) WA_CONVERT_2(min(WA_CONVERT_1(A), WA_CONVERT_1(B)))
|
||||
#else
|
||||
#define MORPH_OP(A,B) min((A),(B))
|
||||
#define MORPH_OP(A, B) min((A), (B))
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DILATE
|
||||
#define MORPH_OP(A,B) max((A),(B))
|
||||
#ifdef OP_DILATE
|
||||
#define MORPH_OP(A, B) max((A), (B))
|
||||
#endif
|
||||
|
||||
#define PROCESS(y, x) \
|
||||
res = MORPH_OP(res, LDS_DAT[mad24(l_y + y, width, l_x + x)]);
|
||||
|
||||
// BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
|
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) < (l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
|
||||
|
||||
#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT
|
||||
#define EXTRA_PARAMS , __global const uchar * matptr, int mat_step, int mat_offset
|
||||
#else
|
||||
#define EXTRA_PARAMS
|
||||
#endif
|
||||
|
||||
__kernel void morph(__global const uchar * srcptr, int src_step, int src_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset,
|
||||
int src_offset_x, int src_offset_y, int cols, int rows,
|
||||
__constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
|
||||
int src_whole_cols, int src_whole_rows EXTRA_PARAMS)
|
||||
{
|
||||
int gidx = get_global_id(0), gidy = get_global_id(1);
|
||||
int l_x = get_local_id(0), l_y = get_local_id(1);
|
||||
int x = get_group_id(0) * LSIZE0, y = get_group_id(1) * LSIZE1;
|
||||
int start_x = x + src_offset_x - RADIUSX;
|
||||
int end_x = x + src_offset_x + LSIZE0 + RADIUSX;
|
||||
int width = end_x - (x + src_offset_x - RADIUSX) + 1;
|
||||
int width = mad24(RADIUSX, 2, LSIZE0 + 1);
|
||||
int start_y = y + src_offset_y - RADIUSY;
|
||||
int point1 = mad24(l_y, LSIZE0, l_x);
|
||||
int point2 = point1 + LSIZE0 * LSIZE1;
|
||||
@ -117,7 +135,7 @@ __kernel void morph(__global const uchar * srcptr, int src_step, int src_offset,
|
||||
int start_addr = mad24(cur_y, src_step, cur_x * TSIZE);
|
||||
int start_addr2 = mad24(cur_y2, src_step, cur_x2 * TSIZE);
|
||||
|
||||
__local T LDS_DAT[2*LSIZE1*LSIZE0];
|
||||
__local T LDS_DAT[2 * LSIZE1 * LSIZE0];
|
||||
|
||||
// read pixels from src
|
||||
int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * TSIZE);
|
||||
@ -128,8 +146,8 @@ __kernel void morph(__global const uchar * srcptr, int src_step, int src_offset,
|
||||
T temp1 = loadpix(srcptr + start_addr2);
|
||||
|
||||
// judge if read out of boundary
|
||||
temp0 = ELEM(cur_x, 0, src_whole_cols, (T)(VAL),temp0);
|
||||
temp0 = ELEM(cur_y, 0, src_whole_rows, (T)(VAL),temp0);
|
||||
temp0 = ELEM(cur_x, 0, src_whole_cols, (T)(VAL), temp0);
|
||||
temp0 = ELEM(cur_y, 0, src_whole_rows, (T)(VAL), temp0);
|
||||
|
||||
temp1 = ELEM(cur_x2, 0, src_whole_cols, (T)(VAL), temp1);
|
||||
temp1 = ELEM(cur_y2, 0, src_whole_rows, (T)(VAL), temp1);
|
||||
@ -138,24 +156,26 @@ __kernel void morph(__global const uchar * srcptr, int src_step, int src_offset,
|
||||
LDS_DAT[point2] = temp1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
T res = (T)(VAL);
|
||||
for (int i = 0, sizey = 2 * RADIUSY + 1; i < sizey; i++)
|
||||
for (int j = 0, sizex = 2 * RADIUSX + 1; j < sizex; j++)
|
||||
{
|
||||
res =
|
||||
#ifndef RECTKERNEL
|
||||
mat_kernel[i*(2*RADIUSX+1)+j] ?
|
||||
#endif
|
||||
MORPH_OP(res, LDS_DAT[mad24(l_y + i, width, l_x + j)])
|
||||
#ifndef RECTKERNEL
|
||||
: res
|
||||
#endif
|
||||
;
|
||||
}
|
||||
|
||||
if (gidx < cols && gidy < rows)
|
||||
{
|
||||
T res = (T)(VAL);
|
||||
PROCESS_ELEMS;
|
||||
|
||||
int dst_index = mad24(gidy, dst_step, mad24(gidx, TSIZE, dst_offset));
|
||||
|
||||
#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT
|
||||
int mat_index = mad24(gidy, mat_step, mad24(gidx, TSIZE, mat_offset));
|
||||
T value = loadpix(matptr + mat_index);
|
||||
|
||||
#ifdef OP_GRADIENT
|
||||
storepix(convertToT(convertToWT(res) - convertToWT(value)), dstptr + dst_index);
|
||||
#elif defined OP_TOPHAT
|
||||
storepix(convertToT(convertToWT(value) - convertToWT(res)), dstptr + dst_index);
|
||||
#elif defined OP_BLACKHAT
|
||||
storepix(convertToT(convertToWT(res) - convertToWT(value)), dstptr + dst_index);
|
||||
#endif
|
||||
#else // erode or dilate
|
||||
storepix(res, dstptr + dst_index);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -61,6 +61,7 @@
|
||||
#define AB_SCALE (1 << AB_BITS)
|
||||
#define INTER_REMAP_COEF_BITS 15
|
||||
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
|
||||
#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1))
|
||||
|
||||
#define noconvert
|
||||
|
||||
@ -122,6 +123,14 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
|
||||
#elif defined INTER_LINEAR
|
||||
|
||||
__constant float coeffs[64] =
|
||||
{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
|
||||
0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
|
||||
0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
|
||||
0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
|
||||
0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
|
||||
0.062500f, 0.937500f, 0.031250f, 0.968750f };
|
||||
|
||||
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, ST scalar_)
|
||||
@ -131,24 +140,21 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
|
||||
if (dx < dst_cols)
|
||||
{
|
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
|
||||
|
||||
int tmp = (dx << AB_BITS);
|
||||
int tmp = dx << AB_BITS;
|
||||
int X0_ = rint(M[0] * tmp);
|
||||
int Y0_ = rint(M[3] * tmp);
|
||||
|
||||
for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)
|
||||
{
|
||||
int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta;
|
||||
int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta;
|
||||
int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;
|
||||
int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;
|
||||
X0 = X0 >> (AB_BITS - INTER_BITS);
|
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS);
|
||||
|
||||
short sx = convert_short_sat(X0 >> INTER_BITS);
|
||||
short sy = convert_short_sat(Y0 >> INTER_BITS);
|
||||
short ax = convert_short(X0 & (INTER_TAB_SIZE-1));
|
||||
short ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
|
||||
short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);
|
||||
short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
#if defined AMD_DEVICE || depth > 4
|
||||
WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
|
||||
if (sx >= 0 && sx < src_cols)
|
||||
{
|
||||
@ -180,8 +186,48 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
|
||||
#else
|
||||
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
|
||||
WT val = fma(v0, tabx2 * taby2, fma(v1, tabx * taby2, fma(v2, tabx2 * taby, v3 * tabx * taby)));
|
||||
WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));
|
||||
storepix(convertToT(val), dstptr + dst_index);
|
||||
#endif
|
||||
#else // INTEL_DEVICE
|
||||
__constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);
|
||||
|
||||
int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
|
||||
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
|
||||
|
||||
WT sum = (WT)(0), xsum;
|
||||
#pragma unroll
|
||||
for (int y = 0; y < 2; y++)
|
||||
{
|
||||
src_index = mad24(y, src_step, src_index0);
|
||||
if (sy + y >= 0 && sy + y < src_rows)
|
||||
{
|
||||
xsum = (WT)(0);
|
||||
if (sx >= 0 && sx + 2 < src_cols)
|
||||
{
|
||||
#if depth == 0 && cn == 1
|
||||
uchar2 value = vload2(0, srcptr + src_index);
|
||||
xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int x = 0; x < 2; x++)
|
||||
xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll
|
||||
for (int x = 0; x < 2; x++)
|
||||
xsum = fma(sx + x >= 0 && sx + x < src_cols ?
|
||||
convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
|
||||
}
|
||||
sum = fma(xsum, coeffs_y[y], sum);
|
||||
}
|
||||
else
|
||||
sum = fma(scalar, coeffs_y[y], sum);
|
||||
}
|
||||
|
||||
storepix(convertToT(sum), dstptr + dst_index);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -189,6 +235,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
|
||||
#elif defined INTER_CUBIC
|
||||
|
||||
#ifdef AMD_DEVICE
|
||||
|
||||
inline void interpolateCubic( float x, float* coeffs )
|
||||
{
|
||||
const float A = -0.75f;
|
||||
@ -199,6 +247,23 @@ inline void interpolateCubic( float x, float* coeffs )
|
||||
coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__constant float coeffs[128] =
|
||||
{ 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,
|
||||
-0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,
|
||||
-0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,
|
||||
-0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,
|
||||
-0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,
|
||||
-0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,
|
||||
-0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,
|
||||
-0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,
|
||||
-0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,
|
||||
-0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,
|
||||
-0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, ST scalar_)
|
||||
@ -208,22 +273,17 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
|
||||
if (dx < dst_cols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
|
||||
|
||||
int tmp = (dx << AB_BITS);
|
||||
int X0 = rint(M[0] * tmp);
|
||||
int Y0 = rint(M[3] * tmp);
|
||||
int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;
|
||||
int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;
|
||||
|
||||
X0 += rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta;
|
||||
Y0 += rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta;
|
||||
X0 = X0 >> (AB_BITS - INTER_BITS);
|
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS);
|
||||
|
||||
int sx = (short)(X0 >> INTER_BITS) - 1;
|
||||
int sy = (short)(Y0 >> INTER_BITS) - 1;
|
||||
int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
|
||||
int ax = (short)(X0 & (INTER_TAB_SIZE-1));
|
||||
int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1;
|
||||
int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1));
|
||||
|
||||
#ifdef AMD_DEVICE
|
||||
WT v[16];
|
||||
#pragma unroll
|
||||
for (int y = 0; y < 4; y++)
|
||||
@ -269,6 +329,46 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
for (int i = 0; i < 16; i++)
|
||||
sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);
|
||||
storepix(convertToT( sum ), dstptr + dst_index);
|
||||
#endif
|
||||
#else // INTEL_DEVICE
|
||||
__constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);
|
||||
|
||||
int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
|
||||
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
|
||||
|
||||
WT sum = (WT)(0), xsum;
|
||||
#pragma unroll
|
||||
for (int y = 0; y < 4; y++)
|
||||
{
|
||||
src_index = mad24(y, src_step, src_index0);
|
||||
if (sy + y >= 0 && sy + y < src_rows)
|
||||
{
|
||||
xsum = (WT)(0);
|
||||
if (sx >= 0 && sx + 4 < src_cols)
|
||||
{
|
||||
#if depth == 0 && cn == 1
|
||||
uchar4 value = vload4(0, srcptr + src_index);
|
||||
xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int x = 0; x < 4; x++)
|
||||
xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll
|
||||
for (int x = 0; x < 4; x++)
|
||||
xsum = fma(sx + x >= 0 && sx + x < src_cols ?
|
||||
convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
|
||||
}
|
||||
sum = fma(xsum, coeffs_y[y], sum);
|
||||
}
|
||||
else
|
||||
sum = fma(scalar, coeffs_y[y], sum);
|
||||
}
|
||||
|
||||
storepix(convertToT(sum), dstptr + dst_index);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -235,97 +235,87 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
enum { vlen = 4 };
|
||||
|
||||
static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
|
||||
{
|
||||
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 ||
|
||||
!(sdepth == CV_32S || sdepth == CV_32F) )
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
|
||||
if ( (_src.type() != CV_8UC1) ||
|
||||
!(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
|
||||
return false;
|
||||
|
||||
ocl::Kernel k1("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc,
|
||||
format("-D sdepth=%d", sdepth));
|
||||
if (k1.empty())
|
||||
static const int tileSize = 16;
|
||||
|
||||
String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
|
||||
ocl::typeToStr(sdepth), tileSize,
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
|
||||
if (kcols.empty())
|
||||
return false;
|
||||
|
||||
Size size = _src.size(), t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width),
|
||||
ssize(size.width + 1, size.height + 1);
|
||||
_sum.create(ssize, sdepth);
|
||||
UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat();
|
||||
t_sum = t_sum(Range::all(), Range(0, size.height));
|
||||
|
||||
int offset = (int)src.offset / vlen;
|
||||
int vcols = (src.cols + vlen - 1) / vlen;
|
||||
int sum_offset = (int)sum.offset / vlen;
|
||||
|
||||
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
|
||||
offset, src.rows, src.cols, (int)src.step, (int)t_sum.step);
|
||||
size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
|
||||
if (!k1.run(1, >, <, false))
|
||||
UMat src = _src.getUMat();
|
||||
Size src_size = src.size();
|
||||
Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
|
||||
UMat buf(bufsize, sdepth);
|
||||
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
|
||||
size_t gt = src.cols, lt = tileSize;
|
||||
if (!kcols.run(1, >, <, false))
|
||||
return false;
|
||||
|
||||
ocl::Kernel k2("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc,
|
||||
format("-D sdepth=%d", sdepth));
|
||||
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrWriteOnly(sum),
|
||||
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)sum.step, sum_offset);
|
||||
ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
|
||||
if (krows.empty())
|
||||
return false;
|
||||
|
||||
size_t gt2 = t_sum.cols * 32, lt2 = 256;
|
||||
return k2.run(1, >2, <2, false);
|
||||
Size sumsize(src_size.width + 1, src_size.height + 1);
|
||||
_sum.create(sumsize, sdepth);
|
||||
UMat sum = _sum.getUMat();
|
||||
|
||||
krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
|
||||
gt = src.rows;
|
||||
return krows.run(1, >, <, false);
|
||||
}
|
||||
|
||||
static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
|
||||
{
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
|
||||
if ( _src.type() != CV_8UC1 || _src.step() % vlen != 0 || _src.offset() % vlen != 0 ||
|
||||
(!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
|
||||
if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
|
||||
return false;
|
||||
|
||||
char cvt[40];
|
||||
String opts = format("-D sdepth=%d -D sqdepth=%d -D TYPE=%s -D TYPE4=%s4 -D convert_TYPE4=%s%s",
|
||||
sdepth, sqdepth, ocl::typeToStr(sqdepth), ocl::typeToStr(sqdepth),
|
||||
ocl::convertTypeStr(sdepth, sqdepth, 4, cvt),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
static const int tileSize = 16;
|
||||
|
||||
ocl::Kernel k1("integral_cols", ocl::imgproc::integral_sqrsum_oclsrc, opts);
|
||||
if (k1.empty())
|
||||
String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
|
||||
ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
|
||||
tileSize,
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
|
||||
if (kcols.empty())
|
||||
return false;
|
||||
|
||||
Size size = _src.size(), dsize = Size(size.width + 1, size.height + 1),
|
||||
t_size = Size(((size.height + vlen - 1) / vlen) * vlen, size.width);
|
||||
UMat src = _src.getUMat(), t_sum(t_size, sdepth), t_sqsum(t_size, sqdepth);
|
||||
t_sum = t_sum(Range::all(), Range(0, size.height));
|
||||
t_sqsum = t_sqsum(Range::all(), Range(0, size.height));
|
||||
|
||||
_sum.create(dsize, sdepth);
|
||||
_sqsum.create(dsize, sqdepth);
|
||||
UMat sum = _sum.getUMat(), sqsum = _sqsum.getUMat();
|
||||
|
||||
int offset = (int)src.offset / vlen;
|
||||
int pre_invalid = src.offset % vlen;
|
||||
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
|
||||
int sum_offset = (int)(sum.offset / sum.elemSize());
|
||||
int sqsum_offset = (int)(sqsum.offset / sqsum.elemSize());
|
||||
|
||||
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
|
||||
ocl::KernelArg::PtrWriteOnly(t_sqsum), offset, pre_invalid, src.rows,
|
||||
src.cols, (int)src.step, (int)t_sum.step, (int)t_sqsum.step);
|
||||
|
||||
size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
|
||||
if (!k1.run(1, >, <, false))
|
||||
UMat src = _src.getUMat();
|
||||
Size src_size = src.size();
|
||||
Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
|
||||
UMat buf(bufsize, sdepth);
|
||||
UMat buf_sq(bufsize, sqdepth);
|
||||
kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
|
||||
size_t gt = src.cols, lt = tileSize;
|
||||
if (!kcols.run(1, >, <, false))
|
||||
return false;
|
||||
|
||||
ocl::Kernel k2("integral_rows", ocl::imgproc::integral_sqrsum_oclsrc, opts);
|
||||
if (k2.empty())
|
||||
ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
|
||||
if (krows.empty())
|
||||
return false;
|
||||
|
||||
k2.args(ocl::KernelArg::PtrReadOnly(t_sum), ocl::KernelArg::PtrReadOnly(t_sqsum),
|
||||
ocl::KernelArg::PtrWriteOnly(sum), ocl::KernelArg::PtrWriteOnly(sqsum),
|
||||
t_sum.rows, t_sum.cols, (int)t_sum.step, (int)t_sqsum.step,
|
||||
(int)sum.step, (int)sqsum.step, sum_offset, sqsum_offset);
|
||||
Size sumsize(src_size.width + 1, src_size.height + 1);
|
||||
_sum.create(sumsize, sdepth);
|
||||
UMat sum = _sum.getUMat();
|
||||
_sqsum.create(sumsize, sqdepth);
|
||||
UMat sum_sq = _sqsum.getUMat();
|
||||
|
||||
size_t gt2 = t_sum.cols * 32, lt2 = 256;
|
||||
return k2.run(1, >2, <2, false);
|
||||
krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
|
||||
gt = src.rows;
|
||||
return krows.run(1, >, <, false);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -305,11 +305,11 @@ OCL_TEST_P(CvtColor8u32f, Lab2LRGBA) { performTest(3, 4, CVTCODE(Lab2LRGB), dept
|
||||
OCL_TEST_P(CvtColor8u32f, BGR2Luv) { performTest(3, 3, CVTCODE(BGR2Luv), depth == CV_8U ? 1 : 1e-2); }
|
||||
OCL_TEST_P(CvtColor8u32f, RGB2Luv) { performTest(3, 3, CVTCODE(RGB2Luv), depth == CV_8U ? 1 : 1e-2); }
|
||||
OCL_TEST_P(CvtColor8u32f, LBGR2Luv) { performTest(3, 3, CVTCODE(LBGR2Luv), depth == CV_8U ? 1 : 4e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, LRGB2Luv) { performTest(3, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 4e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, LRGB2Luv) { performTest(3, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 5e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, BGRA2Luv) { performTest(4, 3, CVTCODE(BGR2Luv), depth == CV_8U ? 1 : 8e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, RGBA2Luv) { performTest(4, 3, CVTCODE(RGB2Luv), depth == CV_8U ? 1 : 9e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, LBGRA2Luv) { performTest(4, 3, CVTCODE(LBGR2Luv), depth == CV_8U ? 1 : 4e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, LRGBA2Luv) { performTest(4, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 4e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, LBGRA2Luv) { performTest(4, 3, CVTCODE(LBGR2Luv), depth == CV_8U ? 1 : 5e-3); }
|
||||
OCL_TEST_P(CvtColor8u32f, LRGBA2Luv) { performTest(4, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 5e-3); }
|
||||
|
||||
OCL_TEST_P(CvtColor8u32f, Luv2BGR) { performTest(3, 3, CVTCODE(Luv2BGR), depth == CV_8U ? 1 : 7e-5); }
|
||||
OCL_TEST_P(CvtColor8u32f, Luv2RGB) { performTest(3, 3, CVTCODE(Luv2RGB), depth == CV_8U ? 1 : 7e-5); }
|
||||
|
@ -63,7 +63,7 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
|
||||
BorderType, // border type
|
||||
double, // optional parameter
|
||||
bool, // roi or not
|
||||
int) //width multiplier
|
||||
int) // width multiplier
|
||||
{
|
||||
int type, borderType, ksize;
|
||||
Size size;
|
||||
@ -244,8 +244,8 @@ OCL_TEST_P(Erode, Mat)
|
||||
random_roi();
|
||||
Mat kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
|
||||
|
||||
OCL_OFF(cv::erode(src_roi, dst_roi, kernel, Point(-1,-1), iterations) );
|
||||
OCL_ON(cv::erode(usrc_roi, udst_roi, kernel, Point(-1,-1), iterations) );
|
||||
OCL_OFF(cv::erode(src_roi, dst_roi, kernel, Point(-1, -1), iterations) );
|
||||
OCL_ON(cv::erode(usrc_roi, udst_roi, kernel, Point(-1, -1), iterations) );
|
||||
|
||||
Near();
|
||||
}
|
||||
@ -266,8 +266,8 @@ OCL_TEST_P(Dilate, Mat)
|
||||
random_roi();
|
||||
Mat kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
|
||||
|
||||
OCL_OFF(cv::dilate(src_roi, dst_roi, kernel, Point(-1,-1), iterations) );
|
||||
OCL_ON(cv::dilate(usrc_roi, udst_roi, kernel, Point(-1,-1), iterations) );
|
||||
OCL_OFF(cv::dilate(src_roi, dst_roi, kernel, Point(-1, -1), iterations) );
|
||||
OCL_ON(cv::dilate(usrc_roi, udst_roi, kernel, Point(-1, -1), iterations) );
|
||||
|
||||
Near();
|
||||
}
|
||||
@ -289,8 +289,8 @@ OCL_TEST_P(MorphologyEx, Mat)
|
||||
random_roi();
|
||||
Mat kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
|
||||
|
||||
OCL_OFF(cv::morphologyEx(src_roi, dst_roi, op, kernel, Point(-1,-1), iterations) );
|
||||
OCL_ON(cv::morphologyEx(usrc_roi, udst_roi, op, kernel, Point(-1,-1), iterations) );
|
||||
OCL_OFF(cv::morphologyEx(src_roi, dst_roi, op, kernel, Point(-1, -1), iterations) );
|
||||
OCL_ON(cv::morphologyEx(usrc_roi, udst_roi, op, kernel, Point(-1, -1), iterations) );
|
||||
|
||||
Near();
|
||||
}
|
||||
@ -360,8 +360,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
|
||||
OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
|
||||
Values(3, 5, 7),
|
||||
Values(Size(0,0)),//not used
|
||||
Values((BorderType)BORDER_CONSTANT),//not used
|
||||
Values(Size(0, 0)), //not used
|
||||
Values((BorderType)BORDER_CONSTANT),
|
||||
Values(1.0, 2.0, 3.0),
|
||||
Bool(),
|
||||
Values(1))); // not used
|
||||
@ -369,20 +369,20 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
|
||||
OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
|
||||
Values(3, 5, 7),
|
||||
Values(Size(0,0)),//not used
|
||||
Values((BorderType)BORDER_CONSTANT),//not used
|
||||
Values(Size(0, 0)), // not used
|
||||
Values((BorderType)BORDER_CONSTANT),
|
||||
Values(1.0, 2.0, 3.0),
|
||||
Bool(),
|
||||
Values(1))); //not used
|
||||
Values(1))); // not used
|
||||
|
||||
OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(3, 5, 7),
|
||||
Values(Size(0, 0), Size(0, 1), Size(0, 2), Size(0, 3), Size(0, 4), Size(0, 5), Size(0, 6)), // used as generator of operations
|
||||
Values((BorderType)BORDER_CONSTANT),// not used
|
||||
Values(Size(0, 2), Size(0, 3), Size(0, 4), Size(0, 5), Size(0, 6)), // used as generator of operations
|
||||
Values((BorderType)BORDER_CONSTANT),
|
||||
Values(1.0, 2.0, 3.0),
|
||||
Bool(),
|
||||
Values(1))); //not used
|
||||
Values(1))); // not used
|
||||
|
||||
|
||||
} } // namespace cvtest::ocl
|
||||
|
@ -42,6 +42,7 @@
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include <fstream>
|
||||
#include <queue>
|
||||
|
||||
#if defined _MSC_VER && _MSC_VER == 1500
|
||||
typedef int int_fast32_t;
|
||||
@ -57,6 +58,27 @@ using namespace std;
|
||||
namespace cv
|
||||
{
|
||||
|
||||
// Deletes a tree of ERStat regions starting at root. Used only
|
||||
// internally to this implementation.
|
||||
static void deleteERStatTree(ERStat* root) {
|
||||
queue<ERStat*> to_delete;
|
||||
to_delete.push(root);
|
||||
while (!to_delete.empty()) {
|
||||
ERStat* n = to_delete.front();
|
||||
to_delete.pop();
|
||||
ERStat* c = n->child;
|
||||
if (c != NULL) {
|
||||
to_delete.push(c);
|
||||
ERStat* sibling = c->next;
|
||||
while (sibling != NULL) {
|
||||
to_delete.push(sibling);
|
||||
sibling = sibling->next;
|
||||
}
|
||||
}
|
||||
delete n;
|
||||
}
|
||||
}
|
||||
|
||||
ERStat::ERStat(int init_level, int init_pixel, int init_x, int init_y) : pixel(init_pixel),
|
||||
level(init_level), area(0), perimeter(0), euler(0), probability(1.0),
|
||||
parent(0), child(0), next(0), prev(0), local_maxima(0),
|
||||
@ -497,7 +519,7 @@ void ERFilterNM::er_tree_extract( InputArray image )
|
||||
delete(stat->crossings);
|
||||
stat->crossings = NULL;
|
||||
}
|
||||
delete stat;
|
||||
deleteERStatTree(stat);
|
||||
}
|
||||
er_stack.clear();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user