mirror of
https://github.com/opencv/opencv.git
synced 2024-12-11 14:39:11 +08:00
189 lines
6.2 KiB
Common Lisp
189 lines
6.2 KiB
Common Lisp
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
|
|
// Third party copyrights are property of their respective owners.
|
|
//M*/
|
|
|
|
#ifdef DOUBLE_SUPPORT
|
|
#ifdef cl_amd_fp64
|
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
|
#elif defined (cl_khr_fp64)
|
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
|
#endif
|
|
#endif
|
|
|
|
#ifndef LOCAL_SUM_SIZE
|
|
#define LOCAL_SUM_SIZE 16
|
|
#endif
|
|
|
|
#define LOCAL_SUM_STRIDE (LOCAL_SUM_SIZE + 1)
|
|
|
|
|
|
kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,
|
|
__global uchar *buf_ptr, int buf_step, int buf_offset
|
|
#ifdef SUM_SQUARE
|
|
,__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset
|
|
#endif
|
|
)
|
|
{
|
|
__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
|
#ifdef SUM_SQUARE
|
|
__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
|
#endif
|
|
int lid = get_local_id(0);
|
|
int gid = get_group_id(0);
|
|
|
|
int x = get_global_id(0);
|
|
int src_index = x + src_offset;
|
|
|
|
sumT accum = 0;
|
|
#ifdef SUM_SQUARE
|
|
sumSQT accum_sq = 0;
|
|
#endif
|
|
for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)
|
|
{
|
|
int lsum_index = lid;
|
|
#pragma unroll
|
|
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)
|
|
{
|
|
if ((x < cols) && (y + yin < rows))
|
|
{
|
|
__global const uchar *src = src_ptr + src_index;
|
|
accum += src[0];
|
|
#ifdef SUM_SQUARE
|
|
sumSQT temp = src[0] * src[0];
|
|
accum_sq += temp;
|
|
#endif
|
|
}
|
|
lm_sum[lsum_index] = accum;
|
|
#ifdef SUM_SQUARE
|
|
lm_sum_sq[lsum_index] = accum_sq;
|
|
#endif
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
//int buf_index = buf_offset + buf_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
|
|
int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));
|
|
#ifdef SUM_SQUARE
|
|
int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));
|
|
#endif
|
|
|
|
lsum_index = LOCAL_SUM_STRIDE * lid;
|
|
#pragma unroll
|
|
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)
|
|
{
|
|
__global sumT *buf = (__global sumT *)(buf_ptr + buf_index);
|
|
buf[0] = lm_sum[lsum_index];
|
|
buf_index += buf_step;
|
|
#ifdef SUM_SQUARE
|
|
__global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);
|
|
bufsq[0] = lm_sum_sq[lsum_index];
|
|
buf_sq_index += buf_sq_step;
|
|
#endif
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
}
|
|
|
|
kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,
|
|
#ifdef SUM_SQUARE
|
|
__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,
|
|
#endif
|
|
__global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols
|
|
#ifdef SUM_SQUARE
|
|
,__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset
|
|
#endif
|
|
)
|
|
{
|
|
__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
|
#ifdef SUM_SQUARE
|
|
__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
|
|
#endif
|
|
int lid = get_local_id(0);
|
|
int gid = get_group_id(0);
|
|
|
|
int gs = get_global_size(0);
|
|
|
|
int x = get_global_id(0);
|
|
|
|
__global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);
|
|
for (int xin = x; xin < cols; xin += gs)
|
|
{
|
|
dst[xin] = 0;
|
|
}
|
|
dst_offset += dst_step;
|
|
|
|
if (x < rows - 1)
|
|
{
|
|
dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));
|
|
dst[0] = 0;
|
|
}
|
|
|
|
int buf_index = mad24((int)sizeof(sumT), x, buf_offset);
|
|
sumT accum = 0;
|
|
|
|
#ifdef SUM_SQUARE
|
|
__global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);
|
|
for (int xin = x; xin < cols; xin += gs)
|
|
{
|
|
dst_sq[xin] = 0;
|
|
}
|
|
dst_sq_offset += dst_sq_step;
|
|
|
|
if (x < rows - 1)
|
|
{
|
|
dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));
|
|
dst_sq[0] = 0;
|
|
}
|
|
|
|
int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);
|
|
sumSQT accum_sq = 0;
|
|
#endif
|
|
|
|
for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)
|
|
{
|
|
int lsum_index = lid;
|
|
#pragma unroll
|
|
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)
|
|
{
|
|
__global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);
|
|
accum += buf[0];
|
|
lm_sum[lsum_index] = accum;
|
|
buf_index += buf_step;
|
|
#ifdef SUM_SQUARE
|
|
__global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);
|
|
accum_sq += buf_sq[0];
|
|
lm_sum_sq[lsum_index] = accum_sq;
|
|
buf_sq_index += buf_sq_step;
|
|
#endif
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
if (y + lid < cols)
|
|
{
|
|
//int dst_index = dst_offset + dst_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
|
|
int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));
|
|
#ifdef SUM_SQUARE
|
|
int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));
|
|
#endif
|
|
lsum_index = LOCAL_SUM_STRIDE * lid;
|
|
int yin_max = min(rows - 1 - LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);
|
|
#pragma unroll
|
|
for (int yin = 0; yin < yin_max; yin++, lsum_index++)
|
|
{
|
|
dst = (__global sumT *)(dst_ptr + dst_index);
|
|
dst[0] = lm_sum[lsum_index];
|
|
dst_index += dst_step;
|
|
#ifdef SUM_SQUARE
|
|
dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);
|
|
dst_sq[0] = lm_sum_sq[lsum_index];
|
|
dst_sq_index += dst_sq_step;
|
|
#endif
|
|
}
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
}
|