From a5c9d83617778e0f858c9f7fa5ae995e29a775ca Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 15 Sep 2013 19:56:05 +0400 Subject: [PATCH] fixed ocl::pyrUp for 2-byte types --- modules/ocl/src/opencl/pyr_up.cl | 766 +------------------------------ modules/ocl/src/pyrup.cpp | 20 +- 2 files changed, 34 insertions(+), 752 deletions(-) diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl index f58205c02a..88efa9539f 100644 --- a/modules/ocl/src/opencl/pyr_up.cl +++ b/modules/ocl/src/opencl/pyr_up.cl @@ -46,330 +46,25 @@ // //M*/ -uchar get_valid_uchar(float data) -{ - return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0); -} - /////////////////////////////////////////////////////////////////////// -////////////////////////// CV_8UC1 ////////////////////////////////// +//////////////////////// Generic PyrUp ////////////////////////////// /////////////////////////////////////////////////////////////////////// -__kernel void pyrUp_C1_D0(__global uchar* src, __global uchar* dst, +__kernel void pyrUp(__global Type* src, __global Type* dst, int srcRows, int dstRows, int srcCols, int dstCols, int srcOffset, int dstOffset, int srcStep, int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); - __local float s_srcPatch[10][10]; - __local float s_dstPatch[20][16]; - const int tidx = get_local_id(0); - const int tidy = get_local_id(1); + const int lsizex = get_local_size(0); const int lsizey = get_local_size(1); - if( tidx < 10 && tidy < 10 ) - { - int srcx = mad24((int)get_group_id(0), (lsizex>>1), tidx) - 1; - int srcy = mad24((int)get_group_id(1), (lsizey>>1), tidy) - 1; - - srcx = abs(srcx); - srcx = min(srcCols - 1,srcx); - - srcy = abs(srcy); - srcy = min(srcRows -1 ,srcy); - - s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]); - - } - - barrier(CLK_LOCAL_MEM_FENCE); - - float sum = 0; - const int evenFlag = (int)((tidx & 1) == 0); - const int oddFlag = (int)((tidx & 1) != 0); - const bool eveny = ((tidy & 1) == 0); - - if(eveny) - { - sum = (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; - - if (get_local_id(1) < 2) - { - sum = 0; - - if (eveny) - { - sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { - sum = 0; - - if (eveny) - { - sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; - } - s_dstPatch[4 + tidy][tidx] = sum; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0; - - sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx]; - - if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum); -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16UC1 ///////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrUp_C1_D2(__global ushort* src, __global ushort* dst, - int srcRows, int dstRows, int srcCols, int dstCols, - int srcOffset, int dstOffset, int srcStep, int dstStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - __local float s_srcPatch[10][10]; - __local float s_dstPatch[20][16]; - - srcStep = srcStep >> 1; - dstStep = dstStep >> 1; - srcOffset = srcOffset >> 1; - dstOffset = dstOffset >> 1; - - - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) - { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; - - srcx = abs(srcx); - srcx = min(srcCols - 1,srcx); - - srcy = abs(srcy); - srcy = min(srcRows -1 ,srcy); - - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); - - } - - barrier(CLK_LOCAL_MEM_FENCE); - - float sum = 0; - - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); - - if(eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; - - if (get_local_id(1) < 2) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; - } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; - - if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = convert_ushort_sat_rte(4.0f * sum); -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16SC1 ///////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrUp_C1_D3(__global short* src, __global short* dst, - int srcRows, int dstRows, int srcCols, int dstCols, - int srcOffset, int dstOffset, int srcStep, int dstStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - __local float s_srcPatch[10][10]; - __local float s_dstPatch[20][16]; - - srcStep = srcStep >> 1; - dstStep = dstStep >> 1; - srcOffset = srcOffset >> 1; - dstOffset = dstOffset >> 1; - - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) - { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; - - srcx = abs(srcx); - srcx = min(srcCols - 1,srcx); - - srcy = abs(srcy); - srcy = min(srcRows -1 ,srcy); - - s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - float sum = 0; - - const int evenFlag = (int)((get_local_id(0) & 1) == 0); - const int oddFlag = (int)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); - - if (eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; - - if (get_local_id(1) < 2) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; - } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; - - if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum); -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_32FC1 ///////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrUp_C1_D5(__global float* src, __global float* dst, - int srcRows, int dstRows, int srcCols, int dstCols, - int srcOffset, int dstOffset, int srcStep, int dstStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); const int tidx = get_local_id(0); const int tidy = get_local_id(1); - const int lsizex = get_local_size(0); - const int lsizey = get_local_size(1); - __local float s_srcPatch[10][10]; - __local float s_dstPatch[20][16]; - - srcOffset = srcOffset >> 2; - dstOffset = dstOffset >> 2; - srcStep = srcStep >> 2; - dstStep = dstStep >> 2; + __local floatType s_srcPatch[10][10]; + __local floatType s_dstPatch[20][16]; if( tidx < 10 && tidy < 10 ) { @@ -382,451 +77,27 @@ __kernel void pyrUp_C1_D5(__global float* src, __global float* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]); - + s_srcPatch[tidy][tidx] = convertToFloat(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); - float sum = 0; - const int evenFlag = (int)((tidx & 1) == 0); - const int oddFlag = (int)((tidx & 1) != 0); + floatType sum = (floatType)0; + const floatType evenFlag = (floatType)((tidx & 1) == 0); + const floatType oddFlag = (floatType)((tidx & 1) != 0); const bool eveny = ((tidy & 1) == 0); + const floatType co1 = (floatType)0.375f; + const floatType co2 = (floatType)0.25f; + const floatType co3 = (floatType)0.0625f; if(eveny) { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[2 + tidy][tidx] = sum; - - if (tidy < 2) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[tidy][tidx] = sum; - } - - if (tidy > 13) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx + 2) >> 1)]; - } - s_dstPatch[4 + tidy][tidx] = sum; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx]; - sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][tidx]; - sum = sum + 0.375f * s_dstPatch[2 + tidy ][tidx]; - sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][tidx]; - sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx]; - - if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = (float)(4.0f * sum); -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_8UC4 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrUp_C4_D0(__global uchar4* src, __global uchar4* dst, - int srcRows, int dstRows, int srcCols, int dstCols, - int srcOffset, int dstOffset, int srcStep, int dstStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - const int tidx = get_local_id(0); - const int tidy = get_local_id(1); - const int lsizex = get_local_size(0); - const int lsizey = get_local_size(1); - __local float4 s_srcPatch[10][10]; - __local float4 s_dstPatch[20][16]; - - srcOffset >>= 2; - dstOffset >>= 2; - srcStep >>= 2; - dstStep >>= 2; - - - if( tidx < 10 && tidy < 10 ) - { - int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1; - int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1; - - srcx = abs(srcx); - srcx = min(srcCols - 1,srcx); - - srcy = abs(srcy); - srcy = min(srcRows -1 ,srcy); - - s_srcPatch[tidy][tidx] = convert_float4(src[srcx + srcy * srcStep]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - float4 sum = (float4)(0,0,0,0); - - const float4 evenFlag = (float4)((tidx & 1) == 0); - const float4 oddFlag = (float4)((tidx & 1) != 0); - const bool eveny = ((tidy & 1) == 0); - - float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); - float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); - float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); - - - if(eveny) - { - sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + ( evenFlag * co1) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; - - } - - s_dstPatch[2 + tidy][tidx] = sum; - - if (tidy < 2) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[tidy][tidx] = sum; - } - - if (tidy > 13) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)]; - - } - s_dstPatch[4 + tidy][tidx] = sum; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = co3 * s_dstPatch[2 + tidy - 2][tidx]; - sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx]; - sum = sum + co1 * s_dstPatch[2 + tidy ][tidx]; - sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx]; - sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; - - if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum); -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16UC4 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrUp_C4_D2(__global ushort4* src, __global ushort4* dst, - int srcRows, int dstRows, int srcCols, int dstCols, - int srcOffset, int dstOffset, int srcStep, int dstStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - __local float4 s_srcPatch[10][10]; - __local float4 s_dstPatch[20][16]; - - srcOffset >>= 3; - dstOffset >>= 3; - srcStep >>= 3; - dstStep >>= 3; - - - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) - { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; - - srcx = abs(srcx); - srcx = min(srcCols - 1,srcx); - - srcy = abs(srcy); - srcy = min(srcRows -1 ,srcy); - - s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - float4 sum = (float4)(0,0,0,0); - - const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0); - const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); - - float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); - float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); - float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); - - - if(eveny) - { - sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; - - } - - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; - - if (get_local_id(1) < 2) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; - - } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)]; - - if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum); -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16SC4 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrUp_C4_D3(__global short4* src, __global short4* dst, - int srcRows, int dstRows, int srcCols, int dstCols, - int srcOffset, int dstOffset, int srcStep, int dstStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - __local float4 s_srcPatch[10][10]; - __local float4 s_dstPatch[20][16]; - - srcOffset >>= 3; - dstOffset >>= 3; - srcStep >>= 3; - dstStep >>= 3; - - if( get_local_id(0) < 10 && get_local_id(1) < 10 ) - { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; - - srcx = abs(srcx); - srcx = min(srcCols - 1,srcx); - - srcy = abs(srcy); - srcy = min(srcRows -1 ,srcy); - - s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - float4 sum = (float4)(0,0,0,0); - - const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0); - const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0); - const bool eveny = ((get_local_id(1) & 1) == 0); - const int tidx = get_local_id(0); - - float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); - float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); - float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); - - - if(eveny) - { - sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; - sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; - sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; - - } - - s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; - - if (get_local_id(1) < 2) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; - } - - s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; - } - - if (get_local_id(1) > 13) - { - sum = 0; - - if (eveny) - { - sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; - - } - s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - sum = 0; - - const int tidy = get_local_id(1); - - sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)]; - sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)]; - sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)]; - sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)]; - sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)]; - - if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = convert_short4_sat_rte(4.0f * sum); -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_32FC4 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrUp_C4_D5(__global float4* src, __global float4* dst, - int srcRows, int dstRows, int srcCols, int dstCols, - int srcOffset, int dstOffset, int srcStep, int dstStep) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - const int tidx = get_local_id(0); - const int tidy = get_local_id(1); - const int lsizex = get_local_size(0); - const int lsizey = get_local_size(1); - __local float4 s_srcPatch[10][10]; - __local float4 s_dstPatch[20][16]; - - srcOffset >>= 4; - dstOffset >>= 4; - srcStep >>= 4; - dstStep >>= 4; - - - if( tidx < 10 && tidy < 10 ) - { - int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + tidx) - 1; - int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + tidy) - 1; - - srcx = abs(srcx); - srcx = min(srcCols - 1,srcx); - - srcy = abs(srcy); - srcy = min(srcRows -1 ,srcy); - - s_srcPatch[tidy][tidx] = (float4)(src[srcx + srcy * srcStep]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - float4 sum = (float4)(0,0,0,0); - - const float4 evenFlag = (float4)((tidx & 1) == 0); - const float4 oddFlag = (float4)((tidx & 1) != 0); - const bool eveny = ((tidy & 1) == 0); - - float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); - float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); - float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); - - - if(eveny) - { - sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; + sum = ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)]; sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)]; sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx ) >> 1)]; sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)]; sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)]; - } s_dstPatch[2 + tidy][tidx] = sum; @@ -837,8 +108,8 @@ __kernel void pyrUp_C4_D5(__global float4* src, __global float4* dst, if (eveny) { - sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; - sum = sum + (oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; + sum = (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; @@ -853,24 +124,23 @@ __kernel void pyrUp_C4_D5(__global float4* src, __global float4* dst, if (eveny) { - sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; + sum = (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)]; sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)]; sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx ) >> 1)]; sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)]; sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)]; - } s_dstPatch[4 + tidy][tidx] = sum; } barrier(CLK_LOCAL_MEM_FENCE); - sum = co3 * s_dstPatch[2 + tidy - 2][tidx]; + sum = co3 * s_dstPatch[2 + tidy - 2][tidx]; sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx]; sum = sum + co1 * s_dstPatch[2 + tidy ][tidx]; sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx]; sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = 4.0f * sum; + dst[x + y * dstStep] = convertToType(4.0f * sum); } diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp index 95a2915f41..043031072c 100644 --- a/modules/ocl/src/pyrup.cpp +++ b/modules/ocl/src/pyrup.cpp @@ -59,9 +59,10 @@ namespace cv namespace ocl { extern const char *pyr_up; + void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst) { - int depth = src.depth(), channels = src.channels(); + int depth = src.depth(), channels = src.channels(), oclChannels = src.oclchannels(); CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F); CV_Assert(channels == 1 || channels == 3 || channels == 4); @@ -70,7 +71,17 @@ namespace cv Context *clCxt = src.clCxt; + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float" }; + char buildOptions[250], convertString[50]; + const char * const channelsString = oclChannels == 1 ? "" : "4"; + sprintf(convertString, "convert_%s%s_sat_rte", typeMap[depth], channelsString); + sprintf(buildOptions, "-D Type=%s%s -D floatType=float%s -D convertToType=%s -D convertToFloat=%s", + typeMap[depth], channelsString, channelsString, + depth == CV_32F ? "" : convertString, + oclChannels == 4 ? "convert_float4" : "(float)"); + const std::string kernelName = "pyrUp"; + int dststep = dst.step / dst.elemSize(), srcstep = src.step / src.elemSize(); std::vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); @@ -81,14 +92,15 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dststep)); size_t globalThreads[3] = {dst.cols, dst.rows, 1}; size_t localThreads[3] = {16, 16, 1}; - openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth()); + openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, -1, -1, + buildOptions); } } }