Merge pull request #1045 from bitwangyaoyao:2.4_fix

2025-07-22 12:17:04 +08:00 · 2013-06-26 16:55:06 +04:00 · 2013-06-26 16:55:06 +04:00 · bf6b119a3f
commit bf6b119a3f
parent 1cfe5aa41e 2ec1140b25
6 changed files with 325 additions and 329 deletions
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@ -44,43 +44,21 @@

 int main(int argc, const char *argv[])
 {
-    vector<ocl::Info> oclinfo;
-    int num_devices = getDevice(oclinfo);
-
-    if (num_devices < 1)
-    {
-        cerr << "no device found\n";
-        return -1;
-    }
-    // set this to overwrite binary cache every time the test starts
-    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
-
-    int devidx = 0;
-
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
-        {
-            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
-        }
-    }
-
-    redirectError(cvErrorCallback);
-
    const char *keys =
        "{ h | help    | false | print help message }"
        "{ f | filter  |       | filter for test }"
        "{ w | workdir |       | set working directory }"
        "{ l | list    | false | show all tests }"
        "{ d | device  | 0     | device id }"
+        "{ c | cpu_ocl | false | use cpu as ocl device}"
        "{ i | iters   | 10    | iteration count }"
        "{ m | warmup  | 1     | gpu warm up iteration count}"
-        "{ t | xtop    | 1.1	  | xfactor top boundary}"
-        "{ b | xbottom | 0.9	  | xfactor bottom boundary}"
+        "{ t | xtop    | 1.1   | xfactor top boundary}"
+        "{ b | xbottom | 0.9   | xfactor bottom boundary}"
        "{ v | verify  | false | only run gpu once to verify if problems occur}";

+    redirectError(cvErrorCallback);
    CommandLineParser cmd(argc, argv, keys);
-
    if (cmd.get<bool>("help"))
    {
        cout << "Avaible options:" << endl;
@ -88,14 +66,40 @@ int main(int argc, const char *argv[])
        return 0;
    }

-    int device = cmd.get<int>("device");
+    // get ocl devices
+    bool use_cpu = cmd.get<bool>("c");
+    vector<ocl::Info> oclinfo;
+    int num_devices = 0;
+    if(use_cpu)
+        num_devices = getDevice(oclinfo, ocl::CVCL_DEVICE_TYPE_CPU);
+    else
+        num_devices = getDevice(oclinfo);
+    if (num_devices < 1)
+    {
+        cerr << "no device found\n";
+        return -1;
+    }

+    // show device info
+    int devidx = 0;
+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
+        {
+            cout << "device " << devidx++ << ": " << oclinfo[i].DeviceName[j] << endl;
+        }
+    }
+
+    int device = cmd.get<int>("device");
    if (device < 0 || device >= num_devices)
    {
        cerr << "Invalid device ID" << endl;
        return -1;
    }

+    // set this to overwrite binary cache every time the test starts
+    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
+    
    if (cmd.get<bool>("verify"))
    {
        TestSystem::instance().setNumIters(1);
@ -104,7 +108,6 @@ int main(int argc, const char *argv[])
    }

    devidx = 0;
-
    for (size_t i = 0; i < oclinfo.size(); i++)
    {
        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
@ -113,7 +116,7 @@ int main(int argc, const char *argv[])
            {
                ocl::setDevice(oclinfo[i], (int)j);
                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
-                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
+                cout << "use " << devidx << ": " <<oclinfo[i].DeviceName[j] << endl;
                goto END_DEV;
            }
        }
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@ -149,7 +149,7 @@ namespace cv
            cl_image_format format;
            int err;
            int depth    = mat.depth();
-            int channels = mat.channels();
+            int channels = mat.oclchannels();

            switch(depth)
            {
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@ -16,7 +16,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Sen Liu, sen@multicorewareinc.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
        blocky = size.height/TILE_SIZE;
    else
        blocky = size.height/TILE_SIZE + 1;
-    cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1);
-    cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
+    oclMat dst_m(blocky * 10, blockx, CV_64FC1);
+    oclMat sum(1, 10, CV_64FC1);
    int tile_width  = std::min(size.width,TILE_SIZE);
    int tile_height = std::min(size.height,TILE_SIZE);
    size_t localThreads[3]  = { tile_height, 1, 1};
@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);

    size_t localThreadss[3]  = { 128, 1, 1};
    size_t globalThreadss[3] = { 128, 1, 1};
@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height ));
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum ));
+    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
-    openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
-    double* dstsum = new double[10];
-    memset(dstsum,0,10*sizeof(double));
-    openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double));
-    mom->m00 = dstsum[0];
-    mom->m10 = dstsum[1];
-    mom->m01 = dstsum[2];
-    mom->m20 = dstsum[3];
-    mom->m11 = dstsum[4];
-    mom->m02 = dstsum[5];
-    mom->m30 = dstsum[6];
-    mom->m21 = dstsum[7];
-    mom->m12 = dstsum[8];
-    mom->m03 = dstsum[9];
-    delete [] dstsum;
-    openCLSafeCall(clReleaseMemObject(sum));
+    openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
+
+    Mat dstsum(sum);
+    mom->m00 = dstsum.at<double>(0, 0);
+    mom->m10 = dstsum.at<double>(0, 1);
+    mom->m01 = dstsum.at<double>(0, 2);
+    mom->m20 = dstsum.at<double>(0, 3);
+    mom->m11 = dstsum.at<double>(0, 4);
+    mom->m02 = dstsum.at<double>(0, 5);
+    mom->m30 = dstsum.at<double>(0, 6);
+    mom->m21 = dstsum.at<double>(0, 7);
+    mom->m12 = dstsum.at<double>(0, 8);
+    mom->m03 = dstsum.at<double>(0, 9);
+
    icvCompleteMomentState( mom );
 }

--- a/modules/ocl/src/opencl/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt
            sum[i] = dst_sum[i][0];
 }

-__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
                           __global F* dst_m,
                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, int TILE_SIZE)
 {
    uchar tmp_coi[16]; // get the coi data
    uchar16 tmp[16];
@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
    int x = wgidx*TILE_SIZE;  // vector length of uchar
    int kcn = (cn==2)?2:4;
    int rstep = min(src_step, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols - x);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if( tileSize_width < TILE_SIZE )
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
+
+        if( coi > 0 )	//channel of interest
+            for(int i = 0; i < tileSize_width; i += VLEN_C)
+            {
+                for(int j=0; j<VLEN_C; j++)
+                    tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
+                                          tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_C)
+                tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
+    }

-    if( tileSize_width < TILE_SIZE )
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
-    if( coi > 0 )	//channel of interest
-        for(int i = 0; i < tileSize_width; i += VLEN_C)
-        {
-            for(int j=0; j<VLEN_C; j++)
-                tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
-                                      tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_C)
-            tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
    uchar16 zero = (uchar16)(0);
    uchar16 full = (uchar16)(255);
    if( binary )
        for(int i=0; i < tileSize_width; i+=VLEN_C)
            tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
+
    F mom[10];
    __local int m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
+    {
        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
+
    int lm[10] = {0};
    int16 x0 = (int16)(0);
    int16 x1 = (int16)(0);
@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
                m[i][lidy-j/2] = lm[i];
        barrier(CLK_LOCAL_MEM_FENCE);
    }
+
    if(lidy == 0&&lidx == 0)
    {
        for( int mt = 0; mt < 10; mt++ )
@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
    }
 }

-__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
                           __global F* dst_m,
                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    ushort tmp_coi[8]; // get the coi data
    ushort8 tmp[32];
@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
    int rstep = min(src_step/2, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
-    if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
-        for(int i=tileSize_width; i < rstep; i++ )
-            *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_US)
-        {
-            for(int j=0; j<VLEN_US; j++)
-                tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_US)
-            tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
+            for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+            {
+                for(int j=0; j<VLEN_US; j++)
+                    tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+                tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    }
+
    ushort8 zero = (ushort8)(0);
    ushort8 full = (ushort8)(255);
    if( binary )
@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
            tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
    F mom[10];
    __local long m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
    barrier(CLK_LOCAL_MEM_FENCE);
+
    long lm[10] = {0};
    int8 x0 = (int8)(0);
    int8 x1 = (int8)(0);
@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
        lm[0] = x0.s0;             // m00
    }
    barrier(CLK_LOCAL_MEM_FENCE);
+
    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
    {
        if(lidy < j)
            for( int i = 0; i < 10; i++ )
                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
        if(lidy >= j/2&&lidy < j)
            for( int i = 0; i < 10; i++ )
                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
    if(lidy == 0&&lidx == 0)
    {
        for(int mt = 0; mt < 10; mt++ )
@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
    }
 }

-__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
                           __global F* dst_m,
                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    short tmp_coi[8]; // get the coi data
    short8 tmp[32];
@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
    int rstep = min(src_step/2, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_S)
-        {
-            for(int j=0; j<VLEN_S; j++)
-                tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_S)
-            tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+            {
+                for(int j=0; j<VLEN_S; j++)
+                    tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+                tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    }
+
    short8 zero = (short8)(0);
    short8 full = (short8)(255);
    if( binary )
@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols

    F mom[10];
    __local long m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
    barrier(CLK_LOCAL_MEM_FENCE);
    long lm[10] = {0};
    int8 x0 = (int8)(0);
@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
    }
 }

-__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
                            __global F* dst_m,
                            int dst_cols, int dst_step, int blocky,
-                            int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                            int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    float tmp_coi[4]; // get the coi data
    float4 tmp[64] ;
@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
    int y = wgidy*TILE_SIZE;  // real Y index of pixel
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
-    src_step /= sizeof(*src_data);
-    int rstep = min(src_step, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
+    int rstep = min(src_step/4, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
    int maxIdx = mul24(src_rows, src_cols);
    int yOff = (y+lidy)*src_step;
    int index;
-    if(tileSize_width < TILE_SIZE && yOff < src_rows)
-        for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
-            *(src_data+yOff+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_F)
-        {
-#pragma unroll
-            for(int j=0; j<4; j++)
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
            {
-                index = yOff+(x+i+j)*kcn+coi-1;
-                if (index < maxIdx)
-                    tmp_coi[j] = *(src_data+index);
-                else
-                    tmp_coi[j] = 0;
+                for(int j=0; j<4; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
            }
-            tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-        }
-    else
-        for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
-            tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
+                tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+    }
+
    float4 zero = (float4)(0);
    float4 full = (float4)(255);
    if( binary )
@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
            tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
    F mom[10];
    __local F m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
        for(int i = 0; i < 10; i ++)
-            for(int j = 0; j < 128; j ++)
-                m[i][j] = 0;
+            m[i][lidy] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    F lm[10] = {0};
    F4 x0 = (F4)(0);
@ -729,185 +748,6 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
        m[0][lidy-bheight] = x0.s0;             // m00
    }

-    else if(lidy < bheight)
-    {
-        lm[9] = ((F)py) * sy;  // m03
-        lm[8] = ((F)x1.s0) * sy;  // m12
-        lm[7] = ((F)x2.s0) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
-    {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lidy == 0&&lidx == 0)
-    {
-        for( int mt = 0; mt < 10; mt++ )
-            mom[mt] = (F)lm[mt];
-        if(binary)
-        {
-            F s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
-        }
-
-        F xm = x * mom[0], ym = y * mom[0];
-
-        // accumulate moments computed in each tile
-        dst_step /= sizeof(F);
-
-        int dst_x_off = mad24(wgidy, dst_cols, wgidx);
-        int dst_off = 0;
-        int max_dst_index = 10 * blocky * get_global_size(1);
-
-        // + m00 ( = m00' )
-        dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
-
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
-
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
-    }
-}
-
-__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
-                           __global F* dst_m,
-                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
-    F tmp_coi[4]; // get the coi data
-    F4 tmp[64];
-    int VLEN_D = 4; // length of vetor
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE;  // real Y index of pixel
-    int x = wgidx*TILE_SIZE;  // real X index of pixel
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/8, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE,  src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols - x);
-
-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-        {
-            for(int j=0; j<4; j++)
-                tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
-    F4 zero = (F4)(0);
-    F4 full = (F4)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
-    F mom[10];
-    __local F m[10][128];
-    if(lidy == 0)
-        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    F lm[10] = {0};
-    F4 x0 = (F4)(0);
-    F4 x1 = (F4)(0);
-    F4 x2 = (F4)(0);
-    F4 x3 = (F4)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
-    {
-        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
-        F4 p = tmp[xt/VLEN_D];
-        F4 xp = v_xt * p, xxp = xp * v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += xxp *v_xt;
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3;
-
-    F py = lidy * x0.s0, sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((F)py) * sy;  // m03
-        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
-
    else if(lidy < bheight)
    {
        lm[9] = ((F)py) * sy;  // m03
@ -922,6 +762,164 @@ __kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, in
        lm[0] = x0.s0;             // m00
    }
    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        F xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+
+__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    F tmp_coi[4]; // get the coi data
+    F4 tmp[64];
+    int VLEN_D = 4; // length of vetor
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/8, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE,  src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_D)
+            {
+                for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+            }
+        else
+            for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
+                tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+    }
+
+    F4 zero = (F4)(0);
+    F4 full = (F4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
+    F mom[10];
+    __local F m[10][128];
+    if(lidy < 128)
+        for(int i=0; i<10; i++)
+            m[i][lidy]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    F lm[10] = {0};
+    F4 x0 = (F4)(0);
+    F4 x1 = (F4)(0);
+    F4 x2 = (F4)(0);
+    F4 x3 = (F4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
+    {
+        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+        F4 p = tmp[xt/VLEN_D];
+        F4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp *v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+
+    F py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((F)py) * sy;  // m03
+        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((F)py) * sy;  // m03
+        lm[8] = ((F)x1.s0) * sy;  // m12
+        lm[7] = ((F)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
    {
        if(lidy < j)
--- a/modules/ocl/src/opencl/stereobm.cl
+++ b/modules/ocl/src/opencl/stereobm.cl
@ -162,8 +162,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
    int y_tex;
    int x_tex = X - radius;

-    if (x_tex >= cwidth)
-        return;
+    //if (x_tex >= cwidth)
+    //    return;

    for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
    {
--- a/modules/ocl/test/test_moments.cpp
+++ b/modules/ocl/test/test_moments.cpp
@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat)
    {
        if(test_contours)
        {
-            Mat src = imread( workdir + "../cpp/pic3.png", 1 );
-            Mat src_gray, canny_output;
-            cvtColor( src, src_gray, CV_BGR2GRAY );
+            Mat src = imread( workdir + "../cpp/pic3.png", IMREAD_GRAYSCALE );
+            ASSERT_FALSE(src.empty());
+            Mat canny_output;
            vector<vector<Point> > contours;
            vector<Vec4i> hierarchy;
-            Canny( src_gray, canny_output, 100, 200, 3 );
+            Canny( src, canny_output, 100, 200, 3 );
            findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
            for( size_t i = 0; i < contours.size(); i++ )
            {