Merge pull request #1025 from bitwangyaoyao:2.4_tests

2025-06-07 09:25:45 +08:00 · 2013-06-24 12:11:04 +04:00 · 2013-06-24 12:11:04 +04:00 · 4ed3d33dd7
commit 4ed3d33dd7
parent b0421cafab 6326739b44
21 changed files with 1317 additions and 1261 deletions
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@ -52,6 +52,8 @@ int main(int argc, const char *argv[])
        cerr << "no device found\n";
        return -1;
    }
+    // set this to overwrite binary cache every time the test starts
+    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);

    int devidx = 0;

--- a/modules/ocl/test/test_columnsum.cpp
+++ b/modules/ocl/test/test_columnsum.cpp
@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//	   Chunpeng Zhang chunpeng@multicorewareinc.com
-//
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@ -31,7 +31,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@ -45,50 +45,57 @@
 //M*/

 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-PARAM_TEST_CASE(ColumnSum, cv::Size)
+///////////// StereoMatchBM ////////////////////////
+PERFTEST(StereoMatchBM)
 {
-    cv::Size size;
-    cv::Mat src;
+	Mat left_image = imread(abspath("aloeL.jpg"), cv::IMREAD_GRAYSCALE);
+	Mat right_image = imread(abspath("aloeR.jpg"), cv::IMREAD_GRAYSCALE);
+	Mat disp,dst;
+	ocl::oclMat d_left, d_right,d_disp;
+	int n_disp= 128;
+	int winSize =19;

-    virtual void SetUp()
-    {
-        size = GET_PARAM(0);
-    }
-};
+	SUBTEST << left_image.cols << 'x' << left_image.rows << "; aloeL.jpg ;"<< right_image.cols << 'x' << right_image.rows << "; aloeR.jpg ";

-TEST_P(ColumnSum, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_32FC1);
-    cv::ocl::oclMat d_dst;
-    cv::ocl::oclMat d_src(src);
+	StereoBM bm(0, n_disp, winSize);
+	bm(left_image, right_image, dst);

-    cv::ocl::columnSum(d_src, d_dst);
+	CPU_ON;
+	bm(left_image, right_image, dst);
+	CPU_OFF;

-    cv::Mat dst(d_dst);
+	d_left.upload(left_image);
+	d_right.upload(right_image);

-    for (int j = 0; j < src.cols; ++j)
-    {
-        float gold = src.at<float>(0, j);
-        float res = dst.at<float>(0, j);
-        ASSERT_NEAR(res, gold, 1e-5);
-    }
+	ocl::StereoBM_OCL d_bm(0, n_disp, winSize);

-    for (int i = 1; i < src.rows; ++i)
-    {
-        for (int j = 0; j < src.cols; ++j)
-        {
-            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
-            float res = dst.at<float>(i, j);
-            ASSERT_NEAR(res, gold, 1e-5);
-        }
-    }
+	WARMUP_ON;
+	d_bm(d_left, d_right, d_disp);
+	WARMUP_OFF;
+
+    cv::Mat ocl_mat;
+    d_disp.download(ocl_mat);
+    ocl_mat.convertTo(ocl_mat, dst.type());
+
+	GPU_ON;
+	d_bm(d_left, d_right, d_disp);
+	GPU_OFF;
+
+	GPU_FULL_ON;
+	d_left.upload(left_image);
+	d_right.upload(right_image);
+	d_bm(d_left, d_right, d_disp);
+	d_disp.download(disp);
+	GPU_FULL_OFF;
+    
+    TestSystem::instance().setAccurate(-1, 0.);
 }

-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);


-#endif
+
+
+
+
+
+	
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@ -284,6 +284,7 @@ PERFTEST(GaussianBlur)
    Mat src, dst, ocl_dst;
    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
+    const int ksize = 7;	

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
@ -291,29 +292,28 @@ PERFTEST(GaussianBlur)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

-            gen(src, size, size, all_type[j], 5, 16);
+            gen(src, size, size, all_type[j], 0, 256);

-            GaussianBlur(src, dst, Size(9, 9), 0);
+            GaussianBlur(src, dst, Size(ksize, ksize), 0);

            CPU_ON;
-            GaussianBlur(src, dst, Size(9, 9), 0);
+            GaussianBlur(src, dst, Size(ksize, ksize), 0);
            CPU_OFF;

            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst(src.size(), src.type());
-            ocl::oclMat d_buf;
+            ocl::oclMat d_dst;

            WARMUP_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
            WARMUP_OFF;

            GPU_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
            d_dst.download(ocl_dst);
            GPU_FULL_OFF;

--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@ -46,11 +46,6 @@
 #include "precomp.hpp"

 ///////////// HOG////////////////////////
-bool match_rect(cv::Rect r1, cv::Rect r2, int threshold)
-{
-    return ((abs(r1.x - r2.x) < threshold) && (abs(r1.y - r2.y) < threshold) &&
-        (abs(r1.width - r2.width) < threshold) && (abs(r1.height - r2.height) < threshold));
-}

 PERFTEST(HOG)
 {
@ -61,13 +56,12 @@ PERFTEST(HOG)
        throw runtime_error("can't open road.png");
    }

-
    cv::HOGDescriptor hog;
    hog.setSVMDetector(hog.getDefaultPeopleDetector());
    std::vector<cv::Rect> found_locations;
    std::vector<cv::Rect> d_found_locations;

-    SUBTEST << 768 << 'x' << 576 << "; road.png";
+    SUBTEST << src.cols << 'x' << src.rows << "; road.png";

    hog.detectMultiScale(src, found_locations);

@ -84,70 +78,10 @@ PERFTEST(HOG)
    ocl_hog.detectMultiScale(d_src, d_found_locations);
    WARMUP_OFF;
    
-    // Ground-truth rectangular people window
-    cv::Rect win1_64x128(231, 190, 72, 144);
-    cv::Rect win2_64x128(621, 156, 97, 194);
-    cv::Rect win1_48x96(238, 198, 63, 126);
-    cv::Rect win2_48x96(619, 161, 92, 185);
-    cv::Rect win3_48x96(488, 136, 56, 112);
-
-    // Compare whether ground-truth windows are detected and compare the number of windows detected.
-    std::vector<int> d_comp(4);
-    std::vector<int> comp(4);
-    for(int i = 0; i < (int)d_comp.size(); i++)
-    {
-        d_comp[i] = 0;
-        comp[i] = 0;
-    }
-
-    int threshold = 10;
-    int val = 32;
-    d_comp[0] = (int)d_found_locations.size();
-    comp[0] = (int)found_locations.size();
-
-    cv::Size winSize = hog.winSize;
-
-    if (winSize == cv::Size(48, 96))
-    {
-        for(int i = 0; i < (int)d_found_locations.size(); i++)
-        {
-            if (match_rect(d_found_locations[i], win1_48x96, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found_locations[i], win2_48x96, threshold))
-                d_comp[2] = val;
-            if (match_rect(d_found_locations[i], win3_48x96, threshold))
-                d_comp[3] = val;
-        }
-        for(int i = 0; i < (int)found_locations.size(); i++)
-        {
-            if (match_rect(found_locations[i], win1_48x96, threshold))
-                comp[1] = val;
-            if (match_rect(found_locations[i], win2_48x96, threshold))
-                comp[2] = val;
-            if (match_rect(found_locations[i], win3_48x96, threshold))
-                comp[3] = val;
-        }
-    }
-    else if (winSize == cv::Size(64, 128))
-    {
-        for(int i = 0; i < (int)d_found_locations.size(); i++)
-        {
-            if (match_rect(d_found_locations[i], win1_64x128, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found_locations[i], win2_64x128, threshold))
-                d_comp[2] = val;
-        }
-        for(int i = 0; i < (int)found_locations.size(); i++)
-        {
-            if (match_rect(found_locations[i], win1_64x128, threshold))
-                comp[1] = val;
-            if (match_rect(found_locations[i], win2_64x128, threshold))
-                comp[2] = val;
-        }
-    }
-
-    cv::Mat gpu_rst(d_comp), cpu_rst(comp);
-    TestSystem::instance().ExpectedMatNear(gpu_rst, cpu_rst, 3);
+    if(d_found_locations.size() == found_locations.size())
+        TestSystem::instance().setAccurate(1, 0);
+    else
+        TestSystem::instance().setAccurate(0, abs((int)found_locations.size() - (int)d_found_locations.size()));

    GPU_ON;
    ocl_hog.detectMultiScale(d_src, found_locations);
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@ -743,12 +743,12 @@ PERFTEST(meanShiftFiltering)
        WARMUP_OFF;

        GPU_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
        d_dst.download(ocl_dst);
        GPU_FULL_OFF;

@ -969,3 +969,45 @@ PERFTEST(CLAHE)
        }
    }
 }
+
+///////////// columnSum////////////////////////
+PERFTEST(columnSum)
+{
+    Mat src, dst, ocl_dst;
+    ocl::oclMat d_src, d_dst;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
+
+        gen(src, size, size, CV_32FC1, 0, 256);
+
+        CPU_ON;
+        dst.create(src.size(), src.type());
+        for (int j = 0; j < src.cols; j++)
+            dst.at<float>(0, j) = src.at<float>(0, j);
+
+        for (int i = 1; i < src.rows; ++i)
+            for (int j = 0; j < src.cols; ++j)
+                dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
+        CPU_OFF;
+
+        d_src.upload(src);
+
+        WARMUP_ON;
+        ocl::columnSum(d_src, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::columnSum(d_src, d_dst);
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::columnSum(d_src, d_dst);
+        d_dst.download(ocl_dst);
+        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
+    }
+}
--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_columnsum.cpp
@ -44,45 +44,49 @@
 //
 //M*/
 #include "precomp.hpp"
-
-///////////// columnSum////////////////////////
-PERFTEST(columnSum)
+///////////// Moments ////////////////////////
+PERFTEST(Moments)
 {
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;
+    Mat src;
+    bool binaryImage = 0;
+
+    int all_type[] = {CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_16SC1", "CV_32FC1", "CV_64FC1"};

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        SUBTEST << size << 'x' << size << "; CV_32FC1";
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j];

-        gen(src, size, size, CV_32FC1, 0, 256);
+            gen(src, size, size, all_type[j], 0, 256);

-        CPU_ON;
-        dst.create(src.size(), src.type());
-        for (int j = 0; j < src.cols; j++)
-            dst.at<float>(0, j) = src.at<float>(0, j);
+            cv::Moments CvMom = moments(src, binaryImage);

-        for (int i = 1; i < src.rows; ++i)
-            for (int j = 0; j < src.cols; ++j)
-                dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
-        CPU_OFF;
+            CPU_ON;
+            moments(src, binaryImage);
+            CPU_OFF;

-        d_src.upload(src);
+            cv::Moments oclMom;
+            WARMUP_ON;
+            oclMom = ocl::ocl_moments(src, binaryImage);
+            WARMUP_OFF;

-        WARMUP_ON;
-        ocl::columnSum(d_src, d_dst);
-        WARMUP_OFF;
+            Mat gpu_dst, cpu_dst;
+            HuMoments(CvMom, cpu_dst);
+            HuMoments(oclMom, gpu_dst);

-        GPU_ON;
-        ocl::columnSum(d_src, d_dst);
-        GPU_OFF;
+            GPU_ON;
+            ocl::ocl_moments(src, binaryImage);
+            GPU_OFF;

-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::columnSum(d_src, d_dst);
-        d_dst.download(ocl_dst);
-        GPU_FULL_OFF;
+            GPU_FULL_ON;
+            ocl::ocl_moments(src, binaryImage);
+            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(gpu_dst, cpu_dst, .5);
+
+        }

-        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
    }
-}
+}
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@ -331,20 +331,6 @@ void TestSystem::printMetrics(int is_accurate, double cpu_time, double gpu_time,
    cout << setiosflags(ios_base::left);
    stringstream stream;

-#if 0
-    if(is_accurate == 1)
-            stream << "Pass";
-    else if(is_accurate_ == 0)
-            stream << "Fail";
-    else if(is_accurate == -1)
-        stream << " ";
-    else
-    {
-        std::cout<<"is_accurate errer: "<<is_accurate<<"\n";
-        exit(-1);
-    }
-#endif
-
    std::stringstream &cur_subtest_description = getCurSubtestDescription();
   
 #if GTEST_OS_WINDOWS&&!GTEST_OS_WINDOWS_MOBILE
--- a/modules/ocl/test/test_haar.cpp
+++ b/modules/ocl/test/test_haar.cpp
@ -1,180 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Sen Liu, swjutls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/objdetect/objdetect.hpp"
-#include "precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-extern string workdir;
-
-namespace
-{
-IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
-CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
-CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
-struct getRect
-{
-    Rect operator ()(const CvAvgComp &e) const
-    {
-        return e.rect;
-    }
-};
-}
-
-PARAM_TEST_CASE(Haar, double, int, CascadeName)
-{
-    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::CascadeClassifier cpucascade, cpunestedCascade;
-
-    double scale;
-    int flags;
-    std::string cascadeName;
-
-    virtual void SetUp()
-    {
-        scale = GET_PARAM(0);
-        flags = GET_PARAM(1);
-        cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(2));
-
-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
-        {
-            cout << "ERROR: Could not load classifier cascade" << endl;
-            return;
-        }
-    }
-};
-
-////////////////////////////////faceDetect/////////////////////////////////////////////////
-TEST_P(Haar, FaceDetect)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << "Couldn't read " << imgName << std::endl;
-        return ;
-    }
-
-    vector<Rect> faces, oclfaces;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    cv::ocl::oclMat image;
-    CvSeq *_objects;
-    image.upload(smallImg);
-    _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
-                   3, flags, Size(30, 30), Size(0, 0) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    oclfaces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-    
-    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-    EXPECT_EQ(faces.size(), oclfaces.size());
-}
-
-TEST_P(Haar, FaceDetectUseBuf)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << "Couldn't read " << imgName << std::endl;
-        return ;
-    }
-
-    vector<Rect> faces, oclfaces;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    cv::ocl::oclMat image;
-    image.upload(smallImg);
-
-    cv::ocl::OclCascadeClassifierBuf cascadebuf;
-    if( !cascadebuf.load( cascadeName ) )
-    {
-        cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << endl;
-        return;
-    }
-    cascadebuf.detectMultiScale( image, oclfaces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-
-    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-    EXPECT_EQ(faces.size(), oclfaces.size());
-
-    // intentionally run ocl facedetect again and check if it still works after the first run
-    cascadebuf.detectMultiScale( image, oclfaces,  1.1, 3,
-        flags,
-        Size(30, 30));
-    cascadebuf.release();
-    EXPECT_EQ(faces.size(), oclfaces.size());
-}
-
-INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
-    Combine(Values(1.0),
-            Values(CV_HAAR_SCALE_IMAGE, 0), Values(cascade_frontalface_alt, cascade_frontalface_alt2)));
-
-#endif // HAVE_OPENCL
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
@ -1573,6 +1573,47 @@ TEST_P(Convolve, Mat)
    }
 }

+//////////////////////////////// ColumnSum //////////////////////////////////////
+PARAM_TEST_CASE(ColumnSum, cv::Size)
+{
+    cv::Size size;
+    cv::Mat src;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+    }
+};
+
+TEST_P(ColumnSum, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1);
+    cv::ocl::oclMat d_dst;
+    cv::ocl::oclMat d_src(src);
+
+    cv::ocl::columnSum(d_src, d_dst);
+
+    cv::Mat dst(d_dst);
+
+    for (int j = 0; j < src.cols; ++j)
+    {
+        float gold = src.at<float>(0, j);
+        float res = dst.at<float>(0, j);
+        ASSERT_NEAR(res, gold, 1e-5);
+    }
+
+    for (int i = 1; i < src.rows; ++i)
+    {
+        for (int j = 0; j < src.cols; ++j)
+        {
+            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            float res = dst.at<float>(i, j);
+            ASSERT_NEAR(res, gold, 1e-5);
+        }
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////
+
 INSTANTIATE_TEST_CASE_P(ImgprocTestBase, equalizeHist, Combine(
                            ONE_TYPE(CV_8UC1),
                            NULL_TYPE,
@ -1688,7 +1729,6 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CLAHE, Combine(
                        Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(1300, 1300)),
                        Values(0.0, 40.0)));

-//INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine(
-//                            Values(CV_32FC1, CV_32FC1),
-//                            Values(false))); // Values(false) is the reserved parameter
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);
+
 #endif // HAVE_OPENCL
--- a/modules/ocl/test/test_objdetect.cpp
+++ b/modules/ocl/test/test_objdetect.cpp
@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Wenju He, wenju@multicorewareinc.com
+//		Yao Wang, bitwangyaoyao@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@ -45,51 +45,58 @@

 #include "precomp.hpp"
 #include "opencv2/core/core.hpp"
-using namespace std;
+#include "opencv2/objdetect/objdetect.hpp"
+
+using namespace cv;
+using namespace testing;
 #ifdef HAVE_OPENCL

 extern string workdir;
-PARAM_TEST_CASE(HOG, cv::Size, int)
+
+///////////////////// HOG /////////////////////////////
+PARAM_TEST_CASE(HOG, Size, int)
 {
-    cv::Size winSize;
+    Size winSize;
    int type;
+    Mat img_rgb;
    virtual void SetUp()
    {
        winSize = GET_PARAM(0);
        type = GET_PARAM(1);
+        img_rgb = readImage(workdir + "../gpu/road.png");
+        if(img_rgb.empty())
+        {
+            std::cout << "Couldn't read road.png" << std::endl;
+        }
    }
 };

 TEST_P(HOG, GetDescriptors)
 {
-    // Load image
-    cv::Mat img_rgb = readImage(workdir + "lena.jpg");
-    ASSERT_FALSE(img_rgb.empty());
-
    // Convert image
-    cv::Mat img;
+    Mat img;
    switch (type)
    {
    case CV_8UC1:
-        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        cvtColor(img_rgb, img, CV_BGR2GRAY);
        break;
    case CV_8UC4:
    default:
-        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        cvtColor(img_rgb, img, CV_BGR2BGRA);
        break;
    }
-    cv::ocl::oclMat d_img(img);
+    ocl::oclMat d_img(img);

    // HOGs
-    cv::ocl::HOGDescriptor ocl_hog;
+    ocl::HOGDescriptor ocl_hog;
    ocl_hog.gamma_correction = true;
-    cv::HOGDescriptor hog;
+    HOGDescriptor hog;
    hog.gammaCorrection = true;

    // Compute descriptor
-    cv::ocl::oclMat d_descriptors;
+    ocl::oclMat d_descriptors;
    ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
-    cv::Mat down_descriptors;
+    Mat down_descriptors;
    d_descriptors.download(down_descriptors);
    down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);

@ -105,45 +112,34 @@ TEST_P(HOG, GetDescriptors)
        hog.compute(img_rgb, descriptors, ocl_hog.win_size);
        break;
    }
-    cv::Mat cpu_descriptors(descriptors);
+    Mat cpu_descriptors(descriptors);

    EXPECT_MAT_SIMILAR(down_descriptors, cpu_descriptors, 1e-2);
 }

-
-bool match_rect(cv::Rect r1, cv::Rect r2, int threshold)
-{
-    return ((abs(r1.x - r2.x) < threshold) && (abs(r1.y - r2.y) < threshold) &&
-            (abs(r1.width - r2.width) < threshold) && (abs(r1.height - r2.height) < threshold));
-}
-
 TEST_P(HOG, Detect)
 {
-    // Load image
-    cv::Mat img_rgb = readImage(workdir + "lena.jpg");
-    ASSERT_FALSE(img_rgb.empty());
-
    // Convert image
-    cv::Mat img;
+    Mat img;
    switch (type)
    {
    case CV_8UC1:
-        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        cvtColor(img_rgb, img, CV_BGR2GRAY);
        break;
    case CV_8UC4:
    default:
-        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        cvtColor(img_rgb, img, CV_BGR2BGRA);
        break;
    }
-    cv::ocl::oclMat d_img(img);
+    ocl::oclMat d_img(img);

    // HOGs
-    if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
-        winSize = cv::Size(64, 128);
-    cv::ocl::HOGDescriptor ocl_hog(winSize);
+    if ((winSize != Size(48, 96)) && (winSize != Size(64, 128)))
+        winSize = Size(64, 128);
+    ocl::HOGDescriptor ocl_hog(winSize);
    ocl_hog.gamma_correction = true;

-    cv::HOGDescriptor hog;
+    HOGDescriptor hog;
    hog.winSize = winSize;
    hog.gammaCorrection = true;

@ -165,88 +161,117 @@ TEST_P(HOG, Detect)
    }

    // OpenCL detection
-    std::vector<cv::Rect> d_found;
-    ocl_hog.detectMultiScale(d_img, d_found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+    std::vector<Rect> d_found;
+    ocl_hog.detectMultiScale(d_img, d_found, 0, Size(8, 8), Size(0, 0), 1.05, 6);

    // CPU detection
-    std::vector<cv::Rect> found;
+    std::vector<Rect> found;
    switch (type)
    {
    case CV_8UC1:
-        hog.detectMultiScale(img, found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+        hog.detectMultiScale(img, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
        break;
    case CV_8UC4:
    default:
-        hog.detectMultiScale(img_rgb, found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+        hog.detectMultiScale(img_rgb, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
        break;
    }

-    // Ground-truth rectangular people window
-    cv::Rect win1_64x128(231, 190, 72, 144);
-    cv::Rect win2_64x128(621, 156, 97, 194);
-    cv::Rect win1_48x96(238, 198, 63, 126);
-    cv::Rect win2_48x96(619, 161, 92, 185);
-    cv::Rect win3_48x96(488, 136, 56, 112);
-
-    // Compare whether ground-truth windows are detected and compare the number of windows detected.
-    std::vector<int> d_comp(4);
-    std::vector<int> comp(4);
-    for(int i = 0; i < (int)d_comp.size(); i++)
-    {
-        d_comp[i] = 0;
-        comp[i] = 0;
-    }
-
-    int threshold = 10;
-    int val = 32;
-    d_comp[0] = (int)d_found.size();
-    comp[0] = (int)found.size();
-    if (winSize == cv::Size(48, 96))
-    {
-        for(int i = 0; i < (int)d_found.size(); i++)
-        {
-            if (match_rect(d_found[i], win1_48x96, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found[i], win2_48x96, threshold))
-                d_comp[2] = val;
-            if (match_rect(d_found[i], win3_48x96, threshold))
-                d_comp[3] = val;
-        }
-        for(int i = 0; i < (int)found.size(); i++)
-        {
-            if (match_rect(found[i], win1_48x96, threshold))
-                comp[1] = val;
-            if (match_rect(found[i], win2_48x96, threshold))
-                comp[2] = val;
-            if (match_rect(found[i], win3_48x96, threshold))
-                comp[3] = val;
-        }
-    }
-    else if (winSize == cv::Size(64, 128))
-    {
-        for(int i = 0; i < (int)d_found.size(); i++)
-        {
-            if (match_rect(d_found[i], win1_64x128, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found[i], win2_64x128, threshold))
-                d_comp[2] = val;
-        }
-        for(int i = 0; i < (int)found.size(); i++)
-        {
-            if (match_rect(found[i], win1_64x128, threshold))
-                comp[1] = val;
-            if (match_rect(found[i], win2_64x128, threshold))
-                comp[2] = val;
-        }
-    }
-
-    EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3);
+    EXPECT_LT(checkRectSimilarity(img.size(), found, d_found), 1.0);
 }


 INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
-                            testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
+                            testing::Values(Size(64, 128), Size(48, 96)),
                            testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));

+///////////////////////////// Haar //////////////////////////////
+IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
+CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
+CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
+struct getRect
+{
+    Rect operator ()(const CvAvgComp &e) const
+    {
+        return e.rect;
+    }
+};

-#endif //HAVE_OPENCL
+PARAM_TEST_CASE(Haar, int, CascadeName)
+{
+    ocl::OclCascadeClassifier cascade, nestedCascade;
+    CascadeClassifier cpucascade, cpunestedCascade;
+
+    int flags;
+    std::string cascadeName;
+    vector<Rect> faces, oclfaces;
+    Mat img;
+    ocl::oclMat d_img;
+
+    virtual void SetUp()
+    {
+        flags = GET_PARAM(0);
+        cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(1));
+        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
+        {
+            std::cout << "ERROR: Could not load classifier cascade" << std::endl;
+            return;
+        }
+        img = readImage(workdir + "lena.jpg", IMREAD_GRAYSCALE);
+        if(img.empty())
+        {
+            std::cout << "Couldn't read lena.jpg" << std::endl;
+            return ;
+        }
+        equalizeHist(img, img);
+        d_img.upload(img);
+    }
+};
+
+TEST_P(Haar, FaceDetect)
+{
+    MemStorage storage(cvCreateMemStorage(0));
+    CvSeq *_objects;
+    _objects = cascade.oclHaarDetectObjects(d_img, storage, 1.1, 3, 
+                                            flags, Size(30, 30), Size(0, 0));
+    vector<CvAvgComp> vecAvgComp;
+    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
+    oclfaces.resize(vecAvgComp.size());
+    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
+    
+    cpucascade.detectMultiScale(img, faces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+
+    EXPECT_LT(checkRectSimilarity(img.size(), faces, oclfaces), 1.0);
+}
+
+TEST_P(Haar, FaceDetectUseBuf)
+{
+    ocl::OclCascadeClassifierBuf cascadebuf;
+    if(!cascadebuf.load(cascadeName))
+    {
+        std::cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << std::endl;
+        return;
+    }
+    cascadebuf.detectMultiScale(d_img, oclfaces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+    cpucascade.detectMultiScale(img, faces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+
+    // intentionally run ocl facedetect again and check if it still works after the first run
+    cascadebuf.detectMultiScale(d_img, oclfaces,  1.1, 3,
+                                flags,
+                                Size(30, 30));
+    cascadebuf.release();
+
+    EXPECT_LT(checkRectSimilarity(img.size(), faces, oclfaces), 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, Haar,
+    Combine(Values(CV_HAAR_SCALE_IMAGE, 0), 
+            Values(cascade_frontalface_alt/*, cascade_frontalface_alt2*/)));
+
+#endif //HAVE_OPENCL
--- a/modules/ocl/test/test_pyramids.cpp
+++ b/modules/ocl/test/test_pyramids.cpp
@ -15,7 +15,6 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Dachuan Zhao, dachuan@multicorewareinc.com
 //    Yao Wang yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -56,11 +55,12 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;

-PARAM_TEST_CASE(PyrDown, MatType, int)
+PARAM_TEST_CASE(PyrBase, MatType, int)
 {
    int type;
    int channels;
-
+    Mat dst_cpu;
+    oclMat gdst;
    virtual void SetUp()
    {
        type = GET_PARAM(0);
@ -69,19 +69,19 @@ PARAM_TEST_CASE(PyrDown, MatType, int)

 };

+/////////////////////// PyrDown //////////////////////////
+struct PyrDown : PyrBase {};

 TEST_P(PyrDown, Mat)
 {
    for(int j = 0; j < LOOP_TIMES; j++)
    {
-        cv::Size size(MWIDTH, MHEIGHT);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Mat src = randomMat(rng, size, CV_MAKETYPE(type, channels), 0, 100, false);
-
-        cv::ocl::oclMat gsrc(src), gdst;
-        cv::Mat dst_cpu;
-        cv::pyrDown(src, dst_cpu);
-        cv::ocl::pyrDown(gsrc, gdst);
+        Size size(MWIDTH, MHEIGHT);
+        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        oclMat gsrc(src);
+        
+        pyrDown(src, dst_cpu);
+        pyrDown(gsrc, gdst);

        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), type == CV_32F ? 1e-4f : 1.0f);
    }
@ -90,5 +90,27 @@ TEST_P(PyrDown, Mat)
 INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine(
                            Values(CV_8U, CV_32F), Values(1, 3, 4)));

+/////////////////////// PyrUp //////////////////////////

+struct PyrUp : PyrBase {};
+
+TEST_P(PyrUp, Accuracy)
+{
+    for(int j = 0; j < LOOP_TIMES; j++)
+    {
+        Size size(MWIDTH, MHEIGHT);
+        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        oclMat gsrc(src);
+
+        pyrUp(src, dst_cpu);
+        pyrUp(gsrc, gdst);
+
+        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (type == CV_32F ? 1e-4f : 1.0));
+    }
+
+}
+
+
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
+                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
 #endif // HAVE_OPENCL
--- a/modules/ocl/test/test_pyrup.cpp
+++ b/modules/ocl/test/test_pyrup.cpp
@ -1,91 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Chunpeng chunpeng@multicorewareinc.com
-//    Yao Wang yao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencv2/core/core.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(PyrUp, MatType, int)
-{
-    int type;
-    int channels;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-    }
-};
-
-TEST_P(PyrUp, Accuracy)
-{
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        Size size(MWIDTH, MHEIGHT);
-        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
-        Mat dst_gold;
-        pyrUp(src, dst_gold);
-        ocl::oclMat dst;
-        ocl::oclMat srcMat(src);
-        ocl::pyrUp(srcMat, dst);
-
-        EXPECT_MAT_NEAR(dst_gold, Mat(dst), (type == CV_32F ? 1e-4f : 1.0));
-    }
-
-}
-
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
-                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
-
-
-#endif // HAVE_OPENCL
--- a/modules/ocl/test/utility.cpp
+++ b/modules/ocl/test/utility.cpp
@ -100,12 +100,6 @@ Mat randomMat(Size size, int type, double minVal, double maxVal)
    return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
 }

-
-
-
-
-
-
 /*
 void showDiff(InputArray gold_, InputArray actual_, double eps)
 {
@ -137,58 +131,7 @@ void showDiff(InputArray gold_, InputArray actual_, double eps)
 }
 */

-/*
-bool supportFeature(const DeviceInfo& info, FeatureSet feature)
-{
-    return TargetArchs::builtWith(feature) && info.supports(feature);
-}

-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
-
-vector<DeviceInfo> devices(FeatureSet feature)
-{
-    const vector<DeviceInfo>& d = devices();
-
-    vector<DeviceInfo> devs_filtered;
-
-    if (TargetArchs::builtWith(feature))
-    {
-        devs_filtered.reserve(d.size());
-
-        for (size_t i = 0, size = d.size(); i < size; ++i)
-        {
-            const DeviceInfo& info = d[i];
-
-            if (info.supports(feature))
-                devs_filtered.push_back(info);
-        }
-    }
-
-    return devs_filtered;
-}
-*/

 vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
 {
@ -264,3 +207,48 @@ void PrintTo(const Inverse &inverse, std::ostream *os)
        (*os) << "direct";
 }

+double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
+{
+    double final_test_result = 0.0;
+    size_t sz1 = ob1.size();
+    size_t sz2 = ob2.size();
+
+    if(sz1 != sz2)
+    {
+        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
+    else
+    {
+        if(sz1==0 && sz2==0)
+            return 0;
+        cv::Mat cpu_result(sz, CV_8UC1);
+        cpu_result.setTo(0);
+
+        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
+        {      
+            cv::Mat cpu_result_roi(cpu_result, *r);
+            cpu_result_roi.setTo(1);
+            cpu_result.copyTo(cpu_result);
+        }
+        int cpu_area = cv::countNonZero(cpu_result > 0);
+
+        cv::Mat gpu_result(sz, CV_8UC1);
+        gpu_result.setTo(0);
+        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
+        {
+            cv::Mat gpu_result_roi(gpu_result, *r2);
+            gpu_result_roi.setTo(1);
+            gpu_result.copyTo(gpu_result);
+        }
+
+        cv::Mat result_;
+        multiply(cpu_result, gpu_result, result_);
+        int result = cv::countNonZero(result_ > 0);
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
+    }
+    return final_test_result;
+}
+
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@ -55,13 +55,12 @@ cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal =

 void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);

-//! return true if device supports specified feature and gpu module was built with support the feature.
-//bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+// This function test if gpu_rst matches cpu_rst.
+// If the two vectors are not equal, it will return the difference in vector size
+// Else it will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
+// The smaller, the better matched
+double checkRectSimilarity(cv::Size sz, std::vector<cv::Rect>& ob1, std::vector<cv::Rect>& ob2);

-//! return all devices compatible with current gpu module build.
-//const std::vector<cv::ocl::DeviceInfo>& devices();
-//! return all devices compatible with current gpu module build which support specified feature.
-//std::vector<cv::ocl::DeviceInfo> devices(cv::gpu::FeatureSet feature);

 //! read image from testdata folder.
 cv::Mat readImage(const std::string &fileName, int flags = cv::IMREAD_COLOR);
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@ -7,55 +7,67 @@

 using namespace std;
 using namespace cv;
-#define LOOP_NUM 10 
+#define LOOP_NUM 10

 const static Scalar colors[] =  { CV_RGB(0,0,255),
-        CV_RGB(0,128,255),
-        CV_RGB(0,255,255),
-        CV_RGB(0,255,0),
-        CV_RGB(255,128,0),
-        CV_RGB(255,255,0),
-        CV_RGB(255,0,0),
-        CV_RGB(255,0,255)} ;
+                                  CV_RGB(0,128,255),
+                                  CV_RGB(0,255,255),
+                                  CV_RGB(0,255,0),
+                                  CV_RGB(255,128,0),
+                                  CV_RGB(255,255,0),
+                                  CV_RGB(255,0,0),
+                                  CV_RGB(255,0,255)
+                                } ;
+

 int64 work_begin = 0;
 int64 work_end = 0;
+string outputName;

-static void workBegin() 
-{ 
+static void workBegin()
+{
    work_begin = getTickCount();
 }
 static void workEnd()
 {
    work_end += (getTickCount() - work_begin);
 }
-static double getTime(){
+static double getTime()
+{
    return work_end /((double)cvGetTickFrequency() * 1000.);
 }

-void detect( Mat& img, vector<Rect>& faces, 
-    cv::ocl::OclCascadeClassifierBuf& cascade, 
-    double scale, bool calTime);

-void detectCPU( Mat& img, vector<Rect>& faces, 
-    CascadeClassifier& cascade, 
-    double scale, bool calTime);
+void detect( Mat& img, vector<Rect>& faces,
+             ocl::OclCascadeClassifierBuf& cascade,
+             double scale, bool calTime);
+
+
+void detectCPU( Mat& img, vector<Rect>& faces,
+                CascadeClassifier& cascade,
+                double scale, bool calTime);
+

 void Draw(Mat& img, vector<Rect>& faces, double scale);

+
 // This function test if gpu_rst matches cpu_rst.
 // If the two vectors are not equal, it will return the difference in vector size
 // Else if will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
-double checkRectSimilarity(Size sz, std::vector<Rect>& cpu_rst, std::vector<Rect>& gpu_rst);
+double checkRectSimilarity(Size sz, vector<Rect>& cpu_rst, vector<Rect>& gpu_rst);
+

 int main( int argc, const char** argv )
 {
    const char* keys =
        "{ h | help       | false       | print help message }"
        "{ i | input      |             | specify input image }"
-        "{ t | template   | ../../../data/haarcascades/haarcascade_frontalface_alt.xml  | specify template file }"
+        "{ t | template   | haarcascade_frontalface_alt.xml |"
+        " specify template file path }"
        "{ c | scale      |   1.0       | scale image }"
-        "{ s | use_cpu    | false       | use cpu or gpu to process the image }";
+        "{ s | use_cpu    | false       | use cpu or gpu to process the image }"
+        "{ o | output     | facedetect_output.jpg  |"
+        " specify output image save path(only works when input is images) }";

    CommandLineParser cmd(argc, argv, keys);
    if (cmd.get<bool>("help"))
@ -69,9 +81,10 @@ int main( int argc, const char** argv )

    bool useCPU = cmd.get<bool>("s");
    string inputName = cmd.get<string>("i");
+    outputName = cmd.get<string>("o");
    string cascadeName = cmd.get<string>("t");
    double scale = cmd.get<double>("c");
-    cv::ocl::OclCascadeClassifierBuf cascade;
+    ocl::OclCascadeClassifierBuf cascade;
    CascadeClassifier  cpu_cascade;

    if( !cascade.load( cascadeName ) || !cpu_cascade.load(cascadeName) )
@ -83,7 +96,7 @@ int main( int argc, const char** argv )
    if( inputName.empty() )
    {
        capture = cvCaptureFromCAM(0);
-        if(!capture) 
+        if(!capture)
            cout << "Capture from CAM 0 didn't work" << endl;
    }
    else if( inputName.size() )
@ -92,7 +105,7 @@ int main( int argc, const char** argv )
        if( image.empty() )
        {
            capture = cvCaptureFromAVI( inputName.c_str() );
-            if(!capture) 
+            if(!capture)
                cout << "Capture from AVI didn't work" << endl;
            return -1;
        }
@ -100,14 +113,15 @@ int main( int argc, const char** argv )
    else
    {
        image = imread( "lena.jpg", 1 );
-        if(image.empty()) 
+        if(image.empty())
            cout << "Couldn't read lena.jpg" << endl;
        return -1;
    }

+
    cvNamedWindow( "result", 1 );
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = cv::ocl::getDevice(oclinfo);
+    vector<ocl::Info> oclinfo;
+    int devnums = ocl::getDevice(oclinfo);
    if( devnums < 1 )
    {
        std::cout << "no device found\n";
@ -130,19 +144,23 @@ int main( int argc, const char** argv )
                frame.copyTo( frameCopy );
            else
                flip( frame, frameCopy, 0 );
-            if(useCPU){
+            if(useCPU)
+            {
                detectCPU(frameCopy, faces, cpu_cascade, scale, false);
            }
-            else{
-                detect(frameCopy, faces, cascade, scale, false);     
+            else
+            {
+                detect(frameCopy, faces, cascade, scale, false);
            }
            Draw(frameCopy, faces, scale);
            if( waitKey( 10 ) >= 0 )
                goto _cleanup_;
        }

+
        waitKey(0);

+
 _cleanup_:
        cvReleaseCapture( &capture );
    }
@ -152,18 +170,21 @@ _cleanup_:
        vector<Rect> faces;
        vector<Rect> ref_rst;
        double accuracy = 0.;
-        for(int i = 0; i <= LOOP_NUM;i ++) 
+        for(int i = 0; i <= LOOP_NUM; i ++)
        {
            cout << "loop" << i << endl;
-            if(useCPU){
-                detectCPU(image, faces, cpu_cascade, scale, i==0?false:true);  
+            if(useCPU)
+            {
+                detectCPU(image, faces, cpu_cascade, scale, i==0?false:true);
            }
-            else{
+            else
+            {
                detect(image, faces, cascade, scale, i==0?false:true);
-                if(i == 0){
+                if(i == 0)
+                {
                    detectCPU(image, ref_rst, cpu_cascade, scale, false);
                    accuracy = checkRectSimilarity(image.size(), ref_rst, faces);
-                }                    
+                }
            }
            if (i == LOOP_NUM)
            {
@ -180,31 +201,31 @@ _cleanup_:
    }

    cvDestroyWindow("result");
-
    return 0;
 }

-void detect( Mat& img, vector<Rect>& faces, 
-    cv::ocl::OclCascadeClassifierBuf& cascade, 
-    double scale, bool calTime)
+void detect( Mat& img, vector<Rect>& faces,
+             ocl::OclCascadeClassifierBuf& cascade,
+             double scale, bool calTime)
 {
-    cv::ocl::oclMat image(img);
-    cv::ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
+    ocl::oclMat image(img);
+    ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
    if(calTime) workBegin();
-    cv::ocl::cvtColor( image, gray, CV_BGR2GRAY );
-    cv::ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    cv::ocl::equalizeHist( smallImg, smallImg );
+    ocl::cvtColor( image, gray, CV_BGR2GRAY );
+    ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
+    ocl::equalizeHist( smallImg, smallImg );

    cascade.detectMultiScale( smallImg, faces, 1.1,
-        3, 0
-        |CV_HAAR_SCALE_IMAGE
-        , Size(30,30), Size(0, 0) );
+                              3, 0
+                              |CV_HAAR_SCALE_IMAGE
+                              , Size(30,30), Size(0, 0) );
    if(calTime) workEnd();
 }

-void detectCPU( Mat& img, vector<Rect>& faces, 
-    CascadeClassifier& cascade, 
-    double scale, bool calTime)
+
+void detectCPU( Mat& img, vector<Rect>& faces,
+                CascadeClassifier& cascade,
+                double scale, bool calTime)
 {
    if(calTime) workBegin();
    Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
@ -212,11 +233,12 @@ void detectCPU( Mat& img, vector<Rect>& faces,
    resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR);
    equalizeHist(cpu_smallImg, cpu_smallImg);
    cascade.detectMultiScale(cpu_smallImg, faces, 1.1,
-        3, 0 | CV_HAAR_SCALE_IMAGE,
-        Size(30, 30), Size(0, 0));
-    if(calTime) workEnd(); 
+                             3, 0 | CV_HAAR_SCALE_IMAGE,
+                             Size(30, 30), Size(0, 0));
+    if(calTime) workEnd();
 }

+
 void Draw(Mat& img, vector<Rect>& faces, double scale)
 {
    int i = 0;
@ -230,31 +252,38 @@ void Draw(Mat& img, vector<Rect>& faces, double scale)
        radius = cvRound((r->width + r->height)*0.25*scale);
        circle( img, center, radius, color, 3, 8, 0 );
    }
-    cv::imshow( "result", img );
+    imshow( "result", img );
+    imwrite( outputName, img );
 }

-double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
+
+double checkRectSimilarity(Size sz, vector<Rect>& ob1, vector<Rect>& ob2)
 {
    double final_test_result = 0.0;
    size_t sz1 = ob1.size();
    size_t sz2 = ob2.size();

    if(sz1 != sz2)
+    {
        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
    else
    {
-        cv::Mat cpu_result(sz, CV_8UC1);
+        if(sz1==0 && sz2==0)
+            return 0;
+        Mat cpu_result(sz, CV_8UC1);
        cpu_result.setTo(0);

        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
-        {      
-            cv::Mat cpu_result_roi(cpu_result, *r);
+        {
+            Mat cpu_result_roi(cpu_result, *r);
            cpu_result_roi.setTo(1);
            cpu_result.copyTo(cpu_result);
        }
-        int cpu_area = cv::countNonZero(cpu_result > 0);
+        int cpu_area = countNonZero(cpu_result > 0);

-        cv::Mat gpu_result(sz, CV_8UC1);
+
+        Mat gpu_result(sz, CV_8UC1);
        gpu_result.setTo(0);
        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
        {
@ -263,11 +292,13 @@ double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& o
            gpu_result.copyTo(gpu_result);
        }

-        cv::Mat result_;
+        Mat result_;
        multiply(cpu_result, gpu_result, result_);
-        int result = cv::countNonZero(result_ > 0);
-
-        final_test_result = 1.0 - (double)result/(double)cpu_area;
+        int result = countNonZero(result_ > 0);
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
    }
    return final_test_result;
 }
--- a/samples/ocl/hog.cpp
+++ b/samples/ocl/hog.cpp
@ -10,75 +10,39 @@
 using namespace std;
 using namespace cv;

-bool help_showed = false;
-
-class Args
-{
-public:
-    Args();
-    static Args read(int argc, char** argv);
-
-    string src;
-    bool src_is_video;
-    bool src_is_camera;
-    int camera_id;
-
-    bool write_video;
-    string dst_video;
-    double dst_video_fps;
-
-    bool make_gray;
-
-    bool resize_src;
-    int width, height;
-
-    double scale;
-    int nlevels;
-    int gr_threshold;
-
-    double hit_threshold;
-    bool hit_threshold_auto;
-
-    int win_width;
-    int win_stride_width, win_stride_height;
-
-    bool gamma_corr;
-};
-
 class App
 {
 public:
-    App(const Args& s);
+    App(CommandLineParser& cmd);
    void run();
-
    void handleKey(char key);
-
    void hogWorkBegin();
    void hogWorkEnd();
    string hogWorkFps() const;
-
    void workBegin();
    void workEnd();
    string workFps() const;
-
    string message() const;

+
 // This function test if gpu_rst matches cpu_rst.
 // If the two vectors are not equal, it will return the difference in vector size
-// Else if will return 
+// Else if will return
 // (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
-    double checkRectSimilarity(Size sz, 
-                               std::vector<Rect>& cpu_rst, 
+    double checkRectSimilarity(Size sz,
+                               std::vector<Rect>& cpu_rst,
                               std::vector<Rect>& gpu_rst);
 private:
    App operator=(App&);

-    Args args;
+    //Args args;
    bool running;
-
    bool use_gpu;
    bool make_gray;
    double scale;
+    double resize_scale;
+    int win_width;
+    int win_stride_width, win_stride_height;
    int gr_threshold;
    int nlevels;
    double hit_threshold;
@ -86,119 +50,49 @@ private:

    int64 hog_work_begin;
    double hog_work_fps;
-
    int64 work_begin;
    double work_fps;
-};

-static void printHelp()
-{
-    cout << "Histogram of Oriented Gradients descriptor and detector sample.\n"
-         << "\nUsage: hog_gpu\n"
-         << "  (<image>|--video <vide>|--camera <camera_id>) # frames source\n"
-         << "  [--make_gray <true/false>] # convert image to gray one or not\n"
-         << "  [--resize_src <true/false>] # do resize of the source image or not\n"
-         << "  [--width <int>] # resized image width\n"
-         << "  [--height <int>] # resized image height\n"
-         << "  [--hit_threshold <double>] # classifying plane distance threshold (0.0 usually)\n"
-         << "  [--scale <double>] # HOG window scale factor\n"
-         << "  [--nlevels <int>] # max number of HOG window scales\n"
-         << "  [--win_width <int>] # width of the window (48 or 64)\n"
-         << "  [--win_stride_width <int>] # distance by OX axis between neighbour wins\n"
-         << "  [--win_stride_height <int>] # distance by OY axis between neighbour wins\n"
-         << "  [--gr_threshold <int>] # merging similar rects constant\n"
-         << "  [--gamma_correct <int>] # do gamma correction or not\n"
-         << "  [--write_video <bool>] # write video or not\n"
-         << "  [--dst_video <path>] # output video path\n"
-         << "  [--dst_video_fps <double>] # output video fps\n";
-    help_showed = true;
-}
+    string img_source;
+    string vdo_source;
+    string output;
+    int camera_id;
+};

 int main(int argc, char** argv)
 {
+    const char* keys =
+        "{ h |  help    | false          | print help message }"
+        "{ i |  input   |                | specify input image}"
+        "{ c | camera   | -1             | enable camera capturing }"
+        "{ v | video    |                | use video as input }"
+        "{ g |  gray    | false          | convert image to gray one or not}"
+        "{ s |  scale   | 1.0            | resize the image before detect}"
+        "{ l |larger_win| false          | use 64x128 window}"
+        "{ o |  output  |                | specify output path when input is images}";
+    CommandLineParser cmd(argc, argv, keys);
+    App app(cmd);
    try
    {
-        if (argc < 2)
-            printHelp();
-        Args args = Args::read(argc, argv);
-        if (help_showed)
-            return -1;
-        App app(args);
        app.run();
    }
-    catch (const Exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch (const exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch(...) { return cout << "unknown exception" << endl, 1; }
+    catch (const Exception& e)
+    {
+        return cout << "error: "  << e.what() << endl, 1;
+    }
+    catch (const exception& e)
+    {
+        return cout << "error: "  << e.what() << endl, 1;
+    }
+    catch(...)
+    {
+        return cout << "unknown exception" << endl, 1;
+    }
    return 0;
 }

-
-Args::Args()
+App::App(CommandLineParser& cmd)
 {
-    src_is_video = false;
-    src_is_camera = false;
-    camera_id = 0;
-
-    write_video = false;
-    dst_video_fps = 24.;
-
-    make_gray = false;
-
-    resize_src = false;
-    width = 640;
-    height = 480;
-
-    scale = 1.05;
-    nlevels = 13;
-    gr_threshold = 8;
-    hit_threshold = 1.4;
-    hit_threshold_auto = true;
-
-    win_width = 48;
-    win_stride_width = 8;
-    win_stride_height = 8;
-
-    gamma_corr = true;
-}
-
-
-Args Args::read(int argc, char** argv)
-{
-    Args args;
-    for (int i = 1; i < argc; i++)
-    {
-        if (string(argv[i]) == "--make_gray") args.make_gray = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--resize_src") args.resize_src = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--width") args.width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--height") args.height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--hit_threshold")
-        {
-            args.hit_threshold = atof(argv[++i]);
-            args.hit_threshold_auto = false;
-        }
-        else if (string(argv[i]) == "--scale") args.scale = atof(argv[++i]);
-        else if (string(argv[i]) == "--nlevels") args.nlevels = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--dst_video") args.dst_video = argv[++i];
-        else if (string(argv[i]) == "--dst_video_fps") args.dst_video_fps = atof(argv[++i]);
-        else if (string(argv[i]) == "--help") printHelp();
-        else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; }
-        else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; }
-        else if (args.src.empty()) args.src = argv[i];
-        else throw runtime_error((string("unknown key: ") + argv[i]));
-    }
-    return args;
-}
-
-
-App::App(const Args& s)
-{
-    args = s;
    cout << "\nControls:\n"
         << "\tESC - exit\n"
         << "\tm - change mode GPU <-> CPU\n"
@ -209,56 +103,56 @@ App::App(const Args& s)
         << "\t4/r - increase/decrease hit threshold\n"
         << endl;

+
    use_gpu = true;
-    make_gray = args.make_gray;
-    scale = args.scale;
-    gr_threshold = args.gr_threshold;
-    nlevels = args.nlevels;
+    make_gray = cmd.get<bool>("g");
+    resize_scale = cmd.get<double>("s");
+    win_width = cmd.get<bool>("l") == true ? 64 : 48;
+    vdo_source = cmd.get<string>("v");
+    img_source = cmd.get<string>("i");
+    output = cmd.get<string>("o");
+    camera_id = cmd.get<int>("c");

-    if (args.hit_threshold_auto)
-        args.hit_threshold = args.win_width == 48 ? 1.4 : 0.;
-    hit_threshold = args.hit_threshold;
+    win_stride_width = 8;
+    win_stride_height = 8;
+    gr_threshold = 8;
+    nlevels = 13;
+    hit_threshold = win_width == 48 ? 1.4 : 0.;
+    scale = 1.05;
+    gamma_corr = true;

-    gamma_corr = args.gamma_corr;
-
-    if (args.win_width != 64 && args.win_width != 48)
-        args.win_width = 64;
-
-    cout << "Scale: " << scale << endl;
-    if (args.resize_src)
-        cout << "Resized source: (" << args.width << ", " << args.height << ")\n";
    cout << "Group threshold: " << gr_threshold << endl;
    cout << "Levels number: " << nlevels << endl;
-    cout << "Win width: " << args.win_width << endl;
-    cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n";
+    cout << "Win width: " << win_width << endl;
+    cout << "Win stride: (" << win_stride_width << ", " << win_stride_height << ")\n";
    cout << "Hit threshold: " << hit_threshold << endl;
    cout << "Gamma correction: " << gamma_corr << endl;
    cout << endl;
 }

-
 void App::run()
 {
-    std::vector<ocl::Info> oclinfo;
+    vector<ocl::Info> oclinfo;
    ocl::getDevice(oclinfo);
    running = true;
-    cv::VideoWriter video_writer;
+    VideoWriter video_writer;

-    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(args.win_stride_width, args.win_stride_height);
+    Size win_size(win_width, win_width * 2);
+    Size win_stride(win_stride_width, win_stride_height);

    // Create HOG descriptors and detectors here
    vector<float> detector;
    if (win_size == Size(64, 128))
-        detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
+        detector = ocl::HOGDescriptor::getPeopleDetector64x128();
    else
-        detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
+        detector = ocl::HOGDescriptor::getPeopleDetector48x96();

-    cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
-                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
+
+    ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
+                               ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
+                               ocl::HOGDescriptor::DEFAULT_NLEVELS);
+    HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
+                          HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
    gpu_hog.setSVMDetector(detector);
    cpu_hog.setSVMDetector(detector);

@ -267,29 +161,29 @@ void App::run()
        VideoCapture vc;
        Mat frame;

-        if (args.src_is_video)
+        if (vdo_source!="")
        {
-            vc.open(args.src.c_str());
+            vc.open(vdo_source.c_str());
            if (!vc.isOpened())
-                throw runtime_error(string("can't open video file: " + args.src));
+                throw runtime_error(string("can't open video file: " + vdo_source));
            vc >> frame;
        }
-        else if (args.src_is_camera)
+        else if (camera_id != -1)
        {
-            vc.open(args.camera_id);
+            vc.open(camera_id);
            if (!vc.isOpened())
            {
                stringstream msg;
-                msg << "can't open camera: " << args.camera_id;
+                msg << "can't open camera: " << camera_id;
                throw runtime_error(msg.str());
            }
            vc >> frame;
        }
        else
        {
-            frame = imread(args.src);
+            frame = imread(img_source);
            if (frame.empty())
-                throw runtime_error(string("can't open image file: " + args.src));
+                throw runtime_error(string("can't open image file: " + img_source));
        }

        Mat img_aux, img, img_to_show;
@ -307,13 +201,15 @@ void App::run()
            else frame.copyTo(img_aux);

            // Resize image
-            if (args.resize_src) resize(img_aux, img, Size(args.width, args.height));
+            if (abs(scale-1.0)>0.001)
+            {
+                Size sz((int)((double)img_aux.cols/resize_scale), (int)((double)img_aux.rows/resize_scale));
+                resize(img_aux, img, sz);
+            }
            else img = img_aux;
            img_to_show = img;
-
            gpu_hog.nlevels = nlevels;
            cpu_hog.nlevels = nlevels;
-
            vector<Rect> found;

            // Perform HOG classification
@ -330,15 +226,16 @@ void App::run()
                    vector<Rect> ref_rst;
                    cvtColor(img, img, CV_BGRA2BGR);
                    cpu_hog.detectMultiScale(img, ref_rst, hit_threshold, win_stride,
-                                              Size(0, 0), scale, gr_threshold-2);
+                                             Size(0, 0), scale, gr_threshold-2);
                    double accuracy = checkRectSimilarity(img.size(), ref_rst, found);
-                    cout << "\naccuracy value: " << accuracy << endl;           
-                } 
-           }
+                    cout << "\naccuracy value: " << accuracy << endl;
+                }
+            }
            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
-                                          Size(0, 0), scale, gr_threshold);
+                                              Size(0, 0), scale, gr_threshold);
            hogWorkEnd();

+
            // Draw positive classified windows
            for (size_t i = 0; i < found.size(); i++)
            {
@ -353,25 +250,31 @@ void App::run()
            putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
            putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
            imshow("opencv_gpu_hog", img_to_show);
-
-            if (args.src_is_video || args.src_is_camera) vc >> frame;
+            if (vdo_source!="" || camera_id!=-1) vc >> frame;

            workEnd();

-            if (args.write_video)
+            if (output!="")
            {
-                if (!video_writer.isOpened())
+                if (img_source!="")     // wirte image
                {
-                    video_writer.open(args.dst_video, CV_FOURCC('x','v','i','d'), args.dst_video_fps,
-                                      img_to_show.size(), true);
-                    if (!video_writer.isOpened())
-                        throw std::runtime_error("can't create video writer");
+                    imwrite(output, img_to_show);
                }
+                else                    //write video
+                {
+                    if (!video_writer.isOpened())
+                    {
+                        video_writer.open(output, CV_FOURCC('x','v','i','d'), 24,
+                                          img_to_show.size(), true);
+                        if (!video_writer.isOpened())
+                            throw std::runtime_error("can't create video writer");
+                    }

-                if (make_gray) cvtColor(img_to_show, img, CV_GRAY2BGR);
-                else cvtColor(img_to_show, img, CV_BGRA2BGR);
+                    if (make_gray) cvtColor(img_to_show, img, CV_GRAY2BGR);
+                    else cvtColor(img_to_show, img, CV_BGRA2BGR);

-                video_writer << img;
+                    video_writer << img;
+                }
            }

            handleKey((char)waitKey(3));
@ -379,7 +282,6 @@ void App::run()
    }
 }

-
 void App::handleKey(char key)
 {
    switch (key)
@ -442,7 +344,10 @@ void App::handleKey(char key)
 }


-inline void App::hogWorkBegin() { hog_work_begin = getTickCount(); }
+inline void App::hogWorkBegin()
+{
+    hog_work_begin = getTickCount();
+}

 inline void App::hogWorkEnd()
 {
@ -458,8 +363,10 @@ inline string App::hogWorkFps() const
    return ss.str();
 }

-
-inline void App::workBegin() { work_begin = getTickCount(); }
+inline void App::workBegin()
+{
+    work_begin = getTickCount();
+}

 inline void App::workEnd()
 {
@ -475,8 +382,9 @@ inline string App::workFps() const
    return ss.str();
 }

-double App::checkRectSimilarity(Size sz, 
-                                std::vector<Rect>& ob1, 
+
+double App::checkRectSimilarity(Size sz,
+                                std::vector<Rect>& ob1,
                                std::vector<Rect>& ob2)
 {
    double final_test_result = 0.0;
@ -484,20 +392,26 @@ double App::checkRectSimilarity(Size sz,
    size_t sz2 = ob2.size();

    if(sz1 != sz2)
+    {
        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
    else
    {
+        if(sz1==0 && sz2==0)
+            return 0;
        cv::Mat cpu_result(sz, CV_8UC1);
        cpu_result.setTo(0);

+
        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
-        {      
+        {
            cv::Mat cpu_result_roi(cpu_result, *r);
            cpu_result_roi.setTo(1);
            cpu_result.copyTo(cpu_result);
        }
        int cpu_area = cv::countNonZero(cpu_result > 0);

+
        cv::Mat gpu_result(sz, CV_8UC1);
        gpu_result.setTo(0);
        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
@ -510,10 +424,11 @@ double App::checkRectSimilarity(Size sz,
        cv::Mat result_;
        multiply(cpu_result, gpu_result, result_);
        int result = cv::countNonZero(result_ > 0);
-
-        final_test_result = 1.0 - (double)result/(double)cpu_area;
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
    }
    return final_test_result;
-
 }

--- a/samples/ocl/pyrlk_optical_flow.cpp
+++ b/samples/ocl/pyrlk_optical_flow.cpp
@ -11,19 +11,20 @@ using namespace cv;
 using namespace cv::ocl;

 typedef unsigned char uchar;
-#define LOOP_NUM 10 
+#define LOOP_NUM 10
 int64 work_begin = 0;
 int64 work_end = 0;

-static void workBegin() 
-{ 
+static void workBegin()
+{
    work_begin = getTickCount();
 }
 static void workEnd()
 {
    work_end += (getTickCount() - work_begin);
 }
-static double getTime(){
+static double getTime()
+{
    return work_end * 1000. / getTickFrequency();
 }

@ -93,14 +94,15 @@ int main(int argc, const char* argv[])
    //set this to save kernel compile time from second time you run
    ocl::setBinpath("./");
    const char* keys =
-        "{ h            | help           | false | print help message }"
-        "{ l            | left           |       | specify left image }"
-        "{ r            | right          |       | specify right image }"
-        "{ c            | camera         | 0     | enable camera capturing }"
-        "{ s            | use_cpu        | false | use cpu or gpu to process the image }"
-        "{ v            | video          |       | use video as input }"
-        "{ points       | points         | 1000  | specify points count [GoodFeatureToTrack] }"
-        "{ min_dist     | min_dist       | 0     | specify minimal distance between points [GoodFeatureToTrack] }";
+        "{ h   | help     | false           | print help message }"
+        "{ l   | left     |                 | specify left image }"
+        "{ r   | right    |                 | specify right image }"
+        "{ c   | camera   | 0               | specify camera id }"
+        "{ s   | use_cpu  | false           | use cpu or gpu to process the image }"
+        "{ v   | video    |                 | use video as input }"
+        "{ o   | output   | pyrlk_output.jpg| specify output save path when input is images }"
+        "{ p   | points   | 1000            | specify points count [GoodFeatureToTrack] }"
+        "{ m   | min_dist | 0               | specify minimal distance between points [GoodFeatureToTrack] }";

    CommandLineParser cmd(argc, argv, keys);

@ -113,13 +115,13 @@ int main(int argc, const char* argv[])
    }

    bool defaultPicturesFail = false;
-    string fname0 = cmd.get<string>("left");
-    string fname1 = cmd.get<string>("right");
-    string vdofile = cmd.get<string>("video");
-    int points = cmd.get<int>("points");
-    double minDist = cmd.get<double>("min_dist");
+    string fname0 = cmd.get<string>("l");
+    string fname1 = cmd.get<string>("r");
+    string vdofile = cmd.get<string>("v");
+    string outfile = cmd.get<string>("o");
+    int points = cmd.get<int>("p");
+    double minDist = cmd.get<double>("m");
    bool useCPU = cmd.get<bool>("s");
-    bool useCamera = cmd.get<bool>("c");
    int inputName = cmd.get<int>("c");

    oclMat d_nextPts, d_status;
@ -132,22 +134,9 @@ int main(int argc, const char* argv[])
    vector<unsigned char> status(points);
    vector<float> err;

-    if (frame0.empty() || frame1.empty())
-    {
-        useCamera = true;
-        defaultPicturesFail = true;
-        CvCapture* capture = 0;
-        capture = cvCaptureFromCAM( inputName );
-        if (!capture)
-        {
-            cout << "Can't load input images" << endl;
-            return -1;
-        }
-    }
-
    cout << "Points count : " << points << endl << endl;

-    if (useCamera)
+    if (frame0.empty() || frame1.empty())
    {
        CvCapture* capture = 0;
        Mat frame, frameCopy;
@ -241,10 +230,10 @@ _cleanup_:
    else
    {
 nocamera:
-        for(int i = 0; i <= LOOP_NUM;i ++) 
+        for(int i = 0; i <= LOOP_NUM; i ++)
        {
            cout << "loop" << i << endl;
-            if (i > 0) workBegin();     
+            if (i > 0) workBegin();

            if (useCPU)
            {
@ -274,8 +263,8 @@ nocamera:
                cout << getTime() / LOOP_NUM << " ms" << endl;

                drawArrows(frame0, pts, nextPts, status, Scalar(255, 0, 0));
-
                imshow("PyrLK [Sparse]", frame0);
+                imwrite(outfile, frame0);
            }
        }
    }
--- a/samples/ocl/squares.cpp
+++ b/samples/ocl/squares.cpp
@ -6,7 +6,6 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/ocl/ocl.hpp"
-
 #include <iostream>
 #include <math.h>
 #include <string.h>
@ -14,23 +13,50 @@
 using namespace cv;
 using namespace std;

-static void help()
-{
-    cout <<
-        "\nA program using OCL module pyramid scaling, Canny, dilate functions, threshold, split; cpu contours, contour simpification and\n"
-        "memory storage (it's got it all folks) to find\n"
-        "squares in a list of images pic1-6.png\n"
-        "Returns sequence of squares detected on the image.\n"
-        "the sequence is stored in the specified memory storage\n"
-        "Call:\n"
-        "./squares\n"
-        "Using OpenCV version %s\n" << CV_VERSION << "\n" << endl;
-}
+#define ACCURACY_CHECK 1

+#if ACCURACY_CHECK
+// check if two vectors of vector of points are near or not
+// prior assumption is that they are in correct order
+static bool checkPoints(
+    vector< vector<Point> > set1,
+    vector< vector<Point> > set2,
+    int maxDiff = 5)
+{
+    if(set1.size() != set2.size())
+    {
+        return false;
+    }
+
+    for(vector< vector<Point> >::iterator it1 = set1.begin(), it2 = set2.begin();
+            it1 < set1.end() && it2 < set2.end(); it1 ++, it2 ++)
+    {
+        vector<Point> pts1 = *it1;
+        vector<Point> pts2 = *it2;
+
+
+        if(pts1.size() != pts2.size())
+        {
+            return false;
+        }
+        for(size_t i = 0; i < pts1.size(); i ++)
+        {
+            Point pt1 = pts1[i], pt2 = pts2[i];
+            if(std::abs(pt1.x - pt2.x) > maxDiff ||
+                    std::abs(pt1.y - pt2.y) > maxDiff)
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+#endif

 int thresh = 50, N = 11;
 const char* wndname = "OpenCL Square Detection Demo";

+
 // helper function:
 // finds a cosine of angle between vectors
 // from pt0->pt1 and from pt0->pt2
@ -43,9 +69,92 @@ static double angle( Point pt1, Point pt2, Point pt0 )
    return (dx1*dx2 + dy1*dy2)/sqrt((dx1*dx1 + dy1*dy1)*(dx2*dx2 + dy2*dy2) + 1e-10);
 }

+
 // returns sequence of squares detected on the image.
 // the sequence is stored in the specified memory storage
 static void findSquares( const Mat& image, vector<vector<Point> >& squares )
+{
+    squares.clear();
+    Mat pyr, timg, gray0(image.size(), CV_8U), gray;
+
+    // down-scale and upscale the image to filter out the noise
+    pyrDown(image, pyr, Size(image.cols/2, image.rows/2));
+    pyrUp(pyr, timg, image.size());
+    vector<vector<Point> > contours;
+
+    // find squares in every color plane of the image
+    for( int c = 0; c < 3; c++ )
+    {
+        int ch[] = {c, 0};
+        mixChannels(&timg, 1, &gray0, 1, ch, 1);
+
+        // try several threshold levels
+        for( int l = 0; l < N; l++ )
+        {
+            // hack: use Canny instead of zero threshold level.
+            // Canny helps to catch squares with gradient shading
+            if( l == 0 )
+            {
+                // apply Canny. Take the upper threshold from slider
+                // and set the lower to 0 (which forces edges merging)
+                Canny(gray0, gray, 0, thresh, 5);
+                // dilate canny output to remove potential
+                // holes between edge segments
+                dilate(gray, gray, Mat(), Point(-1,-1));
+            }
+            else
+            {
+                // apply threshold if l!=0:
+                //     tgray(x,y) = gray(x,y) < (l+1)*255/N ? 255 : 0
+                cv::threshold(gray0, gray, (l+1)*255/N, 255, THRESH_BINARY);
+            }
+
+            // find contours and store them all as a list
+            findContours(gray, contours, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);
+
+            vector<Point> approx;
+
+            // test each contour
+            for( size_t i = 0; i < contours.size(); i++ )
+            {
+                // approximate contour with accuracy proportional
+                // to the contour perimeter
+                approxPolyDP(Mat(contours[i]), approx, arcLength(Mat(contours[i]), true)*0.02, true);
+
+                // square contours should have 4 vertices after approximation
+                // relatively large area (to filter out noisy contours)
+                // and be convex.
+                // Note: absolute value of an area is used because
+                // area may be positive or negative - in accordance with the
+                // contour orientation
+                if( approx.size() == 4 &&
+                        fabs(contourArea(Mat(approx))) > 1000 &&
+                        isContourConvex(Mat(approx)) )
+                {
+                    double maxCosine = 0;
+
+                    for( int j = 2; j < 5; j++ )
+                    {
+                        // find the maximum cosine of the angle between joint edges
+                        double cosine = fabs(angle(approx[j%4], approx[j-2], approx[j-1]));
+                        maxCosine = MAX(maxCosine, cosine);
+                    }
+
+                    // if cosines of all angles are small
+                    // (all angles are ~90 degree) then write quandrange
+                    // vertices to resultant sequence
+                    if( maxCosine < 0.3 )
+                        squares.push_back(approx);
+                }
+            }
+        }
+    }
+}
+
+
+// returns sequence of squares detected on the image.
+// the sequence is stored in the specified memory storage
+static void findSquares_ocl( const Mat& image, vector<vector<Point> >& squares )
 {
    squares.clear();

@ -91,7 +200,6 @@ static void findSquares( const Mat& image, vector<vector<Point> >& squares )
            findContours(gray, contours, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);

            vector<Point> approx;
-
            // test each contour
            for( size_t i = 0; i < contours.size(); i++ )
            {
@ -106,11 +214,10 @@ static void findSquares( const Mat& image, vector<vector<Point> >& squares )
                // area may be positive or negative - in accordance with the
                // contour orientation
                if( approx.size() == 4 &&
-                    fabs(contourArea(Mat(approx))) > 1000 &&
-                    isContourConvex(Mat(approx)) )
+                        fabs(contourArea(Mat(approx))) > 1000 &&
+                        isContourConvex(Mat(approx)) )
                {
                    double maxCosine = 0;
-
                    for( int j = 2; j < 5; j++ )
                    {
                        // find the maximum cosine of the angle between joint edges
@ -139,40 +246,93 @@ static void drawSquares( Mat& image, const vector<vector<Point> >& squares )
        int n = (int)squares[i].size();
        polylines(image, &p, &n, 1, true, Scalar(0,255,0), 3, CV_AA);
    }
-
-    imshow(wndname, image);
 }


-int main(int /*argc*/, char** /*argv*/)
+// draw both pure-C++ and ocl square results onto a single image
+static Mat drawSquaresBoth( const Mat& image,
+                            const vector<vector<Point> >& sqsCPP,
+                            const vector<vector<Point> >& sqsOCL
+)
 {
+    Mat imgToShow(Size(image.cols * 2, image.rows), image.type());
+    Mat lImg = imgToShow(Rect(Point(0, 0), image.size()));
+    Mat rImg = imgToShow(Rect(Point(image.cols, 0), image.size()));
+    image.copyTo(lImg);
+    image.copyTo(rImg);
+    drawSquares(lImg, sqsCPP);
+    drawSquares(rImg, sqsOCL);
+    float fontScale = 0.8f;
+    Scalar white = Scalar::all(255), black = Scalar::all(0);
+
+    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
+    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
+    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
+    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
+
+    return imgToShow;
+}
+
+
+int main(int argc, char** argv)
+{
+    const char* keys =
+        "{ i | input   |                    | specify input image }"
+        "{ o | output  | squares_output.jpg | specify output save path}";
+    CommandLineParser cmd(argc, argv, keys);
+    string inputName = cmd.get<string>("i");
+    string outfile = cmd.get<string>("o");
+    if(inputName.empty())
+    {
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }

-    //ocl::setBinpath("F:/kernel_bin");
    vector<ocl::Info> info;
    CV_Assert(ocl::getDevice(info));
-
-    static const char* names[] = { "pic1.png", "pic2.png", "pic3.png",
-        "pic4.png", "pic5.png", "pic6.png", 0 };
-    help();
+    int iterations = 10;
    namedWindow( wndname, 1 );
-    vector<vector<Point> > squares;
+    vector<vector<Point> > squares_cpu, squares_ocl;

-    for( int i = 0; names[i] != 0; i++ )
+    Mat image = imread(inputName, 1);
+    if( image.empty() )
    {
-        Mat image = imread(names[i], 1);
-        if( image.empty() )
-        {
-            cout << "Couldn't load " << names[i] << endl;
-            continue;
-        }
-
-        findSquares(image, squares);
-        drawSquares(image, squares);
-
-        int c = waitKey();
-        if( (char)c == 27 )
-            break;
+        cout << "Couldn't load " << inputName << endl;
+        return -1;
    }
+    int j = iterations;
+    int64 t_ocl = 0, t_cpp = 0;
+    //warm-ups
+    cout << "warming up ..." << endl;
+    findSquares(image, squares_cpu);
+    findSquares_ocl(image, squares_ocl);
+
+
+#if ACCURACY_CHECK
+    cout << "Checking ocl accuracy ... " << endl;
+    cout << (checkPoints(squares_cpu, squares_ocl) ? "Pass" : "Failed") << endl;
+#endif
+    do
+    {
+        int64 t_start = cv::getTickCount();
+        findSquares(image, squares_cpu);
+        t_cpp += cv::getTickCount() - t_start;
+
+
+        t_start  = cv::getTickCount();
+        findSquares_ocl(image, squares_ocl);
+        t_ocl += cv::getTickCount() - t_start;
+        cout << "run loop: " << j << endl;
+    }
+    while(--j);
+    cout << "cpp average time: " << 1000.0f * (double)t_cpp / getTickFrequency() / iterations << "ms" << endl;
+    cout << "ocl average time: " << 1000.0f * (double)t_ocl / getTickFrequency() / iterations << "ms" << endl;
+
+    Mat result = drawSquaresBoth(image, squares_cpu, squares_ocl);
+    imshow(wndname, result);
+    imwrite(outfile, result);
+    cvWaitKey(0);

    return 0;
 }
--- a/samples/ocl/stereo_match.cpp
+++ b/samples/ocl/stereo_match.cpp
@ -10,56 +10,45 @@ using namespace cv;
 using namespace std;
 using namespace ocl;

-bool help_showed = false;
-
-struct Params
-{
-    Params();
-    static Params read(int argc, char** argv);
-
-    string left;
-    string right;
-
-    string method_str() const
-    {
-        switch (method)
-        {
-        case BM: return "BM";
-        case BP: return "BP";
-        case CSBP: return "CSBP";
-        }
-        return "";
-    }
-    enum {BM, BP, CSBP} method;
-    int ndisp; // Max disparity + 1
-    enum {GPU, CPU} type;
-};
-

 struct App
 {
-    App(const Params& p);
+    App(CommandLineParser& cmd);
    void run();
    void handleKey(char key);
    void printParams() const;

-    void workBegin() { work_begin = getTickCount(); }
+    void workBegin()
+    {
+        work_begin = getTickCount();
+    }
    void workEnd()
    {
        int64 d = getTickCount() - work_begin;
        double f = getTickFrequency();
        work_fps = f / d;
    }
-
+    string method_str() const
+    {
+        switch (method)
+        {
+        case BM:
+            return "BM";
+        case BP:
+            return "BP";
+        case CSBP:
+            return "CSBP";
+        }
+        return "";
+    }
    string text() const
    {
        stringstream ss;
-        ss << "(" << p.method_str() << ") FPS: " << setiosflags(ios::left)
-            << setprecision(4) << work_fps;
+        ss << "(" << method_str() << ") FPS: " << setiosflags(ios::left)
+           << setprecision(4) << work_fps;
        return ss.str();
    }
 private:
-    Params p;
    bool running;

    Mat left_src, right_src;
@ -72,42 +61,45 @@ private:

    int64 work_begin;
    double work_fps;
-};

-static void printHelp()
-{
-    cout << "Usage: stereo_match_gpu\n"
-        << "\t--left <left_view> --right <right_view> # must be rectified\n"
-        << "\t--method <stereo_match_method> # BM | BP | CSBP\n"
-        << "\t--ndisp <number> # number of disparity levels\n"
-        << "\t--type <device_type> # cpu | CPU | gpu | GPU\n";
-    help_showed = true;
-}
+    string l_img, r_img;
+    string out_img;
+    enum {BM, BP, CSBP} method;
+    int ndisp; // Max disparity + 1
+    enum {GPU, CPU} type;
+};

 int main(int argc, char** argv)
 {
+    const char* keys =
+        "{ h | help     | false                     | print help message }"
+        "{ l | left     |                           | specify left image }"
+        "{ r | right    |                           | specify right image }"
+        "{ m | method   | BM                        | specify match method(BM/BP/CSBP) }"
+        "{ n | ndisp    | 64                        |  specify number of disparity levels }"
+        "{ s | cpu_ocl  | false                     | use cpu or gpu as ocl device to process the image }"
+        "{ o | output   | stereo_match_output.jpg   | specify output path when input is images}";
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }
    try
    {
-        if (argc < 2)
-        {
-            printHelp();
-            return 1;
-        }
+        App app(cmd);
+        int flag = CVCL_DEVICE_TYPE_GPU;
+        if(cmd.get<bool>("s") == true)
+            flag = CVCL_DEVICE_TYPE_CPU;

-        Params args = Params::read(argc, argv);
-        if (help_showed)
-            return -1;
-
-        int flags[2] = { CVCL_DEVICE_TYPE_GPU, CVCL_DEVICE_TYPE_CPU };
        vector<Info> info;
-
-        if(getDevice(info, flags[args.type]) == 0)
+        if(getDevice(info, flag) == 0)
        {
            throw runtime_error("Error: Did not find a valid OpenCL device!");
        }
        cout << "Device name:" << info[0].DeviceName[0] << endl;

-        App app(args);
        app.run();
    }
    catch (const exception& e)
@ -117,77 +109,39 @@ int main(int argc, char** argv)
    return 0;
 }

-
-Params::Params()
-{
-    method = BM;
-    ndisp = 64;
-    type = GPU;
-}
-
-
-Params Params::read(int argc, char** argv)
-{
-    Params p;
-
-    for (int i = 1; i < argc; i++)
-    {
-        if (string(argv[i]) == "--left") p.left = argv[++i];
-        else if (string(argv[i]) == "--right") p.right = argv[++i];
-        else if (string(argv[i]) == "--method")
-        {
-            if (string(argv[i + 1]) == "BM") p.method = BM;
-            else if (string(argv[i + 1]) == "BP") p.method = BP;
-            else if (string(argv[i + 1]) == "CSBP") p.method = CSBP;
-            else throw runtime_error("unknown stereo match method: " + string(argv[i + 1]));
-            i++;
-        }
-        else if (string(argv[i]) == "--ndisp") p.ndisp = atoi(argv[++i]);
-        else if (string(argv[i]) == "--type")
-        {
-            string t(argv[++i]);
-            if (t == "cpu" || t == "CPU")
-            {
-                p.type = CPU;
-            } 
-            else if (t == "gpu" || t == "GPU")
-            {
-                p.type = GPU;
-            }
-            else throw runtime_error("unknown device type: " + t);
-        }
-        else if (string(argv[i]) == "--help") printHelp();
-        else throw runtime_error("unknown key: " + string(argv[i]));
-    }
-
-    return p;
-}
-
-
-App::App(const Params& params)
-    : p(params), running(false)
+App::App(CommandLineParser& cmd)
+    : running(false),method(BM)
 {
    cout << "stereo_match_ocl sample\n";
    cout << "\nControls:\n"
-        << "\tesc - exit\n"
-        << "\tp - print current parameters\n"
-        << "\tg - convert source images into gray\n"
-        << "\tm - change stereo match method\n"
-        << "\ts - change Sobel prefiltering flag (for BM only)\n"
-        << "\t1/q - increase/decrease maximum disparity\n"
-        << "\t2/w - increase/decrease window size (for BM only)\n"
-        << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
-        << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
+         << "\tesc - exit\n"
+         << "\tp - print current parameters\n"
+         << "\tg - convert source images into gray\n"
+         << "\tm - change stereo match method\n"
+         << "\ts - change Sobel prefiltering flag (for BM only)\n"
+         << "\t1/q - increase/decrease maximum disparity\n"
+         << "\t2/w - increase/decrease window size (for BM only)\n"
+         << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
+         << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
+    l_img = cmd.get<string>("l");
+    r_img = cmd.get<string>("r");
+    string mstr = cmd.get<string>("m");
+    if(mstr == "BM") method = BM;
+    else if(mstr == "BP") method = BP;
+    else if(mstr == "CSBP") method = CSBP;
+    else cout << "unknown method!\n";
+    ndisp = cmd.get<int>("n");
+    out_img = cmd.get<string>("o");
 }


 void App::run()
 {
    // Load images
-    left_src = imread(p.left);
-    right_src = imread(p.right);
-    if (left_src.empty()) throw runtime_error("can't open file \"" + p.left + "\"");
-    if (right_src.empty()) throw runtime_error("can't open file \"" + p.right + "\"");
+    left_src = imread(l_img);
+    right_src = imread(r_img);
+    if (left_src.empty()) throw runtime_error("can't open file \"" + l_img + "\"");
+    if (right_src.empty()) throw runtime_error("can't open file \"" + r_img + "\"");

    cvtColor(left_src, left, CV_BGR2GRAY);
    cvtColor(right_src, right, CV_BGR2GRAY);
@ -199,14 +153,15 @@ void App::run()
    imshow("right", right);

    // Set common parameters
-    bm.ndisp = p.ndisp;
-    bp.ndisp = p.ndisp;
-    csbp.ndisp = p.ndisp;
+    bm.ndisp = ndisp;
+    bp.ndisp = ndisp;
+    csbp.ndisp = ndisp;

    cout << endl;
    printParams();

    running = true;
+    bool written = false;
    while (running)
    {

@ -214,9 +169,9 @@ void App::run()
        Mat disp;
        oclMat d_disp;
        workBegin();
-        switch (p.method)
+        switch (method)
        {
-        case Params::BM:
+        case BM:
            if (d_left.channels() > 1 || d_right.channels() > 1)
            {
                cout << "BM doesn't support color images\n";
@ -230,25 +185,27 @@ void App::run()
            }
            bm(d_left, d_right, d_disp);
            break;
-        case Params::BP:
+        case BP:
            bp(d_left, d_right, d_disp);
            break;
-        case Params::CSBP:
+        case CSBP:
            csbp(d_left, d_right, d_disp);
            break;
        }
-        ocl::finish();
-        workEnd();
-
        // Show results
        d_disp.download(disp);
-        if (p.method != Params::BM)
+        workEnd();
+        if (method != BM)
        {
            disp.convertTo(disp, 0);
        }
        putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
        imshow("disparity", disp);
-
+        if(!written)
+        {
+            imwrite(out_img, disp);
+            written = true;
+        }
        handleKey((char)waitKey(3));
    }
 }
@ -259,19 +216,19 @@ void App::printParams() const
    cout << "--- Parameters ---\n";
    cout << "image_size: (" << left.cols << ", " << left.rows << ")\n";
    cout << "image_channels: " << left.channels() << endl;
-    cout << "method: " << p.method_str() << endl
-        << "ndisp: " << p.ndisp << endl;
-    switch (p.method)
+    cout << "method: " << method_str() << endl
+         << "ndisp: " << ndisp << endl;
+    switch (method)
    {
-    case Params::BM:
+    case BM:
        cout << "win_size: " << bm.winSize << endl;
        cout << "prefilter_sobel: " << bm.preset << endl;
        break;
-    case Params::BP:
+    case BP:
        cout << "iter_count: " << bp.iters << endl;
        cout << "level_count: " << bp.levels << endl;
        break;
-    case Params::CSBP:
+    case CSBP:
        cout << "iter_count: " << csbp.iters << endl;
        cout << "level_count: " << csbp.levels << endl;
        break;
@ -287,11 +244,13 @@ void App::handleKey(char key)
    case 27:
        running = false;
        break;
-    case 'p': case 'P':
+    case 'p':
+    case 'P':
        printParams();
        break;
-    case 'g': case 'G':
-        if (left.channels() == 1 && p.method != Params::BM)
+    case 'g':
+    case 'G':
+        if (left.channels() == 1 && method != BM)
        {
            left = left_src;
            right = right_src;
@ -307,23 +266,25 @@ void App::handleKey(char key)
        imshow("left", left);
        imshow("right", right);
        break;
-    case 'm': case 'M':
-        switch (p.method)
+    case 'm':
+    case 'M':
+        switch (method)
        {
-        case Params::BM:
-            p.method = Params::BP;
+        case BM:
+            method = BP;
            break;
-        case Params::BP:
-            p.method = Params::CSBP;
+        case BP:
+            method = CSBP;
            break;
-        case Params::CSBP:
-            p.method = Params::BM;
+        case CSBP:
+            method = BM;
            break;
        }
-        cout << "method: " << p.method_str() << endl;
+        cout << "method: " << method_str() << endl;
        break;
-    case 's': case 'S':
-        if (p.method == Params::BM)
+    case 's':
+    case 'S':
+        if (method == BM)
        {
            switch (bm.preset)
            {
@ -338,76 +299,80 @@ void App::handleKey(char key)
        }
        break;
    case '1':
-        p.ndisp = p.ndisp == 1 ? 8 : p.ndisp + 8;
-        cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
+        ndisp == 1 ? ndisp = 8 : ndisp += 8;
+        cout << "ndisp: " << ndisp << endl;
+        bm.ndisp = ndisp;
+        bp.ndisp = ndisp;
+        csbp.ndisp = ndisp;
        break;
-    case 'q': case 'Q':
-        p.ndisp = max(p.ndisp - 8, 1);
-        cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
+    case 'q':
+    case 'Q':
+        ndisp = max(ndisp - 8, 1);
+        cout << "ndisp: " << ndisp << endl;
+        bm.ndisp = ndisp;
+        bp.ndisp = ndisp;
+        csbp.ndisp = ndisp;
        break;
    case '2':
-        if (p.method == Params::BM)
+        if (method == BM)
        {
            bm.winSize = min(bm.winSize + 1, 51);
            cout << "win_size: " << bm.winSize << endl;
        }
        break;
-    case 'w': case 'W':
-        if (p.method == Params::BM)
+    case 'w':
+    case 'W':
+        if (method == BM)
        {
            bm.winSize = max(bm.winSize - 1, 2);
            cout << "win_size: " << bm.winSize << endl;
        }
        break;
    case '3':
-        if (p.method == Params::BP)
+        if (method == BP)
        {
            bp.iters += 1;
            cout << "iter_count: " << bp.iters << endl;
        }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
        {
            csbp.iters += 1;
            cout << "iter_count: " << csbp.iters << endl;
        }
        break;
-    case 'e': case 'E':
-        if (p.method == Params::BP)
+    case 'e':
+    case 'E':
+        if (method == BP)
        {
            bp.iters = max(bp.iters - 1, 1);
            cout << "iter_count: " << bp.iters << endl;
        }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
        {
            csbp.iters = max(csbp.iters - 1, 1);
            cout << "iter_count: " << csbp.iters << endl;
        }
        break;
    case '4':
-        if (p.method == Params::BP)
+        if (method == BP)
        {
            bp.levels += 1;
            cout << "level_count: " << bp.levels << endl;
        }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
        {
            csbp.levels += 1;
            cout << "level_count: " << csbp.levels << endl;
        }
        break;
-    case 'r': case 'R':
-        if (p.method == Params::BP)
+    case 'r':
+    case 'R':
+        if (method == BP)
        {
            bp.levels = max(bp.levels - 1, 1);
            cout << "level_count: " << bp.levels << endl;
        }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
        {
            csbp.levels = max(csbp.levels - 1, 1);
            cout << "level_count: " << csbp.levels << endl;
--- a/samples/ocl/surf_matcher.cpp
+++ b/samples/ocl/surf_matcher.cpp
@ -1,48 +1,3 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
 #include <iostream>
 #include <stdio.h>
 #include "opencv2/core/core.hpp"
@ -61,27 +16,20 @@ const float GOOD_PORTION = 0.15f;

 namespace
 {
-void help();
-
-void help()
-{
-    std::cout << "\nThis program demonstrates using SURF_OCL features detector and descriptor extractor" << std::endl;
-    std::cout << "\nUsage:\n\tsurf_matcher --left <image1> --right <image2> [-c]" << std::endl;
-    std::cout << "\nExample:\n\tsurf_matcher --left box.png --right box_in_scene.png" << std::endl;
-}

 int64 work_begin = 0;
 int64 work_end = 0;

-void workBegin() 
-{ 
+void workBegin()
+{
    work_begin = getTickCount();
 }
 void workEnd()
 {
    work_end = getTickCount() - work_begin;
 }
-double getTime(){
+double getTime()
+{
    return work_end /((double)cvGetTickFrequency() * 1000.);
 }

@ -114,17 +62,17 @@ struct SURFMatcher
 Mat drawGoodMatches(
    const Mat& cpu_img1,
    const Mat& cpu_img2,
-    const vector<KeyPoint>& keypoints1, 
-    const vector<KeyPoint>& keypoints2, 
+    const vector<KeyPoint>& keypoints1,
+    const vector<KeyPoint>& keypoints2,
    vector<DMatch>& matches,
    vector<Point2f>& scene_corners_
-    )
+)
 {
-    //-- Sort matches and preserve top 10% matches 
+    //-- Sort matches and preserve top 10% matches
    std::sort(matches.begin(), matches.end());
    std::vector< DMatch > good_matches;
    double minDist = matches.front().distance,
-        maxDist = matches.back().distance;
+           maxDist = matches.back().distance;

    const int ptsPairs = std::min(GOOD_PTS_MAX, (int)(matches.size() * GOOD_PORTION));
    for( int i = 0; i < ptsPairs; i++ )
@ -139,8 +87,8 @@ Mat drawGoodMatches(
    // drawing the results
    Mat img_matches;
    drawMatches( cpu_img1, keypoints1, cpu_img2, keypoints2,
-        good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
-        vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );
+                 good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
+                 vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );

    //-- Localize the object
    std::vector<Point2f> obj;
@ -154,28 +102,30 @@ Mat drawGoodMatches(
    }
    //-- Get the corners from the image_1 ( the object to be "detected" )
    std::vector<Point2f> obj_corners(4);
-    obj_corners[0] = cvPoint(0,0); obj_corners[1] = cvPoint( cpu_img1.cols, 0 );
-    obj_corners[2] = cvPoint( cpu_img1.cols, cpu_img1.rows ); obj_corners[3] = cvPoint( 0, cpu_img1.rows );
+    obj_corners[0] = cvPoint(0,0);
+    obj_corners[1] = cvPoint( cpu_img1.cols, 0 );
+    obj_corners[2] = cvPoint( cpu_img1.cols, cpu_img1.rows );
+    obj_corners[3] = cvPoint( 0, cpu_img1.rows );
    std::vector<Point2f> scene_corners(4);
-    
+
    Mat H = findHomography( obj, scene, CV_RANSAC );
    perspectiveTransform( obj_corners, scene_corners, H);

    scene_corners_ = scene_corners;
-    
+
    //-- Draw lines between the corners (the mapped object in the scene - image_2 )
-    line( img_matches, 
-        scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), 
-        Scalar( 0, 255, 0), 2, CV_AA );
-    line( img_matches, 
-        scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), 
-        Scalar( 0, 255, 0), 2, CV_AA );
-    line( img_matches, 
-        scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), 
-        Scalar( 0, 255, 0), 2, CV_AA );
-    line( img_matches, 
-        scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), 
-        Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches,
+          scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches,
+          scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches,
+          scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches,
+          scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
    return img_matches;
 }

@ -185,6 +135,21 @@ Mat drawGoodMatches(
 // use cpu findHomography interface to calculate the transformation matrix
 int main(int argc, char* argv[])
 {
+    const char* keys =
+        "{ h | help     | false           | print help message  }"
+        "{ l | left     |                 | specify left image  }"
+        "{ r | right    |                 | specify right image }"
+        "{ o | output   | SURF_output.jpg | specify output save path (only works in CPU or GPU only mode) }"
+        "{ c | use_cpu  | false           | use CPU algorithms  }"
+        "{ a | use_all  | false           | use both CPU and GPU algorithms}";
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.get<bool>("help"))
+    {
+        std::cout << "Avaible options:" << std::endl;
+        cmd.printParams();
+        return 0;
+    }
+
    vector<cv::ocl::Info> info;
    if(cv::ocl::getDevice(info) == 0)
    {
@ -195,54 +160,38 @@ int main(int argc, char* argv[])

    Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
    oclMat img1, img2;
-    bool useCPU = false;
+    bool useCPU = cmd.get<bool>("c");
    bool useGPU = false;
-    bool useALL = false;
+    bool useALL = cmd.get<bool>("a");

-    for (int i = 1; i < argc; ++i)
+    string outpath = cmd.get<std::string>("o");
+
+    cpu_img1 = imread(cmd.get<std::string>("l"));
+    CV_Assert(!cpu_img1.empty());
+    cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
+    img1 = cpu_img1_grey;
+
+    cpu_img2 = imread(cmd.get<std::string>("r"));
+    CV_Assert(!cpu_img2.empty());
+    cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
+    img2 = cpu_img2_grey;
+
+    if(useALL)
    {
-        if (string(argv[i]) == "--left")
-        {
-            cpu_img1 = imread(argv[++i]);
-            CV_Assert(!cpu_img1.empty());
-            cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
-            img1 = cpu_img1_grey;
-        }
-        else if (string(argv[i]) == "--right")
-        {
-            cpu_img2 = imread(argv[++i]);
-            CV_Assert(!cpu_img2.empty());
-            cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
-            img2 = cpu_img2_grey;
-        }
-        else if (string(argv[i]) == "-c")
-        {
-            useCPU = true;
-            useGPU = false;
-            useALL = false;
-        }else if(string(argv[i]) == "-g")
-        {
-            useGPU = true;
-            useCPU = false;
-            useALL = false;
-        }else if(string(argv[i]) == "-a")
-        {
-            useALL = true;
-            useCPU = false;
-            useGPU = false;
-        }
-        else if (string(argv[i]) == "--help")
-        {
-            help();
-            return -1;
-        }
+        useCPU = false;
+        useGPU = false;
    }
+    else if(useCPU==false && useALL==false)
+    {
+        useGPU = true;
+    }
+
    if(!useCPU)
    {
        std::cout
-            << "Device name:"
-            << info[0].DeviceName[0]
-        << std::endl;
+                << "Device name:"
+                << info[0].DeviceName[0]
+                << std::endl;
    }
    double surf_time = 0.;

@ -262,12 +211,12 @@ int main(int argc, char* argv[])
    //instantiate detectors/matchers
    SURFDetector<SURF>     cpp_surf;
    SURFDetector<SURF_OCL> ocl_surf;
-    
+
    SURFMatcher<BFMatcher>      cpp_matcher;
    SURFMatcher<BFMatcher_OCL>  ocl_matcher;

    //-- start of timing section
-    if (useCPU) 
+    if (useCPU)
    {
        for (int i = 0; i <= LOOP_NUM; i++)
        {
@ -298,7 +247,8 @@ int main(int argc, char* argv[])

        surf_time = getTime();
        std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
-    }else
+    }
+    else
    {
        //cpu runs
        for (int i = 0; i <= LOOP_NUM; i++)
@ -353,14 +303,14 @@ int main(int argc, char* argv[])
            for(size_t i = 0; i < cpu_corner.size(); i++)
            {
                if((std::abs(cpu_corner[i].x - gpu_corner[i].x) > 10)
-                    ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))
+                        ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))
                {
                    std::cout<<"Failed\n";
                    result = false;
                    break;
                }
                result = true;
-            } 
+            }
            if(result)
                std::cout<<"Passed\n";
        }
@ -371,12 +321,15 @@ int main(int argc, char* argv[])
    {
        namedWindow("cpu surf matches", 0);
        imshow("cpu surf matches", img_matches);
+        imwrite(outpath, img_matches);
    }
    else if(useGPU)
    {
        namedWindow("ocl surf matches", 0);
        imshow("ocl surf matches", img_matches);
-    }else
+        imwrite(outpath, img_matches);
+    }
+    else
    {
        namedWindow("cpu surf matches", 0);
        imshow("cpu surf matches", img_matches);
--- a/samples/ocl/tvl1_optical_flow.cpp
+++ b/samples/ocl/tvl1_optical_flow.cpp
@ -0,0 +1,265 @@
+#include <iostream>
+#include <vector>
+#include <iomanip>
+
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/video/video.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+typedef unsigned char uchar;
+#define LOOP_NUM 10
+int64 work_begin = 0;
+int64 work_end = 0;
+
+static void workBegin()
+{
+    work_begin = getTickCount();
+}
+static void workEnd()
+{
+    work_end += (getTickCount() - work_begin);
+}
+static double getTime()
+{
+    return work_end * 1000. / getTickFrequency();
+}
+
+template <typename T> inline T clamp (T x, T a, T b)
+{
+    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
+}
+
+template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
+{
+    x = clamp(x, a, b);
+    return c + (d - c) * (x - a) / (b - a);
+}
+
+static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
+{
+    float maxDisplacement = 1.0f;
+
+    for (int i = 0; i < u.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+        for (int j = 0; j < u.cols; ++j)
+        {
+            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
+
+            if (d > maxDisplacement)
+                maxDisplacement = d;
+        }
+    }
+
+    flowField.create(u.size(), CV_8UC4);
+
+    for (int i = 0; i < flowField.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+
+        Vec4b* row = flowField.ptr<Vec4b>(i);
+
+        for (int j = 0; j < flowField.cols; ++j)
+        {
+            row[j][0] = 0;
+            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][3] = 255;
+        }
+    }
+}
+
+
+int main(int argc, const char* argv[])
+{
+    static std::vector<Info> ocl_info;
+    ocl::getDevice(ocl_info);
+    //if you want to use undefault device, set it here
+    setDevice(ocl_info[0]);
+
+    //set this to save kernel compile time from second time you run
+    ocl::setBinpath("./");
+    const char* keys =
+        "{ h   | help       | false           | print help message }"
+        "{ l   | left       |                 | specify left image }"
+        "{ r   | right      |                 | specify right image }"
+        "{ o   | output     | tvl1_output.jpg | specify output save path }"
+        "{ c   | camera     | 0               | enable camera capturing }"
+        "{ s   | use_cpu    | false           | use cpu or gpu to process the image }"
+        "{ v   | video      |                 | use video as input }";
+
+    CommandLineParser cmd(argc, argv, keys);
+
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Usage: pyrlk_optical_flow [options]" << endl;
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }
+
+    bool defaultPicturesFail = false;
+    string fname0 = cmd.get<string>("l");
+    string fname1 = cmd.get<string>("r");
+    string vdofile = cmd.get<string>("v");
+    string outpath = cmd.get<string>("o");
+    bool useCPU = cmd.get<bool>("s");
+    bool useCamera = cmd.get<bool>("c");
+    int inputName = cmd.get<int>("c");
+
+    Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
+    Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
+
+
+    Mat flow, show_flow;
+    Mat flow_vec[2];
+    if (frame0.empty() || frame1.empty())
+    {
+        useCamera = true;
+        defaultPicturesFail = true;
+        CvCapture* capture = 0;
+        capture = cvCaptureFromCAM( inputName );
+        if (!capture)
+        {
+            cout << "Can't load input images" << endl;
+            return -1;
+        }
+    }
+
+
+    if (useCamera)
+    {
+        CvCapture* capture = 0;
+        Mat frame, frameCopy;
+        Mat frame0Gray, frame1Gray;
+        Mat ptr0, ptr1;
+
+        if(vdofile == "")
+            capture = cvCaptureFromCAM( inputName );
+        else
+            capture = cvCreateFileCapture(vdofile.c_str());
+
+        int c = inputName ;
+        if(!capture)
+        {
+            if(vdofile == "")
+                cout << "Capture from CAM " << c << " didn't work" << endl;
+            else
+                cout << "Capture from file " << vdofile << " failed" <<endl;
+            if (defaultPicturesFail)
+            {
+                return -1;
+            }
+            goto nocamera;
+        }
+
+        cout << "In capture ..." << endl;
+        for(int i = 0;; i++)
+        {
+            frame = cvQueryFrame( capture );
+            if( frame.empty() )
+                break;
+
+            if (i == 0)
+            {
+                frame.copyTo( frame0 );
+                cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+            }
+            else
+            {
+                if (i%2 == 1)
+                {
+                    frame.copyTo(frame1);
+                    cvtColor(frame1, frame1Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame0Gray;
+                    ptr1 = frame1Gray;
+                }
+                else
+                {
+                    frame.copyTo(frame0);
+                    cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame1Gray;
+                    ptr1 = frame0Gray;
+                }
+
+                if (useCPU)
+                {
+                    alg->calc(ptr0, ptr1, flow);
+                    split(flow, flow_vec);
+                }
+                else
+                {
+                    oclMat d_flowx, d_flowy;
+                    d_alg(oclMat(ptr0), oclMat(ptr1), d_flowx, d_flowy);
+                    d_flowx.download(flow_vec[0]);
+                    d_flowy.download(flow_vec[1]);
+                }
+                if (i%2 == 1)
+                    frame1.copyTo(frameCopy);
+                else
+                    frame0.copyTo(frameCopy);
+                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                imshow("PyrLK [Sparse]", show_flow);
+            }
+
+            if( waitKey( 10 ) >= 0 )
+                goto _cleanup_;
+        }
+
+        waitKey(0);
+
+_cleanup_:
+        cvReleaseCapture( &capture );
+    }
+    else
+    {
+nocamera:
+        oclMat d_flowx, d_flowy;
+        for(int i = 0; i <= LOOP_NUM; i ++)
+        {
+            cout << "loop" << i << endl;
+
+            if (i > 0) workBegin();
+            if (useCPU)
+            {
+                alg->calc(frame0, frame1, flow);
+                split(flow, flow_vec);
+            }
+            else
+            {
+                d_alg(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
+                d_flowx.download(flow_vec[0]);
+                d_flowy.download(flow_vec[1]);
+            }
+            if (i > 0 && i <= LOOP_NUM)
+                workEnd();
+
+            if (i == LOOP_NUM)
+            {
+                if (useCPU)
+                    cout << "average CPU time (noCamera) : ";
+                else
+                    cout << "average GPU time (noCamera) : ";
+                cout << getTime() / LOOP_NUM << " ms" << endl;
+
+                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                imshow("PyrLK [Sparse]", show_flow);
+                imwrite(outpath, show_flow);
+            }
+        }
+    }
+
+    waitKey();
+
+    return 0;
+}