Merge branch '2.4'

2025-08-06 14:36:36 +08:00 · 2013-02-28 11:10:40 +04:00 · 2013-02-28 11:10:40 +04:00 · 891d7da6ee
commit 891d7da6ee
parent c5e979ceb7 2be893a21e
39 changed files with 6898 additions and 386 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,27 +1,36 @@
 .git*       export-ignore

-*           text=auto
-*           whitespace=!indent,trail,space
+*           text=auto whitespace=trailing-space,space-before-tab,-indent-with-non-tab,tab-in-indent,tabwidth=4

-*.py        text whitespace=tab-in-indent,trail,space,fix
-*.cpp       text whitespace=tab-in-indent,trail,space,fix
-*.hpp       text whitespace=tab-in-indent,trail,space,fix
-*.cxx       text whitespace=tab-in-indent,trail,space,fix
-*.hxx       text whitespace=tab-in-indent,trail,space,fix
-*.mm        text whitespace=tab-in-indent,trail,space,fix
-*.c         text whitespace=tab-in-indent,trail,space,fix
-*.h         text whitespace=tab-in-indent,trail,space,fix
-*.i         text whitespace=tab-in-indent,trail,space,fix
-*.java      text whitespace=tab-in-indent,trail,space,fix
-*.cu        text whitespace=tab-in-indent,trail,space,fix
-*.cl        text whitespace=tab-in-indent,trail,space,fix
+*.py        text
+*.cpp       text
+*.hpp       text
+*.cxx       text
+*.hxx       text
+*.mm        text
+*.c         text
+*.h         text
+*.i         text
+*.js        text
+*.java      text
+*.scala     text
+*.cu        text
+*.cl        text
+*.css_t     text
+*.qrc       text
+*.qss       text
+*.S         text
+*.rst       text
+*.tex       text
+*.sty       text

-*.cmake     text whitespace=tab-in-indent,trail,space,fix
-*.cmakein   text whitespace=tab-in-indent,trail,space,fix
-*.in        text whitespace=tab-in-indent,trail,space,fix
-CMakeLists.txt  text whitespace=tab-in-indent,trail,space,fix
+*.aidl      text
+*.mk        text

-*.rst       text whitespace=tab-in-indent,trail,space,fix
+*.cmake         text whitespace=tabwidth=2
+*.cmakein       text whitespace=tabwidth=2
+*.in            text whitespace=tabwidth=2
+CMakeLists.txt  text whitespace=tabwidth=2

 *.png       binary
 *.jepg      binary
@ -32,22 +41,21 @@ CMakeLists.txt  text whitespace=tab-in-indent,trail,space,fix
 *.a         binary
 *.so        binary
 *.dll       binary
+*.jar       binary

 *.pdf       binary
 *.pbxproj   binary
 *.vec       binary
 *.doc       binary

-*.css_t     text
-*.qrc       text
-*.qss       text
-*.S         text
-
-*.xml       -text
-*.yml       -text
+*.xml                      -text whitespace=cr-at-eol
+*.yml                      -text whitespace=cr-at-eol
+.project                   -text whitespace=cr-at-eol merge=union
+.classpath                 -text whitespace=cr-at-eol merge=union
+.cproject                  -text whitespace=cr-at-eol merge=union
+org.eclipse.jdt.core.prefs -text whitespace=cr-at-eol merge=union

 *.vcproj    text eol=crlf merge=union
-*.cproject  text eol=crlf merge=union
 *.bat       text eol=crlf
 *.cmd       text eol=crlf
 *.cmd.tmpl  text eol=crlf
--- a/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
+++ b/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
@ -15,7 +15,7 @@ using namespace android;

 const int OpenCVEngine::Platform = DetectKnownPlatforms();
 const int OpenCVEngine::CpuID = GetCpuID();
-const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302};
+const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302, 2040400};

 bool OpenCVEngine::ValidateVersion(int version)
 {
--- a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
+++ b/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
@ -218,6 +218,66 @@ TEST(OpenCVEngineTest, InstallAndGetVersion)
    #endif
    #endif
 }
+
+TEST(OpenCVEngineTest, GetPathFor2_4_2)
+{
+    sp<IOpenCVEngine> Engine = InitConnect();
+    Starter.PackageManager->InstalledPackages.clear();
+    Starter.PackageManager->InstallVersion(2040200, PLATFORM_UNKNOWN, ARCH_ARMv7);
+    EXPECT_FALSE(NULL == Engine.get());
+    String16 result = Engine->GetLibPathByVersion(String16("2.4.2"));
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string());
+}
+
+TEST(OpenCVEngineTest, GetPathFor2_4_3)
+{
+    sp<IOpenCVEngine> Engine = InitConnect();
+    Starter.PackageManager->InstalledPackages.clear();
+    Starter.PackageManager->InstallVersion(2040300, PLATFORM_UNKNOWN, ARCH_ARMv7);
+    EXPECT_FALSE(NULL == Engine.get());
+    String16 result = Engine->GetLibPathByVersion(String16("2.4.3"));
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string());
+}
+
+TEST(OpenCVEngineTest, GetPathFor2_4_3_1)
+{
+    sp<IOpenCVEngine> Engine = InitConnect();
+    Starter.PackageManager->InstalledPackages.clear();
+    Starter.PackageManager->InstallVersion(2040301, PLATFORM_UNKNOWN, ARCH_ARMv7);
+    EXPECT_FALSE(NULL == Engine.get());
+    String16 result = Engine->GetLibPathByVersion(String16("2.4.3.1"));
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string());
+}
+
+TEST(OpenCVEngineTest, GetPathFor2_4_3_2)
+{
+    sp<IOpenCVEngine> Engine = InitConnect();
+    Starter.PackageManager->InstalledPackages.clear();
+    Starter.PackageManager->InstallVersion(2040302, PLATFORM_UNKNOWN, ARCH_ARMv7);
+    EXPECT_FALSE(NULL == Engine.get());
+    String16 result = Engine->GetLibPathByVersion(String16("2.4.3.2"));
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string());
+}
+
+TEST(OpenCVEngineTest, GetPathFor2_4_4)
+{
+    sp<IOpenCVEngine> Engine = InitConnect();
+    Starter.PackageManager->InstalledPackages.clear();
+    Starter.PackageManager->InstallVersion(2040400, PLATFORM_UNKNOWN, ARCH_ARMv7);
+    EXPECT_FALSE(NULL == Engine.get());
+    String16 result = Engine->GetLibPathByVersion(String16("2.4.4"));
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string());
+}
+
+TEST(OpenCVEngineTest, GetPathFor2_4_5)
+{
+    sp<IOpenCVEngine> Engine = InitConnect();
+    Starter.PackageManager->InstalledPackages.clear();
+    Starter.PackageManager->InstallVersion(2040500, PLATFORM_UNKNOWN, ARCH_ARMv7);
+    EXPECT_FALSE(NULL == Engine.get());
+    String16 result = Engine->GetLibPathByVersion(String16("2.4.5"));
+    EXPECT_EQ(0, result.size()); // 2.4.5 is not published yet
+}
 #endif

 #ifndef __i386__
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -33,8 +33,48 @@ if(CUDA_FOUND)

  message(STATUS "CUDA detected: " ${CUDA_VERSION})

-  set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-  set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+  set(_generations "Fermi" "Kepler")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND _generations "Auto")
+  endif()
+  set(CUDA_GENERATION "" CACHE STRING "Build CUDA device code only for specific GPU architecture. Leave empty to build for all architectures.")
+  if( CMAKE_VERSION VERSION_GREATER "2.8" )
+    set_property( CACHE CUDA_GENERATION PROPERTY STRINGS "" ${_generations} )
+  endif()
+
+  if(CUDA_GENERATION)
+    if(NOT ";${_generations};" MATCHES ";${CUDA_GENERATION};")
+      string(REPLACE ";" ", " _generations "${_generations}")
+      message(FATAL_ERROR "ERROR: ${_generations} Generations are suppered.")
+    endif()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  set(__cuda_arch_ptx "")
+  if(CUDA_GENERATION STREQUAL "Fermi")
+    set(__cuda_arch_bin "2.0 2.1(2.0)")
+  elseif(CUDA_GENERATION STREQUAL "Kepler")
+    set(__cuda_arch_bin "3.0")
+  elseif(CUDA_GENERATION STREQUAL "Auto")
+    execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCudaArch.cu" "--run"
+                     WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
+                     RESULT_VARIABLE _nvcc_res OUTPUT_VARIABLE _nvcc_out
+                     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT _nvcc_res EQUAL 0)
+      message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
+    else()
+      set(__cuda_arch_bin "${_nvcc_out}")
+    endif()
+  endif()
+
+  if(NOT DEFINED __cuda_arch_bin)
+    set(__cuda_arch_bin "1.1 1.2 1.3 2.0 2.1(2.0) 3.0")
+    set(__cuda_arch_ptx "2.0 3.0")
+  endif()
+
+  set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+  set(CUDA_ARCH_PTX ${__cuda_arch_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")

  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
  string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
--- a/cmake/OpenCVDetectCudaArch.cu
+++ b/cmake/OpenCVDetectCudaArch.cu
@ -0,0 +1,14 @@
+#include <stdio.h>
+int main()
+{
+    int count = 0;
+    if (cudaSuccess != cudaGetDeviceCount(&count)){return -1;}
+    if (count == 0) {return -1;}
+    for (int device = 0; device < count; ++device)
+    {
+        cudaDeviceProp prop;
+        if (cudaSuccess != cudaGetDeviceProperties(&prop, device)){ continue;}
+        printf("%d.%d ", prop.major, prop.minor);
+    }
+    return 0;
+}
--- a/cmake/templates/OpenCV.mk.in
+++ b/cmake/templates/OpenCV.mk.in
@ -42,7 +42,7 @@ else
        OPENCV_EXTRA_COMPONENTS:=@OPENCV_EXTRA_COMPONENTS_CONFIGMAKE@
    endif
    ifeq ($(TARGET_ARCH_ABI),mips)
-        OPENCV_3RDPARTY_COMPONENTS:=@OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE_NO_TBB@
+        OPENCV_3RDPARTY_COMPONENTS:=@OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE@
        OPENCV_EXTRA_COMPONENTS:=@OPENCV_EXTRA_COMPONENTS_CONFIGMAKE@
    endif
 endif
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@ -48,10 +48,10 @@ The structure of package contents looks as follows:

 ::

-    OpenCV-2.4.3-android-sdk
+    OpenCV-2.4.4-android-sdk
    |_ apk
-    |   |_ OpenCV_2.4.3_binary_pack_armv7a.apk
-    |   |_ OpenCV_2.4.3_Manager_2.0_XXX.apk
+    |   |_ OpenCV_2.4.4_binary_pack_armv7a.apk
+    |   |_ OpenCV_2.4.4_Manager_2.6_XXX.apk
    |
    |_ doc
    |_ samples
@ -157,10 +157,10 @@ Get the OpenCV4Android SDK

   .. code-block:: bash

-      unzip ~/Downloads/OpenCV-2.4.3-android-sdk.zip
+      unzip ~/Downloads/OpenCV-2.4.4-android-sdk.zip

-.. |opencv_android_bin_pack| replace:: OpenCV-2.4.3.2-android-sdk.zip
-.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.3/OpenCV-2.4.3.2-android-sdk.zip/download
+.. |opencv_android_bin_pack| replace:: OpenCV-2.4.4-android-sdk.zip
+.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.4/OpenCV-2.4.4-android-sdk.zip/download
 .. |opencv_android_bin_pack_url| replace:: |opencv_android_bin_pack|
 .. |seven_zip| replace:: 7-Zip
 .. _seven_zip: http://www.7-zip.org/
@ -295,7 +295,7 @@ Well, running samples from Eclipse is very simple:
  .. code-block:: sh
    :linenos:

-    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.3_Manager_armv7a-neon.apk
+    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.4_Manager_armv7a-neon.apk

  .. note:: ``armeabi``, ``armv7a-neon``, ``arm7a-neon-android8``, ``mips`` and ``x86`` stand for
            platform targets:
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
@ -55,14 +55,14 @@ Manager to access OpenCV libraries externally installed in the target system.
   :guilabel:`File -> Import -> Existing project in your workspace`.

   Press :guilabel:`Browse`  button and locate OpenCV4Android SDK
-   (:file:`OpenCV-2.4.3-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.4-android-sdk/sdk`).

   .. image:: images/eclipse_opencv_dependency0.png
        :alt: Add dependency from OpenCV library
        :align: center

 #. In application project add a reference to the OpenCV Java SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.3``.
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.4``.

   .. image:: images/eclipse_opencv_dependency1.png
        :alt: Add dependency from OpenCV library
@ -128,27 +128,27 @@ described above.
 #. Add the OpenCV library project to your workspace the same way as for the async initialization
   above. Use menu :guilabel:`File -> Import -> Existing project in your workspace`,
   press :guilabel:`Browse` button and select OpenCV SDK path
-   (:file:`OpenCV-2.4.3-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.4-android-sdk/sdk`).

   .. image:: images/eclipse_opencv_dependency0.png
        :alt: Add dependency from OpenCV library
        :align: center

 #. In the application project add a reference to the OpenCV4Android SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.3``;
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.4``;

   .. image:: images/eclipse_opencv_dependency1.png
       :alt: Add dependency from OpenCV library
       :align: center

 #. If your application project **doesn't have a JNI part**, just copy the corresponding OpenCV
-   native libs from :file:`<OpenCV-2.4.3-android-sdk>/sdk/native/libs/<target_arch>` to your
+   native libs from :file:`<OpenCV-2.4.4-android-sdk>/sdk/native/libs/<target_arch>` to your
   project directory to folder :file:`libs/<target_arch>`.

   In case of the application project **with a JNI part**, instead of manual libraries copying you
   need to modify your ``Android.mk`` file:
   add the following two code lines after the ``"include $(CLEAR_VARS)"`` and before
-   ``"include path_to_OpenCV-2.4.3-android-sdk/sdk/native/jni/OpenCV.mk"``
+   ``"include path_to_OpenCV-2.4.4-android-sdk/sdk/native/jni/OpenCV.mk"``

   .. code-block:: make
      :linenos:
@ -221,7 +221,7 @@ taken:

   .. code-block:: make

-      include C:\Work\OpenCV4Android\OpenCV-2.4.3-android-sdk\sdk\native\jni\OpenCV.mk
+      include C:\Work\OpenCV4Android\OpenCV-2.4.4-android-sdk\sdk\native\jni\OpenCV.mk

   Should be inserted into the :file:`jni/Android.mk` file **after** this line:

@ -382,7 +382,7 @@ result.
           OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_3, this, mLoaderCallback);
       }

-#. Defines that your activity implements CvViewFrameListener interface and fix activity related
+#. Defines that your activity implements ``CvViewFrameListener2`` interface and fix activity related
   errors by defining missed methods. For this activity define ``onCreate``, ``onDestroy`` and
   ``onPause`` and implement them according code snippet bellow. Fix errors by adding requited
   imports.
@ -423,8 +423,8 @@ result.
       public void onCameraViewStopped() {
       }

-       public Mat onCameraFrame(Mat inputFrame) {
-           return inputFrame;
+       public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
+           return inputFrame.rgba();
       }

 #. Run your application on device or emulator.
@ -432,7 +432,7 @@ result.
 Lets discuss some most important steps. Every Android application with UI must implement Activity
 and View. By the first steps we create blank activity and default view layout. The simplest
 OpenCV-centric application must implement OpenCV initialization, create its own view to show
-preview from camera and implements ``CvViewFrameListener`` interface to get frames from camera and
+preview from camera and implements ``CvViewFrameListener2`` interface to get frames from camera and
 process it.

 First of all we create our application view using xml layout. Our layout consists of the only
@ -448,8 +448,13 @@ After creating layout we need to implement ``Activity`` class. OpenCV initializa
 been already discussed above. In this sample we use asynchronous initialization. Implementation of
 ``CvCameraViewListener`` interface allows you to add processing steps after frame grabbing from
 camera and before its rendering on screen. The most important function is ``onCameraFrame``. It is
-callback function and it is called on retrieving frame from camera. The callback input is frame
-from camera. RGBA format is used by default. You can change this behavior by ``SetCaptureFormat``
-method of ``View`` class. ``Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA`` and
-``Highgui.CV_CAP_ANDROID_GREY_FRAME`` are supported. It expects that function returns RGBA frame
-that will be drawn on the screen.
+callback function and it is called on retrieving frame from camera. The callback input is object
+of ``CvCameraViewFrame`` class that represents frame from camera.
+
+.. note::
+    Do not save or use ``CvCameraViewFrame`` object out of ``onCameraFrame`` callback. This object
+    does not have its own state and its behavior out of callback is unpredictable!
+
+It has ``rgba()`` and ``gray()`` methods that allows to get frame as RGBA and one channel gray scale
+``Mat`` respectively. It expects that ``onCameraFrame`` function returns RGBA frame that will be
+drawn on the screen.
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@ -4458,6 +4458,26 @@ public:
                  Ptr<Algorithm> (Algorithm::*getter)()=0,
                  void (Algorithm::*setter)(const Ptr<Algorithm>&)=0,
                  const std::string& help=std::string());
+    void addParam(Algorithm& algo, const char* name,
+                  float& value, bool readOnly=false,
+                  float (Algorithm::*getter)()=0,
+                  void (Algorithm::*setter)(float)=0,
+                  const std::string& help=std::string());
+    void addParam(Algorithm& algo, const char* name,
+                  unsigned int& value, bool readOnly=false,
+                  unsigned int (Algorithm::*getter)()=0,
+                  void (Algorithm::*setter)(unsigned int)=0,
+                  const std::string& help=std::string());
+    void addParam(Algorithm& algo, const char* name,
+                  uint64& value, bool readOnly=false,
+                  uint64 (Algorithm::*getter)()=0,
+                  void (Algorithm::*setter)(uint64)=0,
+                  const std::string& help=std::string());
+    void addParam(Algorithm& algo, const char* name,
+                  uchar& value, bool readOnly=false,
+                  uchar (Algorithm::*getter)()=0,
+                  void (Algorithm::*setter)(uchar)=0,
+                  const std::string& help=std::string());
    template<typename _Tp, typename _Base> void addParam(Algorithm& algo, const char* name,
                  Ptr<_Tp>& value, bool readOnly=false,
                  Ptr<_Tp> (Algorithm::*getter)()=0,
@ -4477,7 +4497,7 @@ protected:

 struct CV_EXPORTS Param
 {
-    enum { INT=0, BOOLEAN=1, REAL=2, STRING=3, MAT=4, MAT_VECTOR=5, ALGORITHM=6, FLOAT=7, UNSIGNED_INT=8, UINT64=9 };
+    enum { INT=0, BOOLEAN=1, REAL=2, STRING=3, MAT=4, MAT_VECTOR=5, ALGORITHM=6, FLOAT=7, UNSIGNED_INT=8, UINT64=9, UCHAR=11 };

    Param();
    Param(int _type, bool _readonly, int _offset,
@ -4572,6 +4592,13 @@ template<> struct ParamType<uint64>
    enum { type = Param::UINT64 };
 };

+template<> struct ParamType<uchar>
+{
+    typedef uchar const_param_type;
+    typedef uchar member_type;
+
+    enum { type = Param::UCHAR };
+};

 // The CommandLineParser class is designed for command line arguments parsing

--- a/modules/core/src/algorithm.cpp
+++ b/modules/core/src/algorithm.cpp
@ -427,6 +427,14 @@ void AlgorithmInfo::write(const Algorithm* algo, FileStorage& fs) const
            Ptr<Algorithm> nestedAlgo = algo->get<Algorithm>(pname);
            nestedAlgo->write(fs);
        }
+        else if( p.type == Param::FLOAT)
+            cv::write(fs, pname, algo->getDouble(pname));
+        else if( p.type == Param::UNSIGNED_INT)
+            cv::write(fs, pname, algo->getInt(pname));//TODO: implement cv::write(, , unsigned int)
+        else if( p.type == Param::UINT64)
+            cv::write(fs, pname, algo->getInt(pname));//TODO: implement cv::write(, , uint64)
+        else if( p.type == Param::UCHAR)
+            cv::write(fs, pname, algo->getInt(pname));
        else
        {
            std::string msg = format("unknown/unsupported type of '%s' parameter == %d", pname.c_str(), p.type);
@ -486,6 +494,26 @@ void AlgorithmInfo::read(Algorithm* algo, const FileNode& fn) const
            nestedAlgo->read(n);
            info->set(algo, pname.c_str(), p.type, &nestedAlgo, true);
        }
+        else if( p.type == Param::FLOAT )
+        {
+            float val = (float)n;
+            info->set(algo, pname.c_str(), p.type, &val, true);
+        }
+        else if( p.type == Param::UNSIGNED_INT )
+        {
+            unsigned int val = (unsigned int)((int)n);//TODO: implement conversion (unsigned int)FileNode
+            info->set(algo, pname.c_str(), p.type, &val, true);
+        }
+        else if( p.type == Param::UINT64)
+        {
+            uint64 val = (uint64)((int)n);//TODO: implement conversion (uint64)FileNode
+            info->set(algo, pname.c_str(), p.type, &val, true);
+        }
+        else if( p.type == Param::UCHAR)
+        {
+            uchar val = (uchar)((int)n);
+            info->set(algo, pname.c_str(), p.type, &val, true);
+        }
        else
        {
            std::string msg = format("unknown/unsupported type of '%s' parameter == %d", pname.c_str(), p.type);
@ -508,6 +536,10 @@ union GetSetParam
    Mat (Algorithm::*get_mat)() const;
    std::vector<Mat> (Algorithm::*get_mat_vector)() const;
    Ptr<Algorithm> (Algorithm::*get_algo)() const;
+    float (Algorithm::*get_float)() const;
+    unsigned int (Algorithm::*get_uint)() const;
+    uint64 (Algorithm::*get_uint64)() const;
+    uchar (Algorithm::*get_uchar)() const;

    void (Algorithm::*set_int)(int);
    void (Algorithm::*set_bool)(bool);
@ -516,6 +548,10 @@ union GetSetParam
    void (Algorithm::*set_mat)(const Mat&);
    void (Algorithm::*set_mat_vector)(const std::vector<Mat>&);
    void (Algorithm::*set_algo)(const Ptr<Algorithm>&);
+    void (Algorithm::*set_float)(float);
+    void (Algorithm::*set_uint)(unsigned int);
+    void (Algorithm::*set_uint64)(uint64);
+    void (Algorithm::*set_uchar)(uchar);
 };

 static std::string getNameOfType(int argType);
@ -531,6 +567,10 @@ static std::string getNameOfType(int argType)
        case Param::MAT: return "cv::Mat";
        case Param::MAT_VECTOR: return "std::vector<cv::Mat>";
        case Param::ALGORITHM: return "algorithm";
+        case Param::FLOAT: return "float";
+        case Param::UNSIGNED_INT: return "unsigned int";
+        case Param::UINT64: return "unsigned int64";
+        case Param::UCHAR: return "unsigned char";
        default: CV_Error(CV_StsBadArg, "Wrong argument type");
    }
    return "";
@ -542,9 +582,10 @@ static std::string getErrorMessageForWrongArgumentInSetter(std::string algoName,
        + " method was called for the parameter '" + paramName + "' of the algorithm '" + algoName
        +"', the parameter has " + getNameOfType(paramType) + " type, ";

-    if (paramType == Param::INT || paramType == Param::BOOLEAN || paramType == Param::REAL)
+    if (paramType == Param::INT || paramType == Param::BOOLEAN || paramType == Param::REAL
+            || paramType == Param::FLOAT || paramType == Param::UNSIGNED_INT || paramType == Param::UINT64 || paramType == Param::UCHAR)
    {
-        message += "so it should be set by integer, boolean, or double value, ";
+        message += "so it should be set by integer, unsigned integer, uint64, unsigned char, boolean, float or double value, ";
    }
    message += "but the setter was called with " + getNameOfType(argType) + " value";

@ -559,11 +600,11 @@ static std::string getErrorMessageForWrongArgumentInGetter(std::string algoName,

    if (paramType == Param::BOOLEAN)
    {
-        message += "so it should be get as integer, boolean, or double value, ";
+        message += "so it should be get as integer, unsigned integer, uint64, boolean, unsigned char, float or double value, ";
    }
-    else if (paramType == Param::INT)
+    else if (paramType == Param::INT || paramType == Param::UNSIGNED_INT || paramType == Param::UINT64 || paramType == Param::UCHAR)
    {
-        message += "so it should be get as integer or double value, ";
+        message += "so it should be get as integer, unsigned integer, uint64, unsigned char, float or double value, ";
    }
    message += "but the getter was called to get a " + getNameOfType(argType) + " value";

@ -583,9 +624,11 @@ void AlgorithmInfo::set(Algorithm* algo, const char* parameter, int argType, con
    GetSetParam f;
    f.set_int = p->setter;

-    if( argType == Param::INT || argType == Param::BOOLEAN || argType == Param::REAL )
+    if( argType == Param::INT || argType == Param::BOOLEAN || argType == Param::REAL
+            || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR)
    {
-        if ( !( p->type == Param::INT || p->type == Param::REAL || p->type == Param::BOOLEAN) )
+        if ( !( p->type == Param::INT || p->type == Param::REAL || p->type == Param::BOOLEAN
+                || p->type == Param::UNSIGNED_INT || p->type == Param::UINT64 || p->type == Param::FLOAT || argType == Param::UCHAR) )
        {
            std::string message = getErrorMessageForWrongArgumentInSetter(algo->name(), parameter, p->type, argType);
            CV_Error(CV_StsBadArg, message);
@ -593,9 +636,21 @@ void AlgorithmInfo::set(Algorithm* algo, const char* parameter, int argType, con

        if( p->type == Param::INT )
        {
+            bool is_ok = true;
            int val = argType == Param::INT ? *(const int*)value :
            argType == Param::BOOLEAN ? (int)*(const bool*)value :
-                saturate_cast<int>(*(const double*)value);
+                argType == Param::REAL ? saturate_cast<int>(*(const double*)value) :
+                argType == Param::FLOAT ?  saturate_cast<int>(*(const float*)value) :
+                argType == Param::UNSIGNED_INT ? (int)*(const unsigned int*)value :
+                argType == Param::UINT64 ? (int)*(const uint64*)value :
+                argType == Param::UCHAR ? (int)*(const uchar*)value :
+                (int)(is_ok = false);
+
+            if (!is_ok)
+            {
+                CV_Error(CV_StsBadArg, "Wrong argument type in the setter");
+            }
+
            if( p->setter )
                (algo->*f.set_int)(val);
            else
@ -603,24 +658,133 @@ void AlgorithmInfo::set(Algorithm* algo, const char* parameter, int argType, con
        }
        else if( p->type == Param::BOOLEAN )
        {
+            bool is_ok = true;
            bool val = argType == Param::INT ? *(const int*)value != 0 :
                    argType == Param::BOOLEAN ? *(const bool*)value :
-                    *(const double*)value != 0;
+                    argType == Param::REAL ? (*(const double*)value != 0) :
+                    argType == Param::FLOAT ?  (*(const float*)value != 0) :
+                    argType == Param::UNSIGNED_INT ? (*(const unsigned int*)value != 0):
+                    argType == Param::UINT64 ? (*(const uint64*)value != 0):
+                    argType == Param::UCHAR ? (*(const uchar*)value != 0):
+                    (int)(is_ok = false);
+
+            if (!is_ok)
+            {
+                CV_Error(CV_StsBadArg, "Wrong argument type in the setter");
+            }
+
            if( p->setter )
                (algo->*f.set_bool)(val);
            else
                *(bool*)((uchar*)algo + p->offset) = val;
        }
-        else
+        else if( p->type == Param::REAL )
        {
+            bool is_ok = true;
            double val = argType == Param::INT ? (double)*(const int*)value :
                         argType == Param::BOOLEAN ? (double)*(const bool*)value :
-                        *(const double*)value;
+                         argType == Param::REAL ? (double)(*(const double*)value ) :
+                         argType == Param::FLOAT ?  (double)(*(const float*)value ) :
+                         argType == Param::UNSIGNED_INT ? (double)(*(const unsigned int*)value ) :
+                         argType == Param::UINT64 ? (double)(*(const uint64*)value ) :
+                         argType == Param::UCHAR ? (double)(*(const uchar*)value ) :
+                         (double)(is_ok = false);
+
+            if (!is_ok)
+            {
+                CV_Error(CV_StsBadArg, "Wrong argument type in the setter");
+            }
            if( p->setter )
                (algo->*f.set_double)(val);
            else
                *(double*)((uchar*)algo + p->offset) = val;
        }
+        else if( p->type == Param::FLOAT )
+        {
+            bool is_ok = true;
+            double val = argType == Param::INT ? (double)*(const int*)value :
+                         argType == Param::BOOLEAN ? (double)*(const bool*)value :
+                         argType == Param::REAL ? (double)(*(const double*)value ) :
+                         argType == Param::FLOAT ?  (double)(*(const float*)value ) :
+                         argType == Param::UNSIGNED_INT ? (double)(*(const unsigned int*)value ) :
+                         argType == Param::UINT64 ? (double)(*(const uint64*)value ) :
+                         argType == Param::UCHAR ? (double)(*(const uchar*)value ) :
+                         (double)(is_ok = false);
+
+            if (!is_ok)
+            {
+                CV_Error(CV_StsBadArg, "Wrong argument type in the setter");
+            }
+            if( p->setter )
+                (algo->*f.set_float)((float)val);
+            else
+                *(float*)((uchar*)algo + p->offset) = (float)val;
+        }
+        else if( p->type == Param::UNSIGNED_INT )
+        {
+            bool is_ok = true;
+            unsigned int val = argType == Param::INT ? (unsigned int)*(const int*)value :
+                         argType == Param::BOOLEAN ? (unsigned int)*(const bool*)value :
+                         argType == Param::REAL ? saturate_cast<unsigned int>(*(const double*)value ) :
+                         argType == Param::FLOAT ?  saturate_cast<unsigned int>(*(const float*)value ) :
+                         argType == Param::UNSIGNED_INT ? (unsigned int)(*(const unsigned int*)value ) :
+                         argType == Param::UINT64 ? (unsigned int)(*(const uint64*)value ) :
+                         argType == Param::UCHAR ? (unsigned int)(*(const uchar*)value ) :
+                         (int)(is_ok = false);
+
+            if (!is_ok)
+            {
+                CV_Error(CV_StsBadArg, "Wrong argument type in the setter");
+            }
+            if( p->setter )
+                (algo->*f.set_uint)(val);
+            else
+                *(unsigned int*)((uchar*)algo + p->offset) = val;
+        }
+        else if( p->type == Param::UINT64 )
+        {
+            bool is_ok = true;
+            uint64 val = argType == Param::INT ? (uint64)*(const int*)value :
+                         argType == Param::BOOLEAN ? (uint64)*(const bool*)value :
+                         argType == Param::REAL ? saturate_cast<uint64>(*(const double*)value ) :
+                         argType == Param::FLOAT ?  saturate_cast<uint64>(*(const float*)value ) :
+                         argType == Param::UNSIGNED_INT ? (uint64)(*(const unsigned int*)value ) :
+                         argType == Param::UINT64 ? (uint64)(*(const uint64*)value ) :
+                         argType == Param::UCHAR ? (uint64)(*(const uchar*)value ) :
+                         (int)(is_ok = false);
+
+            if (!is_ok)
+            {
+                CV_Error(CV_StsBadArg, "Wrong argument type in the setter");
+            }
+            if( p->setter )
+                (algo->*f.set_uint64)(val);
+            else
+                *(uint64*)((uchar*)algo + p->offset) = val;
+        }
+        else if( p->type == Param::UCHAR )
+        {
+            bool is_ok = true;
+            uchar val = argType == Param::INT ? (uchar)*(const int*)value :
+                         argType == Param::BOOLEAN ? (uchar)*(const bool*)value :
+                         argType == Param::REAL ? saturate_cast<uchar>(*(const double*)value ) :
+                         argType == Param::FLOAT ?  saturate_cast<uchar>(*(const float*)value ) :
+                         argType == Param::UNSIGNED_INT ? (uchar)(*(const unsigned int*)value ) :
+                         argType == Param::UINT64 ? (uchar)(*(const uint64*)value ) :
+                         argType == Param::UCHAR ? (uchar)(*(const uchar*)value ) :
+                         (int)(is_ok = false);
+
+            if (!is_ok)
+            {
+                CV_Error(CV_StsBadArg, "Wrong argument type in the setter");
+            }
+            if( p->setter )
+                (algo->*f.set_uchar)(val);
+            else
+                *(uchar*)((uchar*)algo + p->offset) = val;
+        }
+        else
+            CV_Error(CV_StsBadArg, "Wrong parameter type in the setter");
    }
    else if( argType == Param::STRING )
    {
@ -691,11 +855,12 @@ void AlgorithmInfo::get(const Algorithm* algo, const char* parameter, int argTyp
    GetSetParam f;
    f.get_int = p->getter;

-    if( argType == Param::INT || argType == Param::BOOLEAN || argType == Param::REAL )
+    if( argType == Param::INT || argType == Param::BOOLEAN || argType == Param::REAL
+            || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR)
    {
        if( p->type == Param::INT )
        {
-            if (!( argType == Param::INT || argType == Param::REAL ))
+            if (!( argType == Param::INT || argType == Param::REAL || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR))
            {
                std::string message = getErrorMessageForWrongArgumentInGetter(algo->name(), parameter, p->type, argType);
                CV_Error(CV_StsBadArg, message);
@ -703,13 +868,24 @@ void AlgorithmInfo::get(const Algorithm* algo, const char* parameter, int argTyp
            int val = p->getter ? (algo->*f.get_int)() : *(int*)((uchar*)algo + p->offset);

            if( argType == Param::INT )
-                *(int*)value = val;
+                *(int*)value = (int)val;
+            else if ( argType == Param::REAL )
+                *(double*)value = (double)val;
+            else if ( argType == Param::FLOAT)
+                *(float*)value = (float)val;
+            else if ( argType == Param::UNSIGNED_INT )
+                *(unsigned int*)value = (unsigned int)val;
+            else if ( argType == Param::UINT64 )
+                *(uint64*)value = (uint64)val;
+            else if ( argType == Param::UCHAR)
+                *(uchar*)value = (uchar)val;
            else
-                *(double*)value = val;
+                CV_Error(CV_StsBadArg, "Wrong argument type");
+
        }
        else if( p->type == Param::BOOLEAN )
        {
-            if (!( argType == Param::INT || argType == Param::BOOLEAN || argType == Param::REAL ))
+            if (!( argType == Param::INT || argType == Param::BOOLEAN || argType == Param::REAL || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR))
            {
                std::string message = getErrorMessageForWrongArgumentInGetter(algo->name(), parameter, p->type, argType);
                CV_Error(CV_StsBadArg, message);
@ -720,20 +896,126 @@ void AlgorithmInfo::get(const Algorithm* algo, const char* parameter, int argTyp
                *(int*)value = (int)val;
            else if( argType == Param::BOOLEAN )
                *(bool*)value = val;
-            else
+            else if ( argType == Param::REAL )
                *(double*)value = (int)val;
-        }
+            else if ( argType == Param::FLOAT)
+                *(float*)value = (float)((int)val);
+            else if ( argType == Param::UNSIGNED_INT )
+                *(unsigned int*)value = (unsigned int)val;
+            else if ( argType == Param::UINT64 )
+                *(uint64*)value = (int)val;
+            else if ( argType == Param::UCHAR)
+                *(uchar*)value = (uchar)val;
            else
+                CV_Error(CV_StsBadArg, "Wrong argument type");
+        }
+        else if( p->type == Param::REAL )
        {
-            if( argType != Param::REAL )
+            if(!( argType == Param::REAL || argType == Param::FLOAT))
            {
                std::string message = getErrorMessageForWrongArgumentInGetter(algo->name(), parameter, p->type, argType);
                CV_Error(CV_StsBadArg, message);
            }
            double val = p->getter ? (algo->*f.get_double)() : *(double*)((uchar*)algo + p->offset);

+            if ( argType == Param::REAL )
                *(double*)value = val;
+            else if ( argType == Param::FLOAT)
+                *(float*)value = (float)val;
+            else
+                CV_Error(CV_StsBadArg, "Wrong argument type");
        }
+        else if( p->type == Param::FLOAT )
+        {
+            if(!( argType == Param::REAL || argType == Param::FLOAT))
+            {
+                std::string message = getErrorMessageForWrongArgumentInGetter(algo->name(), parameter, p->type, argType);
+                CV_Error(CV_StsBadArg, message);
+            }
+            float val = p->getter ? (algo->*f.get_float)() : *(float*)((uchar*)algo + p->offset);
+
+            if ( argType == Param::REAL )
+                *(double*)value = (double)val;
+            else if ( argType == Param::FLOAT)
+                *(float*)value = (float)val;
+            else
+                CV_Error(CV_StsBadArg, "Wrong argument type");
+        }
+        else if( p->type == Param::UNSIGNED_INT )
+        {
+            if (!( argType == Param::INT || argType == Param::REAL || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR))
+            {
+                std::string message = getErrorMessageForWrongArgumentInGetter(algo->name(), parameter, p->type, argType);
+                CV_Error(CV_StsBadArg, message);
+            }
+            unsigned int val = p->getter ? (algo->*f.get_uint)() : *(unsigned int*)((uchar*)algo + p->offset);
+
+            if( argType == Param::INT )
+                *(int*)value = (int)val;
+            else if ( argType == Param::REAL )
+                *(double*)value = (double)val;
+            else if ( argType == Param::FLOAT)
+                *(float*)value = (float)val;
+            else if ( argType == Param::UNSIGNED_INT )
+                *(unsigned int*)value = (unsigned int)val;
+            else if ( argType == Param::UINT64 )
+                *(uint64*)value = (uint64)val;
+            else if ( argType == Param::UCHAR)
+                *(uchar*)value = (uchar)val;
+            else
+                CV_Error(CV_StsBadArg, "Wrong argument type");
+        }
+        else if( p->type == Param::UINT64 )
+        {
+            if (!( argType == Param::INT || argType == Param::REAL || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR))
+            {
+                std::string message = getErrorMessageForWrongArgumentInGetter(algo->name(), parameter, p->type, argType);
+                CV_Error(CV_StsBadArg, message);
+        }
+            uint64 val = p->getter ? (algo->*f.get_uint64)() : *(uint64*)((uchar*)algo + p->offset);
+
+            if( argType == Param::INT )
+                *(int*)value = (int)val;
+            else if ( argType == Param::REAL )
+                *(double*)value = (double)val;
+            else if ( argType == Param::FLOAT)
+                *(float*)value = (float)val;
+            else if ( argType == Param::UNSIGNED_INT )
+                *(unsigned int*)value = (unsigned int)val;
+            else if ( argType == Param::UINT64 )
+                *(uint64*)value = (uint64)val;
+            else if ( argType == Param::UCHAR)
+                *(uchar*)value = (uchar)val;
+        else
+                CV_Error(CV_StsBadArg, "Wrong argument type");
+        }
+        else if( p->type == Param::UCHAR )
+        {
+            if (!( argType == Param::INT || argType == Param::REAL || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR))
+            {
+                std::string message = getErrorMessageForWrongArgumentInGetter(algo->name(), parameter, p->type, argType);
+                CV_Error(CV_StsBadArg, message);
+            }
+            uchar val = p->getter ? (algo->*f.get_uchar)() : *(uchar*)((uchar*)algo + p->offset);
+
+            if( argType == Param::INT )
+                *(int*)value = val;
+            else if ( argType == Param::REAL )
+            *(double*)value = val;
+            else if ( argType == Param::FLOAT)
+                *(float*)value = val;
+            else if ( argType == Param::UNSIGNED_INT )
+                *(unsigned int*)value = val;
+            else if ( argType == Param::UINT64 )
+                *(uint64*)value = val;
+            else if ( argType == Param::UCHAR)
+                *(uchar*)value = val;
+            else
+                CV_Error(CV_StsBadArg, "Wrong argument type");
+
+        }
+        else
+            CV_Error(CV_StsBadArg, "Unknown/unsupported parameter type");
    }
    else if( argType == Param::STRING )
    {
@ -819,7 +1101,9 @@ void AlgorithmInfo::addParam_(Algorithm& algo, const char* parameter, int argTyp
    CV_Assert( argType == Param::INT || argType == Param::BOOLEAN ||
               argType == Param::REAL || argType == Param::STRING ||
               argType == Param::MAT || argType == Param::MAT_VECTOR ||
-               argType == Param::ALGORITHM );
+               argType == Param::ALGORITHM
+               || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64
+               || argType == Param::UCHAR);
    data->params.add(std::string(parameter), Param(argType, readOnly,
                     (int)((size_t)value - (size_t)(void*)&algo),
                     getter, setter, help));
@ -896,6 +1180,46 @@ void AlgorithmInfo::addParam(Algorithm& algo, const char* parameter,
              (Algorithm::Getter)getter, (Algorithm::Setter)setter, help);
 }

+void AlgorithmInfo::addParam(Algorithm& algo, const char* parameter,
+                             float& value, bool readOnly,
+                             float (Algorithm::*getter)(),
+                             void (Algorithm::*setter)(float),
+                             const std::string& help)
+{
+    addParam_(algo, parameter, ParamType<float>::type, &value, readOnly,
+              (Algorithm::Getter)getter, (Algorithm::Setter)setter, help);
+}
+
+void AlgorithmInfo::addParam(Algorithm& algo, const char* parameter,
+                             unsigned int& value, bool readOnly,
+                             unsigned int (Algorithm::*getter)(),
+                             void (Algorithm::*setter)(unsigned int),
+                             const std::string& help)
+{
+    addParam_(algo, parameter, ParamType<unsigned int>::type, &value, readOnly,
+              (Algorithm::Getter)getter, (Algorithm::Setter)setter, help);
+}
+
+void AlgorithmInfo::addParam(Algorithm& algo, const char* parameter,
+                             uint64& value, bool readOnly,
+                             uint64 (Algorithm::*getter)(),
+                             void (Algorithm::*setter)(uint64),
+                             const std::string& help)
+{
+    addParam_(algo, parameter, ParamType<uint64>::type, &value, readOnly,
+              (Algorithm::Getter)getter, (Algorithm::Setter)setter, help);
+}
+
+void AlgorithmInfo::addParam(Algorithm& algo, const char* parameter,
+                             uchar& value, bool readOnly,
+                             uchar (Algorithm::*getter)(),
+                             void (Algorithm::*setter)(uchar),
+                             const std::string& help)
+{
+    addParam_(algo, parameter, ParamType<uchar>::type, &value, readOnly,
+              (Algorithm::Getter)getter, (Algorithm::Setter)setter, help);
+}
+
 }

 /* End of file. */
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -42,6 +42,10 @@

 #include "precomp.hpp"

+#if defined _M_IX86 && defined _MSC_VER && _MSC_VER < 1700
+#pragma float_control(precise, on)
+#endif
+
 namespace cv
 {

@ -1095,6 +1099,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
            if( type == CV_32FC1 )
            {
                double d = det3(Sf);
+
                if( d != 0. )
                {
                    double t[12];
--- a/modules/core/test/test_dxt.cpp
+++ b/modules/core/test/test_dxt.cpp
@ -419,7 +419,9 @@ static void fixCCS( Mat& mat, int cols, int flags )
    }
 }

-
+#if defined _MSC_VER &&  _MSC_VER >= 1700
+#pragma optimize("", off)
+#endif
 static void mulComplex( const Mat& src1, const Mat& src2, Mat& dst, int flags )
 {
    dst.create(src1.rows, src1.cols, src1.type());
@ -439,8 +441,8 @@ static void mulComplex( const Mat& src1, const Mat& src2, Mat& dst, int flags )
            if( !(flags & CV_DXT_MUL_CONJ) )
                for( j = 0; j < cols; j += 2 )
                {
-                    double re = (double)a[j]*b[j] - (double)a[j+1]*b[j+1];
-                    double im = (double)a[j+1]*b[j] + (double)a[j]*b[j+1];
+                    double re = (double)a[j]*(double)b[j] - (double)a[j+1]*(double)b[j+1];
+                    double im = (double)a[j+1]*(double)b[j] + (double)a[j]*(double)b[j+1];

                    c[j] = (float)re;
                    c[j+1] = (float)im;
@ -448,8 +450,8 @@ static void mulComplex( const Mat& src1, const Mat& src2, Mat& dst, int flags )
            else
                for( j = 0; j < cols; j += 2 )
                {
-                    double re = (double)a[j]*b[j] + (double)a[j+1]*b[j+1];
-                    double im = (double)a[j+1]*b[j] - (double)a[j]*b[j+1];
+                    double re = (double)a[j]*(double)b[j] + (double)a[j+1]*(double)b[j+1];
+                    double im = (double)a[j+1]*(double)b[j] - (double)a[j]*(double)b[j+1];

                    c[j] = (float)re;
                    c[j+1] = (float)im;
@ -482,6 +484,9 @@ static void mulComplex( const Mat& src1, const Mat& src2, Mat& dst, int flags )
        }
    }
 }
+#if defined _MSC_VER &&  _MSC_VER >= 1700
+#pragma optimize("", on)
+#endif

 }

--- a/modules/features2d/include/opencv2/features2d/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d/features2d.hpp
@ -659,6 +659,7 @@ protected:
  virtual void findBlobs(const Mat &image, const Mat &binaryImage, std::vector<Center> &centers) const;

  Params params;
+  AlgorithmInfo* info() const;
 };


--- a/modules/features2d/src/features2d_init.cpp
+++ b/modules/features2d/src/features2d_init.cpp
@ -125,6 +125,26 @@ CV_INIT_ALGORITHM(GFTTDetector, "Feature2D.GFTT",

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////

+CV_INIT_ALGORITHM(SimpleBlobDetector, "Feature2D.SimpleBlob",
+                  obj.info()->addParam(obj, "thresholdStep",    obj.params.thresholdStep);
+                  obj.info()->addParam(obj, "minThreshold",     obj.params.minThreshold);
+                  obj.info()->addParam(obj, "maxThreshold",     obj.params.maxThreshold);
+                  obj.info()->addParam_(obj, "minRepeatability", (sizeof(size_t) == sizeof(uint64))?Param::UINT64 : Param::UNSIGNED_INT, &obj.params.minRepeatability, false, 0, 0);
+                  obj.info()->addParam(obj, "minDistBetweenBlobs", obj.params.minDistBetweenBlobs);
+                  obj.info()->addParam(obj, "filterByColor",    obj.params.filterByColor);
+                  obj.info()->addParam(obj, "blobColor",        obj.params.blobColor);
+                  obj.info()->addParam(obj, "filterByArea",     obj.params.filterByArea);
+                  obj.info()->addParam(obj, "maxArea",          obj.params.maxArea);
+                  obj.info()->addParam(obj, "filterByCircularity", obj.params.filterByCircularity);
+                  obj.info()->addParam(obj, "maxCircularity",   obj.params.maxCircularity);
+                  obj.info()->addParam(obj, "filterByInertia",  obj.params.filterByInertia);
+                  obj.info()->addParam(obj, "maxInertiaRatio",  obj.params.maxInertiaRatio);
+                  obj.info()->addParam(obj, "filterByConvexity", obj.params.filterByConvexity);
+                  obj.info()->addParam(obj, "maxConvexity",     obj.params.maxConvexity);
+                  );
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+
 class CV_EXPORTS HarrisDetector : public GFTTDetector
 {
 public:
--- a/modules/gpu/test/test_hough.cpp
+++ b/modules/gpu/test/test_hough.cpp
@ -130,7 +130,7 @@ GPU_TEST_P(HoughCircles, Accuracy)
    const bool useRoi = GET_PARAM(2);

    const float dp = 2.0f;
-    const float minDist = 10.0f;
+    const float minDist = 0.0f;
    const int minRadius = 10;
    const int maxRadius = 20;
    const int cannyThreshold = 100;
@ -163,7 +163,7 @@ GPU_TEST_P(HoughCircles, Accuracy)
        {
            cv::Vec3f gold = circles_gold[j];

-            if (std::fabs(cur[0] - gold[0]) < minDist && std::fabs(cur[1] - gold[1]) < minDist && std::fabs(cur[2] - gold[2]) < minDist)
+            if (std::fabs(cur[0] - gold[0]) < 5 && std::fabs(cur[1] - gold[1]) < 5 && std::fabs(cur[2] - gold[2]) < 5)
            {
                found = true;
                break;
--- a/modules/java/android_lib/lint.xml
+++ b/modules/java/android_lib/lint.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<lint>
+    <issue id="NewApi">
+        <ignore path="src\org\opencv\android\JavaCameraView.java" />
+    </issue>
+</lint>
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@ -156,9 +156,21 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
        private CvCameraViewListener mOldStyleListener;
    };

+    /**
+     * This class interface is abstract representation of single frame from camera for onCameraFrame callback
+     * Attention: Do not use objects, that represents this interface out of onCameraFrame callback!
+     */
    public interface CvCameraViewFrame {
-        public abstract Mat rgba();
-        public abstract Mat gray();
+
+        /**
+         * This method returns RGBA Mat with frame
+         */
+        public Mat rgba();
+
+        /**
+         * This method returns single channel gray scale Mat with frame
+         */
+        public Mat gray();
    };

    public void surfaceChanged(SurfaceHolder arg0, int arg1, int arg2, int arg3) {
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@ -2,7 +2,6 @@ package org.opencv.android;

 import java.util.List;

-import android.annotation.TargetApi;
 import android.content.Context;
 import android.graphics.ImageFormat;
 import android.graphics.SurfaceTexture;
@ -11,7 +10,6 @@ import android.hardware.Camera.PreviewCallback;
 import android.os.Build;
 import android.util.AttributeSet;
 import android.util.Log;
-import android.view.SurfaceHolder;

 import org.opencv.core.CvType;
 import org.opencv.core.Mat;
@ -64,7 +62,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
        Log.d(TAG, "Java camera view ctor");
    }

-    @TargetApi(11)
    protected boolean initializeCamera(int width, int height) {
        Log.d(TAG, "Initialize java camera");
        boolean result = true;
@ -154,7 +151,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb

                    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) {
                        mSurfaceTexture = new SurfaceTexture(MAGIC_TEXTURE_ID);
-                        getHolder().setType(SurfaceHolder.SURFACE_TYPE_PUSH_BUFFERS);
                        mCamera.setPreviewTexture(mSurfaceTexture);
                    } else
                       mCamera.setPreviewDisplay(null);
@ -234,7 +230,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
        releaseCamera();
    }

-    @TargetApi(Build.VERSION_CODES.FROYO)
    public void onPreviewFrame(byte[] frame, Camera arg1) {
        Log.i(TAG, "Preview Frame received. Need to create MAT and deliver it to clients");
        Log.i(TAG, "Frame size  is " + frame.length);
--- a/modules/ocl/doc/introduction.rst
+++ b/modules/ocl/doc/introduction.rst
@ -6,13 +6,23 @@ OpenCL Module Introduction
 General Information
 -------------------

-The OpenCV OCL module is a set of classes and functions to utilize OpenCL compatible device. In theroy, it supports any OpenCL 1.1 compatible device, but we only test it on AMD's, Intel's and NVIDIA's GPU at this stage. The compatibility, correctness and performance on CPU is not guaranteed. The OpenCV OCL module includes utility functions, low-level vision primitives, and high-level algorithms. The utility functions and low-level primitives provide a powerful infrastructure for developing fast vision algorithms taking advangtage of OCL whereas the high-level functionality includes some state-of-the-art algorithms(such as surf detector, face detector) ready to be used by the application developers.
+The OpenCV OCL module contains  a set of classes and functions that implement and accelerate select openCV functionality on OpenCL compatible devices. OpenCL is a Khronos standard, implemented by a variety of devices (CPUs, GPUs, FPGAs, ARM), abstracting the exact hardware details, while enabling vendors to provide native implementation for maximal acceleration on their hardware. The standard enjoys wide industry support, and the end user of the module will enjoy the data parallelism benefits that the specific platform/hardware may be capable of, in a platform/hardware independent manner.
+
+While in the future we hope to validate (and enable) the OCL module in all OpenCL capable devices, we currently develop and test on GPU devices only. This includes both discrete GPUs (NVidia, AMD), as well as integrated chips(AMD APU and intel HD devices). Performance of any particular algorithm will depend on the particular platform characteristics and capabilities. However, currently (as of 2.4.4), accuracy and  mathematical correctness has been verified to be identical to that of the pure CPU implementation on all tested GPU devices and platforms (both windows and linux).
+
+
+The OpenCV OCL module includes utility functions, low-level vision primitives, and high-level algorithms. The utility functions and low-level primitives provide a powerful infrastructure for developing fast vision algorithms taking advangtage of OCL whereas the high-level functionality (samples)includes some state-of-the-art algorithms (including LK Optical flow, and Face detection) ready to be used by the application developers. The module is also accompanied by an extensive performance and accuracy test suite.
+
+The OpenCV OCL module is designed for ease of use and does not require any knowledge of OpenCL. At a minimuml level, it can be viewed as a set of accelerators, that can take advantage of the high compute throughput that GPU/APU devices can provide. However, it can also be viewed as a starting point to really integratethe  built-in functionality with your own custom OpenCL kernels, with or without modifying the source of OpenCV-OCL. Of course, knowledge of OpenCL will certainly help, however we hope that OpenCV-OCL module, and  the kernels it contains in source code, can be very useful as a means of actually learning openCL. Such a knowledge would be necessary to further fine-tune any of the existing OpenCL kernels, or for extending the framework with new kernels. As of OpenCV 2.4.4, we introduce interoperability with OpenCL, enabling easy use of custom OpenCL kernels within the OpenCV framework.
+
+To use the OCL module, you need to make sure that you have the OpenCL SDK provided with your device vendor. To correctly run the OCL module, you need to have the OpenCL runtime provide by the device vendor, typically the device driver.
+
+To enable OCL support, configure OpenCV using CMake with WITH\_OPENCL=ON. When the flag is set and if OpenCL SDK is installed, the full-featured OpenCV OCL module is built. Otherwise, the module may be not built. If you have AMD'S FFT and BLAS library, you can select it with WITH\_OPENCLAMDFFT=ON, WITH\_OPENCLAMDBLAS=ON.
+
+The ocl module can be found under the "modules" directory. In "modules/ocl/src" you can find the source code for the cpp class that wrap around the direct kernel invocation. The kernels themselves can be found in "modules/ocl/src/kernels."  Samples can be found under "samples/ocl." Accuracy tests can be found in "modules/ocl/test," and performance tests under "module/ocl/perf."

-The OpenCV OCL module is designed as a host-level API plus device-level kernels. The device-level kernels are collected as strings at OpenCV compile time and are compiled at runtime, so it need OpenCL runtime support. To correctly build the OpenCV OCL module, make sure you have OpenCL SDK provided your device vendor. To correctly run the OpenCV OCL module, make sure you have OpenCL runtime provided by your device vendor, which is device driver normally.

-The OpenCV OCL module is designed for ease of use and does not require any knowledge of OpenCL. Though, such a knowledge will certainly be useful to handle non-trivial cases or achieve the highest performance. It is helpful to understand the cost of various operations, what the OCL does, what the preferred data formats are, and so on. Since there is data transfer between OpenCL host and OpenCL device, for better performance it's recommended to copy data once to the OpenCL host memory (i.e. copy ``cv::Mat`` to ``cv::ocl::OclMat``), then call several ``cv::ocl`` functions and then copy the result back to CPU memory, rather than do forward and backward transfer for each OCL function.

-To enable OCL support, configure OpenCV using CMake with WIHT\_OPENCL=ON. When the flag is set and if OpenCL SDK is installed, the full-featured OpenCV OCL module is built. Otherwise, the module may be not built. If you have AMD'S FFT and BLAS library, you can select it with WITH\_OPENCLAMDFFT=ON, WIHT\_OPENCLAMDBLAS=ON.

 Right now, the user should define the cv::ocl::Info class in the application and call cv::ocl::getDevice before any cv::ocl::func. This operation initialize OpenCL runtime and set the first found device as computing device. If there are more than one device and you want to use undefault device, you can call cv::ocl::setDevice then.

@ -21,24 +31,28 @@ In the current version, all the thread share the same context and device so the
 Developer Notes
 -------------------

-This section descripe the design details of ocl module for who are interested in the detail of this module or want to contribute this module. User who isn't interested the details, can safely ignore it.
+In a heterogeneous device environment, there may be cost associated with data transfer. This would be the case, for example, when data needs to be moved from host memory (accessible to the CPU), to device memory (accessible to a discrete GPU). in the case of integrated graphics chips, there may be performance issues, relating to memory coherency between access from the GPU "part" of the integrated device, or the CPU "part." For best performance, in either case, it is recommended that you do not introduce dat transfers between CPU and the discrete GPU, except in the beginning and the end of the algorithmic pipeline.
+
+Some tidbits:

 1. OpenCL version should be larger than 1.1 with FULL PROFILE.

-2. There's only one OpenCL context and commandqueue and generated as a singleton. So now it only support one device with single commandqueue.
+2. Currently (2.4.4) the user call the cv::ocl::getDevice before any other function in the ocl module. This will initialize the OpenCL runtime and set the first found device as computing device. If there are more than one device and you want to use undefault device, you can call cv::ocl::setDevice thereafter.

-3. All the functions use 256 as its workgroup size if possible, so the max work group size of the device must larger than 256.
+2. Currently there's only one OpenCL context and command queue. We hope to implement multi device and multi queue support in the future.

-4. If the device support double, we will use double in kernel if OpenCV cpu version use double, otherwise, we use float instead.
+3. Many kernels use 256 as its workgroup size if possible, so the max work group size of the device must larger than 256. All GPU devices we are aware of indeed support 256 workitems in a workgroup, however non GPU devices may not. This will be improved in the future.

-5. The oclMat use buffer object, not image object.
+4. If the device does not support double arithetic, we revert to float.

-6. All the 3-channel matrix(i.e. RGB image) are represented by 4-channel matrix in oclMat. It means 3-channel image have 4-channel space with the last channel unused. We provide a transparent interface to handle the difference between OpenCV Mat and oclMat.
+5. The oclMat uses buffer object, not image object.
+
+6. All the 3-channel matrices(i.e. RGB image) are represented by 4-channel matrices in oclMat, with the last channel unused. We provide a transparent interface to handle the difference between OpenCV Mat and oclMat.

 7. All the matrix in oclMat is aligned in column(now the alignment factor is 32 byte). It means, if a matrix is n columns m rows with the element size x byte, we will assign ALIGNMENT(x*n) bytes for each column with the last ALIGNMENT(x*n) - x*n bytes unused, so there's small holes after each column if its size is not the multiply of ALIGN.

-8. Data transfer between Mat and oclMat. If the CPU matrix is aligned in column, we will use faster API to transfer between Mat and oclMat, otherwise, we will use clEnqueueRead/WriteBufferRect to transfer data to guarantee the alignment. 3-channel matrix is an exception, it's directly transferred to a temp buffer and then padded to 4-channel matrix(also aligned) when uploading and do the reverse operation when downloading.
+8. Data transfer between Mat and oclMat: If the CPU matrix is aligned in column, we will use faster API to transfer between Mat and oclMat, otherwise, we will use clEnqueueRead/WriteBufferRect to transfer data to guarantee the alignment. 3-channel matrix is an exception, it's directly transferred to a temp buffer and then padded to 4-channel matrix(also aligned) when uploading and do the reverse operation when downloading.

-9. Data transfer between Mat and oclMat. ROI is a feature of OpenCV, which allow users process a sub rectangle of a matrix. When a CPU matrix which has ROI will be transfered to GPU, the whole matrix will be transfered and set ROI as CPU's. In a word, we always transfer the whole matrix despite whether it has ROI or not.
+9. Data transfer between Mat and oclMat: ROI is a feature of OpenCV, which allow users process a sub rectangle of a matrix. When a CPU matrix which has ROI will be transfered to GPU, the whole matrix will be transfered and set ROI as CPU's. In a word, we always transfer the whole matrix despite whether it has ROI or not.

 10. All the kernel file should locate in ocl/src/kernels/ with the extension ".cl". ALL the kernel files are transformed to pure characters at compilation time in kernels.cpp, and the file name without extension is the name of the characters.
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -65,6 +65,32 @@ namespace cv
            //CVCL_DEVICE_TYPE_CUSTOM      = (1 << 4)
            CVCL_DEVICE_TYPE_ALL         = 0xFFFFFFFF
        };
+
+        enum DevMemRW
+        {
+            DEVICE_MEM_R_W = 0,
+            DEVICE_MEM_R_ONLY,
+            DEVICE_MEM_W_ONLY
+        };
+
+        enum DevMemType
+        {
+            DEVICE_MEM_DEFAULT = 0,
+            DEVICE_MEM_AHP,         //alloc host pointer
+            DEVICE_MEM_UHP,         //use host pointer
+            DEVICE_MEM_CHP,         //copy host pointer
+            DEVICE_MEM_PM           //persistent memory
+        };
+
+        //Get the global device memory and read/write type
+        //return 1 if unified memory system supported, otherwise return 0
+        CV_EXPORTS int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type);
+
+        //Set the global device memory and read/write type,
+        //the newly generated oclMat will all use this type
+        //return -1 if the target type is unsupported, otherwise return 0
+        CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
+
        //this class contains ocl runtime information
        class CV_EXPORTS Info
        {
@ -227,6 +253,11 @@ namespace cv
            // previous data is unreferenced if needed.
            void create(int rows, int cols, int type);
            void create(Size size, int type);
+
+            //! allocates new oclMatrix with specified device memory type.
+            void createEx(int rows, int cols, int type, DevMemRW rw_type, DevMemType mem_type);
+            void createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type);
+
            //! decreases reference counter;
            // deallocate the data when reference counter reaches 0.
            void release();
@ -1788,6 +1819,8 @@ namespace cv
                                          const oclMat &bu, const oclMat &bv,
                                          float pos, oclMat &newFrame, oclMat &buf);

+        //! computes moments of the rasterized shape or a vector of points
+        CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage);
    }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@ -4317,11 +4317,11 @@ INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(
                            Values(false))); // Values(false) is the reserved parameter

 INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
-                            Values(CV_32FC1, CV_64FC1),
+                            Values(CV_32FC1, CV_32FC1),
                            Values(false))); // Values(false) is the reserved parameter

 INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
-                            Values(CV_32FC1, CV_64FC1),
+                            Values(CV_32FC1, CV_32FC1),
                            Values(false))); // Values(false) is the reserved parameter

 INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@ -53,6 +53,10 @@ void cv::ocl::dft(const oclMat&, oclMat&, Size, int)
 {
    CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
 }
+namespace cv { namespace ocl {
+    void fft_teardown();
+}}
+void cv::ocl::fft_teardown(){}
 #else
 #include "clAmdFft.h"
 namespace cv
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@ -55,13 +55,16 @@ using namespace cv::ocl;

 //#define PRINT_KERNEL_RUN_TIME
 #define RUN_TIMES 100
-
+#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
+#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
+#endif
 //#define AMD_DOUBLE_DIFFER

 namespace cv
 {
    namespace ocl
    {
+        extern void fft_teardown();
        /*
         * The binary caching system to eliminate redundant program source compilation.
         * Strictly, this is not a cache because we do not implement evictions right now.
@ -69,6 +72,15 @@ namespace cv
         */
        std::auto_ptr<ProgramCache> ProgramCache::programCache;
        ProgramCache *programCache = NULL;
+        DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
+        DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
+        int gDevMemTypeValueMap[5] = {0,
+                                      CL_MEM_ALLOC_HOST_PTR,
+                                      CL_MEM_USE_HOST_PTR,
+                                      CL_MEM_COPY_HOST_PTR,
+                                      CL_MEM_USE_PERSISTENT_MEM_AMD};
+        int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
+
        ProgramCache::ProgramCache()
        {
            codeCache.clear();
@ -110,29 +122,24 @@ namespace cv
        }

        ////////////////////////Common OpenCL specific calls///////////////
-        //Info::Info()
-        //{
-        //	oclplatform = 0;
-        //	oclcontext = 0;
-        //	devnum = 0;
-        //}
-        //Info::~Info()
-        //{
-        //	release();
-        //}
-        //void Info::release()
-        //{
-        //	if(oclplatform)
-        //	{
-        //		oclplatform = 0;
-        //	}
-        //	if(oclcontext)
-        //	{
-        //		openCLSafeCall(clReleaseContext(oclcontext));
-        //	}
-        //	devices.empty();
-        //	devName.empty();
-        //}
+        int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
+        {
+            rw_type = gDeviceMemRW;
+            mem_type = gDeviceMemType;
+            return Context::getContext()->impl->unified_memory;
+        }
+
+        int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
+        {
+            if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
+                 mem_type == DEVICE_MEM_UHP ||
+                 mem_type == DEVICE_MEM_CHP )
+                return -1;
+            gDeviceMemRW = rw_type;
+            gDeviceMemType = mem_type;
+            return 0;
+        }
+
       struct Info::Impl
        {
            cl_platform_id oclplatform;
@ -287,11 +294,8 @@ namespace cv
         }

        void *getoclContext()
-
        {
-
            return &(Context::getContext()->impl->clContext);
-
        }

        void *getoclCommandQueue()
@ -316,10 +320,16 @@ namespace cv

        void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
                               size_t widthInBytes, size_t height)
+        {
+            openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
+        }
+
+        void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+                               size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
        {
            cl_int status;

-            *dev_ptr = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
+            *dev_ptr = clCreateBuffer(clCxt->impl->clContext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
                                      widthInBytes * height, 0, &status);
            openCLVerifyCall(status);
            *pitch = widthInBytes;
@ -834,6 +844,11 @@ namespace cv
            clcxt->impl->double_support = oclinfo.impl->double_support;
            //extra options to recognize compiler options
            memcpy(clcxt->impl->extra_options, oclinfo.impl->extra_options, 512);
+            cl_bool unfymem = false;
+            openCLSafeCall(clGetDeviceInfo(clcxt->impl->devices, CL_DEVICE_HOST_UNIFIED_MEMORY,
+                                           sizeof(cl_bool), (void *)&unfymem, NULL));
+            if(unfymem)
+                clcxt->impl->unified_memory = 1;
        }
        Context::Context()
        {
@ -850,6 +865,7 @@ namespace cv
            impl->double_support = 0;
            //extra options to recognize vendor specific fp64 extensions
            memset(impl->extra_options, 0, 512);
+            impl->unified_memory = 0;
            programCache = ProgramCache::getProgramCache();
        }

@ -874,6 +890,7 @@ namespace cv
        }
        void Info::release()
        {
+            fft_teardown();
            if(impl->oclplatform)
            {
                impl->oclplatform = 0;
--- a/modules/ocl/src/interpolate_frames.cpp
+++ b/modules/ocl/src/interpolate_frames.cpp
@ -45,6 +45,7 @@

 #include <iomanip>
 #include "precomp.hpp"
+#include "mcwutil.hpp"

 using namespace cv;
 using namespace cv::ocl;
@ -230,73 +231,10 @@ void interpolate::blendFrames(const oclMat &frame0, const oclMat &/*frame1*/, co

 void interpolate::bindImgTex(const oclMat &img, cl_mem &texture)
 {
-    cl_image_format format;
-    int err;
-    int depth    = img.depth();
-    int channels = img.channels();
-
-    switch(depth)
-    {
-    case CV_8U:
-        format.image_channel_data_type = CL_UNSIGNED_INT8;
-        break;
-    case CV_32S:
-        format.image_channel_data_type = CL_UNSIGNED_INT32;
-        break;
-    case CV_32F:
-        format.image_channel_data_type = CL_FLOAT;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
-    switch(channels)
-    {
-    case 1:
-        format.image_channel_order     = CL_R;
-        break;
-    case 3:
-        format.image_channel_order     = CL_RGB;
-        break;
-    case 4:
-        format.image_channel_order     = CL_RGBA;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
    if(texture)
    {
        openCLFree(texture);
    }
-
-#ifdef CL_VERSION_1_2
-    cl_image_desc desc;
-    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
-    desc.image_width      = img.step / img.elemSize();
-    desc.image_height     = img.rows;
-    desc.image_depth      = 0;
-    desc.image_array_size = 1;
-    desc.image_row_pitch  = 0;
-    desc.image_slice_pitch = 0;
-    desc.buffer           = NULL;
-    desc.num_mip_levels   = 0;
-    desc.num_samples      = 0;
-    texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
-#else
-    texture = clCreateImage2D(
-                  Context::getContext()->impl->clContext,
-                  CL_MEM_READ_WRITE,
-                  &format,
-                  img.step / img.elemSize(),
-                  img.rows,
-                  0,
-                  NULL,
-                  &err);
-#endif
-    size_t origin[] = { 0, 0, 0 };
-    size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
-    clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
-    openCLSafeCall(err);
+    texture = bindTexture(img);
 }

--- a/modules/ocl/src/kernels/cvt_color.cl
+++ b/modules/ocl/src/kernels/cvt_color.cl
@ -203,8 +203,8 @@ __kernel void YUV2RGB(int cols,int rows,int src_step,int dst_step,int channels,

 __constant int ITUR_BT_601_CY = 1220542;
 __constant int ITUR_BT_601_CUB = 2116026;
-__constant int ITUR_BT_601_CUG = -409993;
-__constant int ITUR_BT_601_CVG = -852492;
+__constant int ITUR_BT_601_CUG = 409993;
+__constant int ITUR_BT_601_CVG = 852492;
 __constant int ITUR_BT_601_CVR = 1673527;
 __constant int ITUR_BT_601_SHIFT = 20;

@ -229,7 +229,7 @@ __kernel void YUV2RGBA_NV12(int cols,int rows,int src_step,int dst_step,
        int V  = usrc[1] - 128;

        int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
-        int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * V + ITUR_BT_601_CUG * U;
+        int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
        int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;

        Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
--- a/modules/ocl/src/kernels/moments.cl
+++ b/modules/ocl/src/kernels/moments.cl
@ -0,0 +1,938 @@
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#else
+typedef float double;
+typedef float4 double4;
+#define convert_double4 convert_float4
+#endif
+//#pragma OPENCL EXTENSION cl_amd_printf:enable
+//#if defined (DOUBLE_SUPPORT)
+__kernel void icvContourMoments(int contour_total,
+                                __global float* reader_oclmat_data,
+                                __global double* dst_a00,
+                                __global double* dst_a10,
+                                __global double* dst_a01,
+                                __global double* dst_a20,
+                                __global double* dst_a11,
+                                __global double* dst_a02,
+                                __global double* dst_a30,
+                                __global double* dst_a21,
+                                __global double* dst_a12,
+                                __global double* dst_a03)
+{
+    double xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
+    int idx = get_global_id(0);
+
+    xi_1 = *(reader_oclmat_data + (get_global_id(0) << 1));
+    yi_1 = *(reader_oclmat_data + (get_global_id(0) << 1) + 1);
+    xi_12 = xi_1 * xi_1;
+    yi_12 = yi_1 * yi_1;
+
+    if(idx == contour_total - 1)
+    {
+        xi = *(reader_oclmat_data);
+        yi = *(reader_oclmat_data + 1);
+    }
+    else
+    {
+        xi = *(reader_oclmat_data + (idx + 1) * 2);
+        yi = *(reader_oclmat_data + (idx + 1) * 2 + 1);
+    }
+
+    xi2 = xi * xi;
+    yi2 = yi * yi;
+    dxy = xi_1 * yi - xi * yi_1;
+    xii_1 = xi_1 + xi;
+    yii_1 = yi_1 + yi;
+
+    dst_a00[idx] = dxy;
+    dst_a10[idx] = dxy * xii_1;
+    dst_a01[idx] = dxy * yii_1;
+    dst_a20[idx] = dxy * (xi_1 * xii_1 + xi2);
+    dst_a11[idx] = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
+    dst_a02[idx] = dxy * (yi_1 * yii_1 + yi2);
+    dst_a30[idx] = dxy * xii_1 * (xi_12 + xi2);
+    dst_a03[idx] = dxy * yii_1 * (yi_12 + yi2);
+    dst_a21[idx] =
+        dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
+               xi2 * (yi_1 + 3 * yi));
+    dst_a12[idx] =
+        dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
+               yi2 * (xi_1 + 3 * xi));
+}
+//#endif
+
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step, int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
+{
+    uchar tmp_coi[16]; // get the coi data
+    uchar16 tmp[16];
+    int VLEN_C = 16;  // vector length of uchar
+
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE; // vector length of uchar
+    int x = wgidx*TILE_SIZE;  // vector length of uchar
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if( tileSize_width < TILE_SIZE )
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
+    if( coi > 0 )   //channel of interest
+        for(int i = 0; i < tileSize_width; i += VLEN_C)
+        {
+            for(int j=0; j<VLEN_C; j++)
+                tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
+                                      tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_C)
+            tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
+    uchar16 zero = (uchar16)(0);
+    uchar16 full = (uchar16)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_C)
+            tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
+    double mom[10];
+    __local int m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int lm[10] = {0};
+    int16 x0 = (int16)(0);
+    int16 x1 = (int16)(0);
+    int16 x2 = (int16)(0);
+    int16 x3 = (int16)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
+    {
+        int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
+        int16 p = convert_int16(tmp[xt/VLEN_C]);
+        int16 xp = v_xt * p, xxp = xp *v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp * v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
+    int py = lidy * ((int)x0.s0);
+    int sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((int)py) * sy;  // m03
+        m[8][lidy-bheight] = ((int)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((int)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((int)py) * sy;  // m03
+        lm[8] = ((int)x1.s0) * sy;  // m12
+        lm[7] = ((int)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = bheight; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+        double xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, __global double* sum, __global double* dst_m00,
+                      __global double* dst_m10,
+                      __global double* dst_m01,
+                      __global double* dst_m20,
+                      __global double* dst_m11,
+                      __global double* dst_m02,
+                      __global double* dst_m30,
+                      __global double* dst_m21,
+                      __global double* dst_m12,
+                      __global double* dst_m03)
+{
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int block_y = src_rows/tile_height;
+    int block_x = src_cols/tile_width;
+    int block_num;
+
+    if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
+        block_y ++;
+    if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
+        block_x ++;
+    block_num = block_y * block_x;
+    __local double dst_sum[10][128];
+    if(gidy<128-block_num)
+        for(int i=0; i<10; i++)
+            dst_sum[i][gidy+block_num]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(gidy<block_num)
+    {
+        dst_sum[0][gidy] = dst_m00[gidy];
+        dst_sum[1][gidy] = dst_m10[gidy];
+        dst_sum[2][gidy] = dst_m01[gidy];
+        dst_sum[3][gidy] = dst_m20[gidy];
+        dst_sum[4][gidy] = dst_m11[gidy];
+        dst_sum[5][gidy] = dst_m02[gidy];
+        dst_sum[6][gidy] = dst_m30[gidy];
+        dst_sum[7][gidy] = dst_m21[gidy];
+        dst_sum[8][gidy] = dst_m12[gidy];
+        dst_sum[9][gidy] = dst_m03[gidy];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int lsize=64; lsize>0; lsize>>=1)
+    {
+        if(gidy<lsize)
+        {
+            int lsize2 = gidy + lsize;
+            for(int i=0; i<10; i++)
+                dst_sum[i][gidy] += dst_sum[i][lsize2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(gidy==0)
+        for(int i=0; i<10; i++)
+            sum[i] = dst_sum[i][0];
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step,
+                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    ushort tmp_coi[8]; // get the coi data
+    ushort8 tmp[32];
+    int VLEN_US = 8; // vector length of ushort
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/2, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols -x);
+    if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
+        for(int i=tileSize_width; i < rstep; i++ )
+            *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_US)
+        {
+            for(int j=0; j<VLEN_US; j++)
+                tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_US)
+            tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    ushort8 zero = (ushort8)(0);
+    ushort8 full = (ushort8)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_US)
+            tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
+    double mom[10];
+    __local long m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    long lm[10] = {0};
+    int8 x0 = (int8)(0);
+    int8 x1 = (int8)(0);
+    int8 x2 = (int8)(0);
+    long8 x3 = (long8)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
+    {
+        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
+        int8 p = convert_int8(tmp[xt/VLEN_US]);
+        int8 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += convert_long8(xxp) *convert_long8(v_xt);
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
+
+    int py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((long)py) * sy;  // m03
+        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((long)py) * sy;  // m03
+        lm[8] = ((long)x1.s0) * sy;  // m12
+        lm[7] = ((long)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x  *mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step,
+                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    short tmp_coi[8]; // get the coi data
+    short8 tmp[32];
+    int VLEN_S =8; // vector length of short
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/2, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols -x);
+    if(tileSize_width < TILE_SIZE)
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_S)
+        {
+            for(int j=0; j<VLEN_S; j++)
+                tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_S)
+            tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    short8 zero = (short8)(0);
+    short8 full = (short8)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=(VLEN_S))
+            tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
+
+    double mom[10];
+    __local long m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    long lm[10] = {0};
+    int8 x0 = (int8)(0);
+    int8 x1 = (int8)(0);
+    int8 x2 = (int8)(0);
+    long8 x3 = (long8)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
+    {
+        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
+        int8 p = convert_int8(tmp[xt/VLEN_S]);
+        int8 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += convert_long8(xxp) * convert_long8(v_xt);
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
+
+    int py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((long)py) * sy;  // m03
+        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((long)py) * sy;  // m03
+        lm[8] = ((long)(x1.s0)) * sy;  // m12
+        lm[7] = ((long)(x2.s0)) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >=1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy ==0 &&lidx ==0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x * mom[0], ym = y*mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx]  = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]   = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]   = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                            __global double* dst_m00,
+                            __global double* dst_m10,
+                            __global double* dst_m01,
+                            __global double* dst_m20,
+                            __global double* dst_m11,
+                            __global double* dst_m02,
+                            __global double* dst_m30,
+                            __global double* dst_m21,
+                            __global double* dst_m12,
+                            __global double* dst_m03,
+                            int dst_cols, int dst_step,
+                            int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    float tmp_coi[4]; // get the coi data
+    float4 tmp[64] ;
+    int VLEN_F = 4; // vector length of float
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/4, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols -x);
+    if(tileSize_width < TILE_SIZE)
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_F)
+        {
+            for(int j=0; j<4; j++)
+                tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_F)
+            tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+    float4 zero = (float4)(0);
+    float4 full = (float4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=4)
+            tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
+    double mom[10];
+    __local double m[10][128];
+    if(lidy == 0)
+        for(int i = 0; i < 10; i ++)
+            for(int j = 0; j < 128; j ++)
+                m[i][j] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    double lm[10] = {0};
+    double4 x0 = (double4)(0);
+    double4 x1 = (double4)(0);
+    double4 x2 = (double4)(0);
+    double4 x3 = (double4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
+    {
+        double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
+        double4 p = convert_double4(tmp[xt/VLEN_F]);
+        double4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp * v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+/*
+    double py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((double)py) * sy;  // m03
+        m[8][lidy-bheight] = ((double)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((double)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((double)py) * sy;  // m03
+        lm[8] = ((double)x1.s0) * sy;  // m12
+        lm[7] = ((double)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx]= mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }*/
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D6(__global double* src_data,  int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step,
+                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    double tmp_coi[4]; // get the coi data
+    double4 tmp[64];
+    int VLEN_D = 4; // length of vetor
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/8, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE,  src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if(tileSize_width < TILE_SIZE)
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+        {
+            for(int j=0; j<4; j++)
+                tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+    double4 zero = (double4)(0);
+    double4 full = (double4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
+    double mom[10];
+    __local double m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    double lm[10] = {0};
+    double4 x0 = (double4)(0);
+    double4 x1 = (double4)(0);
+    double4 x2 = (double4)(0);
+    double4 x3 = (double4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
+    {
+        double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
+        double4 p = tmp[xt/VLEN_D];
+        double4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp *v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+
+    double py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((double)py) * sy;  // m03
+        m[8][lidy-bheight] = ((double)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((double)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+
+    else if(lidy < bheight)
+    {
+        lm[9] = ((double)py) * sy;  // m03
+        lm[8] = ((double)x1.s0) * sy;  // m12
+        lm[7] = ((double)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
@ -43,10 +43,39 @@
 //
 //M*/

-#pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable

+// specialized for non-image2d_t supported platform, intel HD4000, for example
+#ifdef DISABLE_IMAGE2D
+#define IMAGE_INT32 __global uint  *
+#define IMAGE_INT8  __global uchar *
+#else
+#define IMAGE_INT32 image2d_t
+#define IMAGE_INT8  image2d_t
+#endif
+
+uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
+{
+#ifdef DISABLE_IMAGE2D
+    int x = clamp(coord.x, 0, cols);
+    int y = clamp(coord.y, 0, rows);
+    return img[elemPerRow * y + x];
+#else
+    return read_imageui(img, sam, coord).x;
+#endif
+}
+uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
+{
+#ifdef DISABLE_IMAGE2D
+    int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
+    int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
+    return img[elemPerRow * y + x];
+#else
+    return (uchar)read_imageui(img, sam, coord).x;
+#endif
+}
+
 // dynamically change the precision used for floating type

 #if defined (__ATI__) || defined (__NVIDIA__)
@ -58,14 +87,24 @@
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

+#ifndef FLT_EPSILON
 #define FLT_EPSILON (1e-15)
-#define CV_PI_F 3.14159265f
+#endif

+#ifndef CV_PI_F
+#define CV_PI_F 3.14159265f
+#endif

 // Use integral image to calculate haar wavelets.
 // N = 2
 // for simple haar paatern
-float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x)
+float icvCalcHaarPatternSum_2(
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
+    int rows, int cols, int elemPerRow)
 {

    float ratio = (float)newSize / oldSize;
@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
        int dy2 = convert_int_rte(ratio * src[k][3]);

        F t = 0;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
-
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
    }

@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
 }

 // N = 3
-float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x)
+float icvCalcHaarPatternSum_3(
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
+    int rows, int cols, int elemPerRow)
 {

    float ratio = (float)newSize / oldSize;
@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
        int dy2 = convert_int_rte(ratio * src[k][3]);

        F t = 0;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
-
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
    }

@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
 }

 // N = 4
-float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x)
+float icvCalcHaarPatternSum_4(
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
+    int rows, int cols, int elemPerRow)
 {

    float ratio = (float)newSize / oldSize;
@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int
        int dy2 = convert_int_rte(ratio * src[k][3]);

        F t = 0;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
-
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
    }

@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer)

 //calculate targeted layer per-pixel determinant and trace with an integral image
 __kernel void icvCalcLayerDetAndTrace(
-    image2d_t sumTex, // input integral image
+    IMAGE_INT32 sumTex, // input integral image
    __global float * det,      // output Determinant
    __global float * trace,    // output trace
    int det_step,     // the step of det in bytes
@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace(
    int c_img_cols,
    int c_nOctaveLayers,
    int c_octave,
-    int c_layer_rows
+    int c_layer_rows,
+    int sumTex_step
    )
 {
    det_step   /= sizeof(*det);
    trace_step /= sizeof(*trace);
+    sumTex_step/= sizeof(uint);
    // Determine the indices
    const int gridDim_y  = get_num_groups(1) / (c_nOctaveLayers + 2);
    const int blockIdx_y = get_group_id(1) % gridDim_y;
@ -205,9 +255,9 @@ __kernel void icvCalcLayerDetAndTrace(

    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
    {
-        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
-        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
-        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
+        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);

        det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace(

 __constant float c_DM[5] = {0, 0, 9, 9, 1};

-bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
+bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
 {
    float ratio = (float)size / 9.0f;

@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)

    float t = 0;

-    t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x;
-    t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x;
-    t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x;
-    t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x;
+    t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
+    t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
+    t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
+    t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);

    d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));

@ -261,7 +311,8 @@ __kernel
    int c_layer_cols,
    int c_max_candidates,
    float c_hessianThreshold,
-    image2d_t maskSumTex
+    IMAGE_INT32 maskSumTex,
+    int mask_step
    )
 {
    volatile __local  float N9[768]; // threads.x * threads.y * 3
@ -269,6 +320,7 @@ __kernel
    det_step   /= sizeof(*det);
    trace_step /= sizeof(*trace);
    maxCounter += counter_offset;
+    mask_step  /= sizeof(uint);

    // Determine the indices
    const int gridDim_y  = get_num_groups(1) / c_nOctaveLayers;
@ -321,7 +373,7 @@ __kernel
            const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
            const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;

-            if (within_check(maskSumTex, sum_i, sum_j, size))
+            if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
            {
                // Check to see if we have a max (in its 26 neighbours)
                const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
@ -704,14 +756,16 @@ void reduce_32_sum(volatile __local  float * data, float partial_reduction, int

 __kernel
    void icvCalcOrientation(
-    image2d_t sumTex,
+    IMAGE_INT32 sumTex,
    __global float * keypoints,
    int keypoints_step,
    int c_img_rows,
-    int c_img_cols
+    int c_img_cols,
+    int sum_step
    )
 {
    keypoints_step /= sizeof(*keypoints);
+    sum_step       /= sizeof(uint);
    __global float* featureX    = keypoints + X_ROW * keypoints_step;
    __global float* featureY    = keypoints + Y_ROW * keypoints_step;
    __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
@ -754,8 +808,8 @@ __kernel
        if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
            x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
        {
-            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x);
-            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
+            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);

            angle = atan2(Y, X);

@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =

 // utility for linear filter
 inline uchar readerGet(
-    image2d_t src,
+    IMAGE_INT8 src,
    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
-    int i, int j
+    int i, int j, int rows, int cols, int elemPerRow
    )
 {
    float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
    float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-    return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x;
+    return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
 }

 inline float linearFilter(
-    image2d_t src,
+    IMAGE_INT8 src,
    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
-    float y, float x
+    float y, float x, int rows, int cols, int elemPerRow
    )
 {
    x -= 0.5f;
@ -907,30 +961,33 @@ inline float linearFilter(
    const int x2 = x1 + 1;
    const int y2 = y1 + 1;

-    uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
+    uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
    out = out + src_reg * ((x2 - x) * (y2 - y));

-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
    out = out + src_reg * ((x - x1) * (y2 - y));

-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
    out = out + src_reg * ((x2 - x) * (y - y1));

-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
    out = out + src_reg * ((x - x1) * (y - y1));

    return out;
 }

 void calc_dx_dy(
-    image2d_t imgTex,
+    IMAGE_INT8 imgTex,
    volatile __local  float s_dx_bin[25],
    volatile __local  float s_dy_bin[25],
    volatile __local  float s_PATCH[6][6],
    __global const float* featureX,
    __global const float* featureY,
    __global const float* featureSize,
-    __global const float* featureDir
+    __global const float* featureDir,
+    int rows,
+    int cols,
+    int elemPerRow
    )
 {
    const float centerX = featureX[get_group_id(0)];
@ -965,7 +1022,7 @@ void calc_dx_dy(
    const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
    const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;

-    s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo);
+    s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);

    barrier(CLK_LOCAL_MEM_FENCE);

@ -1035,16 +1092,18 @@ void reduce_sum25(

 __kernel
    void compute_descriptors64(
-    image2d_t imgTex,
+    IMAGE_INT8 imgTex,
    volatile __global float * descriptors,
    __global const float * keypoints,
    int descriptors_step,
-    int keypoints_step
+    int keypoints_step,
+    int rows,
+    int cols,
+    int img_step
    )
 {
    descriptors_step /= sizeof(float);
    keypoints_step   /= sizeof(float);
-
    __global const float * featureX    = keypoints + X_ROW * keypoints_step;
    __global const float * featureY    = keypoints + Y_ROW * keypoints_step;
    __global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
@ -1057,7 +1116,7 @@ __kernel
    volatile __local  float sdyabs[25];
    volatile __local  float s_PATCH[6][6];

-    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
+    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
    barrier(CLK_LOCAL_MEM_FENCE);

    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@ -1066,10 +1125,10 @@ __kernel
    {
        sdxabs[tid] = fabs(sdx[tid]); // |dx| array
        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);

@ -1085,11 +1144,14 @@ __kernel
 }
 __kernel
    void compute_descriptors128(
-    image2d_t imgTex,
+    IMAGE_INT8 imgTex,
    __global volatile float * descriptors,
    __global float * keypoints,
    int descriptors_step,
-    int keypoints_step
+    int keypoints_step,
+    int rows,
+    int cols,
+    int img_step
    )
 {
    descriptors_step /= sizeof(*descriptors);
@ -1111,7 +1173,7 @@ __kernel
    volatile __local  float sdabs2[25];
    volatile __local  float s_PATCH[6][6];

-    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
+    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
    barrier(CLK_LOCAL_MEM_FENCE);

    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@ -1132,10 +1194,10 @@ __kernel
            sd2[tid] = sdx[tid];
            sdabs2[tid] = fabs(sdx[tid]);
        }
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);

@ -1162,10 +1224,10 @@ __kernel
            sd2[tid] = sdy[tid];
            sdabs2[tid] = fabs(sdy[tid]);
        }
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
        if (tid == 0)
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@ -68,6 +68,8 @@ namespace cv
        extern const char *operator_setTo;
        extern const char *operator_setToM;
        extern const char *convertC3C4;
+        extern DevMemType gDeviceMemType;
+        extern DevMemRW gDeviceMemRW;
    }
 }

@ -911,7 +913,17 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const

 }

+void cv::ocl::oclMat::createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type)
+{
+    createEx(size.height, size.width, type, rw_type, mem_type);
+}
+
 void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
+{
+    createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
+}
+
+void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type, DevMemType mem_type)
 {
    clCxt = Context::getContext();
    /* core logic */
@ -936,7 +948,7 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
        size_t esz = elemSize();

        void *dev_ptr;
-        openCLMallocPitch(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows);
+        openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
        //openCLMallocPitch(clCxt,&dev_ptr, &step, esz * cols, rows);

        if (esz * cols == step)
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@ -217,6 +217,36 @@ namespace cv
        {
            openCLFree(texture);
        }
+
+        bool support_image2d(Context *clCxt)
+        {
+            static const char * _kernel_string = "__kernel void test_func(image2d_t img) {}";
+            static bool _isTested = false;
+            static bool _support = false;
+            if(_isTested)
+            {
+                return _support;
+            }
+            try
+            {
+                cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
+                _support = true;
+            }
+            catch (const cv::Exception& e)
+            {
+                if(e.code == -217)
+                {
+                    _support = false;
+                }
+                else
+                {
+                    // throw e once again
+                    throw e;
+                }
+            }
+            _isTested = true;
+            return _support;
+        }
    }//namespace ocl

 }//namespace cv
--- a/modules/ocl/src/mcwutil.hpp
+++ b/modules/ocl/src/mcwutil.hpp
@ -69,6 +69,10 @@ namespace cv
        //   2. for faster clamping, there is no buffer padding for the constructed texture
        cl_mem bindTexture(const oclMat &mat);
        void releaseTexture(cl_mem& texture);
+
+        // returns whether the current context supports image2d_t format or not
+        bool support_image2d(Context *clCxt = Context::getContext());
+
    }//namespace ocl

 }//namespace cv
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@ -0,0 +1,379 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, sen@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include <iostream>
+namespace cv
+{
+namespace ocl
+{
+extern const char *moments;
+
+// The function calculates center of gravity and the central second order moments
+static void icvCompleteMomentState( CvMoments* moments )
+{
+    double cx = 0, cy = 0;
+    double mu20, mu11, mu02;
+
+    assert( moments != 0 );
+    moments->inv_sqrt_m00 = 0;
+
+    if( fabs(moments->m00) > DBL_EPSILON )
+    {
+        double inv_m00 = 1. / moments->m00;
+        cx = moments->m10 * inv_m00;
+        cy = moments->m01 * inv_m00;
+        moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
+    }
+
+    // mu20 = m20 - m10*cx
+    mu20 = moments->m20 - moments->m10 * cx;
+    // mu11 = m11 - m10*cy
+    mu11 = moments->m11 - moments->m10 * cy;
+    // mu02 = m02 - m01*cy
+    mu02 = moments->m02 - moments->m01 * cy;
+
+    moments->mu20 = mu20;
+    moments->mu11 = mu11;
+    moments->mu02 = mu02;
+
+    // mu30 = m30 - cx*(3*mu20 + cx*m10)
+    moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
+    mu11 += mu11;
+    // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
+    moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
+    // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
+    moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
+    // mu03 = m03 - cy*(3*mu02 + cy*m01)
+    moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
+}
+
+
+static void icvContourMoments( CvSeq* contour, CvMoments* mom )
+{
+    if( contour->total )
+    {
+        CvSeqReader reader;
+        int lpt = contour->total;
+        double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
+        int dst_type = cv::ocl::Context::getContext()->impl->double_support ? CV_64FC1 : CV_32FC1;
+
+        cvStartReadSeq( contour, &reader, 0 );
+
+        cv::ocl::oclMat dst_a00(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a10(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a01(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a20(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a11(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a02(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a30(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a21(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a12(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a03(1,lpt,dst_type);
+        size_t reader_size = lpt << 1;
+        cv::Mat reader_mat(1,reader_size,CV_32FC1);
+
+        bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
+
+        if( is_float )
+        {
+            for(size_t i = 0; i < reader_size; ++i)
+            {
+                reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
+                reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
+                CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+            }
+        }
+        else
+        {
+            for(size_t i = 0; i < reader_size; ++i)
+            {
+                reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
+                reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
+                CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+            }
+        }
+
+        cv::ocl::oclMat reader_oclmat(reader_mat);
+        int llength = std::min(lpt,128);
+        size_t localThreads[3]  = { llength, 1, 1};
+        size_t globalThreads[3] = { lpt, 1, 1};
+        std::vector<std::pair<size_t , const void *> > args;
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a00.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a10.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a01.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a20.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a11.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a02.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a30.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a21.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a12.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a03.data ));
+        openCLExecuteKernel(dst_a00.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
+
+        cv::Mat dst(dst_a00);
+        cv::Scalar s = cv::sum(dst);
+        a00 = s[0];
+        dst = dst_a10;
+        s = cv::sum(dst);
+        a10 = s[0];//dstsum[1];
+        dst = dst_a01;
+        s = cv::sum(dst);
+        a01 = s[0];//dstsum[2];
+        dst = dst_a20;
+        s = cv::sum(dst);
+        a20 = s[0];//dstsum[3];
+        dst = dst_a11;
+        s = cv::sum(dst);
+        a11 = s[0];//dstsum[4];
+        dst = dst_a02;
+        s = cv::sum(dst);
+        a02 = s[0];//dstsum[5];
+        dst = dst_a30;
+        s = cv::sum(dst);
+        a30 = s[0];//dstsum[6];
+        dst = dst_a21;
+        s = cv::sum(dst);
+        a21 = s[0];//dstsum[7];
+        dst = dst_a12;
+        s = cv::sum(dst);
+        a12 = s[0];//dstsum[8];
+        dst = dst_a03;
+        s = cv::sum(dst);
+        a03 = s[0];//dstsum[9];
+
+        double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
+        if( fabs(a00) > FLT_EPSILON )
+        {
+            if( a00 > 0 )
+            {
+                db1_2 = 0.5;
+                db1_6 = 0.16666666666666666666666666666667;
+                db1_12 = 0.083333333333333333333333333333333;
+                db1_24 = 0.041666666666666666666666666666667;
+                db1_20 = 0.05;
+                db1_60 = 0.016666666666666666666666666666667;
+            }
+            else
+            {
+                db1_2 = -0.5;
+                db1_6 = -0.16666666666666666666666666666667;
+                db1_12 = -0.083333333333333333333333333333333;
+                db1_24 = -0.041666666666666666666666666666667;
+                db1_20 = -0.05;
+                db1_60 = -0.016666666666666666666666666666667;
+            }
+
+            // spatial moments
+            mom->m00 = a00 * db1_2;
+            mom->m10 = a10 * db1_6;
+            mom->m01 = a01 * db1_6;
+            mom->m20 = a20 * db1_12;
+            mom->m11 = a11 * db1_24;
+            mom->m02 = a02 * db1_12;
+            mom->m30 = a30 * db1_20;
+            mom->m21 = a21 * db1_60;
+            mom->m12 = a12 * db1_60;
+            mom->m03 = a03 * db1_20;
+
+            icvCompleteMomentState( mom );
+        }
+    }
+}
+
+static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
+{
+    const int TILE_SIZE = 256;
+    int type, depth, cn, coi = 0;
+    CvMat stub, *mat = (CvMat*)array;
+    CvContour contourHeader;
+    CvSeq* contour = 0;
+    CvSeqBlock block;
+    if( CV_IS_SEQ( array ))
+    {
+        contour = (CvSeq*)array;
+        if( !CV_IS_SEQ_POINT_SET( contour ))
+            CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
+    }
+
+    if( !moments )
+        CV_Error( CV_StsNullPtr, "" );
+
+    memset( mom, 0, sizeof(*mom));
+
+    if( !contour )
+    {
+
+        mat = cvGetMat( mat, &stub, &coi );
+        type = CV_MAT_TYPE( mat->type );
+
+        if( type == CV_32SC2 || type == CV_32FC2 )
+        {
+            contour = cvPointSeqFromMat(
+                          CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
+                          mat, &contourHeader, &block );
+        }
+    }
+    if( contour )
+    {
+        icvContourMoments( contour, mom );
+        return;
+    }
+
+    type = CV_MAT_TYPE( mat->type );
+    depth = CV_MAT_DEPTH( type );
+    cn = CV_MAT_CN( type );
+
+    cv::Size size = cvGetMatSize( mat );
+    if( cn > 1 && coi == 0 )
+        CV_Error( CV_StsBadArg, "Invalid image type" );
+
+    if( size.width <= 0 || size.height <= 0 )
+        return;
+
+    cv::Mat src0(mat);
+    cv::ocl::oclMat src(src0);
+    cv::Size tileSize;
+    int blockx,blocky;
+    if(size.width%TILE_SIZE == 0)
+        blockx = size.width/TILE_SIZE;
+    else
+        blockx = size.width/TILE_SIZE + 1;
+    if(size.height%TILE_SIZE == 0)
+        blocky = size.height/TILE_SIZE;
+    else
+        blocky = size.height/TILE_SIZE + 1;
+    cv::ocl::oclMat dst_m00(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m10(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m01(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m20(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m11(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m02(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m30(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m21(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m12(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m03(blocky, blockx, CV_64FC1);
+    cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
+    int tile_width  = std::min(size.width,TILE_SIZE);
+    int tile_height = std::min(size.height,TILE_SIZE);
+    size_t localThreads[3]  = { tile_height, 1, 1};
+    size_t globalThreads[3] = { size.height, blockx, 1};
+    std::vector<std::pair<size_t , const void *> > args,args_sum;
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m00.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m00.step ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&type ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&depth ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cn ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&coi ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
+    openCLExecuteKernel(dst_m00.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
+
+    size_t localThreadss[3]  = { 128, 1, 1};
+    size_t globalThreadss[3] = { 128, 1, 1};
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_height ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_width ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
+    openCLExecuteKernel(dst_m00.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
+    double* dstsum = new double[10];
+    memset(dstsum,0,10*sizeof(double));
+    openCLReadBuffer(dst_m00.clCxt,sum,(void *)dstsum,10*sizeof(double));
+    mom->m00 = dstsum[0];
+    mom->m10 = dstsum[1];
+    mom->m01 = dstsum[2];
+    mom->m20 = dstsum[3];
+    mom->m11 = dstsum[4];
+    mom->m02 = dstsum[5];
+    mom->m30 = dstsum[6];
+    mom->m21 = dstsum[7];
+    mom->m12 = dstsum[8];
+    mom->m03 = dstsum[9];
+
+    icvCompleteMomentState( mom );
+}
+
+Moments ocl_moments( InputArray _array, bool binaryImage )
+{
+    CvMoments om;
+    Mat arr = _array.getMat();
+    CvMat c_array = arr;
+    ocl_cvMoments(&c_array, &om, binaryImage);
+    return om;
+}
+
+}
+
+}
--- a/modules/ocl/src/precomp.hpp
+++ b/modules/ocl/src/precomp.hpp
@ -93,6 +93,8 @@ namespace cv
        ///////////////////////////OpenCL call wrappers////////////////////////////
        void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
                               size_t widthInBytes, size_t height);
+        void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+                               size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
        void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
                            const void *src, size_t spitch,
                            size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
@ -141,6 +143,7 @@ namespace cv
            //extra options to recognize vendor specific fp64 extensions
            char extra_options[512];
            std::string Binpath;
+            int unified_memory; //1 means integrated GPU, otherwise this value is 0
        };
    }
 }
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@ -573,8 +573,9 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    Context  *clCxt = I.clCxt;
    int elemCntPerRow = I.step / I.elemSize();
    std::string kernelName = "lkSparse";
-    size_t localThreads[3]  = { 8, 8, 1 };
-    size_t globalThreads[3] = { 8 * ptcount, 8, 1};
+    bool isImageSupported = support_image2d();
+    size_t localThreads[3]  = { 8, isImageSupported ? 8 : 32, 1 };
+    size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
    int cn = I.oclchannels();
    char calcErr;
    if (level == 0)
@ -587,8 +588,9 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    }

    std::vector<std::pair<size_t , const void *> > args;
-    cl_mem ITex = bindTexture(I);
-    cl_mem JTex = bindTexture(J);
+
+    cl_mem ITex = isImageSupported ? bindTexture(I) : (cl_mem)I.data;
+    cl_mem JTex = isImageSupported ? bindTexture(J) : (cl_mem)J.data;

    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
@ -601,6 +603,8 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&level ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
+    if (!isImageSupported)
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.x ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.y ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cn ));
@ -609,19 +613,14 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
    args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));

-    try
+    if(isImageSupported)
    {
        openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
-    }
-    catch(Exception&)
-    {
-        printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n");
        releaseTexture(ITex);
        releaseTexture(JTex);
-        ITex = (cl_mem)I.data;
-        JTex = (cl_mem)J.data;
-        localThreads[1] = globalThreads[1] = 32;
-        args.insert( args.begin()+11, std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
+    }
+    else
+    {
        openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
    }
 }
@ -723,7 +722,7 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v,
                 oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters)
 {
    Context  *clCxt = I.clCxt;
-    bool isImageSupported = clCxt->impl->devName.find("Intel(R) HD Graphics") == std::string::npos;
+    bool isImageSupported = support_image2d();
    int elemCntPerRow = I.step / I.elemSize();

    std::string kernelName = "lkDense";
--- a/modules/ocl/src/surf.cpp
+++ b/modules/ocl/src/surf.cpp
@ -1,4 +1,4 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+/*M/////////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@ -44,6 +44,7 @@
 //M*/
 #include <iomanip>
 #include "precomp.hpp"
+#include "mcwutil.hpp"
 //#include "opencv2/highgui/highgui.hpp"

 using namespace cv;
@ -70,7 +71,7 @@ static inline int calcSize(int octave, int layer)

    /* Wavelet size increment between layers. This should be an even number,
    such that the wavelet sizes in an octave are either all even or all odd.
-    This ensures that when looking for the neighbours of a sample, the layers
+    This ensures that when looking for the neighbors of a sample, the layers

    above and below are aligned correctly. */
    const int HAAR_SIZE_INC = 6;
@ -78,6 +79,11 @@ static inline int calcSize(int octave, int layer)
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }

+namespace
+{
+    const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+}
+
 class SURF_OCL_Invoker
 {
 public:
@ -87,7 +93,7 @@ public:
    //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
    //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);

-    // kernel callers declearations
+    // kernel callers declarations
    void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);

    void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@ -99,14 +105,14 @@ public:
    void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);

    void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
-    // end of kernel callers declearations
+    // end of kernel callers declarations


    SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
        surf_(surf),
        img_cols(img.cols), img_rows(img.rows),
-        use_mask(!mask.empty()),
-        imgTex(NULL), sumTex(NULL), maskSumTex(NULL)
+        use_mask(!mask.empty()), counters(oclMat()),
+        imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
    {
        CV_Assert(!img.empty() && img.type() == CV_8UC1);
        CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@ -130,12 +136,13 @@ public:
        counters.create(1, surf_.nOctaves + 1, CV_32SC1);
        counters.setTo(Scalar::all(0));

-        //loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
-
+        integral(img, surf_.sum);
+        if(support_image2d())
+        {
        bindImgTex(img, imgTex);
-        integral(img, surf_.sum); // the two argumented integral version is incorrect
-
            bindImgTex(surf_.sum, sumTex);
+        }
+
        maskSumTex = 0;

        if (use_mask)
@ -154,7 +161,7 @@ public:
    void detectKeypoints(oclMat &keypoints)
    {
        // create image pyramid buffers
-        // different layers have same sized buffers, but they are sampled from gaussin kernel.
+        // different layers have same sized buffers, but they are sampled from Gaussian kernel.
        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);

@ -221,7 +228,6 @@ public:
            openCLFree(sumTex);
        if(maskSumTex)
            openCLFree(maskSumTex);
-        additioalParamBuffer.release();
    }

 private:
@ -241,7 +247,7 @@ private:
    cl_mem sumTex;
    cl_mem maskSumTex;

-    oclMat additioalParamBuffer;
+    const oclMat _img; // make a copy for non-image2d_t supported platform

    SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
    {
@ -361,11 +367,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
 {
    if (!img.empty())
    {
-        if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != std::string::npos)
-        {
-            std::cout << " Intel HD GPU device unsupported " << std::endl;
-            return;
-        }
        SURF_OCL_Invoker surf(*this, img, mask);

        surf.detectKeypoints(keypoints);
@ -377,11 +378,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
 {
    if (!img.empty())
    {
-        if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != std::string::npos)
-        {
-            std::cout << " Intel HD GPU device unsupported " << std::endl;
-            return;
-        }
        SURF_OCL_Invoker surf(*this, img, mask);

        if (!useProvidedKeypoints)
@ -442,74 +438,11 @@ void cv::ocl::SURF_OCL::releaseMemory()
 // bind source buffer to image oject.
 void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
 {
-    cl_image_format format;
-    int err;
-    int depth    = img.depth();
-    int channels = img.channels();
-
-    switch(depth)
-    {
-    case CV_8U:
-        format.image_channel_data_type = CL_UNSIGNED_INT8;
-        break;
-    case CV_32S:
-        format.image_channel_data_type = CL_UNSIGNED_INT32;
-        break;
-    case CV_32F:
-        format.image_channel_data_type = CL_FLOAT;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
-    switch(channels)
-    {
-    case 1:
-        format.image_channel_order     = CL_R;
-        break;
-    case 3:
-        format.image_channel_order     = CL_RGB;
-        break;
-    case 4:
-        format.image_channel_order     = CL_RGBA;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
    if(texture)
    {
        openCLFree(texture);
    }
-
-#ifdef CL_VERSION_1_2
-    cl_image_desc desc;
-    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
-    desc.image_width      = img.step / img.elemSize();
-    desc.image_height     = img.rows;
-    desc.image_depth      = 0;
-    desc.image_array_size = 1;
-    desc.image_row_pitch  = 0;
-    desc.image_slice_pitch = 0;
-    desc.buffer           = NULL;
-    desc.num_mip_levels   = 0;
-    desc.num_samples      = 0;
-    texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
-#else
-    texture = clCreateImage2D(
-                  Context::getContext()->impl->clContext,
-                  CL_MEM_READ_WRITE,
-                  &format,
-                  img.step / img.elemSize(),
-                  img.rows,
-                  0,
-                  NULL,
-                  &err);
-#endif
-    size_t origin[] = { 0, 0, 0 };
-    size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
-    clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
-    openCLSafeCall(err);
+    texture = bindTexture(img);
 }

 ////////////////////////////
@ -524,7 +457,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
    std::string kernelName = "icvCalcLayerDetAndTrace";
    std::vector< std::pair<size_t, const void *> > args;

+    if(sumTex)
+    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
+    }
+    else
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+    }
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
@ -534,6 +474,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));

    size_t localThreads[3]  = {16, 16, 1};
    size_t globalThreads[3] =
@ -542,8 +483,15 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
        divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
        1
    };
+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
        int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
@ -571,18 +519,32 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
    args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));

    if(use_mask)
+    {
+        if(maskSumTex)
        {
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
        }
-
+        else
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
+        }
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
+    }
    size_t localThreads[3]  = {16, 16, 1};
    size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
                               divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
                               1
                              };

+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
        oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
@ -606,8 +568,15 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
    size_t localThreads[3]  = {3, 3, 3};
    size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};

+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
 {
@ -616,17 +585,32 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat

    std::vector< std::pair<size_t, const void *> > args;

+    if(sumTex)
+    {
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
+    }
+    else
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+    }
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));

    size_t localThreads[3]  = {32, 4, 1};
    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};

+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
 {
@ -648,12 +632,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        globalThreads[1] = 16 * localThreads[1];

        args.clear();
+        if(imgTex)
+        {
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
+        }
+        else
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
+        }
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
+        if(support_image2d())
+        {
            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        }
+        else
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }

        kernelName = "normalize_descriptors64";

@ -666,9 +667,16 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        if(support_image2d())
+        {
        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
    else
+    {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }
+    }
+    else
    {
        kernelName = "compute_descriptors128";

@ -679,12 +687,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        globalThreads[1] = 16 * localThreads[1];

        args.clear();
+        if(imgTex)
+        {
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
+        }
+        else
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
+        }
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
+        if(support_image2d())
+        {
            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        }
+        else
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }

        kernelName = "normalize_descriptors128";

@ -697,7 +722,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        if(support_image2d())
+        {
            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
        }
+        else
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }
+    }
 }

--- a/modules/ocl/test/test_moments.cpp
+++ b/modules/ocl/test/test_moments.cpp
@ -0,0 +1,72 @@
+#include "precomp.hpp"
+#include <iomanip>
+#include "opencv2/imgproc/imgproc_c.h"
+
+#ifdef HAVE_OPENCL
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+extern string workdir;
+PARAM_TEST_CASE(MomentsTestBase, MatType, bool)
+{
+    int type;
+    cv::Mat mat1;
+    bool test_contours;
+
+    virtual void SetUp()
+    {
+        type = GET_PARAM(0);
+        test_contours = GET_PARAM(1);
+        cv::RNG &rng = TS::ptr()->get_rng();
+        cv::Size size(10*MWIDTH, 10*MHEIGHT);
+        mat1 = randomMat(rng, size, type, 5, 16, false);
+    }
+
+    void Compare(Moments& cpu, Moments& gpu)
+    {
+        Mat gpu_dst, cpu_dst;
+        HuMoments(cpu, cpu_dst);
+        HuMoments(gpu, gpu_dst);
+        EXPECT_MAT_NEAR(gpu_dst,cpu_dst, .5, "");
+    }
+
+};
+struct ocl_Moments : MomentsTestBase {};
+
+TEST_P(ocl_Moments, Mat)
+{
+    bool binaryImage = 0;
+    SetUp();
+
+    for(int j = 0; j < LOOP_TIMES; j++)
+    {
+        if(test_contours)
+        {
+            Mat src = imread( workdir + "../cpp/pic3.png", 1 );
+            Mat src_gray, canny_output;
+            cvtColor( src, src_gray, CV_BGR2GRAY );
+            vector<vector<Point> > contours;
+            vector<Vec4i> hierarchy;
+            Canny( src_gray, canny_output, 100, 200, 3 );
+            findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
+            for( size_t i = 0; i < contours.size(); i++ )
+            {
+                Moments m = moments( contours[i], false );
+                Moments dm = ocl::ocl_moments( contours[i], false );
+                Compare(m, dm);
+            }
+        }
+        cv::_InputArray _array(mat1);
+        cv::Moments CvMom = cv::moments(_array, binaryImage);
+        cv::Moments oclMom = cv::ocl::ocl_moments(_array, binaryImage);
+
+        Compare(CvMom, oclMom);
+
+    }
+}
+INSTANTIATE_TEST_CASE_P(Moments, ocl_Moments, Combine(
+                            Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_64FC1), Values(true,false)));
+#endif // HAVE_OPENCL
--- a/modules/ts/misc/summary.py
+++ b/modules/ts/misc/summary.py
@ -37,10 +37,12 @@ if __name__ == "__main__":
    parser.add_option("", "--module", dest="module", default=None, metavar="NAME", help="module prefix for test names")
    parser.add_option("", "--columns", dest="columns", default=None, metavar="NAMES", help="comma-separated list of column aliases")
    parser.add_option("", "--no-relatives", action="store_false", dest="calc_relatives", default=True, help="do not output relative values")
-    parser.add_option("", "--with-cycles-reduction", action="store_true", dest="calc_cr", default=False, help="alos output cycle reduction percentages")
+    parser.add_option("", "--with-cycles-reduction", action="store_true", dest="calc_cr", default=False, help="output cycle reduction percentages")
+    parser.add_option("", "--with-score", action="store_true", dest="calc_score", default=False, help="output automatic classification of speedups")
    parser.add_option("", "--show-all", action="store_true", dest="showall", default=False, help="also include empty and \"notrun\" lines")
    parser.add_option("", "--match", dest="match", default=None)
    parser.add_option("", "--match-replace", dest="match_replace", default="")
+    parser.add_option("", "--regressions-only", dest="regressionsOnly", default=None, metavar="X-FACTOR", help="show only tests with performance regressions not")
    (options, args) = parser.parse_args()

    options.generateHtml = detectHtmlOutputType(options.format)
@ -106,6 +108,7 @@ if __name__ == "__main__":

    # build table
    getter = metrix_table[options.metric][1]
+    getter_score = metrix_table["score"][1]
    if options.calc_relatives:
        getter_p = metrix_table[options.metric + "%"][1]
    if options.calc_cr:
@ -129,6 +132,11 @@ if __name__ == "__main__":
        for set in metric_sets:
            tbl.newColumn(str(i) + "%", getSetName(set, i, options.columns) + "\nvs\n" + getSetName(test_sets[0], 0, options.columns) + "\n(x-factor)", align = "center", cssclass = "col_rel")
            i += 1
+    if options.calc_score:
+        i = 1
+        for set in metric_sets:
+            tbl.newColumn(str(i) + "S", getSetName(set, i, options.columns) + "\nvs\n" + getSetName(test_sets[0], 0, options.columns) + "\n(score)", align = "center", cssclass = "col_name")
+            i += 1

    # rows
    prevGroupName = None
@ -157,6 +165,8 @@ if __name__ == "__main__":
                    tbl.newCell(str(i) + "%", "-")
                if options.calc_cr and i > 0:
                    tbl.newCell(str(i) + "$", "-")
+                if options.calc_score and i > 0:
+                    tbl.newCell(str(i) + "$", "-")
            else:
                status = case.get("status")
                if status != "run":
@ -167,6 +177,8 @@ if __name__ == "__main__":
                        tbl.newCell(str(i) + "%", "-", color = "red")
                    if options.calc_cr and i > 0:
                        tbl.newCell(str(i) + "$", "-", color = "red")
+                    if options.calc_score and i > 0:
+                        tbl.newCell(str(i) + "S", "-", color = "red")
                else:
                    val = getter(case, cases[0], options.units)
                    if options.calc_relatives and i > 0 and val:
@ -177,6 +189,10 @@ if __name__ == "__main__":
                        valcr = getter_cr(case, cases[0], options.units)
                    else:
                        valcr = None
+                    if options.calc_score and i > 0 and val:
+                        val_score = getter_score(case, cases[0], options.units)
+                    else:
+                        val_score = None
                    if not valp or i == 0:
                        color = None
                    elif valp > 1.05:
@ -192,9 +208,23 @@ if __name__ == "__main__":
                        tbl.newCell(str(i) + "%", formatValue(valp, "%"), valp, color = color, bold = color)
                    if options.calc_cr and i > 0:
                        tbl.newCell(str(i) + "$", formatValue(valcr, "$"), valcr, color = color, bold = color)
+                    if options.calc_score and i > 0:
+                        tbl.newCell(str(i) + "S", formatValue(val_score, "S"), val_score, color = color, bold = color)
    if not needNewRow:
        tbl.trimLastRow()

+    if options.regressionsOnly:
+        for r in reversed(range(len(tbl.rows))):
+            delete = True
+            i = 1
+            for set in metric_sets:
+                val = tbl.rows[r].cells[len(tbl.rows[r].cells)-i].value
+                if val is not None and val < float(options.regressionsOnly):
+                    delete = False
+                i += 1
+            if (delete):
+                tbl.rows.pop(r)
+
    # output table
    if options.generateHtml:
        if options.format == "moinwiki":
@ -205,3 +235,6 @@ if __name__ == "__main__":
            htmlPrintFooter(sys.stdout)
    else:
        tbl.consolePrintTable(sys.stdout)
+
+    if options.regressionsOnly:
+        sys.exit(len(tbl.rows))
--- a/modules/ts/misc/table_formatter.py
+++ b/modules/ts/misc/table_formatter.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python

-import sys, re, os.path, cgi, stat
+import sys, re, os.path, cgi, stat, math
 from optparse import OptionParser
 from color import getColorizer

@ -627,6 +627,21 @@ def getCycleReduction(test, test0, metric):
        return None
    return (1.0-float(val)/val0)*100

+def getScore(test, test0, metric):
+    if not test or not test0:
+        return None
+    m0 = float(test.get("gmean", None))
+    m1 = float(test0.get("gmean", None))
+    if m0 == 0 or m1 == 0:
+        return None
+    s0 = float(test.get("gstddev", None))
+    s1 = float(test0.get("gstddev", None))
+    s = math.sqrt(s0*s0 + s1*s1)
+    m0 = math.log(m0)
+    m1 = math.log(m1)
+    if s == 0:
+        return None
+    return (m0-m1)/s

 metrix_table = \
 {
@ -655,6 +670,8 @@ metrix_table = \
    "median$": ("Median (cycle reduction)", lambda test,test0,units: getCycleReduction(test, test0, "median")),
    "stddev$": ("Standard deviation (cycle reduction)", lambda test,test0,units: getCycleReduction(test, test0, "stddev")),
    "gstddev$": ("Standard deviation of Ln(time) (cycle reduction)", lambda test,test0,units: getCycleReduction(test, test0, "gstddev")),
+
+    "score": ("SCORE", lambda test,test0,units: getScore(test, test0, "gstddev")),
 }

 def formatValue(val, metric, units = None):
@ -664,6 +681,18 @@ def formatValue(val, metric, units = None):
        return "%.2f" % val
    if metric.endswith("$"):
        return "%.2f%%" % val
+    if metric.endswith("S"):
+        if val > 3.5:
+            return "SLOWER"
+        if val < -3.5:
+            return "FASTER"
+        if val > -1.5 and val < 1.5:
+            return " "
+        if val < 0:
+            return "faster"
+        if val > 0:
+            return "slower"
+        #return "%.4f" % val
    return "%.3f %s" % (val, units)

 if __name__ == "__main__":
--- a/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsActivity.java
+++ b/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsActivity.java
@ -68,7 +68,6 @@ public class ImageManipulationsActivity extends Activity implements CvCameraView
    private float                mBuff[];
    private Mat                  mRgbaInnerWindow;
    private Mat                  mGrayInnerWindow;
-    private Mat                  mBlurWindow;
    private Mat                  mZoomWindow;
    private Mat                  mZoomCorner;
    private Mat                  mSepiaKernel;
@ -220,9 +219,6 @@ public class ImageManipulationsActivity extends Activity implements CvCameraView
        if (mGrayInnerWindow == null && !mGray.empty())
            mGrayInnerWindow = mGray.submat(top, top + height, left, left + width);

-        if (mBlurWindow == null)
-            mBlurWindow = mRgba.submat(0, rows, cols / 3, cols * 2 / 3);
-
        if (mZoomCorner == null)
            mZoomCorner = mRgba.submat(0, rows / 2 - rows / 10, 0, cols / 2 - cols / 10);

@ -236,8 +232,6 @@ public class ImageManipulationsActivity extends Activity implements CvCameraView
            mZoomWindow.release();
        if (mZoomCorner != null)
            mZoomCorner.release();
-        if (mBlurWindow != null)
-            mBlurWindow.release();
        if (mGrayInnerWindow != null)
            mGrayInnerWindow.release();
        if (mRgbaInnerWindow != null)
@ -254,7 +248,6 @@ public class ImageManipulationsActivity extends Activity implements CvCameraView
        mIntermediateMat = null;
        mRgbaInnerWindow = null;
        mGrayInnerWindow = null;
-        mBlurWindow = null;
        mZoomCorner = null;
        mZoomWindow = null;
    }
@ -327,7 +320,9 @@ public class ImageManipulationsActivity extends Activity implements CvCameraView
            break;

        case ImageManipulationsActivity.VIEW_MODE_SEPIA:
-            Core.transform(mRgba, mRgba, mSepiaKernel);
+            if ((mRgbaInnerWindow == null) || (mRgba.cols() != mSizeRgba.width) || (mRgba.height() != mSizeRgba.height))
+                CreateAuxiliaryMats();
+            Core.transform(mRgbaInnerWindow, mRgbaInnerWindow, mSepiaKernel);
            break;

        case ImageManipulationsActivity.VIEW_MODE_ZOOM:
--- a/samples/ocl/performance.cpp
+++ b/samples/ocl/performance.cpp