diff --git a/3rdparty/tbb/.gitignore b/3rdparty/tbb/.gitignore
new file mode 100644
index 0000000000..601e1b265e
--- /dev/null
+++ b/3rdparty/tbb/.gitignore
@@ -0,0 +1 @@
+tbb*.tgz
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62709b8050..e048156a64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,6 +140,9 @@ OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_CLP            "Include Clp support (EPL)"                   OFF)
 OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS) )
+
 
 # OpenCV build components
 # ===================================================
@@ -396,6 +399,12 @@ if(WITH_OPENCL)
   if(OPENCL_FOUND)
     set(HAVE_OPENCL 1)
   endif()
+  if(WITH_OPENCLAMDFFT)
+    set(HAVE_CLAMDFFT 1)
+  endif()
+  if(WITH_OPENCLAMDBLAS)
+    set(HAVE_CLAMDBLAS 1)
+  endif()
 endif()
 
 # ----------------------------------------------------------------------------
diff --git a/android/android.toolchain.cmake b/android/android.toolchain.cmake
index 324074c8f2..3c89806ac0 100644
--- a/android/android.toolchain.cmake
+++ b/android/android.toolchain.cmake
@@ -4,7 +4,7 @@
 #  See home page: http://code.google.com/p/android-cmake/
 #
 #  The file is mantained by the OpenCV project. And also can be found at
-#  http://code.opencv.org/svn/opencv/trunk/opencv/android/android.toolchain.cmake
+#  http://code.opencv.org/projects/opencv/repository/revisions/master/changes/android/android.toolchain.cmake
 #
 #  Usage Linux:
 #   $ export ANDROID_NDK=/absolute/path/to/the/android-ndk
@@ -182,6 +182,7 @@
 #     [+] added mips architecture support
 #   - modified August 2012
 #     [+] updated for NDK r8b
+#     [~] all intermediate files generated by toolchain are moved into CMakeFiles
 # ------------------------------------------------------------------------------
 
 cmake_minimum_required( VERSION 2.6.3 )
@@ -854,45 +855,48 @@ elseif( X86 )
 endif()
 
 #linker flags
-list( APPEND ANDROID_SYSTEM_LIB_DIRS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}" "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
+if( NOT DEFINED __ndklibspath )
+ set( __ndklibspath "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/ndklibs/${ANDROID_NDK_ABI_NAME}" )
+endif()
+list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
 set( ANDROID_LINKER_FLAGS "" )
 #STL
 if( ANDROID_USE_STLPORT )
  if( EXISTS "${__stlLibPath}/libstlport_static.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstlport_static.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstlport_static.a" )
+  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstlport_static.a" "${__ndklibspath}/libstlport_static.a" )
  endif()
- if( EXISTS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstlport_static.a" )
+ if( EXISTS "${__ndklibspath}/libstlport_static.a" )
   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--start-group -lstlport_static" )
  endif()
 else( ANDROID_USE_STLPORT )
  if( EXISTS "${__stlLibPath}/libgnustl_static.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/libgnustl_static.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+  __COPY_IF_DIFFERENT( "${__stlLibPath}/libgnustl_static.a" "${__ndklibspath}/libstdc++.a" )
  elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+  __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" "${__ndklibspath}/libstdc++.a" )
  elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+  __COPY_IF_DIFFERENT( "${__stlLibPath}/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" "${__ndklibspath}/libstdc++.a" )
  elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${__stlLibPath}/thumb/libstdc++.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/thumb/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+  __COPY_IF_DIFFERENT( "${__stlLibPath}/thumb/libstdc++.a" "${__ndklibspath}/libstdc++.a" )
  elseif( EXISTS "${__stlLibPath}/libstdc++.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstdc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstdc++.a" "${__ndklibspath}/libstdc++.a" )
  endif()
- if( EXISTS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
+ if( EXISTS "${__ndklibspath}/libstdc++.a" )
   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lstdc++" )
  endif()
 
  #gcc exception & rtti support
  if( EXISTS "${__stlLibPath}/libsupc++.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
+  __COPY_IF_DIFFERENT( "${__stlLibPath}/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
  elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" )
-  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
+  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
  elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
-  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
+  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
  elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" )
-  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
+  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
  elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" )
-  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
+  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
  endif()
- if( EXISTS "${CMAKE_BINARY_DIR}/systemlibs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
+ if( EXISTS "${__ndklibspath}/libsupc++.a" )
   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lsupc++" )
  endif()
 endif( ANDROID_USE_STLPORT )
@@ -1038,13 +1042,14 @@ endmacro()
 # export toolchain settings for the try_compile() command
 if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
  set( __toolchain_config "")
- foreach( __var ANDROID_ABI ANDROID_FORCE_ARM_BUILD ANDROID_NATIVE_API_LEVEL ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_SET_OBSOLETE_VARIABLES LIBRARY_OUTPUT_PATH_ROOT ANDROID_USE_STLPORT ANDROID_FORBID_SYGWIN ANDROID_NDK ANDROID_STANDALONE_TOOLCHAIN ANDROID_FUNCTION_LEVEL_LINKING )
+ foreach( __var ANDROID_ABI ANDROID_FORCE_ARM_BUILD ANDROID_NATIVE_API_LEVEL ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_SET_OBSOLETE_VARIABLES LIBRARY_OUTPUT_PATH_ROOT ANDROID_USE_STLPORT ANDROID_FORBID_SYGWIN ANDROID_NDK ANDROID_STANDALONE_TOOLCHAIN ANDROID_FUNCTION_LEVEL_LINKING __ndklibspath )
   if( DEFINED ${__var} )
    set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" )\n" )
   endif()
  endforeach()
- file( WRITE "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/android.toolchain.config.cmake" "${__toolchain_config}" )
+ file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" )
  unset( __toolchain_config )
+ unset( __ndklibspath )
 endif()
 
 
@@ -1073,6 +1078,7 @@ endif()
 # Can be set only at the first run:
 #   ANDROID_NDK
 #   ANDROID_STANDALONE_TOOLCHAIN
+#   ANDROID_TOOLCHAIN_NAME : "arm-linux-androideabi-4.4.3" or "arm-linux-androideabi-4.6" or "mipsel-linux-android-4.4.3" or "mipsel-linux-android-4.6" or "x86-4.4.3" or "x86-4.6"
 # Obsolete:
 #   ANDROID_API_LEVEL : superseded by ANDROID_NATIVE_API_LEVEL
 #   ARM_TARGET : superseded by ANDROID_ABI
@@ -1105,7 +1111,6 @@ endif()
 #   ANDROID_COMPILER_VERSION : GCC version used
 #   ANDROID_CXX_FLAGS : C/C++ compiler flags required by Android platform
 #   ANDROID_SUPPORTED_ABIS : list of currently allowed values for ANDROID_ABI
-#   ANDROID_TOOLCHAIN_NAME : "standalone", "arm-linux-androideabi-4.4.3" or "x86-4.4.3" or something similar.
 #   ANDROID_TOOLCHAIN_MACHINE_NAME : "arm-linux-androideabi", "arm-eabi" or "i686-android-linux"
 #   ANDROID_TOOLCHAIN_ROOT : path to the top level of toolchain (standalone or placed inside NDK)
 #   ANDROID_SUPPORTED_NATIVE_API_LEVELS : list of native API levels found inside NDK
diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake
index 903b55b652..96473a5003 100644
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@@ -2,8 +2,19 @@ if(APPLE)
     set(OPENCL_FOUND YES)
     set(OPENCL_LIBRARIES "-framework OpenCL")
 else()
-    find_package(OpenCL QUIET)
-
+    #find_package(OpenCL QUIET)
+	if(WITH_OPENCLAMDFFT)
+            find_path(CLAMDFFT_INCLUDE_DIR
+                NAMES clAmdFft.h)
+            find_library(CLAMDFFT_LIBRARIES
+                NAMES clAmdFft.Runtime)
+	endif()
+	if(WITH_OPENCLAMDBLAS)
+            find_path(CLAMDBLAS_INCLUDE_DIR
+                NAMES clAmdBlas.h)
+            find_library(CLAMDBLAS_LIBRARIES
+                NAMES clAmdBlas)
+	endif()
     # Try AMD/ATI Stream SDK
     if (NOT OPENCL_FOUND)
         set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT})
diff --git a/cmake/templates/cvconfig.h.cmake b/cmake/templates/cvconfig.h.cmake
index 1012008059..fb779c887d 100644
--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@@ -175,6 +175,12 @@
 /* OpenCL Support */
 #cmakedefine HAVE_OPENCL
 
+/* AMD's OpenCL Fast Fourier Transform Library*/ 
+#cmakedefine HAVE_CLAMDFFT
+
+/* AMD's Basic Linear Algebra Subprograms Library*/
+#cmakedefine HAVE_CLAMDBLAS
+
 /* NVidia Cuda Fast Fourier Transform (FFT) API*/
 #cmakedefine HAVE_CUFFT
 
diff --git a/modules/contrib/src/stereovar.cpp b/modules/contrib/src/stereovar.cpp
index 88640d86b2..1b542bbf52 100755
--- a/modules/contrib/src/stereovar.cpp
+++ b/modules/contrib/src/stereovar.cpp
@@ -67,11 +67,12 @@ StereoVar::~StereoVar()
 
 static Mat diffX(Mat &src)
 {
-    register int x, y, cols = src.cols - 1;
+    int cols = src.cols - 1;
     Mat dst(src.size(), src.type());
-    for(y = 0; y < src.rows; y++){
+    for(int y = 0; y < src.rows; y++){
         const float* pSrc = src.ptr<float>(y);
         float* pDst = dst.ptr<float>(y);
+        int x = 0;
 #if CV_SSE2
         for (x = 0; x <= cols - 8; x += 8) {
             __m128 a0 = _mm_loadu_ps(pSrc + x);
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index f496de3d51..fe496762ed 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -9,16 +9,16 @@ Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 
-	*Redistributions of source code must retain the above copyright
-	 notice, this list of conditions and the following disclaimer.
+    *Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
 
-	*Redistributions in binary form must reproduce the above copyright
-	 notice, this list of conditions and the following disclaimer in the
-	 documentation and/or other materials provided with the distribution.
+    *Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
 
-	*Neither the name of the University of Cambridge nor the names of
-	 its contributors may be used to endorse or promote products derived
-	 from this software without specific prior written permission.
+    *Neither the name of the University of Cambridge nor the names of
+     its contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -350,7 +350,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
     }
 
     int b0 = -a0;
-    for( k = 0; k < 12; k += 2 )
+    for( k = 0; k < 8; k += 2 )
     {
         int b = std::max((int)d[k+1], (int)d[k+2]);
         b = std::max(b, (int)d[k+3]);
@@ -375,7 +375,10 @@ template<int patternSize>
 void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
 {
     Mat img = _img.getMat();
-    const int K = patternSize/2, N = patternSize + K + 1, quarterPatternSize = patternSize/4;
+    const int K = patternSize/2, N = patternSize + K + 1;
+#if CV_SSE2
+    const int quarterPatternSize = patternSize/4;
+#endif
     int i, j, k, pixel[25];
     makeOffsets(pixel, (int)img.step, patternSize);
     for(k = patternSize; k < 25; k++)
@@ -585,7 +588,7 @@ FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppressio
 FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppression, int _type )
 : threshold(_threshold), nonmaxSuppression(_nonmaxSuppression), type(_type)
 {}
-    
+
 void FastFeatureDetector::detectImpl( const Mat& image, vector<KeyPoint>& keypoints, const Mat& mask ) const
 {
     Mat grayImage = image;
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 2165673c84..91a004a866 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1298,17 +1298,17 @@ public:
         maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
     {
     }
-    
+
     virtual void operator() (const Range& range) const
     {
         int i, j, cn = dest->channels(), k;
         Size size = dest->size();
-        
+
         for( i = range.start; i < range.end; i++ )
         {
             const uchar* sptr = temp->ptr(i+radius) + radius*cn;
             uchar* dptr = dest->ptr(i);
-            
+
             if( cn == 1 )
             {
                 for( j = 0; j < size.width; j++ )
@@ -1351,10 +1351,10 @@ public:
             }
         }
     }
-    
+
 private:
-    const Mat *temp;
     Mat *dest;
+    const Mat *temp;
     int radius, maxk, *space_ofs;
     float *space_weight, *color_weight;
 };
@@ -1367,40 +1367,40 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
     int cn = src.channels();
     int i, j, maxk, radius;
     Size size = src.size();
-    
+
     CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) &&
               src.type() == dst.type() && src.size() == dst.size() &&
               src.data != dst.data );
-    
+
     if( sigma_color <= 0 )
         sigma_color = 1;
     if( sigma_space <= 0 )
         sigma_space = 1;
-    
+
     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
-    
+
     if( d <= 0 )
         radius = cvRound(sigma_space*1.5);
     else
         radius = d/2;
     radius = MAX(radius, 1);
     d = radius*2 + 1;
-    
+
     Mat temp;
     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-    
+
     vector<float> _color_weight(cn*256);
     vector<float> _space_weight(d*d);
     vector<int> _space_ofs(d*d);
     float* color_weight = &_color_weight[0];
     float* space_weight = &_space_weight[0];
     int* space_ofs = &_space_ofs[0];
-    
+
     // initialize color-related bilateral filter coefficients
     for( i = 0; i < 256*cn; i++ )
         color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
-    
+
     // initialize space-related bilateral filter coefficients
     for( i = -radius, maxk = 0; i <= radius; i++ )
         for( j = -radius; j <= radius; j++ )
@@ -1411,7 +1411,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
             space_ofs[maxk++] = (int)(i*temp.step + j*cn);
         }
-    
+
     BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
     parallel_for_(Range(0, size.height), body);
 }
diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt
index b331e9f1e6..24595770df 100644
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -29,6 +29,14 @@ if (HAVE_OPENCL)
   if(OPENCL_INCLUDE_DIR)
       ocv_include_directories(${OPENCL_INCLUDE_DIR})
   endif()
+  if (HAVE_CLAMDFFT)
+	set(ocl_link_libs ${ocl_link_libs} ${CLAMDFFT_LIBRARIES})
+	ocv_include_directories(${CLAMDFFT_INCLUDE_DIR})
+  endif()
+  if (HAVE_CLAMDBLAS)
+	set(ocl_link_libs ${ocl_link_libs} ${CLAMDBLAS_LIBRARIES})
+	ocv_include_directories(${CLAMDBLAS_INCLUDE_DIR})
+  endif()
 endif()
 
 ocv_set_module_sources(
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index 0efc72283a..3bc6729e48 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -858,7 +858,72 @@ namespace cv
         void benchmark_copy_vectorize(const oclMat &src, oclMat &dst);
         void benchmark_copy_offset_stride(const oclMat &src, oclMat &dst);
         void benchmark_ILP();
-        
+
+		//! computes vertical sum, supports only CV_32FC1 images
+		CV_EXPORTS void columnSum(const oclMat& src, oclMat& sum);
+
+		//! performs linear blending of two images
+		//! to avoid accuracy errors sum of weigths shouldn't be very close to zero
+		// supports only CV_8UC1 source type
+		CV_EXPORTS void blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, oclMat& result);
+
+		/////////////////////////////// Pyramid /////////////////////////////////////
+		CV_EXPORTS void pyrDown(const oclMat& src, oclMat& dst);
+
+		//! upsamples the source image and then smoothes it
+		CV_EXPORTS void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst);
+
+		///////////////////////////////////////// match_template /////////////////////////////////////////////////////////////
+		struct CV_EXPORTS MatchTemplateBuf
+		{
+			Size user_block_size;
+			oclMat imagef, templf;
+			std::vector<oclMat> images;
+			std::vector<oclMat> image_sums;
+			std::vector<oclMat> image_sqsums;
+		};
+
+
+		//! computes the proximity map for the raster template and the image where the template is searched for
+		// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
+		// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
+		CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method);
+		
+		//! computes the proximity map for the raster template and the image where the template is searched for
+		// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
+		// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
+		CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf);
+
+#ifdef HAVE_CLAMDFFT
+            ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
+            // the two functions must be called before/after run any fft library functions.
+            CV_EXPORTS void fft_setup();    // this will be implicitly invoked
+            CV_EXPORTS void fft_teardown(); // you need to teardown fft library manually
+
+		    /////////////////////////////////////// DFT /////////////////////////////////////////////////////
+		    //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
+		    //! Param dft_size is the size of DFT transform.
+		    //!
+		    //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format.
+		    // support src type of CV32FC1, CV32FC2
+		    // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS
+		    // dft_size is the size of original input, which is used for transformation from complex to real.
+		    // dft_size must be powers of 2, 3 and 5
+		    // real to complex dft requires at least v1.8 clAmdFft
+		    // real to complex dft output is not the same with cpu version
+		    // real to complex and complex to real does not support DFT_ROWS
+		    CV_EXPORTS void dft(const oclMat& src, oclMat& dst, Size dft_size = Size(0, 0), int flags = 0);
+#endif // HAVE_CLAMDFFT
+
+#ifdef HAVE_CLAMDBLAS
+		//! implements generalized matrix product algorithm GEMM from BLAS
+		// The functionality requires clAmdBlas library
+		// only support type CV_32FC1
+		// flag GEMM_3_T is not supported
+		CV_EXPORTS void gemm(const oclMat& src1, const oclMat& src2, double alpha,
+		const oclMat& src3, double beta, oclMat& dst, int flags = 0);
+#endif
+
     }
 }
 #include "opencv2/ocl/matrix_operations.hpp"
diff --git a/modules/ocl/src/blend.cpp b/modules/ocl/src/blend.cpp
new file mode 100644
index 0000000000..a9df907d3c
--- /dev/null
+++ b/modules/ocl/src/blend.cpp
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2,
+                            oclMat& result){throw_nogpu();}
+#else
+namespace cv 
+{
+	namespace ocl 
+	{
+        ////////////////////////////////////OpenCL kernel strings//////////////////////////
+        extern const char *blend_linear;
+	}
+}
+
+void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2,
+                            oclMat& result)
+{
+	cv::ocl::Context *ctx = img1.clCxt;
+	assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt);
+	int channels = img1.channels();
+	int depth = img1.depth();
+	int rows = img1.rows;
+	int cols = img1.cols;
+	int istep = img1.step;
+	int wstep = weights1.step;
+	size_t globalSize[] = {cols * channels, rows, 1};
+	size_t localSize[] = {16, 16, 1};
+
+	vector< pair<size_t, const void *> > args;
+
+	if(globalSize[0]!=0)
+	{
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&img1.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&img2.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&weights1.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&weights2.data ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&istep ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&wstep ));
+		std::string kernelName = "BlendLinear";
+
+		openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth);
+	}
+}
+#endif
\ No newline at end of file
diff --git a/modules/ocl/src/columnsum.cpp b/modules/ocl/src/columnsum.cpp
new file mode 100644
index 0000000000..e789d38b09
--- /dev/null
+++ b/modules/ocl/src/columnsum.cpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Chunpeng Zhang, chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+
+#if !defined(HAVE_OPENCL)
+
+void cv::ocl::columnSum(const oclMat& src,oclMat& dst){ throw_nogpu(); }
+
+#else /*!HAVE_OPENCL */
+
+namespace cv 
+{ 
+	namespace ocl
+	{
+		extern const char* imgproc_columnsum;
+	}
+}
+
+void cv::ocl::columnSum(const oclMat& src,oclMat& dst)
+{
+	CV_Assert(src.type() == CV_32FC1 && dst.type() == CV_32FC1 && src.size() == dst.size());
+
+	Context *clCxt = src.clCxt;                                        
+		       
+	const std::string kernelName = "columnSum";
+		
+	std::vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));		
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));			
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));		
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));			
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));		
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));		
+
+	size_t globalThreads[3] = {dst.cols, dst.rows, 1};					
+	size_t localThreads[3]  = {16, 16, 1};		
+
+	openCLExecuteKernel(clCxt, &imgproc_columnsum, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+
+}
+#endif 
\ No newline at end of file
diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp
new file mode 100644
index 0000000000..b3eda35c18
--- /dev/null
+++ b/modules/ocl/src/fft.cpp
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <iomanip>
+#include "precomp.hpp"
+
+#ifdef HAVE_CLAMDFFT
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::dft(const oclMat& src, oclMat& dst, int flags) { throw_nogpu(); }
+#else
+
+#include <clAmdFft.h>
+
+namespace cv{ namespace ocl {
+    enum FftType
+    {
+        C2R = 1, // complex to complex
+        R2C = 2, // real to opencl HERMITIAN_INTERLEAVED
+        C2C = 3  // opencl HERMITIAN_INTERLEAVED to real
+    };
+    struct FftPlan
+    {
+        friend void fft_setup();
+        friend void fft_teardown();
+        ~FftPlan();
+    protected:
+        FftPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type);
+        const Size dft_size;
+        const int src_step, dst_step;
+        const int flags;
+        const FftType type;
+        clAmdFftPlanHandle plHandle;
+        static vector<FftPlan*> planStore;
+        static bool started;
+        static clAmdFftSetupData * setupData;
+    public:
+        // return a baked plan-> 
+        // if there is one matched plan, return it
+        // if not, bake a new one, put it into the planStore and return it.
+        static clAmdFftPlanHandle getPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type);
+    };
+}}
+bool cv::ocl::FftPlan::started = false;
+vector<cv::ocl::FftPlan*> cv::ocl::FftPlan::planStore = vector<cv::ocl::FftPlan*>();
+clAmdFftSetupData * cv::ocl::FftPlan::setupData = 0;
+
+void cv::ocl::fft_setup()
+{
+    if(FftPlan::started)
+    {
+        return;
+    }
+    FftPlan::setupData = new clAmdFftSetupData;
+    openCLSafeCall(clAmdFftInitSetupData( FftPlan::setupData ));
+    FftPlan::started = true;
+}
+void cv::ocl::fft_teardown()
+{
+    if(!FftPlan::started)
+    {
+        return;
+    }
+    delete FftPlan::setupData;
+    for(int i = 0; i < FftPlan::planStore.size(); i ++)
+    {
+        delete FftPlan::planStore[i];
+    }
+    FftPlan::planStore.clear();
+    openCLSafeCall( clAmdFftTeardown( ) );
+    FftPlan::started = false;
+}
+
+// bake a new plan
+cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type)
+    : dft_size(_dft_size), src_step(_src_step), dst_step(_dst_step), flags(_flags), type(_type), plHandle(0)
+{
+    if(!FftPlan::started)
+    {
+        // implicitly do fft setup
+        fft_setup();
+    }
+
+    bool is_1d_input	= (_dft_size.height == 1);
+    int is_row_dft		= flags & DFT_ROWS;
+    int is_scaled_dft		= flags & DFT_SCALE;
+    int is_inverse			= flags & DFT_INVERSE;
+
+    clAmdFftResultLocation	place;
+    clAmdFftLayout			inLayout;
+    clAmdFftLayout			outLayout;
+    clAmdFftDim				dim = is_1d_input||is_row_dft ? CLFFT_1D : CLFFT_2D;
+
+    size_t batchSize		 = is_row_dft?dft_size.height : 1;
+    size_t clLengthsIn[ 3 ]  = {1, 1, 1};
+    size_t clStridesIn[ 3 ]  = {1, 1, 1};
+    size_t clLengthsOut[ 3 ] = {1, 1, 1};
+    size_t clStridesOut[ 3 ] = {1, 1, 1};
+    clLengthsIn[0]			 = dft_size.width;
+    clLengthsIn[1]			 = is_row_dft ? 1 : dft_size.height;
+    clStridesIn[0]			 = 1;
+    clStridesOut[0]			 = 1;
+
+    switch(_type)
+    {
+    case C2C:
+        inLayout        = CLFFT_COMPLEX_INTERLEAVED;
+        outLayout       = CLFFT_COMPLEX_INTERLEAVED;
+        clStridesIn[1]  = src_step / sizeof(std::complex<float>);
+        clStridesOut[1] = clStridesIn[1];
+        break;
+    case R2C:
+        CV_Assert(!is_row_dft); // this is not supported yet
+        inLayout        = CLFFT_REAL;
+        outLayout       = CLFFT_HERMITIAN_INTERLEAVED;
+        clStridesIn[1]  = src_step / sizeof(float);
+        clStridesOut[1] = dst_step / sizeof(std::complex<float>);
+        break;
+    case C2R:
+        CV_Assert(!is_row_dft); // this is not supported yet
+        inLayout        = CLFFT_HERMITIAN_INTERLEAVED;
+        outLayout       = CLFFT_REAL;
+        clStridesIn[1]  = src_step / sizeof(std::complex<float>);
+        clStridesOut[1] = dst_step / sizeof(float);
+        break;
+    default:
+        //std::runtime_error("does not support this convertion!");
+        cout << "Does not support this convertion!" << endl;
+        throw exception();
+        break;
+    }
+
+    clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
+    clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
+
+    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, Context::getContext()->impl->clContext, dim, clLengthsIn ) );
+
+    openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
+    openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
+    openCLSafeCall( clAmdFftSetPlanBatchSize( plHandle, batchSize ) );
+
+    openCLSafeCall( clAmdFftSetPlanInStride  ( plHandle, dim, clStridesIn ) );
+    openCLSafeCall( clAmdFftSetPlanOutStride ( plHandle, dim, clStridesOut ) );
+    openCLSafeCall( clAmdFftSetPlanDistance  ( plHandle, clStridesIn[ dim ], clStridesIn[ dim ]) );
+    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &(Context::getContext()->impl->clCmdQueue), NULL, NULL ) );
+}
+cv::ocl::FftPlan::~FftPlan()
+{
+    for(int i = 0; i < planStore.size(); i ++)
+    {
+        if(planStore[i]->plHandle == plHandle)
+        {
+            planStore.erase(planStore.begin()+ i);
+        }
+    }
+    openCLSafeCall( clAmdFftDestroyPlan( &plHandle ) );
+}
+
+clAmdFftPlanHandle cv::ocl::FftPlan::getPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type)
+{
+    // go through search
+    for(int i = 0; i < planStore.size(); i ++)
+    {
+        FftPlan * plan = planStore[i];
+        if(
+            plan->dft_size.width == _dft_size.width && 
+            plan->dft_size.height == _dft_size.height &&
+            plan->flags == _flags &&
+            plan->src_step == _src_step &&
+            plan->dst_step == _dst_step &&
+            plan->type == _type
+            )
+        {
+            return plan->plHandle;
+        }
+    }
+    // no baked plan is found
+    FftPlan *newPlan = new FftPlan(_dft_size, _src_step, _dst_step, _flags, _type);
+    planStore.push_back(newPlan);
+    return newPlan->plHandle;
+}
+
+void cv::ocl::dft(const oclMat& src, oclMat& dst, Size dft_size, int flags) 
+{
+    if(dft_size == Size(0,0))
+    {
+        dft_size = src.size();
+    }
+    // check if the given dft size is of optimal dft size
+    CV_Assert(dft_size.area() == getOptimalDFTSize(dft_size.area()));
+
+    // similar assertions with cuda module
+    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
+
+    // we don't support DFT_SCALE flag
+    CV_Assert(!(DFT_SCALE & flags));
+
+    bool is_1d_input	= (src.rows == 1);
+    int is_row_dft		= flags & DFT_ROWS;
+    int is_scaled_dft		= flags & DFT_SCALE;
+    int is_inverse			= flags & DFT_INVERSE;
+    bool is_complex_input	= src.channels() == 2;
+    bool is_complex_output	= !(flags & DFT_REAL_OUTPUT);
+
+    // We don't support real-to-real transform
+    CV_Assert(is_complex_input || is_complex_output);
+    FftType type = (FftType)(is_complex_input << 0 | is_complex_output << 1);
+
+    switch(type)
+    {
+    case C2C:
+        dst.create(src.rows, src.cols, CV_32FC2);
+        break;
+    case R2C:
+        CV_Assert(!is_row_dft); // this is not supported yet
+        dst.create(src.rows, src.cols/2 + 1, CV_32FC2);
+        break;
+    case C2R:
+        CV_Assert(dft_size.width / 2 + 1 == src.cols && dft_size.height == src.rows);
+        CV_Assert(!is_row_dft); // this is not supported yet
+        dst.create(src.rows, dft_size.width, CV_32FC1);
+        break;
+    default:
+        //std::runtime_error("does not support this convertion!");
+        cout << "Does not support this convertion!" << endl;
+        throw exception();
+        break;
+    }
+    clAmdFftPlanHandle plHandle = FftPlan::getPlan(dft_size, src.step, dst.step, flags, type);
+
+    //get the buffersize
+    size_t buffersize=0;
+    openCLSafeCall( clAmdFftGetTmpBufSize(plHandle, &buffersize ) );
+
+    //allocate the intermediate buffer	
+    cl_mem clMedBuffer=NULL;
+    if (buffersize)
+    {
+        cl_int medstatus;
+        clMedBuffer = clCreateBuffer ( src.clCxt->impl->clContext, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+        openCLSafeCall( medstatus );
+    }
+    openCLSafeCall( clAmdFftEnqueueTransform( plHandle, 
+        is_inverse?CLFFT_BACKWARD:CLFFT_FORWARD, 
+        1, 
+        &src.clCxt->impl->clCmdQueue, 
+        0, NULL, NULL, 
+        (cl_mem*)&src.data, (cl_mem*)&dst.data, clMedBuffer ) );
+    openCLSafeCall( clFinish(src.clCxt->impl->clCmdQueue) );
+    if(clMedBuffer)
+    {
+        openCLFree(clMedBuffer);
+    }
+}
+
+#endif
+#endif //HAVE_CLAMDFFT
diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp
new file mode 100644
index 0000000000..c35e061826
--- /dev/null
+++ b/modules/ocl/src/gemm.cpp
@@ -0,0 +1,161 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iomanip>
+#include "precomp.hpp"
+
+#ifdef HAVE_CLAMDBLAS
+
+#include "clAmdBlas.h"
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::dft(const oclMat& src, oclMat& dst, int flags) { throw_nogpu(); }
+#else
+
+using namespace cv;
+
+	void cv::ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha,
+		const oclMat& src3, double beta, oclMat& dst, int flags)
+	{
+		CV_Assert(src1.cols == src2.rows && 
+			(src3.empty() || src1.rows == src3.rows && src2.cols == src3.cols));
+		CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported
+		if(!src3.empty())
+		{
+			src3.copyTo(dst);
+		}
+		else
+		{
+			dst.create(src1.rows, src2.cols, src1.type());
+			dst.setTo(Scalar::all(0));
+		}
+		openCLSafeCall( clAmdBlasSetup() );
+		
+		const clAmdBlasTranspose transA = (cv::GEMM_1_T & flags)?clAmdBlasTrans:clAmdBlasNoTrans;
+		const clAmdBlasTranspose transB = (cv::GEMM_2_T & flags)?clAmdBlasTrans:clAmdBlasNoTrans;
+		const clAmdBlasOrder     order  = clAmdBlasRowMajor;
+
+		const int M = src1.rows;
+		const int N = src2.cols;
+		const int K = src1.cols;
+		int lda     = src1.step;
+		int ldb     = src2.step;
+		int ldc     = dst.step;
+		int offa    = src1.offset;
+		int offb    = src2.offset;
+		int offc    = dst.offset;
+
+
+		switch(src1.type())
+		{
+		case CV_32FC1:
+			lda  /= sizeof(float);
+			ldb  /= sizeof(float);
+			ldc  /= sizeof(float);
+			offa /= sizeof(float);
+			offb /= sizeof(float);
+			offc /= sizeof(float);
+			openCLSafeCall
+			(
+				clAmdBlasSgemmEx(order, transA, transB, M, N, K,
+					alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+					beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+			);
+			break;
+		case CV_64FC1:
+			lda  /= sizeof(double);
+			ldb  /= sizeof(double);
+			ldc  /= sizeof(double);
+			offa /= sizeof(double);
+			offb /= sizeof(double);
+			offc /= sizeof(double);
+			openCLSafeCall
+			(
+				clAmdBlasDgemmEx(order, transA, transB, M, N, K,
+					alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+					beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+			);
+			break;
+		case CV_32FC2:
+			{
+				lda  /= sizeof(std::complex<float>);
+				ldb  /= sizeof(std::complex<float>);
+				ldc  /= sizeof(std::complex<float>);
+				offa /= sizeof(std::complex<float>);
+				offb /= sizeof(std::complex<float>);
+				offc /= sizeof(std::complex<float>);
+				cl_float2 alpha_2 = {{alpha, 0}};
+				cl_float2 beta_2  = {{beta, 0}};
+				openCLSafeCall
+				(
+					clAmdBlasCgemmEx(order, transA, transB, M, N, K,
+						alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+						beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+				);
+			}
+			break;
+		case CV_64FC2:
+			{
+				lda  /= sizeof(std::complex<double>);
+				ldb  /= sizeof(std::complex<double>);
+				ldc  /= sizeof(std::complex<double>);
+				offa /= sizeof(std::complex<double>);
+				offb /= sizeof(std::complex<double>);
+				offc /= sizeof(std::complex<double>);
+				cl_double2 alpha_2 = {{alpha, 0}};
+				cl_double2 beta_2  = {{beta, 0}};
+				openCLSafeCall
+				(
+					clAmdBlasZgemmEx(order, transA, transB, M, N, K,
+						alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+						beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+				);
+			}
+			break;
+		}
+		clAmdBlasTeardown();
+	}
+#endif
+#endif
diff --git a/modules/ocl/src/kernels/blend_linear.cl b/modules/ocl/src/kernels/blend_linear.cl
new file mode 100644
index 0000000000..bf733576c0
--- /dev/null
+++ b/modules/ocl/src/kernels/blend_linear.cl
@@ -0,0 +1,196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, MulticoreWare Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Liu Liujun, liujun@multicorewareinc.com 
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+__kernel void BlendLinear_C1_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	if (idx < cols && idy < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + idx;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+
+	}
+}
+
+__kernel void BlendLinear_C3_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 3;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C4_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 4;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C1_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	if (idx < cols && idy < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + idx;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C3_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 3;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C4_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 4;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
diff --git a/modules/ocl/src/kernels/imgproc_columnsum.cl b/modules/ocl/src/kernels/imgproc_columnsum.cl
new file mode 100644
index 0000000000..913b417d15
--- /dev/null
+++ b/modules/ocl/src/kernels/imgproc_columnsum.cl
@@ -0,0 +1,80 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Chunpeng Zhang chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+////////////////////////////////////////////////////////////////////
+///////////////////////// columnSum ////////////////////////////////
+////////////////////////////////////////////////////////////////////
+/// CV_32FC1
+__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	
+	srcStep >>= 2;
+	dstStep >>= 2;
+
+	if (x < srcCols)
+    {
+		int srcIdx = x ;
+		int dstIdx = x ;
+
+        float sum = 0;
+		
+        for (int y = 0; y < srcRows; ++y)
+        {
+			sum += src[srcIdx];
+            dst[dstIdx] = sum;
+			srcIdx += srcStep;
+			dstIdx += dstStep;	
+        }
+	}
+}
diff --git a/modules/ocl/src/kernels/match_template.cl b/modules/ocl/src/kernels/match_template.cl
new file mode 100644
index 0000000000..4c5a4fc9ca
--- /dev/null
+++ b/modules/ocl/src/kernels/match_template.cl
@@ -0,0 +1,824 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__))
+#define TYPE_IMAGE_SQSUM double
+#else
+#define TYPE_IMAGE_SQSUM ulong
+#endif
+
+//////////////////////////////////////////////////
+// utilities
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox)
+#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
+// normAcc* are accurate normalization routines which make GPU matchTemplate
+// consistent with CPU one
+float normAcc(float num, float denum)
+{
+	if(fabs(num) < denum)
+	{
+		return num / denum;
+	}
+	if(fabs(num) < denum * 1.125f)
+	{
+		return num > 0 ? 1 : -1;
+	}
+	return 0;
+}
+
+float normAcc_SQDIFF(float num, float denum)
+{
+	if(fabs(num) < denum)
+	{
+		return num / denum;
+	}
+	if(fabs(num) < denum * 1.125f)
+	{
+		return num > 0 ? 1 : -1;
+	}
+	return 1;
+}
+//////////////////////////////////////////////////////////////////////
+// normalize
+
+__kernel 
+void normalizeKernel_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
+	}
+}
+
+__kernel 
+void matchTemplate_Prepared_SQDIFF_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
+	}
+}
+
+__kernel 
+void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
+										sqrt(image_sqsum_ * tpl_sqsum));
+	}
+}
+
+//////////////////////////////////////////////////
+// SQDIFF
+__kernel 
+void matchTemplate_Naive_SQDIFF_C1_D0
+(
+	__global const uchar * img,
+	__global const uchar * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int delta;
+	int sum = 0;
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				delta = img_ptr[j] - tpl_ptr[j];
+				sum   = mad24(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C1_D5
+(
+	__global const float * img,
+	__global const float * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float delta;
+	float sum = 0;
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				delta = img_ptr[j] - tpl_ptr[j];
+				sum   = mad(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C4_D0
+(
+	__global const uchar4 * img,
+	__global const uchar4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int4 delta;
+	int4 sum = (int4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+				delta.x = img_ptr[j].x - tpl_ptr[j].x;
+				delta.y = img_ptr[j].y - tpl_ptr[j].y;
+				delta.z = img_ptr[j].z - tpl_ptr[j].z;
+				delta.w = img_ptr[j].w - tpl_ptr[j].w;
+				sum   = mad24(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C4_D5
+(
+	__global const float4 * img,
+	__global const float4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float4 delta;
+	float4 sum = (float4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+				delta.x = img_ptr[j].x - tpl_ptr[j].x;
+				delta.y = img_ptr[j].y - tpl_ptr[j].y;
+				delta.z = img_ptr[j].z - tpl_ptr[j].z;
+				delta.w = img_ptr[j].w - tpl_ptr[j].w;
+				sum   = mad(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+//////////////////////////////////////////////////
+// CCORR
+__kernel 
+void matchTemplate_Naive_CCORR_C1_D0
+(
+	__global const uchar * img,
+	__global const uchar * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int sum = 0;
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad24(img_ptr[j], tpl_ptr[j], sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C1_D5
+(
+	__global const float * img,
+	__global const float * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float sum = 0;
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad(img_ptr[j], tpl_ptr[j], sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C4_D0
+(
+	__global const uchar4 * img,
+	__global const uchar4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int4 sum = (int4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum   = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C4_D5
+(
+	__global const float4 * img,
+	__global const float4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float4 sum = (float4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+//////////////////////////////////////////////////
+// CCOFF
+__kernel 
+void matchTemplate_Prepared_CCOFF_C1_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	__global const uint * img_sums,
+	int img_sums_offset,
+	int img_sums_step,
+	float tpl_sum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sums_offset   /= sizeof(*img_sums);
+	img_sums_step     /= sizeof(*img_sums);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float sum = (float)(
+			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+		res[res_idx] -= sum * tpl_sum;
+	}
+}
+__kernel 
+void matchTemplate_Prepared_CCOFF_C4_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	__global const uint * img_sums_c0,
+	__global const uint * img_sums_c1,
+	__global const uint * img_sums_c2,
+	__global const uint * img_sums_c3,
+	int img_sums_offset,
+	int img_sums_step,
+	float tpl_sum_c0,
+	float tpl_sum_c1,
+	float tpl_sum_c2,
+	float tpl_sum_c3
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sums_offset   /= sizeof(*img_sums_c0);
+	img_sums_step     /= sizeof(*img_sums_c0);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float ccorr = res[res_idx];
+		ccorr -= tpl_sum_c0*(float)(
+			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c1*(float)(
+			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c2*(float)(
+			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c3*(float)(
+			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+		res[res_idx] = ccorr;
+	}
+}
+
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	float weight,
+	__global const uint * img_sums,
+	int img_sums_offset,
+	int img_sums_step,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	float tpl_sum,
+	float tpl_sqsum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sqsums_step   /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	img_sums_offset   /= sizeof(*img_sums);
+	img_sums_step     /= sizeof(*img_sums);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sum_ =  (float)(
+			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
+							   sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+	}
+}
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	float weight,
+	__global const uint * img_sums_c0,
+	__global const uint * img_sums_c1,
+	__global const uint * img_sums_c2,
+	__global const uint * img_sums_c3,
+	int img_sums_offset,
+	int img_sums_step,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c0,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c1,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c2,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c3,
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	float tpl_sum_c0,
+	float tpl_sum_c1,
+	float tpl_sum_c2,
+	float tpl_sum_c3,
+	float tpl_sqsum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sqsums_step   /= sizeof(*img_sqsums_c0);
+	img_sqsums_offset /= sizeof(*img_sqsums_c0);
+	img_sums_offset   /= sizeof(*img_sums_c0);
+	img_sums_step     /= sizeof(*img_sums_c0);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sum_c0 =  (float)(
+			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+		float image_sum_c1 =  (float)(
+			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+		float image_sum_c2 =  (float)(
+			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+		float image_sum_c3 =  (float)(
+			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+
+		float image_sqsum_c0 = (float)(
+			(img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c1 = (float)(
+			(img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c2 = (float)(
+			(img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c3 = (float)(
+			(img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
+
+		float num = res[res_idx] - 
+			image_sum_c0 * tpl_sum_c0 -
+			image_sum_c1 * tpl_sum_c1 -
+			image_sum_c2 * tpl_sum_c2 -
+			image_sum_c3 * tpl_sum_c3;
+		float denum = sqrt( tpl_sqsum * (
+			image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
+			image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
+			image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
+			image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
+			);
+		res[res_idx] = normAcc(num, denum);
+	}
+}
+
diff --git a/modules/ocl/src/kernels/pyr_down.cl b/modules/ocl/src/kernels/pyr_down.cl
new file mode 100644
index 0000000000..38b4ec7c7f
--- /dev/null
+++ b/modules/ocl/src/kernels/pyr_down.cl
@@ -0,0 +1,500 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+
+uchar round_uchar_uchar(uchar v)
+{ 
+	return v;
+}
+
+uchar round_uchar_int(int v)
+{ 
+    return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0); 
+}
+
+uchar round_uchar_float(float v)
+{ 
+	if(v - convert_int_sat_rte(v) > 1e-6 || v - convert_int_sat_rte(v) < -1e-6)
+	{
+		if(((int)v + 1) - (v + 0.5f) < 1e-6 && ((int)v + 1) - (v + 0.5f) > -1e-6)
+		{
+			v = (int)v + 0.51f;
+		}
+	}
+    int iv = convert_int_sat_rte(v);
+    return round_uchar_int(iv); 
+}
+
+uchar4 round_uchar4_uchar4(uchar4 v)
+{ 
+	return v;
+}
+
+uchar4 round_uchar4_int4(int4 v)
+{ 
+	uchar4 result;
+	result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0); 
+	result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0); 
+	result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0); 
+	result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0); 
+    return result; 
+}
+
+uchar4 round_uchar4_float4(float4 v)
+{ 
+	if(v.x - convert_int_sat_rte(v.x) > 1e-6 || v.x - convert_int_sat_rte(v.x) < -1e-6)
+	{
+		if(((int)(v.x) + 1) - (v.x + 0.5f) < 1e-6 && ((int)(v.x) + 1) - (v.x + 0.5f) > -1e-6)
+		{
+			v.x = (int)(v.x) + 0.51f;
+		}
+	}
+	if(v.y - convert_int_sat_rte(v.y) > 1e-6 || v.y - convert_int_sat_rte(v.y) < -1e-6)
+	{
+		if(((int)(v.y) + 1) - (v.y + 0.5f) < 1e-6 && ((int)(v.y) + 1) - (v.y + 0.5f) > -1e-6)
+		{
+			v.y = (int)(v.y) + 0.51f;
+		}
+	}
+	if(v.z - convert_int_sat_rte(v.z) > 1e-6 || v.z - convert_int_sat_rte(v.z) < -1e-6)
+	{
+		if(((int)(v.z) + 1) - (v.z + 0.5f) < 1e-6 && ((int)(v.z) + 1) - (v.z + 0.5f) > -1e-6)
+		{
+			v.z = (int)(v.z) + 0.51f;
+		}
+	}
+	if(v.w - convert_int_sat_rte(v.w) > 1e-6 || v.w - convert_int_sat_rte(v.w) < -1e-6)
+	{
+		if(((int)(v.w) + 1) - (v.w + 0.5f) < 1e-6 && ((int)(v.w) + 1) - (v.w + 0.5f) > -1e-6)
+		{
+			v.w = (int)(v.w) + 0.51f;
+		}
+	}
+    int4 iv = convert_int4_sat_rte(v);
+    return round_uchar4_int4(iv); 
+}
+
+
+
+
+int idx_row_low(int y, int last_row)
+{
+	if(y < 0)
+	{
+		y = -y;
+	}
+    return y % (last_row + 1);
+}
+
+int idx_row_high(int y, int last_row) 
+{
+	int i;
+	int j;
+	if(last_row - y < 0)
+	{
+		i = (y - last_row);
+	}
+	else
+	{
+		i = (last_row - y);
+	}
+	if(last_row - i < 0)
+	{
+		j = i - last_row;
+	}
+	else
+	{
+		j = last_row - i;
+	}
+    return j % (last_row + 1);
+}
+
+int idx_row(int y, int last_row)
+{
+    return idx_row_low(idx_row_high(y, last_row), last_row);
+}
+
+int idx_col_low(int x, int last_col)
+{
+	if(x < 0)
+	{
+		x = -x;
+	}
+    return x % (last_col + 1);
+}
+
+int idx_col_high(int x, int last_col) 
+{
+	int i;
+	int j;
+	if(last_col - x < 0)
+	{
+		i = (x - last_col);
+	}
+	else
+	{
+		i = (last_col - x);
+	}
+	if(last_col - i < 0)
+	{
+		j = i - last_col;
+	}
+	else
+	{
+		j = last_col - i;
+	}
+    return j % (last_col + 1);
+}
+
+int idx_col(int x, int last_col)
+{
+    return idx_col_low(idx_col_high(x, last_col), last_col);
+}
+
+__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    sum = 0;
+
+    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]);
+
+    smem[2 + get_local_id(0)] = sum;
+
+    if (get_local_id(0) < 2)
+    {
+        const int left_x = x - 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
+
+        smem[get_local_id(0)] = sum;
+    }
+
+    if (get_local_id(0) > 253)
+    {
+        const int right_x = x + 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
+
+        smem[4 + get_local_id(0)] = sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep + dst_x] = round_uchar_float(sum);
+    }
+}
+
+__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+    sum = 0;
+
+	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+
+	smem[2 + get_local_id(0)] = sum;
+
+	if (get_local_id(0) < 2)
+	{
+		const int left_x = x - 2;
+
+		sum = 0;
+
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+
+		smem[get_local_id(0)] = sum;
+	}
+
+	if (get_local_id(0) > 253)
+	{
+		const int right_x = x + 2;
+
+		sum = 0;
+
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+
+		smem[4 + get_local_id(0)] = sum;
+	}
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + co3 * smem[2 + tid2 - 2];
+        sum = sum + co2   * smem[2 + tid2 - 1];
+        sum = sum + co1  * smem[2 + tid2    ];
+        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum);
+    }
+}
+
+__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    sum = 0;
+
+    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)];
+
+    smem[2 + get_local_id(0)] = sum;
+
+    if (get_local_id(0) < 2)
+    {
+        const int left_x = x - 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)];
+
+        smem[get_local_id(0)] = sum;
+    }
+
+    if (get_local_id(0) > 253)
+    {
+        const int right_x = x + 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)];
+
+        smem[4 + get_local_id(0)] = sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 4 + dst_x] = sum;
+    }
+}
+
+__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+    sum = 0;
+
+	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
+
+	smem[2 + get_local_id(0)] = sum;
+
+	if (get_local_id(0) < 2)
+	{
+		const int left_x = x - 2;
+
+		sum = 0;
+
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+
+		smem[get_local_id(0)] = sum;
+	}
+
+	if (get_local_id(0) > 253)
+	{
+		const int right_x = x + 2;
+
+		sum = 0;
+
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+
+		smem[4 + get_local_id(0)] = sum;
+	}
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + co3 * smem[2 + tid2 - 2];
+        sum = sum + co2   * smem[2 + tid2 - 1];
+        sum = sum + co1  * smem[2 + tid2    ];
+        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 16 + dst_x] = sum;
+    }
+}
diff --git a/modules/ocl/src/kernels/pyr_up.cl b/modules/ocl/src/kernels/pyr_up.cl
new file mode 100644
index 0000000000..dd3ba43d1b
--- /dev/null
+++ b/modules/ocl/src/kernels/pyr_up.cl
@@ -0,0 +1,750 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Chunpeng	chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+uchar get_valid_uchar(uchar data)
+{
+	return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
+}
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC1  //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC1  /////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+	srcStep = srcStep >> 1;
+	dstStep = dstStep >> 1;
+	srcOffset = srcOffset >> 1;
+	dstOffset = dstOffset >> 1;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC1  /////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+	srcOffset = srcOffset >> 2;
+	dstOffset = dstOffset >> 2;
+	srcStep = srcStep >> 2;
+	dstStep = dstStep >> 2;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC4  //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+float4 covert_uchar4_to_float4(uchar4 data)
+{
+	float4 f4Data = {0,0,0,0};
+	
+	f4Data.x = (float)data.x;
+	f4Data.y = (float)data.y;
+	f4Data.z = (float)data.z;
+	f4Data.w = (float)data.w;
+	
+	return f4Data;
+}
+
+
+uchar4 convert_float4_to_uchar4(float4 data)
+{
+	uchar4 u4Data;
+	
+	u4Data.x = get_valid_uchar(data.x);
+	u4Data.y = get_valid_uchar(data.y);
+	u4Data.z = get_valid_uchar(data.z);
+	u4Data.w = get_valid_uchar(data.w);
+	
+	return u4Data;
+}
+
+float4 int_x_float4(int leftOpr,float4 rightOpr)
+{
+	float4 result = {0,0,0,0};
+	
+	result.x = rightOpr.x * leftOpr;
+	result.y = rightOpr.y * leftOpr;
+	result.z = rightOpr.z * leftOpr;
+	result.w = rightOpr.w * leftOpr;
+	
+	return result;
+}
+
+float4 float4_x_float4(float4 leftOpr,float4 rightOpr)
+{
+	float4 result;
+	
+	result.x = leftOpr.x * rightOpr.x;
+	result.y = leftOpr.y * rightOpr.y;
+	result.z = leftOpr.z * rightOpr.z;
+	result.w = leftOpr.w * rightOpr.w;
+	
+	return result;
+}
+
+__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 2;
+	dstOffset >>= 2;
+	srcStep >>= 2;
+	dstStep >>= 2;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum));
+	}
+} 
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+float4 covert_ushort4_to_float4(ushort4 data)
+{
+	float4 f4Data = {0,0,0,0};
+	
+	f4Data.x = (float)data.x;
+	f4Data.y = (float)data.y;
+	f4Data.z = (float)data.z;
+	f4Data.w = (float)data.w;
+	
+	return f4Data;
+}
+
+
+ushort4 convert_float4_to_ushort4(float4 data)
+{
+	ushort4 u4Data;
+	
+	u4Data.x = (float)data.x;
+	u4Data.y = (float)data.y;
+	u4Data.z = (float)data.z;
+	u4Data.w = (float)data.w;
+	
+	return u4Data;
+}
+
+
+__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 3;
+	dstOffset >>= 3;
+	srcStep >>= 3;
+	dstStep >>= 3;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum));
+	}
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 4;
+	dstOffset >>= 4;
+	srcStep >>= 4;
+	dstStep >>= 4;
+	
+	
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = 4.0f * sum;
+	}
+}
\ No newline at end of file
diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp
new file mode 100644
index 0000000000..ad31b00c68
--- /dev/null
+++ b/modules/ocl/src/match_template.cpp
@@ -0,0 +1,560 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#define EXT_FP64 0
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
+#else
+//helper routines
+namespace cv
+{
+	namespace ocl
+	{
+		///////////////////////////OpenCL kernel strings///////////////////////////
+		extern const char *match_template;
+	}
+}
+
+namespace cv { namespace ocl
+{
+	void matchTemplate_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_SQDIFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCORR_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCOFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCOFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+
+	void matchTemplateNaive_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn);
+
+	void matchTemplateNaive_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn);
+
+	// Evaluates optimal template's area threshold. If 
+	// template's area is less  than the threshold, we use naive match 
+	// template version, otherwise FFT-based (if available)
+	int getTemplateThreshold(int method, int depth)
+	{
+		switch (method)
+		{
+		case CV_TM_CCORR: 
+			if (depth == CV_32F) return 250;
+			if (depth == CV_8U) return 300;
+			break;
+		case CV_TM_SQDIFF:
+			if (depth == CV_32F) return MAXSHORT; // do naive SQDIFF for CV_32F
+			if (depth == CV_8U) return 300;
+			break;
+		}
+		CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
+		return 0;
+	}
+
+
+	//////////////////////////////////////////////////////////////////////
+	// SQDIFF
+	void matchTemplate_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+		if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+		{
+			matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
+			return;
+		}
+		else
+		{
+			// TODO
+			CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+		}
+	}
+
+	void matchTemplate_SQDIFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		matchTemplate_CCORR(image,templ,result,buf);
+		buf.image_sums.resize(1);
+		buf.image_sqsums.resize(1);
+
+		integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
+
+#if EXT_FP64 && SQRSUM_FIXED
+		unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+#else
+		Mat sqr_mat = templ.reshape(1);
+		unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0];
+#endif
+
+		Context *clCxt = image.clCxt;
+		string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
+	}
+
+	void matchTemplateNaive_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn)
+	{
+		CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
+			|| (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
+		CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
+		CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
+
+		Context *clCxt = image.clCxt;
+		string kernelName = "matchTemplate_Naive_SQDIFF";
+
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+
+	//////////////////////////////////////////////////////////////////////
+	// CCORR
+	void matchTemplate_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+		if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+		{
+			matchTemplateNaive_CCORR(image, templ, result, image.channels());
+			return;
+		}
+		else
+		{
+			CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+			if(image.depth() == CV_8U && templ.depth() == CV_8U)
+			{
+				image.convertTo(buf.imagef, CV_32F);
+				templ.convertTo(buf.templf, CV_32F);
+			}
+			CV_Assert(image.channels() == 1);
+			oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
+			filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0));
+			result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1));
+		}
+	}
+
+	void matchTemplate_CCORR_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		matchTemplate_CCORR(image,templ,result,buf);
+		buf.image_sums.resize(1);
+		buf.image_sqsums.resize(1);
+
+		integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
+#if EXT_FP64 && SQRSUM_FIXED
+		unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+#elif EXT_FP64
+		oclMat templ_c1 = templ.reshape(1);
+		multiply(templ_c1, templ_c1, templ_c1);
+		unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
+#else
+		Mat m_templ_c1 = templ.reshape(1);
+		multiply(m_templ_c1, m_templ_c1, m_templ_c1);
+		unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0];
+#endif
+		Context *clCxt = image.clCxt;
+		string kernelName = "normalizeKernel";
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
+	}
+
+	void matchTemplateNaive_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn)
+	{
+		CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
+			|| (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
+		CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
+		CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
+
+		Context *clCxt = image.clCxt;
+		string kernelName = "matchTemplate_Naive_CCORR";
+
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+	//////////////////////////////////////////////////////////////////////
+	// CCOFF
+	void matchTemplate_CCOFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
+
+		matchTemplate_CCORR(image,templ,result,buf);
+
+		Context *clCxt = image.clCxt;
+		string kernelName;
+
+		kernelName = "matchTemplate_Prepared_CCOFF";
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+
+		vector< pair<size_t, const void *> > args;
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+		// to be continued in the following section
+		if(image.channels() == 1)
+		{
+			buf.image_sums.resize(1);
+			// FIXME: temp fix for incorrect integral kernel
+			oclMat tmp_oclmat;
+			integral(image, buf.image_sums[0], tmp_oclmat);
+
+			float templ_sum = 0;
+#if EXT_FP64
+			templ_sum = (float)sum(templ)[0] / templ.size().area();
+#else
+			Mat o_templ = templ;
+			templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine
+#endif
+			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
+		}
+		else
+		{
+			Vec4f templ_sum = Vec4f::all(0);
+#if EXT_FP64
+			split(image,buf.images);
+			templ_sum = sum(templ) / templ.size().area();
+#else 
+			// temp fix for non-double supported machine
+			Mat o_templ = templ, o_image = image;
+			vector<Mat> o_mat_vector;
+			o_mat_vector.resize(image.channels());
+			buf.images.resize(image.channels());
+			split(o_image, o_mat_vector);
+			for(int i = 0; i < o_mat_vector.size(); i ++)
+			{
+				buf.images[i] = oclMat(o_mat_vector[i]);
+			}
+			templ_sum = sum(o_templ) / templ.size().area();
+#endif
+			buf.image_sums.resize(buf.images.size());
+
+			for(int i = 0; i < image.channels(); i ++)
+			{
+				// FIXME: temp fix for incorrect integral kernel
+				oclMat omat_temp;
+				integral(buf.images[i], buf.image_sums[i], omat_temp);
+			}
+			switch(image.channels())
+			{
+			case 4:
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
+				break;
+			default:
+				CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
+				break;
+			}
+		}
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+
+	void matchTemplate_CCOFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		image.convertTo(buf.imagef, CV_32F);
+		templ.convertTo(buf.templf, CV_32F);
+
+		matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
+		float scale = 1.f/templ.size().area();
+
+		Context *clCxt = image.clCxt;
+		string kernelName;
+
+		kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+
+		vector< pair<size_t, const void *> > args;
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+		args.push_back( make_pair( sizeof(cl_float),(void *)&scale) );
+		// to be continued in the following section
+		if(image.channels() == 1)
+		{
+			buf.image_sums.resize(1);
+			buf.image_sqsums.resize(1);
+			integral(image, buf.image_sums[0], buf.image_sqsums[0]);
+			float templ_sum = 0;
+			float templ_sqsum = 0;
+#if EXT_FP64
+			templ_sum   = (float)sum(templ)[0];
+#if SQRSUM_FIXED
+			templ_sqsum = sqrSum(templ);
+#else
+			oclMat templ_sqr = templ;
+			multiply(templ,templ, templ_sqr);
+			templ_sqsum  = sum(templ_sqr)[0];
+#endif //SQRSUM_FIXED
+			templ_sqsum -= scale * templ_sum * templ_sum;
+			templ_sum   *= scale;
+#else
+			// temp fix for non-double supported machine
+			Mat o_templ = templ;
+			templ_sum   = (float)sum(o_templ)[0]; 
+			templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum;
+			templ_sum  *= scale;
+#endif
+			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
+			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
+			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) );
+		}
+		else
+		{
+			Vec4f templ_sum   = Vec4f::all(0);
+			Vec4f templ_sqsum = Vec4f::all(0);
+#if EXT_FP64
+			split(image,buf.images);
+			templ_sum   = sum(templ);
+#if SQRSUM_FIXED
+			templ_sqsum = sqrSum(templ);
+#else
+			oclMat templ_sqr = templ;
+			multiply(templ,templ, templ_sqr);
+			templ_sqsum  = sum(templ_sqr);
+#endif //SQRSUM_FIXED
+			templ_sqsum -= scale * templ_sum * templ_sum;
+			
+#else 
+			// temp fix for non-double supported machine
+			Mat o_templ = templ, o_image = image;
+			
+			vector<Mat> o_mat_vector;
+			o_mat_vector.resize(image.channels());
+			buf.images.resize(image.channels());
+			split(o_image, o_mat_vector);
+			for(int i = 0; i < o_mat_vector.size(); i ++)
+			{
+				buf.images[i] = oclMat(o_mat_vector[i]);
+			}
+			templ_sum    = sum(o_templ);
+			templ_sqsum  = sum(o_templ.mul(o_templ));
+#endif
+			float templ_sqsum_sum = 0;
+			for(int i = 0; i < image.channels(); i ++)
+			{
+				templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
+			}
+			templ_sum   *= scale;
+			buf.image_sums.resize(buf.images.size());
+			buf.image_sqsums.resize(buf.images.size());
+			
+			for(int i = 0; i < image.channels(); i ++)
+			{
+				integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
+			}
+			
+			switch(image.channels())
+			{
+			case 4:
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) );
+				break;
+			default:
+				CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
+				break;
+			}
+		}
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+
+}/*ocl*/} /*cv*/
+
+void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
+{
+	MatchTemplateBuf buf;
+	matchTemplate(image,templ, result, method,buf);
+}
+void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf)
+{
+	CV_Assert(image.type() == templ.type());
+	CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
+
+	typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&);
+
+	const Caller callers[] = { 
+		::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED, 
+		::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED, 
+		::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
+	};
+
+	Caller caller = callers[method];
+	CV_Assert(caller);
+	caller(image, templ, result, buf);
+}
+#endif //
diff --git a/modules/ocl/src/pyrdown.cpp b/modules/ocl/src/pyrdown.cpp
new file mode 100644
index 0000000000..3f0a241cf7
--- /dev/null
+++ b/modules/ocl/src/pyrdown.cpp
@@ -0,0 +1,115 @@
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+using std::cout;
+using std::endl;
+
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *pyr_down;
+
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////// add subtract multiply divide /////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+template<typename T>
+void pyrdown_run(const oclMat &src, const oclMat &dst)
+{
+    CV_Assert(src.cols / 2 == dst.cols && src.rows / 2 == dst.rows);
+
+    CV_Assert(src.type() == dst.type());
+    CV_Assert(src.depth() != CV_8S);
+
+    Context  *clCxt = src.clCxt;
+    //int channels = dst.channels();
+    //int depth = dst.depth();
+
+    string kernelName = "pyrDown";
+
+    //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
+    //    {4, 0, 4, 4, 1, 1, 1},
+    //    {4, 0, 4, 4, 1, 1, 1},
+    //    {4, 0, 4, 4, 1, 1, 1}
+    //};
+
+    //size_t vector_length = vector_lengths[channels-1][depth];
+    //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
+
+    size_t localThreads[3]  = { 256, 1, 1 };
+    size_t globalThreads[3] = { src.cols, dst.rows, 1};
+
+    //int dst_step1 = dst.cols * dst.elemSize();
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
+
+    openCLExecuteKernel(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+}
+void pyrdown_run(const oclMat &src, const oclMat &dst)
+{
+	switch(src.depth())
+	{
+	case 0:
+	    pyrdown_run<unsigned char>(src, dst);
+		break;
+
+	case 1:
+	    pyrdown_run<char>(src, dst);
+		break;
+
+	case 2:
+	    pyrdown_run<unsigned short>(src, dst);
+		break;
+
+	case 3:
+	    pyrdown_run<short>(src, dst);
+		break;
+
+	case 4:
+	    pyrdown_run<int>(src, dst);
+		break;
+
+	case 5:
+	    pyrdown_run<float>(src, dst);
+		break;
+
+	case 6:
+	    pyrdown_run<double>(src, dst);
+		break;
+
+	default:
+		break;
+	}
+}
+//////////////////////////////////////////////////////////////////////////////
+// pyrDown
+
+void cv::ocl::pyrDown(const oclMat& src, oclMat& dst)
+{
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+
+	//src.step = src.rows;
+
+    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+
+	//dst.step = dst.rows;
+
+    pyrdown_run(src, dst);
+}
+
diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp
new file mode 100644
index 0000000000..ee0dfe382d
--- /dev/null
+++ b/modules/ocl/src/pyrup.cpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//		Zhang Chunpeng chunpeng@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* Haar features calculation */
+//#define EMU
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#ifndef HAVE_OPENCL
+void cv::ocl::pyrUp(const oclMat&, GpuMat&, oclMat&) { throw_nogpu(); }
+#else
+
+namespace cv { namespace ocl 
+{ 
+	extern const char *pyr_up;
+	void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst)
+	{
+		dst.create(src.rows * 2, src.cols * 2, src.type());
+		Context *clCxt = src.clCxt;
+		
+		const std::string kernelName = "pyrUp";
+  
+		std::vector< pair<size_t, const void *> > args;
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
+		
+		size_t globalThreads[3] = {dst.cols, dst.rows, 1};
+		size_t localThreads[3]  = {16, 16, 1};
+	    
+		openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+	}
+}};
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/ocl/test/test_blend.cpp b/modules/ocl/test/test_blend.cpp
new file mode 100644
index 0000000000..a0391b1bb0
--- /dev/null
+++ b/modules/ocl/test/test_blend.cpp
@@ -0,0 +1,83 @@
+#include "precomp.hpp"
+#include <iomanip>
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+template <typename T>
+void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
+{
+    result_gold.create(img1.size(), img1.type());
+
+    int cn = img1.channels();
+
+    for (int y = 0; y < img1.rows; ++y)
+    {
+        const float* weights1_row = weights1.ptr<float>(y);
+        const float* weights2_row = weights2.ptr<float>(y);
+        const T* img1_row = img1.ptr<T>(y);
+        const T* img2_row = img2.ptr<T>(y);
+        T* result_gold_row = result_gold.ptr<T>(y);
+
+        for (int x = 0; x < img1.cols * cn; ++x)
+        {
+            float w1 = weights1_row[x / cn];
+            float w2 = weights2_row[x / cn];
+            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+        }
+    }
+}
+
+PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
+{
+    std::vector<cv::ocl::Info> oclinfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        //devInfo = GET_PARAM(0);
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+        /*useRoi = GET_PARAM(3);*/
+
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(Blend, Accuracy)
+{
+    int depth = CV_MAT_DEPTH(type);
+
+    cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
+    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
+
+	cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F);
+	cv::ocl::oclMat dst(size, type);
+	gimg1.upload(img1);
+	gimg2.upload(img2);
+	gweights1.upload(weights1);
+	gweights2.upload(weights2);
+	cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
+	cv::Mat result;
+    cv::Mat result_gold;
+	dst.download(result);
+    if (depth == CV_8U)
+        blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
+    else
+        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
+
+    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1 : 1e-5f, NULL)
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
+	DIFFERENT_SIZES,
+	testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
+));
\ No newline at end of file
diff --git a/modules/ocl/test/test_columnsum.cpp b/modules/ocl/test/test_columnsum.cpp
new file mode 100644
index 0000000000..94e109d200
--- /dev/null
+++ b/modules/ocl/test/test_columnsum.cpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//	   Chunpeng Zhang chunpeng@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+///////////////////////////////////////////////////////////////////////////////
+/// ColumnSum
+
+#ifdef HAVE_OPENCL
+
+////////////////////////////////////////////////////////////////////////
+// ColumnSum
+
+PARAM_TEST_CASE(ColumnSum, cv::Size, bool )
+{
+    cv::Size size;
+    cv::Mat src;
+	bool useRoi;
+	std::vector<cv::ocl::Info> oclinfo;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+		useRoi = GET_PARAM(1);
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(ColumnSum, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1);
+	//cv::Mat src(size,CV_32FC1);
+
+	//cv::ocl::oclMat d_dst = ::createMat(size,src.type(),useRoi);
+	cv::ocl::oclMat d_dst = loadMat(src,useRoi);
+
+    cv::ocl::columnSum(loadMat(src,useRoi),d_dst);
+
+    cv::Mat dst(d_dst);
+
+    for (int j = 0; j < src.cols; ++j)
+    {
+        float gold = src.at<float>(0, j);
+        float res = dst.at<float>(0, j);
+        ASSERT_NEAR(res, gold, 1e-5);
+    }
+
+    for (int i = 1; i < src.rows; ++i)
+    {
+        for (int j = 0; j < src.cols; ++j)
+        {
+            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            float res = dst.at<float>(i, j);
+            ASSERT_NEAR(res, gold, 1e-5);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine(
+						DIFFERENT_SIZES,testing::Values(Inverse(false),Inverse(true))));
+
+
+#endif 
diff --git a/modules/ocl/test/test_fft.cpp b/modules/ocl/test/test_fft.cpp
new file mode 100644
index 0000000000..4b51d4feca
--- /dev/null
+++ b/modules/ocl/test/test_fft.cpp
@@ -0,0 +1,97 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDFFT
+////////////////////////////////////////////////////////////////////////////
+// Dft
+PARAM_TEST_CASE(Dft, cv::Size, bool) 
+{
+	cv::Size dft_size;
+	bool	 dft_rows;
+	std::vector<cv::ocl::Info> oclinfo;
+    virtual void SetUp()
+    {
+	    int devnums = getDevice(oclinfo);
+        CV_Assert(devnums > 0);
+		dft_size = GET_PARAM(0);
+		dft_rows = GET_PARAM(1);
+    }
+};
+
+TEST_P(Dft, C2C)
+{
+	cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
+	cv::Mat b_gold;
+	int flags = 0;
+	flags |= dft_rows ? cv::DFT_ROWS : 0;
+
+	cv::ocl::oclMat d_b;
+	
+	cv::dft(a, b_gold, flags);
+	cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
+	EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4, "");
+}
+
+
+TEST_P(Dft, R2CthenC2R)
+{
+	cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
+	
+	int flags = 0;
+	//flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
+
+	cv::ocl::oclMat d_b, d_c;
+	cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
+	cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
+	EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
+}
+
+INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
+    testing::Values(cv::Size(5, 4), cv::Size(20, 20)),
+    testing::Values(false, true)));
+
+#endif // HAVE_CLAMDFFT
diff --git a/modules/ocl/test/test_gemm.cpp b/modules/ocl/test/test_gemm.cpp
new file mode 100644
index 0000000000..a836149cb0
--- /dev/null
+++ b/modules/ocl/test/test_gemm.cpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDBLAS
+////////////////////////////////////////////////////////////////////////////
+// GEMM
+PARAM_TEST_CASE(Gemm, int, cv::Size, int) 
+{
+	int      type;
+	cv::Size mat_size;
+	int		 flags;
+	vector<cv::ocl::Info> info;
+    virtual void SetUp()
+    {
+		type     = GET_PARAM(0);
+		mat_size = GET_PARAM(1);
+		flags    = GET_PARAM(2);
+		cv::ocl::getDevice(info);
+    }
+};
+
+TEST_P(Gemm, Accuracy)
+{
+	cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
+	cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
+	cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
+
+	cv::Mat dst;
+	cv::ocl::oclMat ocl_dst;
+
+	cv::gemm(a, b, 1.0, c, 1.0, dst, flags);
+	cv::ocl::gemm(cv::ocl::oclMat(a), cv::ocl::oclMat(b), 1.0, cv::ocl::oclMat(c), 1.0, ocl_dst, flags);
+
+	EXPECT_MAT_NEAR(dst, ocl_dst, mat_size.area() * 1e-4, "");
+}
+
+INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
+	testing::Values(CV_32FC1, CV_32FC2/*, CV_64FC1, CV_64FC2*/),
+    testing::Values(cv::Size(20, 20), cv::Size(300, 300)),
+    testing::Values(0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_1_T + cv::GEMM_2_T)));
+#endif
diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp
new file mode 100644
index 0000000000..7d599a6152
--- /dev/null
+++ b/modules/ocl/test/test_match_template.cpp
@@ -0,0 +1,172 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "precomp.hpp"
+#define PERF_TEST 0
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate
+#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
+
+IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+
+const char* TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
+
+PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+	std::vector<cv::ocl::Info> oclinfo;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        templ_size = GET_PARAM(1);
+        cn = GET_PARAM(2);
+        method = GET_PARAM(3);
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(MatchTemplate8U, Accuracy)
+{
+
+	std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+	std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+	std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+	std::cout << "Channels: " << cn << std::endl;
+
+	cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
+
+    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
+	cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+	char sss [100] = "";
+
+	cv::Mat mat_dst;
+	dst.download(mat_dst);
+
+
+    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
+
+#if PERF_TEST
+	{
+		P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {});
+		P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {});
+	}
+#endif // PERF_TEST
+}
+
+PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+	std::vector<cv::ocl::Info> oclinfo;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        templ_size = GET_PARAM(1);
+        cn = GET_PARAM(2);
+        method = GET_PARAM(3);
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(MatchTemplate32F, Accuracy)
+{
+    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+
+    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
+	cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+	char sss [100] = "";
+
+	cv::Mat mat_dst;
+	dst.download(mat_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
+
+#if PERF_TEST
+	{
+		std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+		std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+		std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+		std::cout << "Channels: " << cn << std::endl;
+		P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {});
+		P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {});
+	}
+#endif // PERF_TEST
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, 
+	testing::Combine(
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4)),
+	ALL_TEMPLATE_METHODS
+	)
+);
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4)),
+    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
+
diff --git a/modules/ocl/test/test_pyrdown.cpp b/modules/ocl/test/test_pyrdown.cpp
new file mode 100644
index 0000000000..f2270b4a8c
--- /dev/null
+++ b/modules/ocl/test/test_pyrdown.cpp
@@ -0,0 +1,295 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+//#define PRINT_CPU_TIME 1000
+//#define PRINT_TIME
+
+
+#include "precomp.hpp"
+#include <iomanip>
+
+#ifdef HAVE_OPENCL
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+PARAM_TEST_CASE(PyrDown, MatType, bool)
+{
+    int type;
+    cv::Scalar val;
+
+    //src mat
+    cv::Mat mat1;
+    cv::Mat mat2;
+    cv::Mat mask;
+    cv::Mat dst;
+    cv::Mat dst1; //bak, for two outputs
+
+    // set up roi
+    int roicols;
+    int roirows;
+    int src1x;
+    int src1y;
+    int src2x;
+    int src2y;
+    int dstx;
+    int dsty;
+    int maskx;
+    int masky;
+
+
+    //src mat with roi
+    cv::Mat mat1_roi;
+    cv::Mat mat2_roi;
+    cv::Mat mask_roi;
+    cv::Mat dst_roi;
+    cv::Mat dst1_roi; //bak
+    std::vector<cv::ocl::Info> oclinfo;
+    //ocl dst mat for testing
+    cv::ocl::oclMat gdst_whole;
+    cv::ocl::oclMat gdst1_whole; //bak
+
+    //ocl mat with roi
+    cv::ocl::oclMat gmat1;
+    cv::ocl::oclMat gmat2;
+    cv::ocl::oclMat gdst;
+    cv::ocl::oclMat gdst1;   //bak
+    cv::ocl::oclMat gmask;
+
+    virtual void SetUp()
+    {
+        type = GET_PARAM(0);
+
+        cv::RNG &rng = TS::ptr()->get_rng();
+
+        cv::Size size(MWIDTH, MHEIGHT);
+
+        mat1 = randomMat(rng, size, type, 5, 16, false);
+        mat2 = randomMat(rng, size, type, 5, 16, false);
+        dst  = randomMat(rng, size, type, 5, 16, false);
+        dst1  = randomMat(rng, size, type, 5, 16, false);
+        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+
+        int devnums = getDevice(oclinfo);
+        CV_Assert(devnums > 0);
+        //if you want to use undefault device, set it here
+        //setDevice(oclinfo[0]);
+    }
+
+	void Cleanup()
+	{
+		mat1.release();
+		mat2.release();
+		mask.release();
+		dst.release();
+		dst1.release();
+		mat1_roi.release();
+		mat2_roi.release();
+		mask_roi.release();
+		dst_roi.release();
+		dst1_roi.release();
+
+		gdst_whole.release();
+		gdst1_whole.release();
+		gmat1.release();
+		gmat2.release();
+		gdst.release();
+		gdst1.release();
+		gmask.release();
+	}
+
+    void random_roi()
+    {
+        cv::RNG &rng = TS::ptr()->get_rng();
+
+#ifdef RANDOMROI
+        //randomize ROI
+        roicols = rng.uniform(1, mat1.cols);
+        roirows = rng.uniform(1, mat1.rows);
+        src1x   = rng.uniform(0, mat1.cols - roicols);
+        src1y   = rng.uniform(0, mat1.rows - roirows);
+        dstx    = rng.uniform(0, dst.cols  - roicols);
+        dsty    = rng.uniform(0, dst.rows  - roirows);
+#else
+        roicols = mat1.cols;
+        roirows = mat1.rows;
+        src1x = 0;
+        src1y = 0;
+        dstx = 0;
+        dsty = 0;
+#endif
+        maskx   = rng.uniform(0, mask.cols - roicols);
+        masky   = rng.uniform(0, mask.rows - roirows);
+        src2x   = rng.uniform(0, mat2.cols - roicols);
+        src2y   = rng.uniform(0, mat2.rows - roirows);
+        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
+        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
+        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
+        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
+
+        gdst_whole = dst;
+        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+
+        gdst1_whole = dst1;
+        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
+
+        gmat1 = mat1_roi;
+        gmat2 = mat2_roi;
+        gmask = mask_roi; //end
+    }
+
+};
+
+#define VARNAME(A) string(#A);
+
+
+void PrePrint()
+{
+		//for(int i = 0; i < MHEIGHT; i++)
+		//{
+		//	printf("(%d) ", i);
+		//	for(int k = 0; k < MWIDTH; k++)
+		//	{
+		//		printf("%d ", mat1_roi.data[i * MHEIGHT + k]);
+		//	}
+		//	printf("\n");
+		//}
+}
+
+void PostPrint()
+{
+		//dst_roi.convertTo(dst_roi,CV_32S);
+		//cpu_dst.convertTo(cpu_dst,CV_32S);
+		//dst_roi -= cpu_dst;
+		//cpu_dst -= dst_roi;
+		//for(int i = 0; i < MHEIGHT / 2; i++)
+		//{
+		//	printf("(%d) ", i);
+		//	for(int k = 0; k < MWIDTH / 2; k++)
+		//	{
+		//		if(gmat1.depth() == 0)
+		//		{
+		//			if(gmat1.channels() == 1)
+		//			{
+		//				printf("%d ", dst_roi.data[i * MHEIGHT / 2 + k]);
+		//			}
+		//			else
+		//			{
+		//				printf("%d ", ((unsigned*)dst_roi.data)[i * MHEIGHT / 2 + k]);
+		//			}
+		//		}
+		//		else if(gmat1.depth() == 5)
+		//		{
+		//			printf("%.6f ", ((float*)dst_roi.data)[i * MHEIGHT / 2 + k]);
+		//		}
+		//	}
+		//	printf("\n");
+		//}
+		//for(int i = 0; i < MHEIGHT / 2; i++)
+		//{
+		//	printf("(%d) ", i);
+		//	for(int k = 0; k < MWIDTH / 2; k++)
+		//	{
+		//		if(gmat1.depth() == 0)
+		//		{
+		//			if(gmat1.channels() == 1)
+		//			{
+		//				printf("%d ", cpu_dst.data[i * MHEIGHT / 2 + k]);
+		//			}
+		//			else
+		//			{
+		//				printf("%d ", ((unsigned*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
+		//			}
+		//		}
+		//		else if(gmat1.depth() == 5)
+		//		{
+		//			printf("%.6f ", ((float*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
+		//		}
+		//	}
+		//	printf("\n");
+		//}
+}
+
+////////////////////////////////PyrDown/////////////////////////////////////////////////
+//struct PyrDown : ArithmTestBase {};
+
+TEST_P(PyrDown, Mat)
+{
+    for(int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+		cv::pyrDown(mat1_roi, dst_roi);
+		cv::ocl::pyrDown(gmat1, gdst);
+
+        cv::Mat cpu_dst;
+        gdst.download(cpu_dst);
+        char s[1024];
+        sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
+
+		EXPECT_MAT_NEAR(dst_roi, cpu_dst, dst_roi.depth() == CV_32F ? 1e-5f : 1.0f, s);
+
+		Cleanup();
+    }
+}
+
+
+
+
+//********test****************
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
+                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+                            Values(false))); // Values(false) is the reserved parameter
+
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_pyrup.cpp b/modules/ocl/test/test_pyrup.cpp
new file mode 100644
index 0000000000..c6c5b9c10c
--- /dev/null
+++ b/modules/ocl/test/test_pyrup.cpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Chunpeng chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/core/core.hpp"
+
+#ifdef HAVE_OPENCL
+
+
+PARAM_TEST_CASE(PyrUp,cv::Size,int)
+{
+	cv::Size size;
+	int type;
+	std::vector<cv::ocl::Info> oclinfo;
+
+	virtual void SetUp()
+	{
+		int devnums = cv::ocl::getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+		CV_Assert(devnums > 0);
+		size = GET_PARAM(0);
+		type = GET_PARAM(1);
+	}
+};
+
+TEST_P(PyrUp,Accuracy)
+{
+	cv::Mat src = randomMat(size,type);
+	
+
+	cv::Mat dst_gold;
+	cv::pyrUp(src,dst_gold);
+
+	cv::ocl::oclMat dst;
+	cv::ocl::oclMat srcMat(src);
+	cv::ocl::pyrUp(srcMat,dst);
+	char s[100]={0};
+
+	EXPECT_MAT_NEAR(dst_gold, dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s);	
+	
+}
+
+#if 1
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
+    testing::Values(cv::Size(32, 32)),
+    testing::Values(MatType(CV_8UC1),MatType(CV_16UC1),MatType(CV_32FC1),MatType(CV_8UC4),
+	MatType(CV_16UC4),MatType(CV_32FC4))));
+#endif
+
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/ts/misc/run.py b/modules/ts/misc/run.py
index 79dad8d8fd..737a2eea35 100644
--- a/modules/ts/misc/run.py
+++ b/modules/ts/misc/run.py
@@ -337,10 +337,9 @@ class RunInfo(object):
 
     def getSvnVersion(self, path, name):
         if not path:
-            setattr(self, name, None)
-            return
-        if not self.svnversion_path and hostos == 'nt':
-            self.tryGetSvnVersionWithTortoise(path, name)
+            val = None
+        elif not self.svnversion_path and hostos == 'nt':
+            val = self.tryGetSvnVersionWithTortoise(path, name)
         else:
             svnversion = self.svnversion_path
             if not svnversion:
@@ -348,11 +347,14 @@ class RunInfo(object):
             try:
                 output = Popen([svnversion, "-n", path], stdout=PIPE, stderr=PIPE).communicate()
                 if not output[1]:
-                    setattr(self, name, output[0])
+                    val = output[0]
                 else:
-                    setattr(self, name, None)
+                    val = None
             except OSError:
-                setattr(self, name, None)
+                val = None
+        if val:
+            val = val.replace(" ", "_")
+        setattr(self, name, val)
 
     def tryGetSvnVersionWithTortoise(self, path, name):
         try:
@@ -371,9 +373,9 @@ class RunInfo(object):
                 tmpfile = open(tmpfilename2, "r")
                 version = tmpfile.read()
                 tmpfile.close()
-            setattr(self, name, version)
+            return version
         except:
-            setattr(self, name, None)
+            return None
         finally:
             if dir:
                 shutil.rmtree(dir)
diff --git a/modules/video/src/bgfg_gmg.cpp b/modules/video/src/bgfg_gmg.cpp
index 163445a45b..b4e9824d69 100644
--- a/modules/video/src/bgfg_gmg.cpp
+++ b/modules/video/src/bgfg_gmg.cpp
@@ -440,8 +440,7 @@ bool BackgroundSubtractorGMG::HistogramFeatureGMG::operator ==(HistogramFeatureG
     std::vector<size_t>::iterator color_a;
     std::vector<size_t>::iterator color_b;
     std::vector<size_t>::iterator color_a_end = this->color.end();
-    std::vector<size_t>::iterator color_b_end = rhs.color.end();
-    for (color_a = color.begin(),color_b =rhs.color.begin();color_a!=color_a_end;++color_a,++color_b)
+    for (color_a = color.begin(), color_b = rhs.color.begin(); color_a != color_a_end; ++color_a, ++color_b)
     {
         if (*color_a != *color_b)
         {