diff --git a/.gitattributes b/.gitattributes
index af704cdf0c..cd4359ba34 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,7 +33,7 @@
 CMakeLists.txt  text whitespace=tabwidth=2
 
 *.png       binary
-*.jepg      binary
+*.jpeg      binary
 *.jpg       binary
 *.exr       binary
 *.ico       binary
diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index af1581349e..03183d1c2a 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -1,12 +1,30 @@
 #Cross compile TBB from source
 project(tbb)
 
-# 4.1 update 2 - works fine
-set(tbb_ver "tbb41_20130116oss")
-set(tbb_url "http://threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb41_20130116oss_src.tgz")
-set(tbb_md5 "3809790e1001a1b32d59c9fee590ee85")
+if (WIN32 AND NOT ARM)
+  message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!")
+endif()
+
+# 4.1 update 4 - works fine
+set(tbb_ver "tbb41_20130613oss")
+set(tbb_url "http://threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb41_20130613oss_src.tgz")
+set(tbb_md5 "108c8c1e481b0aaea61878289eb28b6a")
 set(tbb_version_file "version_string.ver")
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow -Wunused-parameter)
+
+# 4.1 update 3 dev - works fine
+#set(tbb_ver "tbb41_20130401oss")
+#set(tbb_url "http://threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb41_20130401oss_src.tgz")
+#set(tbb_md5 "f2f591a0d2ca8f801e221ce7d9ea84bb")
+#set(tbb_version_file "version_string.ver")
+#ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
+
+# 4.1 update 2 - works fine
+#set(tbb_ver "tbb41_20130116oss")
+#set(tbb_url "http://threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb41_20130116oss_src.tgz")
+#set(tbb_md5 "3809790e1001a1b32d59c9fee590ee85")
+#set(tbb_version_file "version_string.ver")
+#ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
 
 # 4.1 update 1 - works fine
 #set(tbb_ver "tbb41_20121003oss")
@@ -107,7 +125,7 @@ if(NOT EXISTS "${tbb_src_dir}")
               RESULT_VARIABLE tbb_untar_RESULT)
 
   if(NOT tbb_untar_RESULT EQUAL 0 OR NOT EXISTS "${tbb_src_dir}")
-    message(FATAL_ERROR "Failed to unpack TBB sources")
+    message(FATAL_ERROR "Failed to unpack TBB sources from ${tbb_tarball} to ${tbb_src_dir} with error ${tbb_untar_RESULT}")
   endif()
 endif()
 
@@ -123,13 +141,22 @@ file(GLOB lib_hdrs "${tbb_src_dir}/src/tbb/*.h")
 list(APPEND lib_srcs "${tbb_src_dir}/src/rml/client/rml_tbb.cpp")
 
 if (WIN32)
-  add_definitions(-D__TBB_DYNAMIC_LOAD_ENABLED=0
-                -D__TBB_BUILD=1
-                -D_UNICODE
-                -DUNICODE
-                -DWINAPI_FAMILY=WINAPI_FAMILY_APP
-                -DDO_ITT_NOTIFY=0
+  add_definitions(/D__TBB_DYNAMIC_LOAD_ENABLED=0
+                  /D__TBB_BUILD=1
+                  /DTBB_NO_LEGACY=1
+                  /D_UNICODE
+                  /DUNICODE
+                  /DWINAPI_FAMILY=WINAPI_FAMILY_APP
+                  /DDO_ITT_NOTIFY=0
+                  /DUSE_WINTHREAD
                ) # defines were copied from windows.cl.inc
+
+  if (ARM)
+    add_definitions(/D_WIN32_WINNT=0x0602
+                    /D__TBB_WIN32_USE_CL_BUILTINS
+                   )
+  endif()
+
 set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} /APPCONTAINER")
 else()
   add_definitions(-D__TBB_DYNAMIC_LOAD_ENABLED=0         #required
@@ -173,7 +200,23 @@ endif()
 set(TBB_SOURCE_FILES ${TBB_SOURCE_FILES} "${CMAKE_CURRENT_SOURCE_DIR}/${tbb_version_file}")
 
 add_library(tbb ${TBB_SOURCE_FILES})
-target_link_libraries(tbb c m dl)
+
+if (WIN32)
+  if (ARM)
+    set(platform_macro /D_M_ARM=1)
+  endif()
+
+  add_custom_command(TARGET tbb
+                     PRE_BUILD
+                     COMMAND ${CMAKE_C_COMPILER} /nologo /TC /EP ${tbb_src_dir}\\src\\tbb\\win32-tbb-export.def /DTBB_NO_LEGACY=1 /D_CRT_SECURE_NO_DEPRECATE /D__TBB_BUILD=1 ${platform_macro} /I${tbb_src_dir}\\src /I${tbb_src_dir}\\include > "${tbb_src_dir}\\src\\tbb\\tbb.def"
+                     WORKING_DIRECTORY ${tbb_src_dir}\\src\\tbb
+                     COMMENT "Generating tbb.def file" VERBATIM
+                    )
+
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEF:${tbb_src_dir}/src/tbb/tbb.def /DLL /MAP /fixed:no /INCREMENTAL:NO")
+else()
+  target_link_libraries(tbb c m dl)
+endif()
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations)
 string(REPLACE "-Werror=non-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@@ -182,6 +225,7 @@ set_target_properties(tbb
   PROPERTIES OUTPUT_NAME tbb
   DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
   ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
+  RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
   )
 
 if(ENABLE_SOLUTION_FOLDERS)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93549c9430..f464b2263c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,19 @@ if(UNIX AND NOT ANDROID)
   endif()
 endif()
 
+# Add these standard paths to the search paths for FIND_PATH
+# to find include files from these locations first
+if(MINGW)
+  if(EXISTS /mingw)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw)
+  endif()
+  if(EXISTS /mingw32)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw32)
+  endif()
+  if(EXISTS /mingw64)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw64)
+  endif()
+endif()
 
 # ----------------------------------------------------------------------------
 # OpenCV cmake options
@@ -110,7 +123,7 @@ endif()
 
 # Optional 3rd party components
 # ===================================================
-OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
@@ -286,6 +299,10 @@ set(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to c
 add_definitions(-DHAVE_CVCONFIG_H)
 ocv_include_directories(${OPENCV_CONFIG_FILE_INCLUDE_DIR})
 
+# ----------------------------------------------------------------------------
+#  Path for additional modules
+# ----------------------------------------------------------------------------
+set(OPENCV_EXTRA_MODULES_PATH "" CACHE PATH "Where to look for additional OpenCV modules")
 
 # ----------------------------------------------------------------------------
 #  Autodetect if we are in a GIT repository
@@ -402,7 +419,7 @@ if(ANDROID)
   if(NOT ANDROID_TOOLS_Pkg_Revision GREATER 13)
     message(WARNING "OpenCV requires Android SDK tools revision 14 or newer. Otherwise tests and samples will no be compiled.")
   endif()
-elseif(ANT_EXECUTABLE)
+else()
   find_package(JNI)
 endif()
 
@@ -456,15 +473,15 @@ if(BUILD_EXAMPLES OR BUILD_ANDROID_EXAMPLES OR INSTALL_PYTHON_EXAMPLES)
 endif()
 
 if(ANDROID)
-  add_subdirectory(android/service)
+  add_subdirectory(platforms/android/service)
 endif()
 
 if(BUILD_ANDROID_PACKAGE)
-  add_subdirectory(android/package)
+  add_subdirectory(platforms/android/package)
 endif()
 
 if (ANDROID)
-  add_subdirectory(android/libinfo)
+  add_subdirectory(platforms/android/libinfo)
 endif()
 
 # ----------------------------------------------------------------------------
@@ -830,7 +847,7 @@ status("    ant:"           ANT_EXECUTABLE      THEN "${ANT_EXECUTABLE} (ver ${A
 if(NOT ANDROID)
   status("    JNI:"         JNI_INCLUDE_DIRS    THEN "${JNI_INCLUDE_DIRS}"                                       ELSE NO)
 endif()
-status("    Java tests:"    BUILD_TESTS AND (NOT ANDROID OR CAN_BUILD_ANDROID_PROJECTS)                 THEN YES ELSE NO)
+status("    Java tests:"    BUILD_TESTS AND (CAN_BUILD_ANDROID_PROJECTS OR HAVE_opencv_java)            THEN YES ELSE NO)
 
 # ========================== documentation ==========================
 if(BUILD_DOCS)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
deleted file mode 100644
index 8fc54b1b8e..0000000000
--- a/CONTRIBUTING.md
+++ /dev/null
@@ -1,11 +0,0 @@
-We greatly appreciate your support and contributions and they are always welcomed!
-
-Github pull requests are the convenient way to contribute to OpenCV project. Good pull requests have all of these attributes:
-
-* Are scoped to one specific issue
-* Include a test to demonstrate the correctness
-* Update the docs if relevant
-* Match the [coding style guidelines](http://code.opencv.org/projects/opencv/wiki/CodingStyleGuide)
-* Don't messed by "oops" commits
-
-You can find more detailes about contributing process on http://opencv.org/contribute.html
\ No newline at end of file
diff --git a/README b/README
index 9dd45a230b..0799dff89f 100644
--- a/README
+++ b/README
@@ -4,3 +4,14 @@ Homepage:    http://opencv.org
 Online docs: http://docs.opencv.org
 Q&A forum:   http://answers.opencv.org
 Dev zone:    http://code.opencv.org
+
+Please read before starting work on a pull request:
+  http://code.opencv.org/projects/opencv/wiki/How_to_contribute
+
+Summary of guidelines:
+
+* One pull request per issue;
+* Choose the right base branch;
+* Include tests and documentation;
+* Clean up "oops" commits before submitting;
+* Follow the coding style guide.
diff --git a/android/scripts/build.cmd b/android/scripts/build.cmd
deleted file mode 100644
index 3e0f1666b6..0000000000
--- a/android/scripts/build.cmd
+++ /dev/null
@@ -1,90 +0,0 @@
-@ECHO OFF
-
-:: enable command extensions
-VERIFY BADVALUE 2>NUL
-SETLOCAL ENABLEEXTENSIONS || (ECHO Unable to enable command extensions. & EXIT \B)
-
-:: build environment
-SET SOURCE_DIR=%cd%
-IF EXIST .\android.toolchain.cmake (SET BUILD_OPENCV=1) ELSE (SET BUILD_OPENCV=0)
-IF EXIST .\jni\nul (SET BUILD_JAVA_PART=1) ELSE (SET BUILD_JAVA_PART=0)
-
-:: load configuration
-PUSHD %~dp0
-SET SCRIPTS_DIR=%cd%
-IF EXIST .\wincfg.cmd CALL .\wincfg.cmd
-POPD
-
-:: inherit old names
-IF NOT DEFINED CMAKE SET CMAKE=%CMAKE_EXE%
-IF NOT DEFINED MAKE SET MAKE=%MAKE_EXE%
-
-:: defaults
-IF NOT DEFINED BUILD_DIR SET BUILD_DIR=build
-IF NOT DEFINED ANDROID_ABI SET ANDROID_ABI=armeabi-v7a
-SET OPENCV_BUILD_DIR=%SCRIPTS_DIR%\..\%BUILD_DIR%
-
-:: check that all required variables defined
-PUSHD .
-IF NOT DEFINED ANDROID_NDK (ECHO. & ECHO You should set an environment variable ANDROID_NDK to the full path to your copy of Android NDK & GOTO end)
-(CD "%ANDROID_NDK%") || (ECHO. & ECHO Directory "%ANDROID_NDK%" specified by ANDROID_NDK variable does not exist & GOTO end)
-
-IF NOT EXIST "%CMAKE%" (ECHO. & ECHO You should set an environment variable CMAKE to the full path to cmake executable & GOTO end)
-IF NOT EXIST "%MAKE%" (ECHO. & ECHO You should set an environment variable MAKE to the full path to native port of make executable & GOTO end)
-
-IF NOT %BUILD_JAVA_PART%==1 GOTO required_variables_checked
-
-IF NOT DEFINED ANDROID_SDK (ECHO. & ECHO You should set an environment variable ANDROID_SDK to the full path to your copy of Android SDK & GOTO end)
-(CD "%ANDROID_SDK%" 2>NUL) || (ECHO. & ECHO Directory "%ANDROID_SDK%" specified by ANDROID_SDK variable does not exist & GOTO end)
-
-IF NOT DEFINED ANT_DIR (ECHO. & ECHO You should set an environment variable ANT_DIR to the full path to Apache Ant root & GOTO end)
-(CD "%ANT_DIR%" 2>NUL) || (ECHO. & ECHO Directory "%ANT_DIR%" specified by ANT_DIR variable does not exist & GOTO end)
-
-IF NOT DEFINED JAVA_HOME (ECHO. & ECHO You should set an environment variable JAVA_HOME to the full path to JDK & GOTO end)
-(CD "%JAVA_HOME%" 2>NUL) || (ECHO. & ECHO Directory "%JAVA_HOME%" specified by JAVA_HOME variable does not exist & GOTO end)
-
-:required_variables_checked
-POPD
-
-:: check for ninja
-echo "%MAKE%"|findstr /i ninja >nul:
-IF %errorlevel%==1 (SET BUILD_WITH_NINJA=0) ELSE (SET BUILD_WITH_NINJA=1)
-IF %BUILD_WITH_NINJA%==1 (SET CMAKE_GENERATOR=Ninja) ELSE (SET CMAKE_GENERATOR=MinGW Makefiles)
-
-:: create build dir
-IF DEFINED REBUILD rmdir /S /Q "%BUILD_DIR%" 2>NUL
-MKDIR "%BUILD_DIR%" 2>NUL
-PUSHD "%BUILD_DIR%" || (ECHO. & ECHO Directory "%BUILD_DIR%" is not found & GOTO end)
-
-:: run cmake
-ECHO. & ECHO Runnning cmake...
-ECHO ANDROID_ABI=%ANDROID_ABI%
-ECHO.
-IF NOT %BUILD_OPENCV%==1 GOTO other-cmake
-:opencv-cmake
-("%CMAKE%" -G"%CMAKE_GENERATOR%" -DANDROID_ABI="%ANDROID_ABI%" -DCMAKE_TOOLCHAIN_FILE="%SOURCE_DIR%"\android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%MAKE%" %* "%SOURCE_DIR%\..") && GOTO cmakefin
-ECHO. & ECHO cmake failed &	GOTO end
-:other-cmake
-("%CMAKE%" -G"%CMAKE_GENERATOR%" -DANDROID_ABI="%ANDROID_ABI%" -DOpenCV_DIR="%OPENCV_BUILD_DIR%" -DCMAKE_TOOLCHAIN_FILE="%OPENCV_BUILD_DIR%\..\android.toolchain.cmake" -DCMAKE_MAKE_PROGRAM="%MAKE%" %* "%SOURCE_DIR%") && GOTO cmakefin
-ECHO. & ECHO cmake failed &	GOTO end
-:cmakefin
-
-:: run make
-ECHO. & ECHO Building native libs...
-IF %BUILD_WITH_NINJA%==0 ("%MAKE%" -j %NUMBER_OF_PROCESSORS% VERBOSE=%VERBOSE%) || (ECHO. & ECHO make failed & GOTO end)
-IF %BUILD_WITH_NINJA%==1 ("%MAKE%") || (ECHO. & ECHO ninja failed & GOTO end)
-
-IF NOT %BUILD_JAVA_PART%==1 GOTO end
-POPD && PUSHD %SOURCE_DIR%
-
-:: configure java part
-ECHO. & ECHO Updating Android project...
-(CALL "%ANDROID_SDK%\tools\android" update project --name %PROJECT_NAME% --path .) || (ECHO. & ECHO failed to update android project & GOTO end)
-
-:: compile java part
-ECHO. & ECHO Compiling Android project...
-(CALL "%ANT_DIR%\bin\ant" debug) || (ECHO. & ECHO failed to compile android project & GOTO end)
-
-:end
-POPD
-ENDLOCAL
diff --git a/android/scripts/cmake_android.cmd b/android/scripts/cmake_android.cmd
deleted file mode 100644
index 212c04b47e..0000000000
--- a/android/scripts/cmake_android.cmd
+++ /dev/null
@@ -1,5 +0,0 @@
-@ECHO OFF
-
-PUSHD %~dp0..
-CALL .\scripts\build.cmd %* -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
-POPD
\ No newline at end of file
diff --git a/android/scripts/cmake_android_armeabi.sh b/android/scripts/cmake_android_armeabi.sh
deleted file mode 100755
index 9c711d8855..0000000000
--- a/android/scripts/cmake_android_armeabi.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-cd `dirname $0`/..
-
-mkdir -p build_armeabi
-cd build_armeabi
-
-cmake -DANDROID_ABI=armeabi -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake $@ ../..
-
diff --git a/android/scripts/cmake_android_mips.sh b/android/scripts/cmake_android_mips.sh
deleted file mode 100755
index 17d2ff937e..0000000000
--- a/android/scripts/cmake_android_mips.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-cd `dirname $0`/..
-
-mkdir -p build_mips
-cd build_mips
-
-cmake -DANDROID_ABI=mips -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake $@ ../..
-
diff --git a/android/scripts/cmake_android_neon.sh b/android/scripts/cmake_android_neon.sh
deleted file mode 100755
index 5e85605b56..0000000000
--- a/android/scripts/cmake_android_neon.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-cd `dirname $0`/..
-
-mkdir -p build_neon
-cd build_neon
-
-cmake -DANDROID_ABI="armeabi-v7a with NEON" -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake $@ ../..
-
diff --git a/android/scripts/cmake_android_service.sh b/android/scripts/cmake_android_service.sh
deleted file mode 100755
index 0dbd482520..0000000000
--- a/android/scripts/cmake_android_service.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/sh
-cd `dirname $0`/..
-
-mkdir -p build_service
-cd build_service
-
-cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_TOOLCHAIN_NAME="arm-linux-androideabi-4.4.3" -DANDROID_STL=stlport_static -DANDROID_STL_FORCE_FEATURES=OFF -DBUILD_ANDROID_SERVICE=ON -DANDROID_SOURCE_TREE=~/Projects/AndroidSource/ServiceStub/ $@ ../..
diff --git a/android/scripts/cmake_android_x86.sh b/android/scripts/cmake_android_x86.sh
deleted file mode 100755
index a01df2e668..0000000000
--- a/android/scripts/cmake_android_x86.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/sh
-
-cd `dirname $0`/..
-
-mkdir -p build_x86
-cd build_x86
-
-cmake -DANDROID_ABI=x86 -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake $@ ../..
-
diff --git a/android/scripts/wincfg.cmd.tmpl b/android/scripts/wincfg.cmd.tmpl
deleted file mode 100644
index 166a5e7b02..0000000000
--- a/android/scripts/wincfg.cmd.tmpl
+++ /dev/null
@@ -1,30 +0,0 @@
-:: variables required for OpenCV build ::
-:: Note: all pathes should be specified without tailing slashes!
-SET ANDROID_NDK=C:\full\path\to\your\copy\of\android\NDK\android-ndk-r7b
-SET CMAKE_EXE=C:\full\path\to\cmake\utility\cmake.exe
-SET MAKE_EXE=%ANDROID_NDK%\prebuilt\windows\bin\make.exe
-
-:: variables required for android-opencv build ::
-SET ANDROID_SDK=C:\full\path\to\your\copy\of\android\SDK\android-sdk-windows
-SET ANT_DIR=C:\full\path\to\ant\directory\apache-ant-1.8.2
-SET JAVA_HOME=C:\full\path\to\JDK\jdk1.6.0_25
-
-:: configuration options ::
-:::: general ARM-V7 settings
-SET ANDROID_ABI=armeabi-v7a
-SET BUILD_DIR=build
-
-:::: uncomment following lines to compile for old emulator or old device
-::SET ANDROID_ABI=armeabi
-::SET BUILD_DIR=build_armeabi
-
-:::: uncomment following lines to compile for ARM-V7 with NEON support
-::SET ANDROID_ABI=armeabi-v7a with NEON
-::SET BUILD_DIR=build_neon
-
-:::: uncomment following lines to compile for x86
-::SET ANDROID_ABI=x86
-::SET BUILD_DIR=build_x86
-
-:::: other options
-::SET ANDROID_NATIVE_API_LEVEL=8   &:: android-3 is enough for native part of OpenCV but android-8 is required for Java API
diff --git a/android/service/doc/Makefile b/android/service/doc/Makefile
deleted file mode 100644
index b8e7bba113..0000000000
--- a/android/service/doc/Makefile
+++ /dev/null
@@ -1,89 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = _build
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html      to make standalone HTML files"
-	@echo "  dirhtml   to make HTML files named index.html in directories"
-	@echo "  pickle    to make pickle files"
-	@echo "  json      to make JSON files"
-	@echo "  htmlhelp  to make HTML files and a HTML help project"
-	@echo "  qthelp    to make HTML files and a qthelp project"
-	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  changes   to make an overview of all changed/added/deprecated items"
-	@echo "  linkcheck to check all external links for integrity"
-	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	-rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OpenCVEngine.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OpenCVEngine.qhc"
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
-	      "run these through (pdf)latex."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/apps/traincascade/boost.cpp b/apps/traincascade/boost.cpp
index 2d29f338b0..4f91d5a29d 100644
--- a/apps/traincascade/boost.cpp
+++ b/apps/traincascade/boost.cpp
@@ -766,7 +766,7 @@ float CvCascadeBoostTrainData::getVarValue( int vi, int si )
 }
 
 
-struct FeatureIdxOnlyPrecalc
+struct FeatureIdxOnlyPrecalc : ParallelLoopBody
 {
     FeatureIdxOnlyPrecalc( const CvFeatureEvaluator* _featureEvaluator, CvMat* _buf, int _sample_count, bool _is_buf_16u )
     {
@@ -776,11 +776,11 @@ struct FeatureIdxOnlyPrecalc
         idst = _buf->data.i;
         is_buf_16u = _is_buf_16u;
     }
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
         cv::AutoBuffer<float> valCache(sample_count);
         float* valCachePtr = (float*)valCache;
-        for ( int fi = range.begin(); fi < range.end(); fi++)
+        for ( int fi = range.start; fi < range.end; fi++)
         {
             for( int si = 0; si < sample_count; si++ )
             {
@@ -803,7 +803,7 @@ struct FeatureIdxOnlyPrecalc
     bool is_buf_16u;
 };
 
-struct FeatureValAndIdxPrecalc
+struct FeatureValAndIdxPrecalc : ParallelLoopBody
 {
     FeatureValAndIdxPrecalc( const CvFeatureEvaluator* _featureEvaluator, CvMat* _buf, Mat* _valCache, int _sample_count, bool _is_buf_16u )
     {
@@ -814,9 +814,9 @@ struct FeatureValAndIdxPrecalc
         idst = _buf->data.i;
         is_buf_16u = _is_buf_16u;
     }
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
-        for ( int fi = range.begin(); fi < range.end(); fi++)
+        for ( int fi = range.start; fi < range.end; fi++)
         {
             for( int si = 0; si < sample_count; si++ )
             {
@@ -840,7 +840,7 @@ struct FeatureValAndIdxPrecalc
     bool is_buf_16u;
 };
 
-struct FeatureValOnlyPrecalc
+struct FeatureValOnlyPrecalc : ParallelLoopBody
 {
     FeatureValOnlyPrecalc( const CvFeatureEvaluator* _featureEvaluator, Mat* _valCache, int _sample_count )
     {
@@ -848,9 +848,9 @@ struct FeatureValOnlyPrecalc
         valCache = _valCache;
         sample_count = _sample_count;
     }
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
-        for ( int fi = range.begin(); fi < range.end(); fi++)
+        for ( int fi = range.start; fi < range.end; fi++)
             for( int si = 0; si < sample_count; si++ )
                 valCache->at<float>(fi,si) = (*featureEvaluator)( fi, si );
     }
@@ -864,12 +864,12 @@ void CvCascadeBoostTrainData::precalculate()
     int minNum = MIN( numPrecalcVal, numPrecalcIdx);
 
     double proctime = -TIME( 0 );
-    parallel_for( BlockedRange(numPrecalcVal, numPrecalcIdx),
-                  FeatureIdxOnlyPrecalc(featureEvaluator, buf, sample_count, is_buf_16u!=0) );
-    parallel_for( BlockedRange(0, minNum),
-                  FeatureValAndIdxPrecalc(featureEvaluator, buf, &valCache, sample_count, is_buf_16u!=0) );
-    parallel_for( BlockedRange(minNum, numPrecalcVal),
-                  FeatureValOnlyPrecalc(featureEvaluator, &valCache, sample_count) );
+    parallel_for_( Range(numPrecalcVal, numPrecalcIdx),
+                   FeatureIdxOnlyPrecalc(featureEvaluator, buf, sample_count, is_buf_16u!=0) );
+    parallel_for_( Range(0, minNum),
+                   FeatureValAndIdxPrecalc(featureEvaluator, buf, &valCache, sample_count, is_buf_16u!=0) );
+    parallel_for_( Range(minNum, numPrecalcVal),
+                   FeatureValOnlyPrecalc(featureEvaluator, &valCache, sample_count) );
     cout << "Precalculation time: " << (proctime + TIME( 0 )) << endl;
 }
 
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index aeed112ae0..7a91b188ae 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -47,6 +47,9 @@ macro(add_extra_compiler_option option)
   endif()
 endmacro()
 
+# OpenCV fails some tests when 'char' is 'unsigned' by default
+add_extra_compiler_option(-fsigned-char)
+
 if(MINGW)
   # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
   # here we are trying to workaround the problem
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index f3d101ab21..8db667762e 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -26,6 +26,15 @@ if(CUDA_FOUND)
     set(HAVE_CUBLAS 1)
   endif()
 
+  if(${CUDA_VERSION} VERSION_LESS "5.5")
+    find_cuda_helper_libs(npp)
+  else()
+    find_cuda_helper_libs(nppc)
+    find_cuda_helper_libs(nppi)
+    find_cuda_helper_libs(npps)
+    set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
+  endif()
+
   if(WITH_NVCUVID)
     find_cuda_helper_libs(nvcuvid)
     set(HAVE_NVCUVID 1)
@@ -136,8 +145,6 @@ if(CUDA_FOUND)
 
   mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
 
-  find_cuda_helper_libs(npp)
-
   macro(ocv_cuda_compile VAR)
     foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
       set(${var}_backup_in_cuda_compile_ "${${var}}")
diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake
index 014066bc7e..2c96274a8c 100644
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@@ -44,12 +44,18 @@ if(OPENCL_FOUND)
   set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
   set(OPENCL_LIBRARIES    ${OPENCL_LIBRARY})
 
-  if (X86_64)
+  if(WIN32 AND X86_64)
     set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64/import)
-  elseif (X86)
+  elseif(WIN32)
     set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
   endif()
 
+  if(X86_64 AND UNIX)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64)
+  elseif(X86 AND UNIX)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32)
+  endif()
+
   if(WITH_OPENCLAMDFFT)
     find_path(CLAMDFFT_ROOT_DIR
               NAMES include/clAmdFft.h
@@ -80,7 +86,7 @@ if(OPENCL_FOUND)
   if(WITH_OPENCLAMDBLAS)
     find_path(CLAMDBLAS_ROOT_DIR
               NAMES include/clAmdBlas.h
-              PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
+              PATHS ENV CLAMDBLAS_PATH ENV ProgramFiles
               PATH_SUFFIXES clAmdBlas AMD/clAmdBlas
               DOC "AMD FFT root directory"
               NO_DEFAULT_PATH)
diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index 2ea864c16f..d685d23feb 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -24,7 +24,6 @@ if(WITH_QT)
     if(Qt5Core_FOUND AND Qt5Gui_FOUND AND Qt5Widgets_FOUND AND Qt5Test_FOUND AND Qt5Concurrent_FOUND)
       set(HAVE_QT5 ON)
       set(HAVE_QT  ON)
-      add_definitions(-DHAVE_QT)
       find_package(Qt5OpenGL)
       if(Qt5OpenGL_FOUND)
         set(QT_QTOPENGL_FOUND ON)
@@ -33,10 +32,9 @@ if(WITH_QT)
   endif()
 
   if(NOT HAVE_QT)
-    find_package(Qt4)
+    find_package(Qt4 REQUIRED QtCore QtGui QtTest)
     if(QT4_FOUND)
       set(HAVE_QT TRUE)
-      add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
     endif()
   endif()
 endif()
@@ -61,7 +59,6 @@ if(WITH_OPENGL)
       list(APPEND OPENCV_LINKER_LIBS ${OPENGL_LIBRARIES})
       if(QT_QTOPENGL_FOUND)
         set(HAVE_QT_OPENGL TRUE)
-        add_definitions(-DHAVE_QT_OPENGL)
       else()
         ocv_include_directories(${OPENGL_INCLUDE_DIR})
       endif()
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index fbb47d4861..0ca4828fe6 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -81,10 +81,33 @@ endif(WITH_GIGEAPI)
 # --- Dc1394 ---
 ocv_clear_vars(HAVE_DC1394 HAVE_DC1394_2)
 if(WITH_1394)
-  CHECK_MODULE(libdc1394-2 HAVE_DC1394_2)
-  if(NOT HAVE_DC1394_2)
-    CHECK_MODULE(libdc1394 HAVE_DC1394)
-  endif()
+  if(WIN32 AND MINGW)
+      find_path(CMU1394_INCLUDE_PATH "/1394common.h"
+                PATH_SUFFIXES include
+                DOC "The path to cmu1394 headers")
+      find_path(DC1394_2_INCLUDE_PATH "/dc1394/dc1394.h"
+                PATH_SUFFIXES include
+                DOC "The path to DC1394 2.x headers")
+      if(CMU1394_INCLUDE_PATH AND DC1394_2_INCLUDE_PATH)
+        set(CMU1394_LIB_DIR  "${CMU1394_INCLUDE_PATH}/../lib"  CACHE PATH "Full path of CMU1394 library directory")
+        set(DC1394_2_LIB_DIR "${DC1394_2_INCLUDE_PATH}/../lib" CACHE PATH "Full path of DC1394 2.x library directory")
+        if(EXISTS "${CMU1394_LIB_DIR}/lib1394camera.a" AND EXISTS "${DC1394_2_LIB_DIR}/libdc1394.a")
+          set(HAVE_DC1394_2 TRUE)
+        endif()
+      endif()
+      if(HAVE_DC1394_2)
+        ocv_parse_pkg("libdc1394-2" "${DC1394_2_LIB_DIR}/pkgconfig" "")
+        ocv_include_directories(${DC1394_2_INCLUDE_PATH})
+        set(HIGHGUI_LIBRARIES ${HIGHGUI_LIBRARIES}
+            "${DC1394_2_LIB_DIR}/libdc1394.a"
+            "${CMU1394_LIB_DIR}/lib1394camera.a")
+      endif(HAVE_DC1394_2)
+  else(WIN32 AND MINGW)
+    CHECK_MODULE(libdc1394-2 HAVE_DC1394_2)
+    if(NOT HAVE_DC1394_2)
+      CHECK_MODULE(libdc1394 HAVE_DC1394)
+    endif()
+  endif(WIN32 AND MINGW)
 endif(WITH_1394)
 
 # --- xine ---
@@ -197,7 +220,7 @@ endif(WITH_MSMF)
 
 # --- Extra HighGUI libs on Windows ---
 if(WIN32)
-  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 vfw32)
+  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 setupapi ws2_32 vfw32)
   if(MINGW64)
     list(APPEND HIGHGUI_LIBRARIES avifil32 avicap32 winmm msvfw32)
     list(REMOVE_ITEM HIGHGUI_LIBRARIES vfw32)
diff --git a/cmake/OpenCVFindXimea.cmake b/cmake/OpenCVFindXimea.cmake
index 5600275f47..27e2a78ad4 100644
--- a/cmake/OpenCVFindXimea.cmake
+++ b/cmake/OpenCVFindXimea.cmake
@@ -9,6 +9,7 @@
 #
 # Created: 5 Aug 2011 by Marian Zajko (marian.zajko@ximea.com)
 # Updated: 25 June 2012 by Igor Kuzmin (parafin@ximea.com)
+# Updated: 22 October 2012 by Marian Zajko (marian.zajko@ximea.com)
 #
 
 set(XIMEA_FOUND)
@@ -18,11 +19,15 @@ set(XIMEA_LIBRARY_DIR)
 if(WIN32)
   # Try to find the XIMEA API path in registry.
   GET_FILENAME_COMPONENT(XIMEA_PATH "[HKEY_CURRENT_USER\\Software\\XIMEA\\CamSupport\\API;Path]" ABSOLUTE)
-
-  if(EXISTS XIMEA_PATH)
+ 
+  if(EXISTS ${XIMEA_PATH})
     set(XIMEA_FOUND 1)
     # set LIB folders
-    set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x86")
+    if(CMAKE_CL_64)
+      set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x64")
+    else()
+      set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x86")
+    endif()
   else()
     set(XIMEA_FOUND 0)
   endif()
@@ -38,5 +43,4 @@ endif()
 
 mark_as_advanced(FORCE XIMEA_FOUND)
 mark_as_advanced(FORCE XIMEA_PATH)
-mark_as_advanced(FORCE XIMEA_LIBRARY_DIR)
-
+mark_as_advanced(FORCE XIMEA_LIBRARY_DIR)
\ No newline at end of file
diff --git a/cmake/OpenCVGenConfig.cmake b/cmake/OpenCVGenConfig.cmake
index 705ccc8df1..c99cae7883 100644
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@@ -162,7 +162,7 @@ if(UNIX)
 endif()
 
 if(ANDROID)
-  install(FILES "${OpenCV_SOURCE_DIR}/android/android.toolchain.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/)
+  install(FILES "${OpenCV_SOURCE_DIR}/platforms/android/android.toolchain.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/)
 endif()
 
 # --------------------------------------------------------------------------------------------
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 8312845fe0..81340bd0eb 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -303,7 +303,7 @@ macro(ocv_glob_modules)
   # collect modules
   set(OPENCV_INITIAL_PASS ON)
   foreach(__path ${ARGN})
-    ocv_get_real_path(__path "${__path}")
+    get_filename_component(__path "${__path}" ABSOLUTE)
 
     list(FIND __directories_observed "${__path}" __pathIdx)
     if(__pathIdx GREATER -1)
@@ -315,7 +315,7 @@ macro(ocv_glob_modules)
     if(__ocvmodules)
       list(SORT __ocvmodules)
       foreach(mod ${__ocvmodules})
-        ocv_get_real_path(__modpath "${__path}/${mod}")
+        get_filename_component(__modpath "${__path}/${mod}" ABSOLUTE)
         if(EXISTS "${__modpath}/CMakeLists.txt")
 
           list(FIND __directories_observed "${__modpath}" __pathIdx)
@@ -470,7 +470,8 @@ endmacro()
 #   ocv_create_module(<extra link dependencies>)
 #   ocv_create_module(SKIP_LINK)
 macro(ocv_create_module)
-  add_library(${the_module} ${OPENCV_MODULE_TYPE} ${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES})
+  add_library(${the_module} ${OPENCV_MODULE_TYPE} ${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES}
+    "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp")
 
   if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
     target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index db24c99708..59366eb03b 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -411,16 +411,6 @@ macro(ocv_regex_escape var regex)
 endmacro()
 
 
-# get absolute path with symlinks resolved
-macro(ocv_get_real_path VAR PATHSTR)
-  if(CMAKE_VERSION VERSION_LESS 2.8)
-    get_filename_component(${VAR} "${PATHSTR}" ABSOLUTE)
-  else()
-    get_filename_component(${VAR} "${PATHSTR}" REALPATH)
-  endif()
-endmacro()
-
-
 # convert list of paths to full paths
 macro(ocv_convert_to_full_paths VAR)
   if(${VAR})
@@ -511,6 +501,13 @@ macro(ocv_parse_header2 LIBNAME HDR_PATH VARNAME)
   endif()
 endmacro()
 
+# read single version info from the pkg file
+macro(ocv_parse_pkg LIBNAME PKG_PATH SCOPE)
+  if(EXISTS "${PKG_PATH}/${LIBNAME}.pc")
+    file(STRINGS "${PKG_PATH}/${LIBNAME}.pc" line_to_parse REGEX "^Version:[ \t]+[0-9.]*.*$" LIMIT_COUNT 1)
+    STRING(REGEX REPLACE ".*Version: ([^ ]+).*" "\\1" ALIASOF_${LIBNAME}_VERSION "${line_to_parse}" )
+  endif()
+endmacro()
 
 ################################################################################################
 # short command to setup source group
diff --git a/cmake/templates/cvconfig.h.cmake b/cmake/templates/cvconfig.h.cmake
index db46af4b6d..f12730988d 100644
--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@@ -228,3 +228,9 @@
 
 /* Clp support */
 #cmakedefine HAVE_CLP
+
+/* Qt support */
+#cmakedefine HAVE_QT
+
+/* Qt OpenGL support */
+#cmakedefine HAVE_QT_OPENGL
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 0f2695fc9a..70f4809d22 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -53,8 +53,8 @@ if(BUILD_DOCS AND HAVE_SPHINX)
     endif()
   endforeach()
 
-  file(GLOB_RECURSE _OPENCV_FILES_REF "${OpenCV_SOURCE_DIR}/android/service/doc/*.rst")
-  file(GLOB_RECURSE _OPENCV_FILES_REF_PICT "${OpenCV_SOURCE_DIR}/android/service/doc/*.png" "${OpenCV_SOURCE_DIR}/android/service/doc/*.jpg")
+  file(GLOB_RECURSE _OPENCV_FILES_REF "${OpenCV_SOURCE_DIR}/platforms/android/service/doc/*.rst")
+  file(GLOB_RECURSE _OPENCV_FILES_REF_PICT "${OpenCV_SOURCE_DIR}/platforms/android/service/doc/*.png" "${OpenCV_SOURCE_DIR}/platforms/android/service/doc/*.jpg")
   list(APPEND OPENCV_FILES_REF ${_OPENCV_FILES_REF})
   list(APPEND OPENCV_FILES_REF_PICT ${_OPENCV_FILES_REF_PICT})
 
diff --git a/doc/conf.py b/doc/conf.py
index 4c7a15c891..f3f7aec58a 100755
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -239,7 +239,7 @@ latex_documents = [
    u'', 'manual'),
   ('doc/tutorials/tutorials', 'opencv_tutorials.tex', u'The OpenCV Tutorials',
    u'', 'manual'),
-  ('android/refman', 'opencv2manager.tex', u'The OpenCV Manager Manual',
+  ('platforms/android/refman', 'opencv2manager.tex', u'The OpenCV Manager Manual',
    u'', 'manual'),
 ]
 
diff --git a/doc/tutorials/calib3d/camera_calibration/camera_calibration.rst b/doc/tutorials/calib3d/camera_calibration/camera_calibration.rst
index 9196c87d6a..6637e2590c 100644
--- a/doc/tutorials/calib3d/camera_calibration/camera_calibration.rst
+++ b/doc/tutorials/calib3d/camera_calibration/camera_calibration.rst
@@ -12,8 +12,8 @@ For the distortion OpenCV takes into account the radial and tangential factors.
 
 .. math:: 
 
-   x_{corrected} = x( 1 + k_1 r^2 + k_2 r^4 + k^3 r^6) \\
-   y_{corrected} = y( 1 + k_1 r^2 + k_2 r^4 + k^3 r^6)
+   x_{corrected} = x( 1 + k_1 r^2 + k_2 r^4 + k_3 r^6) \\
+   y_{corrected} = y( 1 + k_1 r^2 + k_2 r^4 + k_3 r^6)
 
 So for an old pixel point at :math:`(x,y)` coordinate in the input image, for a corrected output image its position will be :math:`(x_{corrected} y_{corrected})` . The presence of the radial distortion manifests in form of the "barrel" or "fish-eye" effect. 
 
diff --git a/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst b/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
index 47eafedbc7..54d28890ab 100644
--- a/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
+++ b/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
@@ -85,7 +85,7 @@ This tutorial code's is shown lines below. You can also download it from `here <
      std::vector< DMatch > good_matches;
 
      for( int i = 0; i < descriptors_1.rows; i++ )
-     { if( matches[i].distance < 2*min_dist )
+     { if( matches[i].distance <= 2*min_dist )
        { good_matches.push_back( matches[i]); }
      }
 
@@ -127,6 +127,3 @@ Result
    .. image:: images/Feature_FlannMatcher_Keypoints_Result.jpg
       :align: center
       :height: 250pt
-
-
-
diff --git a/doc/tutorials/introduction/ios_install/ios_install.rst b/doc/tutorials/introduction/ios_install/ios_install.rst
index ace657b21c..8d117a0b42 100644
--- a/doc/tutorials/introduction/ios_install/ios_install.rst
+++ b/doc/tutorials/introduction/ios_install/ios_install.rst
@@ -37,7 +37,7 @@ Building OpenCV from Source, using CMake and Command Line
     .. code-block:: bash
 
        cd ~/<my_working_directory>
-       python opencv/ios/build_framework.py ios
+       python opencv/platforms/ios/build_framework.py ios
 
 If everything's fine, a few minutes later you will get ~/<my_working_directory>/ios/opencv2.framework. You can add this framework to your Xcode projects.
 
diff --git a/index.rst b/index.rst
index 909bf908b8..5f50b66d0f 100644
--- a/index.rst
+++ b/index.rst
@@ -10,7 +10,7 @@ Welcome to opencv documentation!
    :maxdepth: 2
 
    modules/refman.rst
-   android/refman.rst
+   platforms/android/refman.rst
    doc/user_guide/user_guide.rst
    doc/tutorials/tutorials.rst
 
diff --git a/ios/configure-device_xcode.sh b/ios/configure-device_xcode.sh
deleted file mode 100755
index 8c28a3e909..0000000000
--- a/ios/configure-device_xcode.sh
+++ /dev/null
@@ -1 +0,0 @@
-cmake -GXcode -DCMAKE_TOOLCHAIN_FILE=../opencv/ios/cmake/Toolchains/Toolchain-iPhoneOS_Xcode.cmake -DCMAKE_INSTALL_PREFIX=../OpenCV_iPhoneOS ../opencv 
diff --git a/ios/configure-simulator_xcode.sh b/ios/configure-simulator_xcode.sh
deleted file mode 100755
index 50e00261db..0000000000
--- a/ios/configure-simulator_xcode.sh
+++ /dev/null
@@ -1 +0,0 @@
-cmake -GXcode -DCMAKE_TOOLCHAIN_FILE=../opencv/ios/cmake/Toolchains/Toolchain-iPhoneSimulator_Xcode.cmake -DCMAKE_INSTALL_PREFIX=../OpenCV_iPhoneSimulator ../opencv 
diff --git a/ios/readme.txt b/ios/readme.txt
deleted file mode 100644
index 1441b241b7..0000000000
--- a/ios/readme.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-Assuming that your build directory is on the same level that opencv source,
-From the build directory run
-  ../opencv/ios/configure-device_xcode.sh
-or
-  ../opencv/ios/configure-simulator_xcode.sh
-
-Then from the same folder invoke
-
-xcodebuild -sdk iphoneos -configuration Release -target ALL_BUILD
-xcodebuild -sdk iphoneos -configuration Release -target install install
-
-or
-
-xcodebuild -sdk iphonesimulator -configuration Release -target ALL_BUILD
-xcodebuild -sdk iphonesimulator -configuration Release -target install install
\ No newline at end of file
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 4a6ed6d11e..3e1ad708e6 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -2,4 +2,4 @@ if(NOT OPENCV_MODULES_PATH)
   set(OPENCV_MODULES_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
 endif()
 
-ocv_glob_modules(${OPENCV_MODULES_PATH})
+ocv_glob_modules(${OPENCV_MODULES_PATH} ${OPENCV_EXTRA_MODULES_PATH})
diff --git a/modules/androidcamera/CMakeLists.txt b/modules/androidcamera/CMakeLists.txt
index d54dd5d208..8ac8ced88e 100644
--- a/modules/androidcamera/CMakeLists.txt
+++ b/modules/androidcamera/CMakeLists.txt
@@ -6,7 +6,7 @@ set(the_description "Auxiliary module for Android native camera support")
 set(OPENCV_MODULE_TYPE STATIC)
 
 ocv_define_module(androidcamera INTERNAL opencv_core log dl)
-ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/camera_wrapper" "${OpenCV_SOURCE_DIR}/android/service/engine/jni/include")
+ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/camera_wrapper" "${OpenCV_SOURCE_DIR}/platforms/android/service/engine/jni/include")
 
 # Android source tree for native camera
 SET (ANDROID_SOURCE_TREE "ANDROID_SOURCE_TREE-NOTFOUND" CACHE PATH
diff --git a/modules/calib3d/include/opencv2/calib3d/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d/calib3d.hpp
index 0d1cc46915..f213a114f4 100644
--- a/modules/calib3d/include/opencv2/calib3d/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d/calib3d.hpp
@@ -639,9 +639,9 @@ CV_EXPORTS Mat findFundamentalMat( InputArray points1, InputArray points2,
                                    double param1=3., double param2=0.99);
 
 //! finds coordinates of epipolar lines corresponding the specified points
-CV_EXPORTS void computeCorrespondEpilines( InputArray points,
-                                           int whichImage, InputArray F,
-                                           OutputArray lines );
+CV_EXPORTS_W void computeCorrespondEpilines( InputArray points,
+                                             int whichImage, InputArray F,
+                                             OutputArray lines );
 
 CV_EXPORTS_W void triangulatePoints( InputArray projMatr1, InputArray projMatr2,
                                      InputArray projPoints1, InputArray projPoints2,
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index 25988be48a..3d2c0c2c47 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -115,31 +115,6 @@ namespace cv
             transform(points, modif_points, transformation);
         }
 
-        class Mutex
-        {
-        public:
-            Mutex() {
-            }
-            void lock()
-            {
-#ifdef HAVE_TBB
-                resultsMutex.lock();
-#endif
-            }
-
-            void unlock()
-            {
-#ifdef HAVE_TBB
-                resultsMutex.unlock();
-#endif
-            }
-
-        private:
-#ifdef HAVE_TBB
-            tbb::mutex resultsMutex;
-#endif
-        };
-
         struct CameraParameters
         {
             void init(Mat _intrinsics, Mat _distCoeffs)
diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index 32514276b5..623883df74 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -699,7 +699,7 @@ struct PrefilterInvoker
 };
 
 
-struct FindStereoCorrespInvoker
+struct FindStereoCorrespInvoker : ParallelLoopBody
 {
     FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
                               Mat& _disp, CvStereoBMState* _state,
@@ -713,12 +713,12 @@ struct FindStereoCorrespInvoker
         validDisparityRect = _validDisparityRect;
     }
 
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
         int cols = left->cols, rows = left->rows;
-        int _row0 = min(cvRound(range.begin() * rows / nstripes), rows);
-        int _row1 = min(cvRound(range.end() * rows / nstripes), rows);
-        uchar *ptr = state->slidingSumBuf->data.ptr + range.begin() * stripeBufSize;
+        int _row0 = min(cvRound(range.start * rows / nstripes), rows);
+        int _row1 = min(cvRound(range.end * rows / nstripes), rows);
+        uchar *ptr = state->slidingSumBuf->data.ptr + range.start * stripeBufSize;
         int FILTERED = (state->minDisparity - 1)*16;
 
         Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
@@ -871,14 +871,10 @@ static void findStereoCorrespondenceBM( const Mat& left0, const Mat& right0, Mat
     const bool useShorts = false;
 #endif
 
-#ifdef HAVE_TBB
     const double SAD_overhead_coeff = 10.0;
     double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
     double maxStripeSize = min(max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
     int nstripes = cvCeil(height / maxStripeSize);
-#else
-    const int nstripes = 1;
-#endif
 
     int bufSize = max(bufSize0 * nstripes, max(bufSize1 * 2, bufSize2));
 
@@ -898,9 +894,9 @@ static void findStereoCorrespondenceBM( const Mat& left0, const Mat& right0, Mat
                                               state->minDisparity, state->numberOfDisparities,
                                               state->SADWindowSize);
 
-    parallel_for(BlockedRange(0, nstripes),
-                 FindStereoCorrespInvoker(left, right, disp, state, nstripes,
-                                          bufSize0, useShorts, validDisparityRect));
+    parallel_for_(Range(0, nstripes),
+                  FindStereoCorrespInvoker(left, right, disp, state, nstripes,
+                                           bufSize0, useShorts, validDisparityRect));
 
     if( state->speckleRange >= 0 && state->speckleWindowSize > 0 )
     {
diff --git a/modules/contrib/src/inputoutput.cpp b/modules/contrib/src/inputoutput.cpp
index d10d884c83..a711f242ad 100644
--- a/modules/contrib/src/inputoutput.cpp
+++ b/modules/contrib/src/inputoutput.cpp
@@ -1,7 +1,7 @@
 
 #include "opencv2/contrib/contrib.hpp"
 
-#ifdef WIN32
+#if defined(WIN32) || defined(_WIN32)
     #include <windows.h>
     #include <tchar.h>
 #else
diff --git a/modules/core/doc/basic_structures.rst b/modules/core/doc/basic_structures.rst
index ca9f5e21a2..3705879228 100644
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@@ -489,6 +489,9 @@ Various Ptr constructors.
 .. ocv:function:: Ptr::Ptr(_Tp* _obj)
 .. ocv:function:: Ptr::Ptr(const Ptr& ptr)
 
+    :param _obj: Object for copy.
+    :param ptr: Object for copy.
+
 Ptr::~Ptr
 ---------
 The Ptr destructor.
@@ -501,6 +504,8 @@ Assignment operator.
 
 .. ocv:function:: Ptr& Ptr::operator = (const Ptr& ptr)
 
+    :param ptr: Object for assignment.
+
 Decrements own reference counter (with ``release()``) and increments ptr's reference counter.
 
 Ptr::addref
@@ -1465,6 +1470,7 @@ Adds elements to the bottom of the matrix.
 .. ocv:function:: void Mat::push_back( const Mat& m )
 
     :param elem: Added element(s).
+    :param m: Added line(s).
 
 The methods add one or more elements to the bottom of the matrix. They emulate the corresponding method of the STL vector class. When ``elem`` is ``Mat`` , its type and the number of columns must be the same as in the container matrix.
 
@@ -1691,7 +1697,7 @@ Returns the depth of a matrix element.
 
 .. ocv:function:: int Mat::depth() const
 
-The method returns the identifier of the matrix element depth (the type of each individual channel). For example, for a 16-bit signed 3-channel array, the method returns ``CV_16S`` . A complete list of matrix types contains the following values:
+The method returns the identifier of the matrix element depth (the type of each individual channel). For example, for a 16-bit signed element array, the method returns ``CV_16S`` . A complete list of matrix types contains the following values:
 
 * ``CV_8U``     - 8-bit unsigned integers ( ``0..255``     )
 
@@ -2160,7 +2166,6 @@ Various SparseMat constructors.
     :param dims: Array dimensionality.
     :param _sizes: Sparce matrix size on all dementions.
     :param _type: Sparse matrix data type.
-    :param try1d: if try1d is true and matrix is a single-column matrix (Nx1), then the sparse matrix will be 1-dimensional.
 
 SparseMat::~SparseMat
 ---------------------
@@ -2175,6 +2180,8 @@ Provides sparse matrix assignment operators.
 .. ocv:function:: SparseMat& SparseMat::operator = (const SparseMat& m)
 .. ocv:function:: SparseMat& SparseMat::operator = (const Mat& m)
 
+    :param m: Matrix for assignment.
+
 The last variant is equivalent to the corresponding constructor with try1d=false.
 
 
@@ -2202,6 +2209,10 @@ Convert sparse matrix with possible type change and scaling.
 .. ocv:function:: void SparseMat::convertTo( SparseMat& m, int rtype, double alpha=1 ) const
 .. ocv:function:: void SparseMat::convertTo( Mat& m, int rtype, double alpha=1, double beta=0 ) const
 
+    :param m: Destination matrix.
+    :param rtype: Destination matrix type.
+    :param alpha: Conversion multiplier.
+
 The first version converts arbitrary sparse matrix to dense matrix and multiplies all the matrix elements by the specified scalar.
 The second versiob converts sparse matrix to dense matrix with optional type conversion and scaling.
 When rtype=-1, the destination element type will be the same as the sparse matrix element type.
@@ -2294,7 +2305,7 @@ The method returns the number of matrix channels.
 
 SparseMat::size
 ---------------
-Returns the array of sizes or matrix size by i dimention and 0 if the matrix is not allocated.
+Returns the array of sizes or matrix size by i dimension and 0 if the matrix is not allocated.
 
 .. ocv:function:: const int* SparseMat::size() const
 .. ocv:function:: int SparseMat::size(int i) const
@@ -2322,6 +2333,11 @@ Compute element hash value from the element indices.
 .. ocv:function:: size_t SparseMat::hash(int i0, int i1, int i2) const
 .. ocv:function:: size_t SparseMat::hash(const int* idx) const
 
+    :param i0: The first dimension index.
+    :param i1: The second dimension index.
+    :param i2: The third dimension index.
+    :param idx: Array of element indices for multidimensional matices.
+
 SparseMat::ptr
 --------------
 Low-level element-access functions, special variants for 1D, 2D, 3D cases, and the generic one for n-D case.
@@ -2331,6 +2347,12 @@ Low-level element-access functions, special variants for 1D, 2D, 3D cases, and t
 .. ocv:function:: uchar* SparseMat::ptr(int i0, int i1, int i2, bool createMissing, size_t* hashval=0)
 .. ocv:function:: uchar* SparseMat::ptr(const int* idx, bool createMissing, size_t* hashval=0)
 
+    :param i0: The first dimension index.
+    :param i1: The second dimension index.
+    :param i2: The third dimension index.
+    :param idx: Array of element indices for multidimensional matices.
+    :param createMissing: Create new element with 0 value if it does not exist in SparseMat.
+
 Return pointer to the matrix element. If the element is there (it is non-zero), the pointer to it is returned.
 If it is not there and ``createMissing=false``, NULL pointer is returned. If it is not there and ``createMissing=true``,
 the new elementis created and initialized with 0. Pointer to it is returned. If the optional hashval pointer is not ``NULL``,
@@ -2344,6 +2366,11 @@ Erase the specified matrix element. When there is no such an element, the method
 .. ocv:function:: void SparseMat::erase(int i0, int i1, int i2, size_t* hashval=0)
 .. ocv:function:: void SparseMat::erase(const int* idx, size_t* hashval=0)
 
+    :param i0: The first dimension index.
+    :param i1: The second dimension index.
+    :param i2: The third dimension index.
+    :param idx: Array of element indices for multidimensional matices.
+
 SparseMat\_
 -----------
 .. ocv:class:: SparseMat_
diff --git a/modules/core/doc/clustering.rst b/modules/core/doc/clustering.rst
index 46130bc8fd..f58e99ce2c 100644
--- a/modules/core/doc/clustering.rst
+++ b/modules/core/doc/clustering.rst
@@ -17,12 +17,18 @@ Finds centers of clusters and groups input samples around the clusters.
 
     :param samples: Floating-point matrix of input samples, one row per sample.
 
+    :param data: Data for clustering.
+
     :param cluster_count: Number of clusters to split the set by.
 
+    :param K: Number of clusters to split the set by.
+
     :param labels: Input/output integer array that stores the cluster indices for every sample.
 
     :param criteria: The algorithm termination criteria, that is, the maximum number of iterations and/or the desired accuracy. The accuracy is specified as ``criteria.epsilon``. As soon as each of the cluster centers moves by less than ``criteria.epsilon`` on some iteration, the algorithm stops.
 
+    :param termcrit: The algorithm termination criteria, that is, the maximum number of iterations and/or the desired accuracy.
+
     :param attempts: Flag to specify the number of times the algorithm is executed using different initial labellings. The algorithm returns the labels that yield the best compactness (see the last function parameter).
 
     :param rng: CvRNG state initialized by RNG().
@@ -37,6 +43,8 @@ Finds centers of clusters and groups input samples around the clusters.
 
     :param centers: Output matrix of the cluster centers, one row per each cluster center.
 
+    :param _centers: Output matrix of the cluster centers, one row per each cluster center.
+
     :param compactness: The returned value that is described below.
 
 The function ``kmeans`` implements a k-means algorithm that finds the
diff --git a/modules/core/doc/drawing_functions.rst b/modules/core/doc/drawing_functions.rst
index 24328f9a54..342301db97 100644
--- a/modules/core/doc/drawing_functions.rst
+++ b/modules/core/doc/drawing_functions.rst
@@ -234,6 +234,8 @@ Calculates the width and height of a text string.
 
     :param text: Input text string.
 
+    :param text_string: Input text string in C format.
+
     :param fontFace: Font to use. See the  :ocv:func:`putText` for details.
 
     :param fontScale: Font scale. See the  :ocv:func:`putText`  for details.
@@ -242,6 +244,12 @@ Calculates the width and height of a text string.
 
     :param baseLine: Output parameter - y-coordinate of the baseline relative to the bottom-most text point.
 
+    :param baseline: Output parameter - y-coordinate of the baseline relative to the bottom-most text point.
+
+    :param font: Font description in terms of old C API.
+
+    :param text_size: Output parameter - The size of a box that contains the specified text.
+
 The function ``getTextSize`` calculates and returns the size of a box that contains the specified text.
 That is, the following code renders some text, the tight box surrounding it, and the baseline: ::
 
diff --git a/modules/core/doc/operations_on_arrays.rst b/modules/core/doc/operations_on_arrays.rst
index d338444760..bd55993afe 100644
--- a/modules/core/doc/operations_on_arrays.rst
+++ b/modules/core/doc/operations_on_arrays.rst
@@ -1062,6 +1062,8 @@ Returns the determinant of a square floating-point matrix.
 
     :param mtx: input matrix that must have ``CV_32FC1`` or ``CV_64FC1`` type and square size.
 
+    :param mat: input matrix that must have ``CV_32FC1`` or ``CV_64FC1`` type and square size.
+
 The function ``determinant`` calculates and returns the determinant of the specified matrix. For small matrices ( ``mtx.cols=mtx.rows<=3`` ),
 the direct method is used. For larger matrices, the function uses LU factorization with partial pivoting.
 
diff --git a/modules/core/doc/utility_and_system_functions_and_macros.rst b/modules/core/doc/utility_and_system_functions_and_macros.rst
index 54198b058a..41cf7e1b72 100644
--- a/modules/core/doc/utility_and_system_functions_and_macros.rst
+++ b/modules/core/doc/utility_and_system_functions_and_macros.rst
@@ -173,6 +173,8 @@ Checks a condition at runtime and throws exception if it fails
 
 .. ocv:function:: CV_Assert(expr)
 
+    :param expr: Expression for check.
+
 The macros ``CV_Assert`` (and ``CV_DbgAssert``) evaluate the specified expression. If it is 0, the macros raise an error (see :ocv:func:`error` ). The macro ``CV_Assert`` checks the condition in both Debug and Release configurations while ``CV_DbgAssert`` is only retained in the Debug configuration.
 
 
@@ -188,8 +190,14 @@ Signals an error and raises an exception.
 
     :param status: Error code. Normally, it is a negative value. The list of pre-defined error codes can be found in  ``cxerror.h`` .
 
+    :param func_name: The function name where error occurs.
+
     :param err_msg: Text of the error message.
 
+    :param file_name: The file name where error occurs.
+
+    :param line: The line number where error occurs.
+
     :param args: ``printf`` -like formatted error message in parentheses.
 
 The function and the helper macros ``CV_Error`` and ``CV_Error_``: ::
@@ -249,6 +257,7 @@ Allocates an aligned memory buffer.
 .. ocv:cfunction:: void* cvAlloc( size_t size )
 
     :param size: Allocated buffer size.
+    :param bufSize: Allocated buffer size.
 
 The function allocates the buffer of the specified size and returns it. When the buffer size is 16 bytes or more, the returned buffer is aligned to 16 bytes.
 
diff --git a/modules/core/doc/xml_yaml_persistence.rst b/modules/core/doc/xml_yaml_persistence.rst
index c7d55d01f5..28bae24508 100644
--- a/modules/core/doc/xml_yaml_persistence.rst
+++ b/modules/core/doc/xml_yaml_persistence.rst
@@ -181,6 +181,17 @@ Opens a file.
 
 .. ocv:function:: bool FileStorage::open(const string& filename, int flags, const string& encoding=string())
 
+    :param filename: Name of the file to open or the text string to read the data from.
+                     Extension of the file (``.xml`` or ``.yml``/``.yaml``) determines its format (XML or YAML respectively).
+                     Also you can append ``.gz`` to work with compressed files, for example ``myHugeMatrix.xml.gz``.
+                     If both ``FileStorage::WRITE`` and ``FileStorage::MEMORY`` flags are specified, ``source``
+                     is used just to specify the output file format (e.g. ``mydata.xml``, ``.yml`` etc.).
+
+    :param flags: Mode of operation. See FileStorage constructor for more details.
+
+    :param encoding: Encoding of the file. Note that UTF-16 XML encoding is not supported currently and you should use 8-bit encoding instead of it.
+
+
 See description of parameters in :ocv:func:`FileStorage::FileStorage`. The method calls :ocv:func:`FileStorage::release` before opening the file.
 
 
diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp
index 1c8e0e2cac..bc1a68fb77 100644
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -1322,7 +1322,8 @@ public:
         EXPR              = 6 << KIND_SHIFT,
         OPENGL_BUFFER     = 7 << KIND_SHIFT,
         OPENGL_TEXTURE    = 8 << KIND_SHIFT,
-        GPU_MAT           = 9 << KIND_SHIFT
+        GPU_MAT           = 9 << KIND_SHIFT,
+        OCL_MAT           =10 << KIND_SHIFT
     };
     _InputArray();
 
@@ -3409,8 +3410,6 @@ public:
     //! converts dense 2d matrix to the sparse form
     /*!
      \param m the input matrix
-     \param try1d if true and m is a single-column matrix (Nx1),
-            then the sparse matrix will be 1-dimensional.
     */
     explicit SparseMat(const Mat& m);
     //! converts old-style sparse matrix to the new-style. All the data is copied
@@ -4813,6 +4812,9 @@ public:
     ~AutoLock() { mutex->unlock(); }
 protected:
     Mutex* mutex;
+private:
+    AutoLock(const AutoLock&);
+    AutoLock& operator = (const AutoLock&);
 };
 
 }
diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp
index 5335fa01f8..606c62f8f5 100644
--- a/modules/core/include/opencv2/core/internal.hpp
+++ b/modules/core/include/opencv2/core/internal.hpp
@@ -50,6 +50,9 @@
 
 #include <vector>
 
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/types_c.h"
+
 #if defined WIN32 || defined _WIN32
 #  ifndef WIN32
 #    define WIN32
@@ -251,6 +254,10 @@ namespace cv
         body(range);
     }
 #endif
+
+    // Returns a static string if there is a parallel framework,
+    // NULL otherwise.
+    CV_EXPORTS const char* currentParallelFramework();
 } //namespace cv
 
 #define CV_INIT_ALGORITHM(classname, algname, memberinit) \
diff --git a/modules/core/perf/perf_reduce.cpp b/modules/core/perf/perf_reduce.cpp
index 93d3a14166..7b74b0e7e3 100644
--- a/modules/core/perf/perf_reduce.cpp
+++ b/modules/core/perf/perf_reduce.cpp
@@ -34,7 +34,8 @@ PERF_TEST_P(Size_MatType_ROp, reduceR,
     declare.in(src, WARMUP_RNG).out(vec);
     declare.time(100);
 
-    TEST_CYCLE() reduce(src, vec, 0, reduceOp, ddepth);
+    int runs = 15;
+    TEST_CYCLE_MULTIRUN(runs) reduce(src, vec, 0, reduceOp, ddepth);
 
     SANITY_CHECK(vec, 1);
 }
@@ -65,4 +66,3 @@ PERF_TEST_P(Size_MatType_ROp, reduceC,
 
     SANITY_CHECK(vec, 1);
 }
-
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index 5988363d3c..05a0c55524 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -2855,9 +2855,9 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp
 
     if( _mean.data )
     {
-        CV_Assert( _mean.size() == mean_sz );        
+        CV_Assert( _mean.size() == mean_sz );
         _mean.convertTo(mean, ctype);
-        covar_flags |= CV_COVAR_USE_AVG; 
+        covar_flags |= CV_COVAR_USE_AVG;
     }
 
     calcCovarMatrix( data, covar, mean, covar_flags, ctype );
@@ -2901,6 +2901,36 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp
     return *this;
 }
 
+template <typename T>
+int computeCumulativeEnergy(const Mat& eigenvalues, double retainedVariance)
+{
+    CV_DbgAssert( eigenvalues.type() == DataType<T>::type );
+
+    Mat g(eigenvalues.size(), DataType<T>::type);
+
+    for(int ig = 0; ig < g.rows; ig++)
+    {
+        g.at<T>(ig, 0) = 0;
+        for(int im = 0; im <= ig; im++)
+        {
+            g.at<T>(ig,0) += eigenvalues.at<T>(im,0);
+        }
+    }
+
+    int L;
+
+    for(L = 0; L < eigenvalues.rows; L++)
+    {
+        double energy = g.at<T>(L, 0) / g.at<T>(g.rows - 1, 0);
+        if(energy > retainedVariance)
+            break;
+    }
+
+    L = std::max(2, L);
+
+    return L;
+}
+
 PCA& PCA::computeVar(InputArray _data, InputArray __mean, int flags, double retainedVariance)
 {
     Mat data = _data.getMat(), _mean = __mean.getMat();
@@ -2977,26 +3007,11 @@ PCA& PCA::computeVar(InputArray _data, InputArray __mean, int flags, double reta
     }
 
     // compute the cumulative energy content for each eigenvector
-    Mat g(eigenvalues.size(), ctype);
-
-    for(int ig = 0; ig < g.rows; ig++)
-    {
-        g.at<float>(ig,0) = 0;
-        for(int im = 0; im <= ig; im++)
-        {
-            g.at<float>(ig,0) += eigenvalues.at<float>(im,0);
-        }
-    }
-
     int L;
-    for(L = 0; L < eigenvalues.rows; L++)
-    {
-        double energy = g.at<float>(L, 0) / g.at<float>(g.rows - 1, 0);
-        if(energy > retainedVariance)
-            break;
-    }
-
-    L = std::max(2, L);
+    if (ctype == CV_32F)
+        L = computeCumulativeEnergy<float>(eigenvalues, retainedVariance);
+    else
+        L = computeCumulativeEnergy<double>(eigenvalues, retainedVariance);
 
     // use clone() to physically copy the data and thus deallocate the original matrices
     eigenvalues = eigenvalues.rowRange(0,L).clone();
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 7acb0e0dbd..5a3600b9b3 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -980,6 +980,11 @@ Mat _InputArray::getMat(int i) const
         return !v.empty() ? Mat(size(i), t, (void*)&v[0]) : Mat();
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == STD_VECTOR_MAT );
     //if( k == STD_VECTOR_MAT )
     {
@@ -1062,6 +1067,11 @@ void _InputArray::getMatVector(vector<Mat>& mv) const
         return;
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == STD_VECTOR_MAT );
     //if( k == STD_VECTOR_MAT )
     {
@@ -1189,6 +1199,11 @@ Size _InputArray::size(int i) const
         return tex->size();
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == GPU_MAT );
     //if( k == GPU_MAT )
     {
@@ -1303,6 +1318,11 @@ bool _InputArray::empty() const
     if( k == OPENGL_TEXTURE )
         return ((const ogl::Texture2D*)obj)->empty();
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == GPU_MAT );
     //if( k == GPU_MAT )
         return ((const gpu::GpuMat*)obj)->empty();
@@ -1523,6 +1543,11 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i, bool all
         return;
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     if( k == NONE )
     {
         CV_Error(CV_StsNullPtr, "create() called for the missing output array" );
@@ -1634,6 +1659,11 @@ void _OutputArray::release() const
         return;
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == STD_VECTOR_MAT );
     //if( k == STD_VECTOR_MAT )
     {
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 0b2a845ac1..0a9ed09871 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -110,8 +110,16 @@
     #endif
 #endif
 
-#if defined HAVE_TBB || defined HAVE_CSTRIPES || defined HAVE_OPENMP || defined HAVE_GCD || defined HAVE_CONCURRENCY
-   #define HAVE_PARALLEL_FRAMEWORK
+#if defined HAVE_TBB && TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202
+#  define CV_PARALLEL_FRAMEWORK "tbb"
+#elif defined HAVE_CSTRIPES
+#  define CV_PARALLEL_FRAMEWORK "cstripes"
+#elif defined HAVE_OPENMP
+#  define CV_PARALLEL_FRAMEWORK "openmp"
+#elif defined HAVE_GCD
+#  define CV_PARALLEL_FRAMEWORK "gcd"
+#elif defined HAVE_CONCURRENCY
+#  define CV_PARALLEL_FRAMEWORK "ms-concurrency"
 #endif
 
 namespace cv
@@ -121,7 +129,7 @@ namespace cv
 
 namespace
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
     class ParallelLoopBodyWrapper
     {
     public:
@@ -218,7 +226,7 @@ public:
 static SchedPtr pplScheduler;
 #endif
 
-#endif // HAVE_PARALLEL_FRAMEWORK
+#endif // CV_PARALLEL_FRAMEWORK
 
 } //namespace
 
@@ -226,7 +234,7 @@ static SchedPtr pplScheduler;
 
 void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
 
     if(numThreads != 0)
     {
@@ -281,7 +289,7 @@ void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body,
     }
     else
 
-#endif // HAVE_PARALLEL_FRAMEWORK
+#endif // CV_PARALLEL_FRAMEWORK
     {
         (void)nstripes;
         body(range);
@@ -290,7 +298,7 @@ void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body,
 
 int cv::getNumThreads(void)
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
 
     if(numThreads == 0)
         return 1;
@@ -333,7 +341,7 @@ int cv::getNumThreads(void)
 void cv::setNumThreads( int threads )
 {
     (void)threads;
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
     numThreads = threads;
 #endif
 
@@ -480,6 +488,14 @@ int cv::getNumberOfCPUs(void)
 #endif
 }
 
+const char* cv::currentParallelFramework() {
+#ifdef CV_PARALLEL_FRAMEWORK
+    return CV_PARALLEL_FRAMEWORK;
+#else
+    return NULL;
+#endif
+}
+
 CV_IMPL void cvSetNumThreads(int nt)
 {
     cv::setNumThreads(nt);
diff --git a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
index 8596ae43db..d7e5eb4c29 100644
--- a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
@@ -217,7 +217,7 @@ For each query descriptor, finds the training descriptors not farther than the s
 
     :param compactResult: Parameter used when the mask (or masks) is not empty. If  ``compactResult``  is false, the  ``matches``  vector has the same size as  ``queryDescriptors``  rows. If  ``compactResult``  is true, the  ``matches``  vector does not contain matches for fully masked-out query descriptors.
 
-    :param maxDistance: Threshold for the distance between matched descriptors.
+    :param maxDistance: Threshold for the distance between matched descriptors. Distance means here metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured in Pixels)!
 
 For each query descriptor, the methods find such training descriptors that the distance between the query descriptor and the training descriptor is equal or smaller than ``maxDistance``. Found matches are returned in the distance increasing order.
 
diff --git a/modules/features2d/src/detectors.cpp b/modules/features2d/src/detectors.cpp
index 2efd5a652a..a1e389a435 100644
--- a/modules/features2d/src/detectors.cpp
+++ b/modules/features2d/src/detectors.cpp
@@ -214,7 +214,7 @@ static void keepStrongest( int N, vector<KeyPoint>& keypoints )
 }
 
 namespace {
-class GridAdaptedFeatureDetectorInvoker
+class GridAdaptedFeatureDetectorInvoker : public ParallelLoopBody
 {
 private:
     int gridRows_, gridCols_;
@@ -223,29 +223,24 @@ private:
     const Mat& image_;
     const Mat& mask_;
     const Ptr<FeatureDetector>& detector_;
-#ifdef HAVE_TBB
-    tbb::mutex* kptLock_;
-#endif
+    Mutex* kptLock_;
 
     GridAdaptedFeatureDetectorInvoker& operator=(const GridAdaptedFeatureDetectorInvoker&); // to quiet MSVC
 
 public:
 
-    GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask, vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols
-#ifdef HAVE_TBB
-        , tbb::mutex* kptLock
-#endif
-        ) : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
-            keypoints_(keypoints), image_(image), mask_(mask), detector_(detector)
-#ifdef HAVE_TBB
-            , kptLock_(kptLock)
-#endif
+    GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask,
+                                      vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols,
+                                      cv::Mutex* kptLock)
+        : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
+          keypoints_(keypoints), image_(image), mask_(mask), detector_(detector),
+          kptLock_(kptLock)
     {
     }
 
-    void operator() (const BlockedRange& range) const
+    void operator() (const Range& range) const
     {
-        for (int i = range.begin(); i < range.end(); ++i)
+        for (int i = range.start; i < range.end; ++i)
         {
             int celly = i / gridCols_;
             int cellx = i - celly * gridCols_;
@@ -270,9 +265,8 @@ public:
                 it->pt.x += col_range.start;
                 it->pt.y += row_range.start;
             }
-#ifdef HAVE_TBB
-            tbb::mutex::scoped_lock join_keypoints(*kptLock_);
-#endif
+
+            cv::AutoLock join_keypoints(*kptLock_);
             keypoints_.insert( keypoints_.end(), sub_keypoints.begin(), sub_keypoints.end() );
         }
     }
@@ -289,13 +283,9 @@ void GridAdaptedFeatureDetector::detectImpl( const Mat& image, vector<KeyPoint>&
     keypoints.reserve(maxTotalKeypoints);
     int maxPerCell = maxTotalKeypoints / (gridRows * gridCols);
 
-#ifdef HAVE_TBB
-    tbb::mutex kptLock;
-    cv::parallel_for(cv::BlockedRange(0, gridRows * gridCols),
+    cv::Mutex kptLock;
+    cv::parallel_for_(cv::Range(0, gridRows * gridCols),
         GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols, &kptLock));
-#else
-    GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols)(cv::BlockedRange(0, gridRows * gridCols));
-#endif
 }
 
 /*
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 5509226419..0062944bab 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -45,16 +45,16 @@ if(HAVE_CUDA)
   set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 
   if(WITH_NVCUVID)
-    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
-  endif()
+    set(cuda_link_libs ${cuda_link_libs} ${CUDA_CUDA_LIBRARY} ${CUDA_nvcuvid_LIBRARY})
 
-  if(WIN32)
-    find_cuda_helper_libs(nvcuvenc)
-    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
-  endif()
+    if(WIN32)
+      find_cuda_helper_libs(nvcuvenc)
+      set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
+    endif()
 
-  if(WITH_FFMPEG)
-    set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
+    if(WITH_FFMPEG)
+      set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
+    endif()
   endif()
 else()
   set(lib_cuda "")
diff --git a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
index d02027f244..5b422849bd 100644
--- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
@@ -120,11 +120,8 @@ namespace cv { namespace gpu { namespace device
                 return dst;
             }
 
-            __device__ __forceinline__ RGB2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-
-            __device__ __forceinline__ RGB2RGB(const RGB2RGB& other_)
-                :unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
         };
 
         template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
@@ -141,8 +138,8 @@ namespace cv { namespace gpu { namespace device
                 return dst;
             }
 
-            __device__ __forceinline__ RGB2RGB():unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2RGB(const RGB2RGB& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
         };
     }
 
@@ -203,8 +200,8 @@ namespace cv { namespace gpu { namespace device
                 return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
             }
 
-            __device__ __forceinline__ RGB2RGB5x5():unary_function<uchar3, ushort>(){}
-            __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uchar3, ushort>(){}
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
         };
 
         template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
@@ -214,8 +211,8 @@ namespace cv { namespace gpu { namespace device
                 return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
             }
 
-            __device__ __forceinline__ RGB2RGB5x5():unary_function<uint, ushort>(){}
-            __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uint, ushort>(){}
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
         };
     }
 
@@ -282,8 +279,8 @@ namespace cv { namespace gpu { namespace device
                 RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
                 return dst;
             }
-            __device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uchar3>(){}
-            __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uchar3>(){}
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
 
         };
 
@@ -295,8 +292,8 @@ namespace cv { namespace gpu { namespace device
                 RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
                 return dst;
             }
-            __device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uint>(){}
-            __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uint>(){}
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
         };
     }
 
@@ -325,9 +322,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ Gray2RGB():unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ Gray2RGB(const Gray2RGB& other_)
-                : unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
         };
 
         template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
@@ -342,8 +338,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ Gray2RGB():unary_function<uchar, uint>(){}
-            __device__ __forceinline__ Gray2RGB(const Gray2RGB& other_):unary_function<uchar, uint>(){}
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
         };
     }
 
@@ -384,8 +380,8 @@ namespace cv { namespace gpu { namespace device
                 return Gray2RGB5x5Converter<green_bits>::cvt(src);
             }
 
-            __device__ __forceinline__ Gray2RGB5x5():unary_function<uchar, ushort>(){}
-            __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5& other_):unary_function<uchar, ushort>(){}
+            __host__ __device__ __forceinline__ Gray2RGB5x5() {}
+            __host__ __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5&) {}
         };
     }
 
@@ -426,8 +422,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return RGB5x52GrayConverter<green_bits>::cvt(src);
             }
-            __device__ __forceinline__ RGB5x52Gray() : unary_function<ushort, uchar>(){}
-            __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray& other_) : unary_function<ushort, uchar>(){}
+            __host__ __device__ __forceinline__ RGB5x52Gray() {}
+            __host__ __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray&) {}
         };
     }
 
@@ -467,9 +463,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return RGB2GrayConvert<bidx>(&src.x);
             }
-            __device__ __forceinline__ RGB2Gray() : unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
-            __device__ __forceinline__ RGB2Gray(const RGB2Gray& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
         };
 
         template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
@@ -478,8 +473,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return RGB2GrayConvert<bidx>(src);
             }
-            __device__ __forceinline__ RGB2Gray() : unary_function<uint, uchar>(){}
-            __device__ __forceinline__ RGB2Gray(const RGB2Gray& other_) : unary_function<uint, uchar>(){}
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
         };
     }
 
@@ -529,10 +524,8 @@ namespace cv { namespace gpu { namespace device
                 RGB2YUVConvert<bidx>(&src.x, dst);
                 return dst;
             }
-            __device__ __forceinline__ RGB2YUV()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2YUV(const RGB2YUV& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2YUV() {}
+            __host__ __device__ __forceinline__ RGB2YUV(const RGB2YUV&) {}
         };
     }
 
@@ -609,10 +602,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ YUV2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ YUV2RGB(const YUV2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
         };
 
         template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -621,8 +612,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return YUV2RGBConvert<bidx>(src);
             }
-            __device__ __forceinline__ YUV2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ YUV2RGB(const YUV2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
         };
     }
 
@@ -689,10 +680,8 @@ namespace cv { namespace gpu { namespace device
                 RGB2YCrCbConvert<bidx>(&src.x, dst);
                 return dst;
             }
-            __device__ __forceinline__ RGB2YCrCb()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
         };
 
         template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -702,8 +691,8 @@ namespace cv { namespace gpu { namespace device
                 return RGB2YCrCbConvert<bidx>(src);
             }
 
-            __device__ __forceinline__ RGB2YCrCb() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
         };
     }
 
@@ -771,10 +760,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ YCrCb2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
         };
 
         template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -783,8 +770,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return YCrCb2RGBConvert<bidx>(src);
             }
-            __device__ __forceinline__ YCrCb2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
         };
     }
 
@@ -849,10 +836,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ RGB2XYZ()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
         };
 
         template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -861,8 +846,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return RGB2XYZConvert<bidx>(src);
             }
-            __device__ __forceinline__ RGB2XYZ() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
         };
     }
 
@@ -926,10 +911,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ XYZ2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
         };
 
         template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -938,8 +921,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return XYZ2RGBConvert<bidx>(src);
             }
-            __device__ __forceinline__ XYZ2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
         };
     }
 
@@ -1066,10 +1049,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ RGB2HSV()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2HSV(const RGB2HSV& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
         };
 
         template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1078,8 +1059,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return RGB2HSVConvert<bidx, hr>(src);
             }
-            __device__ __forceinline__ RGB2HSV():unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2HSV(const RGB2HSV& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
         };
     }
 
@@ -1208,10 +1189,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ HSV2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ HSV2RGB(const HSV2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
         };
 
         template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1220,8 +1199,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return HSV2RGBConvert<bidx, hr>(src);
             }
-            __device__ __forceinline__ HSV2RGB():unary_function<uint, uint>(){}
-            __device__ __forceinline__ HSV2RGB(const HSV2RGB& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
         };
     }
 
@@ -1343,10 +1322,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ RGB2HLS()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2HLS(const RGB2HLS& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
         };
 
         template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1355,8 +1332,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return RGB2HLSConvert<bidx, hr>(src);
             }
-            __device__ __forceinline__ RGB2HLS() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2HLS(const RGB2HLS& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
         };
     }
 
@@ -1485,10 +1462,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ HLS2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ HLS2RGB(const HLS2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
         };
 
         template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1497,8 +1472,8 @@ namespace cv { namespace gpu { namespace device
             {
                 return HLS2RGBConvert<bidx, hr>(src);
             }
-            __device__ __forceinline__ HLS2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ HLS2RGB(const HLS2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
         };
     }
 
@@ -1651,8 +1626,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ RGB2Lab() {}
-            __device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
         };
         template <int scn, int dcn, bool srgb, int blueIdx>
         struct RGB2Lab<float, scn, dcn, srgb, blueIdx>
@@ -1666,8 +1641,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ RGB2Lab() {}
-            __device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
         };
     }
 
@@ -1764,8 +1739,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ Lab2RGB() {}
-            __device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
         };
         template <int scn, int dcn, bool srgb, int blueIdx>
         struct Lab2RGB<float, scn, dcn, srgb, blueIdx>
@@ -1779,8 +1754,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ Lab2RGB() {}
-            __device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
         };
     }
 
@@ -1863,8 +1838,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ RGB2Luv() {}
-            __device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
         };
         template <int scn, int dcn, bool srgb, int blueIdx>
         struct RGB2Luv<float, scn, dcn, srgb, blueIdx>
@@ -1878,8 +1853,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ RGB2Luv() {}
-            __device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
         };
     }
 
@@ -1964,8 +1939,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ Luv2RGB() {}
-            __device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
         };
         template <int scn, int dcn, bool srgb, int blueIdx>
         struct Luv2RGB<float, scn, dcn, srgb, blueIdx>
@@ -1979,8 +1954,8 @@ namespace cv { namespace gpu { namespace device
 
                 return dst;
             }
-            __device__ __forceinline__ Luv2RGB() {}
-            __device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
         };
     }
 
diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp
index 6064e8e99c..db264735e3 100644
--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
@@ -63,8 +63,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a + b;
         }
-        __device__ __forceinline__ plus(const plus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ plus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ plus() {}
+        __host__ __device__ __forceinline__ plus(const plus&) {}
     };
 
     template <typename T> struct minus : binary_function<T, T, T>
@@ -74,8 +74,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a - b;
         }
-        __device__ __forceinline__ minus(const minus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ minus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ minus() {}
+        __host__ __device__ __forceinline__ minus(const minus&) {}
     };
 
     template <typename T> struct multiplies : binary_function<T, T, T>
@@ -85,8 +85,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a * b;
         }
-        __device__ __forceinline__ multiplies(const multiplies& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ multiplies():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ multiplies() {}
+        __host__ __device__ __forceinline__ multiplies(const multiplies&) {}
     };
 
     template <typename T> struct divides : binary_function<T, T, T>
@@ -96,8 +96,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a / b;
         }
-        __device__ __forceinline__ divides(const divides& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ divides():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ divides() {}
+        __host__ __device__ __forceinline__ divides(const divides&) {}
     };
 
     template <typename T> struct modulus : binary_function<T, T, T>
@@ -107,8 +107,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a % b;
         }
-        __device__ __forceinline__ modulus(const modulus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ modulus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ modulus() {}
+        __host__ __device__ __forceinline__ modulus(const modulus&) {}
     };
 
     template <typename T> struct negate : unary_function<T, T>
@@ -117,8 +117,8 @@ namespace cv { namespace gpu { namespace device
         {
             return -a;
         }
-        __device__ __forceinline__ negate(const negate& other):unary_function<T,T>(){}
-        __device__ __forceinline__ negate():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ negate() {}
+        __host__ __device__ __forceinline__ negate(const negate&) {}
     };
 
     // Comparison Operations
@@ -129,8 +129,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a == b;
         }
-        __device__ __forceinline__ equal_to(const equal_to& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ equal_to():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ equal_to() {}
+        __host__ __device__ __forceinline__ equal_to(const equal_to&) {}
     };
 
     template <typename T> struct not_equal_to : binary_function<T, T, bool>
@@ -140,8 +140,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a != b;
         }
-        __device__ __forceinline__ not_equal_to(const not_equal_to& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ not_equal_to():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ not_equal_to() {}
+        __host__ __device__ __forceinline__ not_equal_to(const not_equal_to&) {}
     };
 
     template <typename T> struct greater : binary_function<T, T, bool>
@@ -151,8 +151,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a > b;
         }
-        __device__ __forceinline__ greater(const greater& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ greater():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ greater() {}
+        __host__ __device__ __forceinline__ greater(const greater&) {}
     };
 
     template <typename T> struct less : binary_function<T, T, bool>
@@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a < b;
         }
-        __device__ __forceinline__ less(const less& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ less():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ less() {}
+        __host__ __device__ __forceinline__ less(const less&) {}
     };
 
     template <typename T> struct greater_equal : binary_function<T, T, bool>
@@ -173,8 +173,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a >= b;
         }
-        __device__ __forceinline__ greater_equal(const greater_equal& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ greater_equal():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ greater_equal() {}
+        __host__ __device__ __forceinline__ greater_equal(const greater_equal&) {}
     };
 
     template <typename T> struct less_equal : binary_function<T, T, bool>
@@ -184,8 +184,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a <= b;
         }
-        __device__ __forceinline__ less_equal(const less_equal& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ less_equal():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ less_equal() {}
+        __host__ __device__ __forceinline__ less_equal(const less_equal&) {}
     };
 
     // Logical Operations
@@ -196,8 +196,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a && b;
         }
-        __device__ __forceinline__ logical_and(const logical_and& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ logical_and():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ logical_and() {}
+        __host__ __device__ __forceinline__ logical_and(const logical_and&) {}
     };
 
     template <typename T> struct logical_or : binary_function<T, T, bool>
@@ -207,8 +207,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a || b;
         }
-        __device__ __forceinline__ logical_or(const logical_or& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ logical_or():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ logical_or() {}
+        __host__ __device__ __forceinline__ logical_or(const logical_or&) {}
     };
 
     template <typename T> struct logical_not : unary_function<T, bool>
@@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace device
         {
             return !a;
         }
-        __device__ __forceinline__ logical_not(const logical_not& other):unary_function<T,bool>(){}
-        __device__ __forceinline__ logical_not():unary_function<T,bool>(){}
+        __host__ __device__ __forceinline__ logical_not() {}
+        __host__ __device__ __forceinline__ logical_not(const logical_not&) {}
     };
 
     // Bitwise Operations
@@ -229,8 +229,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a & b;
         }
-        __device__ __forceinline__ bit_and(const bit_and& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_and():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_and() {}
+        __host__ __device__ __forceinline__ bit_and(const bit_and&) {}
     };
 
     template <typename T> struct bit_or : binary_function<T, T, T>
@@ -240,8 +240,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a | b;
         }
-        __device__ __forceinline__ bit_or(const bit_or& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_or():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_or() {}
+        __host__ __device__ __forceinline__ bit_or(const bit_or&) {}
     };
 
     template <typename T> struct bit_xor : binary_function<T, T, T>
@@ -251,8 +251,8 @@ namespace cv { namespace gpu { namespace device
         {
             return a ^ b;
         }
-        __device__ __forceinline__ bit_xor(const bit_xor& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_xor():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_xor() {}
+        __host__ __device__ __forceinline__ bit_xor(const bit_xor&) {}
     };
 
     template <typename T> struct bit_not : unary_function<T, T>
@@ -261,8 +261,8 @@ namespace cv { namespace gpu { namespace device
         {
             return ~v;
         }
-        __device__ __forceinline__ bit_not(const bit_not& other):unary_function<T,T>(){}
-        __device__ __forceinline__ bit_not():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ bit_not() {}
+        __host__ __device__ __forceinline__ bit_not(const bit_not&) {}
     };
 
     // Generalized Identity Operations
@@ -272,8 +272,8 @@ namespace cv { namespace gpu { namespace device
         {
             return x;
         }
-        __device__ __forceinline__ identity(const identity& other):unary_function<T,T>(){}
-        __device__ __forceinline__ identity():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ identity() {}
+        __host__ __device__ __forceinline__ identity(const identity&) {}
     };
 
     template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
@@ -282,8 +282,8 @@ namespace cv { namespace gpu { namespace device
         {
             return lhs;
         }
-        __device__ __forceinline__ project1st(const project1st& other):binary_function<T1,T2,T1>(){}
-        __device__ __forceinline__ project1st():binary_function<T1,T2,T1>(){}
+        __host__ __device__ __forceinline__ project1st() {}
+        __host__ __device__ __forceinline__ project1st(const project1st&) {}
     };
 
     template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
@@ -292,8 +292,8 @@ namespace cv { namespace gpu { namespace device
         {
             return rhs;
         }
-        __device__ __forceinline__ project2nd(const project2nd& other):binary_function<T1,T2,T2>(){}
-        __device__ __forceinline__ project2nd():binary_function<T1,T2,T2>(){}
+        __host__ __device__ __forceinline__ project2nd() {}
+        __host__ __device__ __forceinline__ project2nd(const project2nd&) {}
     };
 
     // Min/Max Operations
@@ -302,8 +302,8 @@ namespace cv { namespace gpu { namespace device
     template <> struct name<type> : binary_function<type, type, type> \
     { \
         __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
-        __device__ __forceinline__ name() {}\
-        __device__ __forceinline__ name(const name&) {}\
+        __host__ __device__ __forceinline__ name() {}\
+        __host__ __device__ __forceinline__ name(const name&) {}\
     };
 
     template <typename T> struct maximum : binary_function<T, T, T>
@@ -312,8 +312,8 @@ namespace cv { namespace gpu { namespace device
         {
             return max(lhs, rhs);
         }
-        __device__ __forceinline__ maximum() {}
-        __device__ __forceinline__ maximum(const maximum&) {}
+        __host__ __device__ __forceinline__ maximum() {}
+        __host__ __device__ __forceinline__ maximum(const maximum&) {}
     };
 
     OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@@ -332,8 +332,8 @@ namespace cv { namespace gpu { namespace device
         {
             return min(lhs, rhs);
         }
-        __device__ __forceinline__ minimum() {}
-        __device__ __forceinline__ minimum(const minimum&) {}
+        __host__ __device__ __forceinline__ minimum() {}
+        __host__ __device__ __forceinline__ minimum(const minimum&) {}
     };
 
     OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@@ -349,7 +349,6 @@ namespace cv { namespace gpu { namespace device
 #undef OPENCV_GPU_IMPLEMENT_MINMAX
 
     // Math functions
-///bound=========================================
 
     template <typename T> struct abs_func : unary_function<T, T>
     {
@@ -358,8 +357,8 @@ namespace cv { namespace gpu { namespace device
             return abs(x);
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
     {
@@ -368,8 +367,8 @@ namespace cv { namespace gpu { namespace device
             return x;
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<signed char> : unary_function<signed char, signed char>
     {
@@ -378,8 +377,8 @@ namespace cv { namespace gpu { namespace device
             return ::abs((int)x);
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<char> : unary_function<char, char>
     {
@@ -388,8 +387,8 @@ namespace cv { namespace gpu { namespace device
             return ::abs((int)x);
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
     {
@@ -398,8 +397,8 @@ namespace cv { namespace gpu { namespace device
             return x;
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<short> : unary_function<short, short>
     {
@@ -408,8 +407,8 @@ namespace cv { namespace gpu { namespace device
             return ::abs((int)x);
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
     {
@@ -418,8 +417,8 @@ namespace cv { namespace gpu { namespace device
             return x;
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<int> : unary_function<int, int>
     {
@@ -428,8 +427,8 @@ namespace cv { namespace gpu { namespace device
             return ::abs(x);
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<float> : unary_function<float, float>
     {
@@ -438,8 +437,8 @@ namespace cv { namespace gpu { namespace device
             return ::fabsf(x);
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<double> : unary_function<double, double>
     {
@@ -448,8 +447,8 @@ namespace cv { namespace gpu { namespace device
             return ::fabs(x);
         }
 
-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
     };
 
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
@@ -459,8 +458,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func ## f(v); \
         } \
-        __device__ __forceinline__ name ## _func() {} \
-        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     }; \
     template <> struct name ## _func<double> : unary_function<double, double> \
     { \
@@ -468,8 +467,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func(v); \
         } \
-        __device__ __forceinline__ name ## _func() {} \
-        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     };
 
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@@ -479,6 +478,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func ## f(v1, v2); \
         } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     }; \
     template <> struct name ## _func<double> : binary_function<double, double, double> \
     { \
@@ -486,6 +487,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func(v1, v2); \
         } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     };
 
     OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
@@ -522,8 +525,8 @@ namespace cv { namespace gpu { namespace device
         {
             return src1 * src1 + src2 * src2;
         }
-        __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
-        __device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
+        __host__ __device__ __forceinline__ hypot_sqr_func() {}
+        __host__ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func&) {}
     };
 
     // Saturate Cast Functor
@@ -533,8 +536,8 @@ namespace cv { namespace gpu { namespace device
         {
             return saturate_cast<D>(v);
         }
-        __device__ __forceinline__ saturate_cast_func(const saturate_cast_func& other):unary_function<T, D>(){}
-        __device__ __forceinline__ saturate_cast_func():unary_function<T, D>(){}
+        __host__ __device__ __forceinline__ saturate_cast_func() {}
+        __host__ __device__ __forceinline__ saturate_cast_func(const saturate_cast_func&) {}
     };
 
     // Threshold Functors
@@ -547,10 +550,9 @@ namespace cv { namespace gpu { namespace device
             return (src > thresh) * maxVal;
         }
 
-        __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
-            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
-
-        __device__ __forceinline__ thresh_binary_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_binary_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
 
         const T thresh;
         const T maxVal;
@@ -565,10 +567,9 @@ namespace cv { namespace gpu { namespace device
             return (src <= thresh) * maxVal;
         }
 
-        __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
-            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
-
-        __device__ __forceinline__ thresh_binary_inv_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
 
         const T thresh;
         const T maxVal;
@@ -583,10 +584,9 @@ namespace cv { namespace gpu { namespace device
             return minimum<T>()(src, thresh);
         }
 
-        __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}
-
-        __device__ __forceinline__ thresh_trunc_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_trunc_func() {}
+        __host__ __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
+            : thresh(other.thresh) {}
 
         const T thresh;
     };
@@ -599,10 +599,10 @@ namespace cv { namespace gpu { namespace device
         {
             return (src > thresh) * src;
         }
-        __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}
 
-        __device__ __forceinline__ thresh_to_zero_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_to_zero_func() {}
+       __host__  __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
+            : thresh(other.thresh) {}
 
         const T thresh;
     };
@@ -615,14 +615,14 @@ namespace cv { namespace gpu { namespace device
         {
             return (src <= thresh) * src;
         }
-        __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}
 
-        __device__ __forceinline__ thresh_to_zero_inv_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
+            : thresh(other.thresh) {}
 
         const T thresh;
     };
-//bound!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ============>
+
     // Function Object Adaptors
     template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
     {
@@ -633,8 +633,8 @@ namespace cv { namespace gpu { namespace device
           return !pred(x);
       }
 
-        __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
-        __device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
+      __host__ __device__ __forceinline__ unary_negate() {}
+      __host__ __device__ __forceinline__ unary_negate(const unary_negate& other) : pred(other.pred) {}
 
       const Predicate pred;
     };
@@ -653,11 +653,9 @@ namespace cv { namespace gpu { namespace device
         {
             return !pred(x,y);
         }
-        __device__ __forceinline__ binary_negate(const binary_negate& other)
-        : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
 
-        __device__ __forceinline__ binary_negate() :
-        binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
+        __host__ __device__ __forceinline__ binary_negate() {}
+        __host__ __device__ __forceinline__ binary_negate(const binary_negate& other) : pred(other.pred) {}
 
         const Predicate pred;
     };
@@ -676,8 +674,8 @@ namespace cv { namespace gpu { namespace device
             return op(arg1, a);
         }
 
-        __device__ __forceinline__ binder1st(const binder1st& other) :
-        unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
+        __host__ __device__ __forceinline__ binder1st() {}
+        __host__ __device__ __forceinline__ binder1st(const binder1st& other) : op(other.op), arg1(other.arg1) {}
 
         const Op op;
         const typename Op::first_argument_type arg1;
@@ -697,8 +695,8 @@ namespace cv { namespace gpu { namespace device
             return op(a, arg2);
         }
 
-         __device__ __forceinline__ binder2nd(const binder2nd& other) :
-        unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
+        __host__ __device__ __forceinline__ binder2nd() {}
+        __host__ __device__ __forceinline__ binder2nd(const binder2nd& other) : op(other.op), arg2(other.arg2) {}
 
         const Op op;
         const typename Op::second_argument_type arg2;
diff --git a/modules/gpu/include/opencv2/gpu/device/limits.hpp b/modules/gpu/include/opencv2/gpu/device/limits.hpp
index b040f199d6..595978006c 100644
--- a/modules/gpu/include/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/limits.hpp
@@ -43,193 +43,80 @@
 #ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
 #define __OPENCV_GPU_LIMITS_GPU_HPP__
 
-#include <limits>
+#include <limits.h>
+#include <float.h>
 #include "common.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
-    template<class T> struct numeric_limits
-    {
-        typedef T type;
-        __device__ __forceinline__ static type min()  { return type(); };
-        __device__ __forceinline__ static type max() { return type(); };
-        __device__ __forceinline__ static type epsilon() { return type(); }
-        __device__ __forceinline__ static type round_error() { return type(); }
-        __device__ __forceinline__ static type denorm_min()  { return type(); }
-        __device__ __forceinline__ static type infinity() { return type(); }
-        __device__ __forceinline__ static type quiet_NaN() { return type(); }
-        __device__ __forceinline__ static type signaling_NaN() { return T(); }
-        static const bool is_signed;
-    };
 
-    template<> struct numeric_limits<bool>
-    {
-        typedef bool type;
-        __device__ __forceinline__ static type min() { return false; };
-        __device__ __forceinline__ static type max() { return true;  };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <class T> struct numeric_limits;
 
-    template<> struct numeric_limits<char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (char)-1 == -1;
-    };
+template <> struct numeric_limits<bool>
+{
+    __device__ __forceinline__ static bool min() { return false; }
+    __device__ __forceinline__ static bool max() { return true;  }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<signed char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return SCHAR_MIN; };
-        __device__ __forceinline__ static type max() { return SCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (signed char)-1 == -1;
-    };
+template <> struct numeric_limits<signed char>
+{
+    __device__ __forceinline__ static signed char min() { return SCHAR_MIN; }
+    __device__ __forceinline__ static signed char max() { return SCHAR_MAX; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<unsigned char>
-    {
-        typedef unsigned char type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<unsigned char>
+{
+    __device__ __forceinline__ static unsigned char min() { return 0; }
+    __device__ __forceinline__ static unsigned char max() { return UCHAR_MAX; }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<short>
-    {
-        typedef short type;
-        __device__ __forceinline__ static type min() { return SHRT_MIN; };
-        __device__ __forceinline__ static type max() { return SHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<short>
+{
+    __device__ __forceinline__ static short min() { return SHRT_MIN; }
+    __device__ __forceinline__ static short max() { return SHRT_MAX; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<unsigned short>
-    {
-        typedef unsigned short type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return USHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<unsigned short>
+{
+    __device__ __forceinline__ static unsigned short min() { return 0; }
+    __device__ __forceinline__ static unsigned short max() { return USHRT_MAX; }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<int>
-    {
-        typedef int type;
-        __device__ __forceinline__ static type min() { return INT_MIN; };
-        __device__ __forceinline__ static type max() { return INT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<int>
+{
+    __device__ __forceinline__ static int min() { return INT_MIN; }
+    __device__ __forceinline__ static int max() { return INT_MAX; }
+    static const bool is_signed = true;
+};
 
+template <> struct numeric_limits<unsigned int>
+{
+    __device__ __forceinline__ static unsigned int min() { return 0; }
+    __device__ __forceinline__ static unsigned int max() { return UINT_MAX; }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<unsigned int>
-    {
-        typedef unsigned int type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UINT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<float>
+{
+    __device__ __forceinline__ static float min() { return FLT_MIN; }
+    __device__ __forceinline__ static float max() { return FLT_MAX; }
+    __device__ __forceinline__ static float epsilon() { return FLT_EPSILON; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<long>
-    {
-        typedef long type;
-        __device__ __forceinline__ static type min() { return LONG_MIN; };
-        __device__ __forceinline__ static type max() { return LONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<double>
+{
+    __device__ __forceinline__ static double min() { return DBL_MIN; }
+    __device__ __forceinline__ static double max() { return DBL_MAX; }
+    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<unsigned long>
-    {
-        typedef unsigned long type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return ULONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<float>
-    {
-        typedef float type;
-        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
-        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-
-    template<> struct numeric_limits<double>
-    {
-        typedef double type;
-        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
-        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
 }}} // namespace cv { namespace gpu { namespace device {
 
 #endif // __OPENCV_GPU_LIMITS_GPU_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp
index 83eaaa21ce..85e81acf08 100644
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@@ -124,8 +124,8 @@ namespace cv { namespace gpu { namespace device
 
     struct WithOutMask
     {
-        __device__ __forceinline__ WithOutMask(){}
-        __device__ __forceinline__ WithOutMask(const WithOutMask& mask){}
+        __host__ __device__ __forceinline__ WithOutMask(){}
+        __host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}
 
         __device__ __forceinline__ void next() const
         {
diff --git a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
index 1c46dc0c33..a6cb43a2fa 100644
--- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
@@ -43,288 +43,880 @@
 #ifndef __OPENCV_GPU_VECMATH_HPP__
 #define __OPENCV_GPU_VECMATH_HPP__
 
-#include "saturate_cast.hpp"
 #include "vec_traits.hpp"
-#include "functional.hpp"
+#include "saturate_cast.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
-    namespace vec_math_detail
-    {
-        template <int cn, typename VecD> struct SatCastHelper;
-        template <typename VecD> struct SatCastHelper<1, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<2, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<3, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<4, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
-            }
-        };
 
-        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
+// saturate_cast
+
+namespace vec_math_detail
+{
+    template <int cn, typename VecD> struct SatCastHelper;
+    template <typename VecD> struct SatCastHelper<1, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
         }
-    }
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
-    }
-
-    namespace vec_math_detail
+    };
+    template <typename VecD> struct SatCastHelper<2, VecD>
     {
-        template <typename T1, typename T2> struct BinOpTraits
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            typedef int argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, T>
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<3, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            typedef T argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, double>
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<4, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<double, T>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<double, double>
-        {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, float>
-        {
-            typedef float argument_type;
-        };
-        template <typename T> struct BinOpTraits<float, T>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<float, float>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<double, float>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<float, double>
-        {
-            typedef double argument_type;
-        };
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+        }
+    };
+
+    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_helper(const VecS& v)
+    {
+        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+    }
+}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+// unary operators
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(op (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(op (a.x), op (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(op (a.x), op (a.y), op (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(op (a.x), op (a.y), op (a.z), op (a.w)); \
     }
 
-#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_OP
+
+// unary functions
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a) \
     { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
+        return VecTraits<output_type ## 1>::make(func (a.x)); \
     } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a) \
     { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+        return VecTraits<output_type ## 2>::make(func (a.x), func (a.y)); \
     } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a) \
     { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+        return VecTraits<output_type ## 3>::make(func (a.x), func (a.y), func (a.z)); \
     } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a) \
     { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<T, type>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+        return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
     }
 
-#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator +, plus) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator -, minus) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator *, multiplies) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator /, divides) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator -, negate) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ==, equal_to) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator !=, not_equal_to) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator > , greater) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator < , less) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator >=, greater_equal) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator <=, less_equal) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &&, logical_and) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ||, logical_or) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp10, exp10_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log, log_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log2, log2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log10, log10_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sin, sin_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cos, cos_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tan, tan_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asin, asin_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acos, acos_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atan, atan_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sinh, sinh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cosh, cosh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tanh, tanh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asinh, asinh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acosh, acosh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atanh, atanh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot, hypot_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, atan2, atan2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, pow, pow_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot_sqr, hypot_sqr_func)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabs, double, double)
 
-#define OPENCV_GPU_IMPLEMENT_VEC_INT_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &, bit_and) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator |, bit_or) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrt, double, double)
 
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(float)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(double)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::exp, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::log, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::cosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
+
+// binary operators (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
+
+// binary operators (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(op, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op s); \
+    } \
+    __device__ __forceinline__ output_type ## 1 operator op(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(s op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op s, a.y op s); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(s op b.x, s op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op s, a.y op s, a.z op s); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(s op b.x, s op b.y, s op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op s, a.y op s, a.z op s, a.w op s); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(s op b.x, s op b.y, s op b.z, s op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uint, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP
+
+// binary function (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x, b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x, b.x), func (a.y, b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z), func (a.w, b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmaxf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmax, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fminf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fmin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypot, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC
+
+// binary function (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(func_name, func, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) a.x, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 1 func_name(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) s, (output_type) b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s), func ((output_type) a.w, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z), func ((output_type) s, (output_type) b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC
 
-    #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
 }}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_VECMATH_HPP__
diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp
index 40d88aad45..adfc294f6d 100644
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -72,7 +72,7 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
 
         TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
 
-        GPU_SANITY_CHECK(dst);
+        GPU_SANITY_CHECK(dst, 1);
     }
     else
     {
diff --git a/modules/gpu/perf/perf_main.cpp b/modules/gpu/perf/perf_main.cpp
index a7ac1ccce8..53a19ca412 100644
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@@ -44,4 +44,11 @@
 
 using namespace perf;
 
-CV_PERF_TEST_MAIN(gpu, printCudaInfo())
+static const char * impls[] = {
+#ifdef HAVE_CUDA
+    "cuda",
+#endif
+    "plain"
+};
+
+CV_PERF_TEST_MAIN_WITH_IMPLS(gpu, impls, printCudaInfo())
diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp
index 1ab01a75be..672d657b21 100644
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -103,7 +103,7 @@ PERF_TEST_P(ImagePair, Video_InterpolateFrames,
 
         TEST_CYCLE() cv::gpu::interpolateFrames(d_frame0, d_frame1, d_fu, d_fv, d_bu, d_bv, 0.5f, newFrame, d_buf);
 
-        GPU_SANITY_CHECK(newFrame);
+        GPU_SANITY_CHECK(newFrame, 1e-4);
     }
     else
     {
@@ -142,7 +142,7 @@ PERF_TEST_P(ImagePair, Video_CreateOpticalFlowNeedleMap,
 
         TEST_CYCLE() cv::gpu::createOpticalFlowNeedleMap(u, v, vertex, colors);
 
-        GPU_SANITY_CHECK(vertex);
+        GPU_SANITY_CHECK(vertex, 1e-6);
         GPU_SANITY_CHECK(colors);
     }
     else
@@ -219,8 +219,8 @@ PERF_TEST_P(ImagePair, Video_BroxOpticalFlow,
 
         TEST_CYCLE() d_flow(d_frame0, d_frame1, u, v);
 
-        GPU_SANITY_CHECK(u);
-        GPU_SANITY_CHECK(v);
+        GPU_SANITY_CHECK(u, 1e-1);
+        GPU_SANITY_CHECK(v, 1e-1);
     }
     else
     {
diff --git a/modules/gpu/src/calib3d.cpp b/modules/gpu/src/calib3d.cpp
index e83213f90f..b84f09d0ab 100644
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -151,7 +151,7 @@ namespace
     }
 
     // Computes rotation, translation pair for small subsets if the input data
-    class TransformHypothesesGenerator
+    class TransformHypothesesGenerator : public ParallelLoopBody
     {
     public:
         TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
@@ -161,7 +161,7 @@ namespace
                   num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                   transl_vectors(transl_vectors_) {}
 
-        void operator()(const BlockedRange& range) const
+        void operator()(const Range& range) const
         {
             // Input data for generation of the current hypothesis
             vector<int> subset_indices(subset_size);
@@ -173,7 +173,7 @@ namespace
             Mat rot_mat(3, 3, CV_64F);
             Mat transl_vec(1, 3, CV_64F);
 
-            for (int iter = range.begin(); iter < range.end(); ++iter)
+            for (int iter = range.start; iter < range.end; ++iter)
             {
                 selectRandom(subset_size, num_points, subset_indices);
                 for (int i = 0; i < subset_size; ++i)
@@ -239,7 +239,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
     // Generate set of hypotheses using small subsets of the input data
     TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
                                       num_points, subset_size, rot_matrices, transl_vectors);
-    parallel_for(BlockedRange(0, num_iters), body);
+    parallel_for_(Range(0, num_iters), body);
 
     // Compute scores (i.e. number of inliers) for each hypothesis
     GpuMat d_object(object);
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 814a96bc0a..7b95b69091 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -406,7 +406,7 @@ public:
         GpuMat dclassified(1, 1, CV_32S);
         cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
 
-        PyrLavel level(0, 1.0f, image.size(), NxM, minObjectSize);
+        PyrLavel level(0, scaleFactor, image.size(), NxM, minObjectSize);
 
         while (level.isFeasible(maxObjectSize))
         {
diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu
index 0fd482c41a..f29471f025 100644
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -67,8 +67,8 @@ namespace cv { namespace gpu { namespace device
                         crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                         crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
             }
-            __device__ __forceinline__ TransformOp() {}
-            __device__ __forceinline__ TransformOp(const TransformOp&) {}
+            __host__ __device__ __forceinline__ TransformOp() {}
+            __host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
         };
 
         void call(const PtrStepSz<float3> src, const float* rot,
@@ -106,8 +106,8 @@ namespace cv { namespace gpu { namespace device
                         (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                         (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
             }
-            __device__ __forceinline__ ProjectOp() {}
-            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
+            __host__ __device__ __forceinline__ ProjectOp() {}
+            __host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
         };
 
         void call(const PtrStepSz<float3> src, const float* rot,
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
index 1afcddc9c9..aab922f22c 100644
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -62,8 +62,8 @@ namespace canny
             return ::abs(x) + ::abs(y);
         }
 
-        __device__ __forceinline__ L1() {}
-        __device__ __forceinline__ L1(const L1&) {}
+        __host__ __device__ __forceinline__ L1() {}
+        __host__ __device__ __forceinline__ L1(const L1&) {}
     };
     struct L2 : binary_function<int, int, float>
     {
@@ -72,8 +72,8 @@ namespace canny
             return ::sqrtf(x * x + y * y);
         }
 
-        __device__ __forceinline__ L2() {}
-        __device__ __forceinline__ L2(const L2&) {}
+        __host__ __device__ __forceinline__ L2() {}
+        __host__ __device__ __forceinline__ L2(const L2&) {}
     };
 }
 
@@ -470,8 +470,8 @@ namespace canny
             return (uchar)(-(e >> 1));
         }
 
-        __device__ __forceinline__ GetEdges() {}
-        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+        __host__ __device__ __forceinline__ GetEdges() {}
+        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
     };
 }
 
diff --git a/modules/gpu/src/cuda/ccomponetns.cu b/modules/gpu/src/cuda/ccomponetns.cu
index 7f3d4ae338..c4d79bd80b 100644
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace device
 
             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
             {
-                I d = a - b;
+                I d = saturate_cast<I>(a - b);
                 return lo.x <= d.x && d.x <= hi.x &&
                        lo.y <= d.y && d.y <= hi.y &&
                        lo.z <= d.z && d.z <= hi.z;
@@ -169,7 +169,7 @@ namespace cv { namespace gpu { namespace device
 
             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
             {
-                I d = a - b;
+                I d = saturate_cast<I>(a - b);
                 return lo.x <= d.x && d.x <= hi.x &&
                        lo.y <= d.y && d.y <= hi.y &&
                        lo.z <= d.z && d.z <= hi.z &&
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index e9397e534f..876d4ad3c4 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -162,8 +162,8 @@ namespace arithm
             return vadd4(a, b);
         }
 
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
+        __host__ __device__ __forceinline__ VAdd4() {}
+        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
     };
 
     ////////////////////////////////////
@@ -175,8 +175,8 @@ namespace arithm
             return vadd2(a, b);
         }
 
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
+        __host__ __device__ __forceinline__ VAdd2() {}
+        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
     };
 
     ////////////////////////////////////
@@ -188,8 +188,8 @@ namespace arithm
             return saturate_cast<D>(a + b);
         }
 
-        __device__ __forceinline__ AddMat() {}
-        __device__ __forceinline__ AddMat(const AddMat& other) {}
+        __host__ __device__ __forceinline__ AddMat() {}
+        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
     };
 }
 
@@ -397,8 +397,8 @@ namespace arithm
             return vsub4(a, b);
         }
 
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4& other) {}
+        __host__ __device__ __forceinline__ VSub4() {}
+        __host__ __device__ __forceinline__ VSub4(const VSub4&) {}
     };
 
     ////////////////////////////////////
@@ -410,8 +410,8 @@ namespace arithm
             return vsub2(a, b);
         }
 
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2& other) {}
+        __host__ __device__ __forceinline__ VSub2() {}
+        __host__ __device__ __forceinline__ VSub2(const VSub2&) {}
     };
 
     ////////////////////////////////////
@@ -423,8 +423,8 @@ namespace arithm
             return saturate_cast<D>(a - b);
         }
 
-        __device__ __forceinline__ SubMat() {}
-        __device__ __forceinline__ SubMat(const SubMat& other) {}
+        __host__ __device__ __forceinline__ SubMat() {}
+        __host__ __device__ __forceinline__ SubMat(const SubMat&) {}
     };
 }
 
@@ -617,8 +617,8 @@ namespace arithm
             return res;
         }
 
-        __device__ __forceinline__ Mul_8uc4_32f() {}
-        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
     };
 
     struct Mul_16sc4_32f : binary_function<short4, float, short4>
@@ -629,8 +629,8 @@ namespace arithm
                                saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
         }
 
-        __device__ __forceinline__ Mul_16sc4_32f() {}
-        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
     };
 
     template <typename T, typename D> struct Mul : binary_function<T, T, D>
@@ -640,8 +640,8 @@ namespace arithm
             return saturate_cast<D>(a * b);
         }
 
-        __device__ __forceinline__ Mul() {}
-        __device__ __forceinline__ Mul(const Mul& other) {}
+        __host__ __device__ __forceinline__ Mul() {}
+        __host__ __device__ __forceinline__ Mul(const Mul&) {}
     };
 
     template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
@@ -888,8 +888,8 @@ namespace arithm
             return b != 0 ? saturate_cast<D>(a / b) : 0;
         }
 
-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
     };
     template <typename T> struct Div<T, float> : binary_function<T, T, float>
     {
@@ -898,8 +898,8 @@ namespace arithm
             return b != 0 ? static_cast<float>(a) / b : 0;
         }
 
-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
     };
     template <typename T> struct Div<T, double> : binary_function<T, T, double>
     {
@@ -908,8 +908,8 @@ namespace arithm
             return b != 0 ? static_cast<double>(a) / b : 0;
         }
 
-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
     };
 
     template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
@@ -1196,8 +1196,8 @@ namespace arithm
             return vabsdiff4(a, b);
         }
 
-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff4() {}
+        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
     };
 
     ////////////////////////////////////
@@ -1209,8 +1209,8 @@ namespace arithm
             return vabsdiff2(a, b);
         }
 
-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff2() {}
+        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
     };
 
     ////////////////////////////////////
@@ -1235,8 +1235,8 @@ namespace arithm
             return saturate_cast<T>(_abs(a - b));
         }
 
-        __device__ __forceinline__ AbsDiffMat() {}
-        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+        __host__ __device__ __forceinline__ AbsDiffMat() {}
+        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
     };
 }
 
@@ -1370,8 +1370,8 @@ namespace arithm
             return saturate_cast<T>(x * x);
         }
 
-        __device__ __forceinline__ Sqr() {}
-        __device__ __forceinline__ Sqr(const Sqr& other) {}
+        __host__ __device__ __forceinline__ Sqr() {}
+        __host__ __device__ __forceinline__ Sqr(const Sqr&) {}
     };
 }
 
@@ -1466,8 +1466,8 @@ namespace arithm
             return saturate_cast<T>(f(x));
         }
 
-        __device__ __forceinline__ Exp() {}
-        __device__ __forceinline__ Exp(const Exp& other) {}
+        __host__ __device__ __forceinline__ Exp() {}
+        __host__ __device__ __forceinline__ Exp(const Exp&) {}
     };
 }
 
@@ -1507,8 +1507,8 @@ namespace arithm
             return vcmpeq4(a, b);
         }
 
-        __device__ __forceinline__ VCmpEq4() {}
-        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+        __host__ __device__ __forceinline__ VCmpEq4() {}
+        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
     };
     struct VCmpNe4 : binary_function<uint, uint, uint>
     {
@@ -1517,8 +1517,8 @@ namespace arithm
             return vcmpne4(a, b);
         }
 
-        __device__ __forceinline__ VCmpNe4() {}
-        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+        __host__ __device__ __forceinline__ VCmpNe4() {}
+        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
     };
     struct VCmpLt4 : binary_function<uint, uint, uint>
     {
@@ -1527,8 +1527,8 @@ namespace arithm
             return vcmplt4(a, b);
         }
 
-        __device__ __forceinline__ VCmpLt4() {}
-        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+        __host__ __device__ __forceinline__ VCmpLt4() {}
+        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
     };
     struct VCmpLe4 : binary_function<uint, uint, uint>
     {
@@ -1537,8 +1537,8 @@ namespace arithm
             return vcmple4(a, b);
         }
 
-        __device__ __forceinline__ VCmpLe4() {}
-        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+        __host__ __device__ __forceinline__ VCmpLe4() {}
+        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
     };
 
     ////////////////////////////////////
@@ -2008,8 +2008,8 @@ namespace arithm
             return vmin4(a, b);
         }
 
-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
+        __host__ __device__ __forceinline__ VMin4() {}
+        __host__ __device__ __forceinline__ VMin4(const VMin4&) {}
     };
 
     ////////////////////////////////////
@@ -2021,8 +2021,8 @@ namespace arithm
             return vmin2(a, b);
         }
 
-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
+        __host__ __device__ __forceinline__ VMin2() {}
+        __host__ __device__ __forceinline__ VMin2(const VMin2&) {}
     };
 }
 
@@ -2100,8 +2100,8 @@ namespace arithm
             return vmax4(a, b);
         }
 
-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
+        __host__ __device__ __forceinline__ VMax4() {}
+        __host__ __device__ __forceinline__ VMax4(const VMax4&) {}
     };
 
     ////////////////////////////////////
@@ -2113,8 +2113,8 @@ namespace arithm
             return vmax2(a, b);
         }
 
-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
+        __host__ __device__ __forceinline__ VMax2() {}
+        __host__ __device__ __forceinline__ VMax2(const VMax2&) {}
     };
 }
 
diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
index faec89b95c..59eba26081 100644
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -48,6 +48,7 @@
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/emulation.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/dynamic_smem.hpp"
 
@@ -811,7 +812,7 @@ namespace cv { namespace gpu { namespace device
 
             const int ind = ::atomicAdd(r_sizes + n, 1);
             if (ind < maxSize)
-                r_table(n, ind) = p - templCenter;
+                r_table(n, ind) = saturate_cast<short2>(p - templCenter);
         }
 
         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
@@ -855,7 +856,7 @@ namespace cv { namespace gpu { namespace device
 
             for (int j = 0; j < r_row_size; ++j)
             {
-                short2 c = p - r_row[j];
+                int2 c = p - r_row[j];
 
                 c.x = __float2int_rn(c.x * idp);
                 c.y = __float2int_rn(c.y * idp);
diff --git a/modules/gpu/src/error.cpp b/modules/gpu/src/error.cpp
index c155aa83bf..7f5d5f38d5 100644
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -81,48 +81,90 @@ namespace
 
     const ErrorEntry npp_errors [] =
     {
-        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
-        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
-        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
-
 #if defined (_MSC_VER)
         error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
 #endif
 
+#if NPP_VERSION < 5500
         error_entry( NPP_BAD_ARG_ERROR ),
-        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_TEXTURE_BIND_ERROR ),
         error_entry( NPP_COEFF_ERROR ),
         error_entry( NPP_RECT_ERROR ),
         error_entry( NPP_QUAD_ERROR ),
+        error_entry( NPP_MEMFREE_ERR ),
+        error_entry( NPP_MEMSET_ERR ),
+        error_entry( NPP_MEM_ALLOC_ERR ),
+        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERR ),
+        error_entry( NPP_INVALID_INPUT ),
+        error_entry( NPP_POINTER_ERROR ),
+        error_entry( NPP_WARNING ),
+        error_entry( NPP_ODD_ROI_WARNING ),
+#else
+        error_entry( NPP_INVALID_HOST_POINTER_ERROR ),
+        error_entry( NPP_INVALID_DEVICE_POINTER_ERROR ),
+        error_entry( NPP_LUT_PALETTE_BITSIZE_ERROR ),
+        error_entry( NPP_ZC_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_MEMFREE_ERROR ),
+        error_entry( NPP_MEMSET_ERROR ),
+        error_entry( NPP_QUALITY_INDEX_ERROR ),
+        error_entry( NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_CHANNEL_ORDER_ERROR ),
+        error_entry( NPP_ZERO_MASK_VALUE_ERROR ),
+        error_entry( NPP_QUADRANGLE_ERROR ),
+        error_entry( NPP_RECTANGLE_ERROR ),
+        error_entry( NPP_COEFFICIENT_ERROR ),
+        error_entry( NPP_NUMBER_OF_CHANNELS_ERROR ),
+        error_entry( NPP_COI_ERROR ),
+        error_entry( NPP_DIVISOR_ERROR ),
+        error_entry( NPP_CHANNEL_ERROR ),
+        error_entry( NPP_STRIDE_ERROR ),
+        error_entry( NPP_ANCHOR_ERROR ),
+        error_entry( NPP_MASK_SIZE_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERROR ),
+        error_entry( NPP_MOMENT_00_ZERO_ERROR ),
+        error_entry( NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR ),
+        error_entry( NPP_THRESHOLD_ERROR ),
+        error_entry( NPP_CONTEXT_MATCH_ERROR ),
+        error_entry( NPP_FFT_FLAG_ERROR ),
+        error_entry( NPP_FFT_ORDER_ERROR ),
+        error_entry( NPP_SCALE_RANGE_ERROR ),
+        error_entry( NPP_DATA_TYPE_ERROR ),
+        error_entry( NPP_OUT_OFF_RANGE_ERROR ),
+        error_entry( NPP_DIVIDE_BY_ZERO_ERROR ),
+        error_entry( NPP_MEMORY_ALLOCATION_ERR ),
+        error_entry( NPP_RANGE_ERROR ),
+        error_entry( NPP_BAD_ARGUMENT_ERROR ),
+        error_entry( NPP_NO_MEMORY_ERROR ),
+        error_entry( NPP_ERROR_RESERVED ),
+        error_entry( NPP_NO_OPERATION_WARNING ),
+        error_entry( NPP_DIVIDE_BY_ZERO_WARNING ),
+        error_entry( NPP_WRONG_INTERSECTION_ROI_WARNING ),
+#endif
+
+        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
+        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
+        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_TEXTURE_BIND_ERROR ),
         error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
         error_entry( NPP_NOT_EVEN_STEP_ERROR ),
         error_entry( NPP_INTERPOLATION_ERROR ),
         error_entry( NPP_RESIZE_FACTOR_ERROR ),
         error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
-        error_entry( NPP_MEMFREE_ERR ),
-        error_entry( NPP_MEMSET_ERR ),
         error_entry( NPP_MEMCPY_ERROR ),
-        error_entry( NPP_MEM_ALLOC_ERR ),
-        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_MIRROR_FLIP_ERR ),
-        error_entry( NPP_INVALID_INPUT ),
         error_entry( NPP_ALIGNMENT_ERROR ),
         error_entry( NPP_STEP_ERROR ),
         error_entry( NPP_SIZE_ERROR ),
-        error_entry( NPP_POINTER_ERROR ),
         error_entry( NPP_NULL_POINTER_ERROR ),
         error_entry( NPP_CUDA_KERNEL_EXECUTION_ERROR ),
         error_entry( NPP_NOT_IMPLEMENTED_ERROR ),
         error_entry( NPP_ERROR ),
         error_entry( NPP_NO_ERROR ),
         error_entry( NPP_SUCCESS ),
-        error_entry( NPP_WARNING ),
         error_entry( NPP_WRONG_INTERSECTION_QUAD_WARNING ),
         error_entry( NPP_MISALIGNED_DST_ROI_WARNING ),
         error_entry( NPP_AFFINE_QUAD_INCORRECT_WARNING ),
-        error_entry( NPP_DOUBLE_SIZE_WARNING ),
-        error_entry( NPP_ODD_ROI_WARNING )
+        error_entry( NPP_DOUBLE_SIZE_WARNING )
     };
 
     const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 761abb525f..056e5ef701 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -187,10 +187,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
     CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
     CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
 
-    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
-        NppiSize oSizeROI, Npp64f* pRetVal);
+#if CUDA_VERSION < 5050
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);
 
-    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+        NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
+
+    typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
+
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+
+    static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
+#endif
 
     NppiSize sz;
     sz.width  = src1.cols;
@@ -202,7 +212,16 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 
     DeviceBuffer dbuf;
 
-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#if CUDA_VERSION < 5050
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#else
+    int bufSize;
+    buf_size_funcs[funcIdx](sz, &bufSize);
+
+    GpuMat buf(1, bufSize, CV_8UC1);
+
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
+#endif
 
     cudaSafeCall( cudaDeviceSynchronize() );
 
diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp
index 0394dba186..80e1da7953 100644
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -130,7 +130,7 @@ typedef                int Ncv32s;
 typedef       unsigned int Ncv32u;
 typedef              short Ncv16s;
 typedef     unsigned short Ncv16u;
-typedef               char Ncv8s;
+typedef        signed char Ncv8s;
 typedef      unsigned char Ncv8u;
 typedef              float Ncv32f;
 typedef             double Ncv64f;
diff --git a/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
index ec2f16ebb7..c1e06b434e 100644
--- a/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
+++ b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
@@ -51,7 +51,7 @@ template<typename TBase> inline __host__ __device__ TBase _pixMaxVal();
 template<> static inline __host__ __device__ Ncv8u  _pixMaxVal<Ncv8u>()  {return UCHAR_MAX;}
 template<> static inline __host__ __device__ Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
 template<> static inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return  UINT_MAX;}
-template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  CHAR_MAX;}
+template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  SCHAR_MAX;}
 template<> static inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return  SHRT_MAX;}
 template<> static inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return   INT_MAX;}
 template<> static inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return   FLT_MAX;}
@@ -61,7 +61,7 @@ template<typename TBase> inline __host__ __device__ TBase _pixMinVal();
 template<> static inline __host__ __device__ Ncv8u  _pixMinVal<Ncv8u>()  {return 0;}
 template<> static inline __host__ __device__ Ncv16u _pixMinVal<Ncv16u>() {return 0;}
 template<> static inline __host__ __device__ Ncv32u _pixMinVal<Ncv32u>() {return 0;}
-template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return CHAR_MIN;}
+template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return SCHAR_MIN;}
 template<> static inline __host__ __device__ Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
 template<> static inline __host__ __device__ Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
 template<> static inline __host__ __device__ Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index f219089321..06d5386405 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -116,11 +116,13 @@
     #define CUDART_MINIMUM_REQUIRED_VERSION 4010
     #define NPP_MINIMUM_REQUIRED_VERSION 4100
 
+    #define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
+
     #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
         #error "Insufficient Cuda Runtime library version, please update it."
     #endif
 
-    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+    #if (NPP_VERSION < NPP_MINIMUM_REQUIRED_VERSION)
         #error "Insufficient NPP version, please update it."
     #endif
 
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index 1bc952c7a1..b622ad8ea9 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -352,7 +352,7 @@ GPU_TEST_P(Add_Scalar, WithOutMask)
         cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
         cv::add(mat, val, dst_gold, cv::noArray(), depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
     }
 }
 
@@ -383,7 +383,7 @@ GPU_TEST_P(Add_Scalar, WithMask)
         cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
         cv::add(mat, val, dst_gold, mask, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
     }
 }
 
@@ -567,7 +567,7 @@ GPU_TEST_P(Subtract_Scalar, WithOutMask)
         cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
         cv::subtract(mat, val, dst_gold, cv::noArray(), depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
     }
 }
 
@@ -598,7 +598,7 @@ GPU_TEST_P(Subtract_Scalar, WithMask)
         cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
         cv::subtract(mat, val, dst_gold, mask, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
     }
 }
 
@@ -2148,7 +2148,7 @@ GPU_TEST_P(Min, Scalar)
 
         cv::Mat dst_gold = cv::min(src, val);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
     }
 }
 
@@ -2231,7 +2231,7 @@ GPU_TEST_P(Max, Scalar)
 
         cv::Mat dst_gold = cv::max(src, val);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
     }
 }
 
diff --git a/modules/gpu/test/test_optflow.cpp b/modules/gpu/test/test_optflow.cpp
index 9e30b92087..53b93a096b 100644
--- a/modules/gpu/test/test_optflow.cpp
+++ b/modules/gpu/test/test_optflow.cpp
@@ -102,8 +102,8 @@ GPU_TEST_P(BroxOpticalFlow, Regression)
     for (int i = 0; i < v_gold.rows; ++i)
         f.read(v_gold.ptr<char>(i), v_gold.cols * sizeof(float));
 
-    EXPECT_MAT_NEAR(u_gold, u, 0);
-    EXPECT_MAT_NEAR(v_gold, v, 0);
+    EXPECT_MAT_SIMILAR(u_gold, u, 1e-3);
+    EXPECT_MAT_SIMILAR(v_gold, v, 1e-3);
 #else
     std::ofstream f(fname.c_str(), std::ios_base::binary);
 
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 4c60867af3..05ab99a78c 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -95,14 +95,10 @@ elseif(HAVE_QT)
   endif()
   include(${QT_USE_FILE})
 
-  if(QT_INCLUDE_DIR)
-    ocv_include_directories(${QT_INCLUDE_DIR})
-  endif()
-
   QT4_ADD_RESOURCES(_RCC_OUTFILES src/window_QT.qrc)
   QT4_WRAP_CPP(_MOC_OUTFILES src/window_QT.h)
 
-  list(APPEND HIGHGUI_LIBRARIES ${QT_LIBRARIES} ${QT_QTTEST_LIBRARY})
+  list(APPEND HIGHGUI_LIBRARIES ${QT_LIBRARIES})
   list(APPEND highgui_srcs src/window_QT.cpp ${_MOC_OUTFILES} ${_RCC_OUTFILES})
   ocv_check_flag_support(CXX -Wno-missing-declarations _have_flag)
   if(${_have_flag})
@@ -183,7 +179,11 @@ if(HAVE_XIMEA)
   if(XIMEA_LIBRARY_DIR)
     link_directories(${XIMEA_LIBRARY_DIR})
   endif()
-  list(APPEND HIGHGUI_LIBRARIES m3api)
+  if(CMAKE_CL_64)
+    list(APPEND HIGHGUI_LIBRARIES m3apiX64)
+  else()
+    list(APPEND HIGHGUI_LIBRARIES m3api)
+  endif()
 endif(HAVE_XIMEA)
 
 if(HAVE_FFMPEG)
diff --git a/modules/highgui/include/opencv2/highgui/cap_ios.h b/modules/highgui/include/opencv2/highgui/cap_ios.h
index 5bd5fe3c67..db3928f13b 100644
--- a/modules/highgui/include/opencv2/highgui/cap_ios.h
+++ b/modules/highgui/include/opencv2/highgui/cap_ios.h
@@ -1,6 +1,4 @@
-/*
- *  cap_ios.h
- *  For iOS video I/O
+/*  For iOS video I/O
  *  by Eduard Feicho on 29/07/12
  *  Copyright 2012. All rights reserved.
  *
@@ -90,6 +88,12 @@
 - (void)createVideoPreviewLayer;
 - (void)updateOrientation;
 
+- (void)lockFocus;
+- (void)unlockFocus;
+- (void)lockExposure;
+- (void)unlockExposure;
+- (void)lockBalance;
+- (void)unlockBalance;
 
 @end
 
@@ -116,6 +120,7 @@
     BOOL grayscaleMode;
 
     BOOL recordVideo;
+    BOOL rotateVideo;
     AVAssetWriterInput* recordAssetWriterInput;
     AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
     AVAssetWriter* recordAssetWriter;
@@ -128,6 +133,7 @@
 @property (nonatomic, assign) BOOL grayscaleMode;
 
 @property (nonatomic, assign) BOOL recordVideo;
+@property (nonatomic, assign) BOOL rotateVideo;
 @property (nonatomic, retain) AVAssetWriterInput* recordAssetWriterInput;
 @property (nonatomic, retain) AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
 @property (nonatomic, retain) AVAssetWriter* recordAssetWriter;
diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h
index 12be9867a2..9204ee81f4 100644
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@@ -558,9 +558,11 @@ CVAPI(int)    cvGetCaptureDomain( CvCapture* capture);
 /* "black box" video file writer structure */
 typedef struct CvVideoWriter CvVideoWriter;
 
+#define CV_FOURCC_MACRO(c1, c2, c3, c4) (((c1) & 255) + (((c2) & 255) << 8) + (((c3) & 255) << 16) + (((c4) & 255) << 24))
+
 CV_INLINE int CV_FOURCC(char c1, char c2, char c3, char c4)
 {
-    return (c1 & 255) + ((c2 & 255) << 8) + ((c3 & 255) << 16) + ((c4 & 255) << 24);
+    return CV_FOURCC_MACRO(c1, c2, c3, c4);
 }
 
 #define CV_FOURCC_PROMPT -1  /* Open Codec Selection Dialog (Windows only) */
diff --git a/modules/highgui/perf/perf_input.cpp b/modules/highgui/perf/perf_input.cpp
index 0c1e8e0a73..414c85365f 100644
--- a/modules/highgui/perf/perf_input.cpp
+++ b/modules/highgui/perf/perf_input.cpp
@@ -11,11 +11,21 @@ using std::tr1::get;
 
 typedef perf::TestBaseWithParam<String> VideoCapture_Reading;
 
+#if defined(HAVE_MSMF)
+// MPEG2 is not supported by Media Foundation yet
+// http://social.msdn.microsoft.com/Forums/en-US/mediafoundationdevelopment/thread/39a36231-8c01-40af-9af5-3c105d684429
+PERF_TEST_P(VideoCapture_Reading, ReadFile, testing::Values( "highgui/video/big_buck_bunny.avi",
+                                               "highgui/video/big_buck_bunny.mov",
+                                               "highgui/video/big_buck_bunny.mp4",
+                                               "highgui/video/big_buck_bunny.wmv" ) )
+
+#else
 PERF_TEST_P(VideoCapture_Reading, ReadFile, testing::Values( "highgui/video/big_buck_bunny.avi",
                                                "highgui/video/big_buck_bunny.mov",
                                                "highgui/video/big_buck_bunny.mp4",
                                                "highgui/video/big_buck_bunny.mpg",
                                                "highgui/video/big_buck_bunny.wmv" ) )
+#endif
 {
   string filename = getDataPath(GetParam());
 
diff --git a/modules/highgui/perf/perf_output.cpp b/modules/highgui/perf/perf_output.cpp
index 6428bb4f03..2adfe89655 100644
--- a/modules/highgui/perf/perf_output.cpp
+++ b/modules/highgui/perf/perf_output.cpp
@@ -22,10 +22,16 @@ PERF_TEST_P(VideoWriter_Writing, WriteFrame,
 {
   string filename = getDataPath(get<0>(GetParam()));
   bool isColor = get<1>(GetParam());
+  Mat image = imread(filename, 1);
+#if defined(HAVE_MSMF) && !defined(HAVE_VFW) && !defined(HAVE_FFMPEG) // VFW has greater priority
+  VideoWriter writer(cv::tempfile(".wmv"), CV_FOURCC('W', 'M', 'V', '3'),
+                            25, cv::Size(image.cols, image.rows), isColor);
+#else
+  VideoWriter writer(cv::tempfile(".avi"), CV_FOURCC('X', 'V', 'I', 'D'),
+                            25, cv::Size(image.cols, image.rows), isColor);
+#endif
 
-  VideoWriter writer(cv::tempfile(".avi"), CV_FOURCC('X', 'V', 'I', 'D'), 25, cv::Size(640, 480), isColor);
-
-  TEST_CYCLE() { Mat image = imread(filename, 1); writer << image; }
+  TEST_CYCLE() { image = imread(filename, 1); writer << image; }
 
   bool dummy = writer.isOpened();
   SANITY_CHECK(dummy);
diff --git a/modules/highgui/perf/perf_precomp.hpp b/modules/highgui/perf/perf_precomp.hpp
index 529187d3b2..d6b28b6d23 100644
--- a/modules/highgui/perf/perf_precomp.hpp
+++ b/modules/highgui/perf/perf_precomp.hpp
@@ -21,6 +21,7 @@
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
     defined(HAVE_FFMPEG)       || \
+    defined(HAVE_MSMF)         || \
     defined(HAVE_VFW)
     /*defined(HAVE_OPENNI) too specialized */ \
 
@@ -34,6 +35,7 @@
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
     defined(HAVE_FFMPEG)       || \
+    defined(HAVE_MSMF)         || \
     defined(HAVE_VFW)
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 1
 #else
diff --git a/modules/highgui/src/cap.cpp b/modules/highgui/src/cap.cpp
index 2c3b3a94c3..cc92da3d0c 100644
--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@@ -117,6 +117,9 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 #ifdef HAVE_DSHOW
         CV_CAP_DSHOW,
 #endif
+#ifdef HAVE_MSMF
+        CV_CAP_MSMF,
+#endif
 #if 1
         CV_CAP_IEEE1394,   // identical to CV_CAP_DC1394
 #endif
@@ -196,13 +199,6 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 
         switch (domains[i])
         {
-#ifdef HAVE_MSMF
-        case CV_CAP_MSMF:
-             capture = cvCreateCameraCapture_MSMF (index);
-             if (capture)
-                 return capture;
-            break;
-#endif
 #ifdef HAVE_DSHOW
         case CV_CAP_DSHOW:
              capture = cvCreateCameraCapture_DShow (index);
@@ -210,7 +206,13 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
                  return capture;
             break;
 #endif
-
+#ifdef HAVE_MSMF
+        case CV_CAP_MSMF:
+             capture = cvCreateCameraCapture_MSMF (index);
+             if (capture)
+                 return capture;
+            break;
+#endif
 #ifdef HAVE_TYZX
         case CV_CAP_STEREO:
             capture = cvCreateCameraCapture_TYZX (index);
@@ -218,14 +220,12 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
                 return capture;
             break;
 #endif
-
-        case CV_CAP_VFW:
 #ifdef HAVE_VFW
+        case CV_CAP_VFW:
             capture = cvCreateCameraCapture_VFW (index);
             if (capture)
                 return capture;
 #endif
-
 #if defined HAVE_LIBV4L || defined HAVE_CAMV4L || defined HAVE_CAMV4L2 || defined HAVE_VIDEOIO
             capture = cvCreateCameraCapture_V4L (index);
             if (capture)
@@ -358,6 +358,16 @@ CV_IMPL CvCapture * cvCreateFileCapture (const char * filename)
     if (! result)
         result = cvCreateFileCapture_FFMPEG_proxy (filename);
 
+#ifdef HAVE_VFW
+    if (! result)
+        result = cvCreateFileCapture_VFW (filename);
+#endif
+
+#ifdef HAVE_MSMF
+    if (! result)
+        result = cvCreateFileCapture_MSMF (filename);
+#endif
+
 #ifdef HAVE_XINE
     if (! result)
         result = cvCreateFileCapture_XINE (filename);
@@ -406,6 +416,16 @@ CV_IMPL CvVideoWriter* cvCreateVideoWriter( const char* filename, int fourcc,
     if(!result)
         result = cvCreateVideoWriter_FFMPEG_proxy (filename, fourcc, fps, frameSize, is_color);
 
+#ifdef HAVE_VFW
+    if(!result)
+        result = cvCreateVideoWriter_VFW(filename, fourcc, fps, frameSize, is_color);
+#endif
+
+#ifdef HAVE_MSMF
+    if (!result)
+        result = cvCreateVideoWriter_MSMF(filename, fourcc, fps, frameSize, is_color);
+#endif
+
 /*  #ifdef HAVE_XINE
     if(!result)
         result = cvCreateVideoWriter_XINE(filename, fourcc, fps, frameSize, is_color);
diff --git a/modules/highgui/src/cap_dc1394_v2.cpp b/modules/highgui/src/cap_dc1394_v2.cpp
index f197337cd1..0d5f898186 100644
--- a/modules/highgui/src/cap_dc1394_v2.cpp
+++ b/modules/highgui/src/cap_dc1394_v2.cpp
@@ -45,7 +45,16 @@
 
 #include <unistd.h>
 #include <stdint.h>
-#include <sys/select.h>
+#ifdef WIN32
+  // On Windows, we have no sys/select.h, but we need to pick up
+  // select() which is in winsock2.
+  #ifndef __SYS_SELECT_H__
+    #define __SYS_SELECT_H__ 1
+    #include <winsock2.h>
+  #endif
+#else
+  #include <sys/select.h>
+#endif /*WIN32*/
 #include <dc1394/dc1394.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/modules/highgui/src/cap_dshow.cpp b/modules/highgui/src/cap_dshow.cpp
index 21fb947b15..b7cfbd94b3 100644
--- a/modules/highgui/src/cap_dshow.cpp
+++ b/modules/highgui/src/cap_dshow.cpp
@@ -3195,8 +3195,10 @@ IplImage* CvCaptureCAM_DShow::retrieveFrame(int)
         frame = cvCreateImage( cvSize(w,h), 8, 3 );
     }
 
-    VI.getPixels( index, (uchar*)frame->imageData, false, true );
-    return frame;
+    if (VI.getPixels( index, (uchar*)frame->imageData, false, true ))
+        return frame;
+    else
+        return NULL;
 }
 
 double CvCaptureCAM_DShow::getProperty( int property_id )
diff --git a/modules/highgui/src/cap_ffmpeg.cpp b/modules/highgui/src/cap_ffmpeg.cpp
index 669ebda125..bf73c0810f 100644
--- a/modules/highgui/src/cap_ffmpeg.cpp
+++ b/modules/highgui/src/cap_ffmpeg.cpp
@@ -209,11 +209,7 @@ CvCapture* cvCreateFileCapture_FFMPEG_proxy(const char * filename)
     if( result->open( filename ))
         return result;
     delete result;
-#ifdef HAVE_VFW
-    return cvCreateFileCapture_VFW(filename);
-#else
     return 0;
-#endif
 }
 
 class CvVideoWriter_FFMPEG_proxy : 
@@ -263,9 +259,5 @@ CvVideoWriter* cvCreateVideoWriter_FFMPEG_proxy( const char* filename, int fourc
     if( result->open( filename, fourcc, fps, frameSize, isColor != 0 ))
         return result;
     delete result;
-#ifdef HAVE_VFW
-     return cvCreateVideoWriter_VFW(filename, fourcc, fps, frameSize, isColor);
- #else
     return 0;
-#endif
 }
diff --git a/modules/highgui/src/cap_ios_abstract_camera.mm b/modules/highgui/src/cap_ios_abstract_camera.mm
index b6a7d944fa..38e1c12e68 100644
--- a/modules/highgui/src/cap_ios_abstract_camera.mm
+++ b/modules/highgui/src/cap_ios_abstract_camera.mm
@@ -2,6 +2,7 @@
  *  cap_ios_abstract_camera.mm
  *  For iOS video I/O
  *  by Eduard Feicho on 29/07/12
+ *  by Alexander Shishkov on 17/07/13
  *  Copyright 2012. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -405,4 +406,89 @@
     }
 }
 
+- (void)lockFocus;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isFocusModeSupported:AVCaptureFocusModeLocked]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.focusMode = AVCaptureFocusModeLocked;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for locked focus configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void) unlockFocus;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isFocusModeSupported:AVCaptureFocusModeContinuousAutoFocus]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.focusMode = AVCaptureFocusModeContinuousAutoFocus;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for autofocus configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void)lockExposure;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isExposureModeSupported:AVCaptureExposureModeLocked]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.exposureMode = AVCaptureExposureModeLocked;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for locked exposure configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void) unlockExposure;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isExposureModeSupported:AVCaptureExposureModeContinuousAutoExposure]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.exposureMode = AVCaptureExposureModeContinuousAutoExposure;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for autoexposure configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void)lockBalance;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isWhiteBalanceModeSupported:AVCaptureWhiteBalanceModeLocked]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.whiteBalanceMode = AVCaptureWhiteBalanceModeLocked;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for locked white balance configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void) unlockBalance;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isWhiteBalanceModeSupported:AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.whiteBalanceMode = AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for auto white balance configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
 @end
+
diff --git a/modules/highgui/src/cap_ios_video_camera.mm b/modules/highgui/src/cap_ios_video_camera.mm
index 1f9ea14bf8..ac85f79ee5 100644
--- a/modules/highgui/src/cap_ios_video_camera.mm
+++ b/modules/highgui/src/cap_ios_video_camera.mm
@@ -2,6 +2,7 @@
  *  cap_ios_video_camera.mm
  *  For iOS video I/O
  *  by Eduard Feicho on 29/07/12
+ *  by Alexander Shishkov on 17/07/13
  *  Copyright 2012. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +31,6 @@
 
 #import "opencv2/highgui/cap_ios.h"
 #include "precomp.hpp"
-
 #import <AssetsLibrary/AssetsLibrary.h>
 
 
@@ -70,6 +70,7 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 @synthesize videoDataOutput;
 
 @synthesize recordVideo;
+@synthesize rotateVideo;
 //@synthesize videoFileOutput;
 @synthesize recordAssetWriterInput;
 @synthesize recordPixelBufferAdaptor;
@@ -85,6 +86,7 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
     if (self) {
         self.useAVCaptureVideoPreviewLayer = NO;
         self.recordVideo = NO;
+        self.rotateVideo = NO;
     }
     return self;
 }
@@ -269,13 +271,8 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 
 }
 
-
-
-
 #pragma mark - Private Interface
 
-
-
 - (void)createVideoDataOutput;
 {
     // Make a video data output
@@ -389,6 +386,38 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
     [self.parentView.layer addSublayer:self.customPreviewLayer];
 }
 
+- (CVPixelBufferRef) pixelBufferFromCGImage: (CGImageRef) image
+{
+    
+    CGSize frameSize = CGSizeMake(CGImageGetWidth(image), CGImageGetHeight(image));
+    NSDictionary *options = [NSDictionary dictionaryWithObjectsAndKeys:
+                             [NSNumber numberWithBool:NO], kCVPixelBufferCGImageCompatibilityKey,
+                             [NSNumber numberWithBool:NO], kCVPixelBufferCGBitmapContextCompatibilityKey,
+                             nil];
+    CVPixelBufferRef pxbuffer = NULL;
+    CVReturn status = CVPixelBufferCreate(kCFAllocatorDefault, frameSize.width,
+                                          frameSize.height,  kCVPixelFormatType_32ARGB, (CFDictionaryRef) CFBridgingRetain(options),
+                                          &pxbuffer);
+    NSParameterAssert(status == kCVReturnSuccess && pxbuffer != NULL);
+    
+    CVPixelBufferLockBaseAddress(pxbuffer, 0);
+    void *pxdata = CVPixelBufferGetBaseAddress(pxbuffer);
+    
+    
+    CGColorSpaceRef rgbColorSpace = CGColorSpaceCreateDeviceRGB();
+    CGContextRef context = CGBitmapContextCreate(pxdata, frameSize.width,
+                                                 frameSize.height, 8, 4*frameSize.width, rgbColorSpace,
+                                                 kCGImageAlphaPremultipliedFirst);
+    
+    CGContextDrawImage(context, CGRectMake(0, 0, CGImageGetWidth(image),
+                                           CGImageGetHeight(image)), image);
+    CGColorSpaceRelease(rgbColorSpace);
+    CGContextRelease(context);
+    
+    CVPixelBufferUnlockBaseAddress(pxbuffer, 0);
+    
+    return pxbuffer;
+}
 
 #pragma mark - Protocol AVCaptureVideoDataOutputSampleBufferDelegate
 
@@ -522,7 +551,8 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
             }
 
             if (self.recordAssetWriterInput.readyForMoreMediaData) {
-                if (! [self.recordPixelBufferAdaptor appendPixelBuffer:imageBuffer
+                CVImageBufferRef pixelBuffer = [self pixelBufferFromCGImage:dstImage];
+                if (! [self.recordPixelBufferAdaptor appendPixelBuffer:pixelBuffer
                                                   withPresentationTime:lastSampleTime] ) {
                     NSLog(@"Video Writing Error");
                 }
@@ -543,9 +573,12 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 
 - (void)updateOrientation;
 {
-    NSLog(@"rotate..");
-    self.customPreviewLayer.bounds = CGRectMake(0, 0, self.parentView.frame.size.width, self.parentView.frame.size.height);
-    [self layoutPreviewLayer];
+    if (self.rotateVideo == YES)
+    {
+        NSLog(@"rotate..");
+        self.customPreviewLayer.bounds = CGRectMake(0, 0, self.parentView.frame.size.width, self.parentView.frame.size.height);
+        [self layoutPreviewLayer];
+    }
 }
 
 
@@ -583,3 +616,4 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 }
 
 @end
+
diff --git a/modules/highgui/src/cap_msmf.cpp b/modules/highgui/src/cap_msmf.cpp
index 52b780463a..09f65b7e87 100644
--- a/modules/highgui/src/cap_msmf.cpp
+++ b/modules/highgui/src/cap_msmf.cpp
@@ -53,7 +53,8 @@
 #include <Mfapi.h>
 #include <mfplay.h>
 #include <mfobjects.h>
-#include "Strsafe.h"
+#include <strsafe.h>
+#include <Mfreadwrite.h>
 #include <new>
 #include <map>
 #include <vector>
@@ -61,18 +62,27 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
+
 #pragma warning(disable:4503)
 #pragma comment(lib, "mfplat")
 #pragma comment(lib, "mf")
 #pragma comment(lib, "mfuuid")
 #pragma comment(lib, "Strmiids")
+#pragma comment(lib, "Mfreadwrite")
 #pragma comment(lib, "MinCore_Downlevel")
+
+// for ComPtr usage
+#include <wrl/client.h>
+using namespace Microsoft::WRL;
+
 struct IMFMediaType;
 struct IMFActivate;
 struct IMFMediaSource;
 struct IMFAttributes;
+
 namespace
 {
+
 template <class T> void SafeRelease(T **ppT)
 {
     if (*ppT)
@@ -81,7 +91,8 @@ template <class T> void SafeRelease(T **ppT)
         *ppT = NULL;
     }
 }
- /// Class for printing info into consol
+
+/// Class for printing info into consol
 class DebugPrintOut
 {
 public:
@@ -93,6 +104,7 @@ public:
 private:
     DebugPrintOut(void);
 };
+
 // Structure for collecting info about types of video, which are supported by current video device
 struct MediaType
 {
@@ -101,14 +113,14 @@ struct MediaType
     unsigned int width;
     unsigned int MF_MT_YUV_MATRIX;
     unsigned int MF_MT_VIDEO_LIGHTING;
-    unsigned int MF_MT_DEFAULT_STRIDE;
+    int MF_MT_DEFAULT_STRIDE; // stride is negative if image is bottom-up
     unsigned int MF_MT_VIDEO_CHROMA_SITING;
     GUID MF_MT_AM_FORMAT_TYPE;
     wchar_t *pMF_MT_AM_FORMAT_TYPEName;
     unsigned int MF_MT_FIXED_SIZE_SAMPLES;
     unsigned int MF_MT_VIDEO_NOMINAL_RANGE;
-    unsigned int MF_MT_FRAME_RATE;
-    unsigned int MF_MT_FRAME_RATE_low;
+    unsigned int MF_MT_FRAME_RATE_NUMERATOR;
+    unsigned int MF_MT_FRAME_RATE_DENOMINATOR;
     unsigned int MF_MT_PIXEL_ASPECT_RATIO;
     unsigned int MF_MT_PIXEL_ASPECT_RATIO_low;
     unsigned int MF_MT_ALL_SAMPLES_INDEPENDENT;
@@ -127,6 +139,7 @@ struct MediaType
     ~MediaType();
     void Clear();
 };
+
 /// Class for parsing info from IMFMediaType into the local MediaType
 class FormatReader
 {
@@ -136,9 +149,10 @@ public:
 private:
     FormatReader(void);
 };
+
 DWORD WINAPI MainThreadFunction( LPVOID lpParam );
 typedef void(*emergensyStopEventCallback)(int, void *);
-typedef unsigned char BYTE;
+
 class RawImage
 {
 public:
@@ -156,6 +170,7 @@ private:
     unsigned char *ri_pixels;
     RawImage(unsigned int size);
 };
+
 // Class for grabbing image from video stream
 class ImageGrabber : public IMFSampleGrabberSinkCallback
 {
@@ -163,13 +178,21 @@ public:
     ~ImageGrabber(void);
     HRESULT initImageGrabber(IMFMediaSource *pSource, GUID VideoFormat);
     HRESULT startGrabbing(void);
+    void pauseGrabbing();
+    void resumeGrabbing();
     void stopGrabbing();
     RawImage *getRawImage();
     // Function of creation of the instance of the class
-    static HRESULT CreateInstance(ImageGrabber **ppIG,unsigned int deviceID);
+    static HRESULT CreateInstance(ImageGrabber **ppIG, unsigned int deviceID, bool synchronous = false);
+
+    const HANDLE ig_hFrameReady;
+    const HANDLE ig_hFrameGrabbed;
+    const HANDLE ig_hFinish;
+
 private:
     bool ig_RIE;
     bool ig_Close;
+    bool ig_Synchronous;
     long m_cRef;
     unsigned int ig_DeviceID;
     IMFMediaSource *ig_pSource;
@@ -178,19 +201,11 @@ private:
     RawImage *ig_RIFirst;
     RawImage *ig_RISecond;
     RawImage *ig_RIOut;
-    ImageGrabber(unsigned int deviceID);
+    ImageGrabber(unsigned int deviceID, bool synchronous);
     HRESULT CreateTopology(IMFMediaSource *pSource, IMFActivate *pSinkActivate, IMFTopology **ppTopo);
-    HRESULT AddSourceNode(
-    IMFTopology *pTopology,
-    IMFMediaSource *pSource,
-    IMFPresentationDescriptor *pPD,
-    IMFStreamDescriptor *pSD,
-    IMFTopologyNode **ppNode);
-    HRESULT AddOutputNode(
-    IMFTopology *pTopology,
-    IMFActivate *pActivate,
-    DWORD dwId,
-    IMFTopologyNode **ppNode);
+    HRESULT AddSourceNode(IMFTopology *pTopology, IMFMediaSource *pSource,
+        IMFPresentationDescriptor *pPD, IMFStreamDescriptor *pSD, IMFTopologyNode **ppNode);
+    HRESULT AddOutputNode(IMFTopology *pTopology, IMFActivate *pActivate, DWORD dwId, IMFTopologyNode **ppNode);
     // IUnknown methods
     STDMETHODIMP QueryInterface(REFIID iid, void** ppv);
     STDMETHODIMP_(ULONG) AddRef();
@@ -208,13 +223,14 @@ private:
         DWORD dwSampleSize);
     STDMETHODIMP OnShutdown();
 };
+
 /// Class for controlling of thread of the grabbing raw data from video device
 class ImageGrabberThread
 {
     friend DWORD WINAPI MainThreadFunction( LPVOID lpParam );
 public:
     ~ImageGrabberThread(void);
-    static HRESULT CreateInstance(ImageGrabberThread **ppIGT, IMFMediaSource *pSource, unsigned int deviceID);
+    static HRESULT CreateInstance(ImageGrabberThread **ppIGT, IMFMediaSource *pSource, unsigned int deviceID, bool synchronious = false);
     void start();
     void stop();
     void setEmergencyStopEvent(void *userData, void(*func)(int, void *));
@@ -222,7 +238,7 @@ public:
 protected:
     virtual void run();
 private:
-    ImageGrabberThread(IMFMediaSource *pSource, unsigned int deviceID);
+    ImageGrabberThread(IMFMediaSource *pSource, unsigned int deviceID, bool synchronious);
     HANDLE igt_Handle;
     DWORD   igt_ThreadIdArray;
     ImageGrabber *igt_pImageGrabber;
@@ -231,6 +247,7 @@ private:
     bool igt_stop;
     unsigned int igt_DeviceID;
 };
+
 // Structure for collecting info about one parametr of current video device
 struct Parametr
 {
@@ -242,6 +259,7 @@ struct Parametr
     long Flag;
     Parametr();
 };
+
 // Structure for collecting info about 17 parametrs of current video device
 struct CamParametrs
 {
@@ -263,11 +281,13 @@ struct CamParametrs
         Parametr Iris;
         Parametr Focus;
 };
+
 typedef std::wstring String;
 typedef std::vector<int> vectorNum;
 typedef std::map<String, vectorNum> SUBTYPEMap;
 typedef std::map<UINT64, SUBTYPEMap> FrameRateMap;
 typedef void(*emergensyStopEventCallback)(int, void *);
+
 /// Class for controlling of video device
 class videoDevice
 {
@@ -311,7 +331,7 @@ private:
     IMFMediaSource *vd_pSource;
     emergensyStopEventCallback vd_func;
     void *vd_userData;
-    long enumerateCaptureFormats(IMFMediaSource *pSource);
+    HRESULT enumerateCaptureFormats(IMFMediaSource *pSource);
     long setDeviceFormat(IMFMediaSource *pSource, unsigned long dwFormatIndex);
     void buildLibraryofTypes();
     int findType(unsigned int size, unsigned int frameRate = 0);
@@ -319,6 +339,7 @@ private:
     long initDevice();
     long checkDevice(IMFAttributes *pAttributes, IMFActivate **pDevice);
 };
+
 /// Class for managing of list of video devices
 class videoDevices
 {
@@ -334,6 +355,7 @@ private:
     std::vector<videoDevice *> vds_Devices;
     videoDevices(void);
 };
+
 // Class for creating of Media Foundation context
 class Media_Foundation
 {
@@ -344,6 +366,7 @@ public:
 private:
     Media_Foundation(void);
 };
+
 /// The only visiable class for controlling of video devices in format singelton
 class videoInput
 {
@@ -393,23 +416,27 @@ public:
     bool isFrameNew(int deviceID);
     // Writing of Raw Data pixels from video device with deviceID with correction of RedAndBlue flipping flipRedAndBlue and vertical flipping flipImage
     bool getPixels(int deviceID, unsigned char * pixels, bool flipRedAndBlue = false, bool flipImage = false);
+    static void processPixels(unsigned char * src, unsigned char * dst, unsigned int width, unsigned int height, unsigned int bpp, bool bRGB, bool bFlip);
 private:
     bool accessToDevices;
     videoInput(void);
-    void processPixels(unsigned char * src, unsigned char * dst, unsigned int width, unsigned int height, unsigned int bpp, bool bRGB, bool bFlip);
     void updateListOfDevices();
 };
+
 DebugPrintOut::DebugPrintOut(void):verbose(true)
 {
 }
+
 DebugPrintOut::~DebugPrintOut(void)
 {
 }
+
 DebugPrintOut& DebugPrintOut::getInstance()
 {
     static DebugPrintOut instance;
     return instance;
 }
+
 void DebugPrintOut::printOut(const wchar_t *format, ...)
 {
     if(verbose)
@@ -430,14 +457,17 @@ void DebugPrintOut::printOut(const wchar_t *format, ...)
         va_end (args);
     }
 }
+
 void DebugPrintOut::setVerbose(bool state)
 {
     verbose = state;
 }
+
 LPCWSTR GetGUIDNameConstNew(const GUID& guid);
 HRESULT GetGUIDNameNew(const GUID& guid, WCHAR **ppwsz);
 HRESULT LogAttributeValueByIndexNew(IMFAttributes *pAttr, DWORD index);
 HRESULT SpecialCaseAttributeValueNew(GUID guid, const PROPVARIANT& var, MediaType &out);
+
 unsigned int *GetParametr(GUID guid, MediaType &out)
 {
     if(guid == MF_MT_YUV_MATRIX)
@@ -445,7 +475,7 @@ unsigned int *GetParametr(GUID guid, MediaType &out)
     if(guid == MF_MT_VIDEO_LIGHTING)
         return &(out.MF_MT_VIDEO_LIGHTING);
     if(guid == MF_MT_DEFAULT_STRIDE)
-        return &(out.MF_MT_DEFAULT_STRIDE);
+        return (unsigned int*)&(out.MF_MT_DEFAULT_STRIDE);
     if(guid == MF_MT_VIDEO_CHROMA_SITING)
         return &(out.MF_MT_VIDEO_CHROMA_SITING);
     if(guid == MF_MT_VIDEO_NOMINAL_RANGE)
@@ -462,6 +492,7 @@ unsigned int *GetParametr(GUID guid, MediaType &out)
         return &(out.MF_MT_INTERLACE_MODE);
     return NULL;
 }
+
 HRESULT LogAttributeValueByIndexNew(IMFAttributes *pAttr, DWORD index, MediaType &out)
 {
     WCHAR *pGuidName = NULL;
@@ -548,6 +579,7 @@ done:
     PropVariantClear(&var);
     return hr;
 }
+
 HRESULT GetGUIDNameNew(const GUID& guid, WCHAR **ppwsz)
 {
     HRESULT hr = S_OK;
@@ -589,14 +621,17 @@ done:
     }
     return hr;
 }
+
 void LogUINT32AsUINT64New(const PROPVARIANT& var, UINT32 &uHigh, UINT32 &uLow)
 {
     Unpack2UINT32AsUINT64(var.uhVal.QuadPart, &uHigh, &uLow);
 }
+
 float OffsetToFloatNew(const MFOffset& offset)
 {
     return offset.value + (static_cast<float>(offset.fract) / 65536.0f);
 }
+
 HRESULT LogVideoAreaNew(const PROPVARIANT& var)
 {
     if (var.caub.cElems < sizeof(MFVideoArea))
@@ -605,8 +640,13 @@ HRESULT LogVideoAreaNew(const PROPVARIANT& var)
     }
     return S_OK;
 }
+
 HRESULT SpecialCaseAttributeValueNew(GUID guid, const PROPVARIANT& var, MediaType &out)
 {
+    if (guid == MF_MT_DEFAULT_STRIDE)
+    {
+        out.MF_MT_DEFAULT_STRIDE = var.intVal;
+    } else
     if (guid == MF_MT_FRAME_SIZE)
     {
         UINT32 uHigh = 0, uLow = 0;
@@ -620,8 +660,8 @@ HRESULT SpecialCaseAttributeValueNew(GUID guid, const PROPVARIANT& var, MediaTyp
     {
         UINT32 uHigh = 0, uLow = 0;
         LogUINT32AsUINT64New(var, uHigh, uLow);
-        out.MF_MT_FRAME_RATE = uHigh;
-        out.MF_MT_FRAME_RATE_low = uLow;
+        out.MF_MT_FRAME_RATE_NUMERATOR = uHigh;
+        out.MF_MT_FRAME_RATE_DENOMINATOR = uLow;
     }
     else
     if (guid == MF_MT_FRAME_RATE_RANGE_MAX)
@@ -653,9 +693,11 @@ HRESULT SpecialCaseAttributeValueNew(GUID guid, const PROPVARIANT& var, MediaTyp
     }
     return S_OK;
 }
+
 #ifndef IF_EQUAL_RETURN
 #define IF_EQUAL_RETURN(param, val) if(val == param) return L#val
 #endif
+
 LPCWSTR GetGUIDNameConstNew(const GUID& guid)
 {
     IF_EQUAL_RETURN(guid, MF_MT_MAJOR_TYPE);
@@ -800,9 +842,11 @@ LPCWSTR GetGUIDNameConstNew(const GUID& guid)
     IF_EQUAL_RETURN(guid, MFAudioFormat_ADTS); //             WAVE_FORMAT_MPEG_ADTS_AAC
     return NULL;
 }
+
 FormatReader::FormatReader(void)
 {
 }
+
 MediaType FormatReader::Read(IMFMediaType *pType)
 {
     UINT32 count = 0;
@@ -833,32 +877,57 @@ MediaType FormatReader::Read(IMFMediaType *pType)
     }
     return out;
 }
+
 FormatReader::~FormatReader(void)
 {
 }
+
 #define CHECK_HR(x) if (FAILED(x)) { goto done; }
-ImageGrabber::ImageGrabber(unsigned int deviceID): m_cRef(1), ig_DeviceID(deviceID), ig_pSource(NULL), ig_pSession(NULL), ig_pTopology(NULL), ig_RIE(true), ig_Close(false)
-{
-}
+
+ImageGrabber::ImageGrabber(unsigned int deviceID, bool synchronous):
+    m_cRef(1),
+    ig_DeviceID(deviceID),
+    ig_pSource(NULL),
+    ig_pSession(NULL),
+    ig_pTopology(NULL),
+    ig_RIE(true),
+    ig_Close(false),
+    ig_Synchronous(synchronous),
+    ig_hFrameReady(synchronous ? CreateEvent(NULL, FALSE, FALSE, NULL): 0),
+    ig_hFrameGrabbed(synchronous ? CreateEvent(NULL, FALSE, TRUE, NULL): 0),
+    ig_hFinish(CreateEvent(NULL, TRUE, FALSE, NULL))
+{}
+
 ImageGrabber::~ImageGrabber(void)
 {
     if (ig_pSession)
     {
         ig_pSession->Shutdown();
     }
-    //SafeRelease(&ig_pSession);
-    //SafeRelease(&ig_pTopology);
+
+    CloseHandle(ig_hFinish);
+
+    if (ig_Synchronous)
+    {
+        CloseHandle(ig_hFrameReady);
+        CloseHandle(ig_hFrameGrabbed);
+    }
+
+    SafeRelease(&ig_pSession);
+    SafeRelease(&ig_pTopology);
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
-    DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Destroing instance of the ImageGrabber class \n", ig_DeviceID);
+
+    DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Destroing instance of the ImageGrabber class\n", ig_DeviceID);
 }
+
 HRESULT ImageGrabber::initImageGrabber(IMFMediaSource *pSource, GUID VideoFormat)
 {
-    IMFActivate *pSinkActivate = NULL;
-    IMFMediaType *pType = NULL;
-    IMFPresentationDescriptor *pPD = NULL;
-    IMFStreamDescriptor *pSD = NULL;
-    IMFMediaTypeHandler *pHandler = NULL;
-    IMFMediaType *pCurrentType = NULL;
+    ComPtr<IMFActivate> pSinkActivate = NULL;
+    ComPtr<IMFMediaType> pType = NULL;
+    ComPtr<IMFPresentationDescriptor> pPD = NULL;
+    ComPtr<IMFStreamDescriptor> pSD = NULL;
+    ComPtr<IMFMediaTypeHandler> pHandler = NULL;
+    ComPtr<IMFMediaType> pCurrentType = NULL;
     HRESULT hr = S_OK;
     MediaType MT;
      // Clean up.
@@ -871,30 +940,32 @@ HRESULT ImageGrabber::initImageGrabber(IMFMediaSource *pSource, GUID VideoFormat
     ig_pSource = pSource;
     hr = pSource->CreatePresentationDescriptor(&pPD);
     if (FAILED(hr))
+    {
         goto err;
+    }
     BOOL fSelected;
     hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, &pSD);
-    if (FAILED(hr))
+    if (FAILED(hr)) {
         goto err;
+    }
     hr = pSD->GetMediaTypeHandler(&pHandler);
-    if (FAILED(hr))
+    if (FAILED(hr)) {
         goto err;
+    }
     DWORD cTypes = 0;
     hr = pHandler->GetMediaTypeCount(&cTypes);
-    if (FAILED(hr))
+    if (FAILED(hr)) {
         goto err;
+    }
     if(cTypes > 0)
     {
         hr = pHandler->GetCurrentMediaType(&pCurrentType);
-        if (FAILED(hr))
+        if (FAILED(hr)) {
             goto err;
-        MT = FormatReader::Read(pCurrentType);
+        }
+        MT = FormatReader::Read(pCurrentType.Get());
     }
 err:
-    SafeRelease(&pPD);
-    SafeRelease(&pSD);
-    SafeRelease(&pHandler);
-    SafeRelease(&pCurrentType);
     unsigned int sizeRawImage = 0;
     if(VideoFormat == MFVideoFormat_RGB24)
     {
@@ -910,17 +981,17 @@ err:
     // Configure the media type that the Sample Grabber will receive.
     // Setting the major and subtype is usually enough for the topology loader
     // to resolve the topology.
-    CHECK_HR(hr = MFCreateMediaType(&pType));
+    CHECK_HR(hr = MFCreateMediaType(pType.GetAddressOf()));
     CHECK_HR(hr = pType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video));
     CHECK_HR(hr = pType->SetGUID(MF_MT_SUBTYPE, VideoFormat));
     // Create the sample grabber sink.
-    CHECK_HR(hr = MFCreateSampleGrabberSinkActivate(pType, this, &pSinkActivate));
+    CHECK_HR(hr = MFCreateSampleGrabberSinkActivate(pType.Get(), this, pSinkActivate.GetAddressOf()));
     // To run as fast as possible, set this attribute (requires Windows 7):
     CHECK_HR(hr = pSinkActivate->SetUINT32(MF_SAMPLEGRABBERSINK_IGNORE_CLOCK, TRUE));
     // Create the Media Session.
     CHECK_HR(hr = MFCreateMediaSession(NULL, &ig_pSession));
     // Create the topology.
-    CHECK_HR(hr = CreateTopology(pSource, pSinkActivate, &ig_pTopology));
+    CHECK_HR(hr = CreateTopology(pSource, pSinkActivate.Get(), &ig_pTopology));
 done:
     // Clean up.
     if (FAILED(hr))
@@ -932,10 +1003,10 @@ done:
         SafeRelease(&ig_pSession);
         SafeRelease(&ig_pTopology);
     }
-    SafeRelease(&pSinkActivate);
-    SafeRelease(&pType);
+
     return hr;
 }
+
 void ImageGrabber::stopGrabbing()
 {
     if(ig_pSession)
@@ -943,16 +1014,17 @@ void ImageGrabber::stopGrabbing()
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
     DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Stopping of of grabbing of images\n", ig_DeviceID);
 }
+
 HRESULT ImageGrabber::startGrabbing(void)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
-    IMFMediaEvent *pEvent = NULL;
+    ComPtr<IMFMediaEvent> pEvent = NULL;
     PROPVARIANT var;
     PropVariantInit(&var);
     HRESULT hr = S_OK;
-    CHECK_HR(hr = ig_pSession->SetTopology(0, ig_pTopology));
-    CHECK_HR(hr = ig_pSession->Start(&GUID_NULL, &var));
+    hr = ig_pSession->SetTopology(0, ig_pTopology);
     DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Start Grabbing of the images\n", ig_DeviceID);
+    hr = ig_pSession->Start(&GUID_NULL, &var);
     for(;;)
     {
         HRESULT hrStatus = S_OK;
@@ -992,27 +1064,41 @@ HRESULT ImageGrabber::startGrabbing(void)
             DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: MEVideoCaptureDeviceRemoved \n", ig_DeviceID);
             break;
         }
-        SafeRelease(&pEvent);
+        if ((met == MEError) || (met == MENonFatalError))
+        {
+            pEvent->GetStatus(&hrStatus);
+            DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: MEError | MENonFatalError: %u\n", ig_DeviceID, hrStatus);
+            break;
+        }
     }
     DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Finish startGrabbing \n", ig_DeviceID);
+
 done:
-    SafeRelease(&pEvent);
-    SafeRelease(&ig_pSession);
-    SafeRelease(&ig_pTopology);
+    SetEvent(ig_hFinish);
+
     return hr;
 }
+
+void ImageGrabber::pauseGrabbing()
+{
+}
+
+void ImageGrabber::resumeGrabbing()
+{
+}
+
 HRESULT ImageGrabber::CreateTopology(IMFMediaSource *pSource, IMFActivate *pSinkActivate, IMFTopology **ppTopo)
 {
-    IMFTopology *pTopology = NULL;
-    IMFPresentationDescriptor *pPD = NULL;
-    IMFStreamDescriptor *pSD = NULL;
-    IMFMediaTypeHandler *pHandler = NULL;
-    IMFTopologyNode *pNode1 = NULL;
-    IMFTopologyNode *pNode2 = NULL;
+    IMFTopology* pTopology = NULL;
+    ComPtr<IMFPresentationDescriptor> pPD = NULL;
+    ComPtr<IMFStreamDescriptor> pSD = NULL;
+    ComPtr<IMFMediaTypeHandler> pHandler = NULL;
+    ComPtr<IMFTopologyNode> pNode1 = NULL;
+    ComPtr<IMFTopologyNode> pNode2 = NULL;
     HRESULT hr = S_OK;
     DWORD cStreams = 0;
     CHECK_HR(hr = MFCreateTopology(&pTopology));
-    CHECK_HR(hr = pSource->CreatePresentationDescriptor(&pPD));
+    CHECK_HR(hr = pSource->CreatePresentationDescriptor(pPD.GetAddressOf()));
     CHECK_HR(hr = pPD->GetStreamDescriptorCount(&cStreams));
     for (DWORD i = 0; i < cStreams; i++)
     {
@@ -1024,29 +1110,23 @@ HRESULT ImageGrabber::CreateTopology(IMFMediaSource *pSource, IMFActivate *pSink
         CHECK_HR(hr = pHandler->GetMajorType(&majorType));
         if (majorType == MFMediaType_Video && fSelected)
         {
-            CHECK_HR(hr = AddSourceNode(pTopology, pSource, pPD, pSD, &pNode1));
-            CHECK_HR(hr = AddOutputNode(pTopology, pSinkActivate, 0, &pNode2));
-            CHECK_HR(hr = pNode1->ConnectOutput(0, pNode2, 0));
+            CHECK_HR(hr = AddSourceNode(pTopology, pSource, pPD.Get(), pSD.Get(), pNode1.GetAddressOf()));
+            CHECK_HR(hr = AddOutputNode(pTopology, pSinkActivate, 0, pNode2.GetAddressOf()));
+            CHECK_HR(hr = pNode1->ConnectOutput(0, pNode2.Get(), 0));
             break;
         }
         else
         {
             CHECK_HR(hr = pPD->DeselectStream(i));
         }
-        SafeRelease(&pSD);
-        SafeRelease(&pHandler);
     }
     *ppTopo = pTopology;
     (*ppTopo)->AddRef();
+
 done:
-    SafeRelease(&pTopology);
-    SafeRelease(&pNode1);
-    SafeRelease(&pNode2);
-    SafeRelease(&pPD);
-    SafeRelease(&pSD);
-    SafeRelease(&pHandler);
     return hr;
 }
+
 HRESULT ImageGrabber::AddSourceNode(
     IMFTopology *pTopology,           // Topology.
     IMFMediaSource *pSource,          // Media source.
@@ -1054,43 +1134,45 @@ HRESULT ImageGrabber::AddSourceNode(
     IMFStreamDescriptor *pSD,         // Stream descriptor.
     IMFTopologyNode **ppNode)         // Receives the node pointer.
 {
-    IMFTopologyNode *pNode = NULL;
+    ComPtr<IMFTopologyNode> pNode = NULL;
     HRESULT hr = S_OK;
-    CHECK_HR(hr = MFCreateTopologyNode(MF_TOPOLOGY_SOURCESTREAM_NODE, &pNode));
+    CHECK_HR(hr = MFCreateTopologyNode(MF_TOPOLOGY_SOURCESTREAM_NODE, pNode.GetAddressOf()));
     CHECK_HR(hr = pNode->SetUnknown(MF_TOPONODE_SOURCE, pSource));
     CHECK_HR(hr = pNode->SetUnknown(MF_TOPONODE_PRESENTATION_DESCRIPTOR, pPD));
     CHECK_HR(hr = pNode->SetUnknown(MF_TOPONODE_STREAM_DESCRIPTOR, pSD));
-    CHECK_HR(hr = pTopology->AddNode(pNode));
+    CHECK_HR(hr = pTopology->AddNode(pNode.Get()));
     // Return the pointer to the caller.
-    *ppNode = pNode;
+    *ppNode = pNode.Get();
     (*ppNode)->AddRef();
+
 done:
-    SafeRelease(&pNode);
     return hr;
 }
+
 HRESULT ImageGrabber::AddOutputNode(
     IMFTopology *pTopology,     // Topology.
     IMFActivate *pActivate,     // Media sink activation object.
     DWORD dwId,                 // Identifier of the stream sink.
     IMFTopologyNode **ppNode)   // Receives the node pointer.
 {
-    IMFTopologyNode *pNode = NULL;
+    ComPtr<IMFTopologyNode> pNode = NULL;
     HRESULT hr = S_OK;
-    CHECK_HR(hr = MFCreateTopologyNode(MF_TOPOLOGY_OUTPUT_NODE, &pNode));
+    CHECK_HR(hr = MFCreateTopologyNode(MF_TOPOLOGY_OUTPUT_NODE, pNode.GetAddressOf()));
     CHECK_HR(hr = pNode->SetObject(pActivate));
     CHECK_HR(hr = pNode->SetUINT32(MF_TOPONODE_STREAMID, dwId));
     CHECK_HR(hr = pNode->SetUINT32(MF_TOPONODE_NOSHUTDOWN_ON_REMOVE, FALSE));
-    CHECK_HR(hr = pTopology->AddNode(pNode));
+    CHECK_HR(hr = pTopology->AddNode(pNode.Get()));
     // Return the pointer to the caller.
-    *ppNode = pNode;
+    *ppNode = pNode.Get();
     (*ppNode)->AddRef();
+
 done:
-    SafeRelease(&pNode);
     return hr;
 }
-HRESULT ImageGrabber::CreateInstance(ImageGrabber **ppIG, unsigned int deviceID)
+
+HRESULT ImageGrabber::CreateInstance(ImageGrabber **ppIG, unsigned int deviceID, bool synchronious)
 {
-    *ppIG = new (std::nothrow) ImageGrabber(deviceID);
+    *ppIG = new (std::nothrow) ImageGrabber(deviceID, synchronious);
     if (ppIG == NULL)
     {
         return E_OUTOFMEMORY;
@@ -1099,6 +1181,7 @@ HRESULT ImageGrabber::CreateInstance(ImageGrabber **ppIG, unsigned int deviceID)
     DPO->printOut(L"IMAGEGRABBER VIDEODEVICE %i: Creating instance of ImageGrabber\n", deviceID);
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::QueryInterface(REFIID riid, void** ppv)
 {
     HRESULT hr = E_NOINTERFACE;
@@ -1119,10 +1202,12 @@ STDMETHODIMP ImageGrabber::QueryInterface(REFIID riid, void** ppv)
     }
     return hr;
 }
+
 STDMETHODIMP_(ULONG) ImageGrabber::AddRef()
 {
     return InterlockedIncrement(&m_cRef);
 }
+
 STDMETHODIMP_(ULONG) ImageGrabber::Release()
 {
     ULONG cRef = InterlockedDecrement(&m_cRef);
@@ -1132,38 +1217,45 @@ STDMETHODIMP_(ULONG) ImageGrabber::Release()
     }
     return cRef;
 }
+
 STDMETHODIMP ImageGrabber::OnClockStart(MFTIME hnsSystemTime, LONGLONG llClockStartOffset)
 {
     (void)hnsSystemTime;
     (void)llClockStartOffset;
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::OnClockStop(MFTIME hnsSystemTime)
 {
     (void)hnsSystemTime;
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::OnClockPause(MFTIME hnsSystemTime)
 {
     (void)hnsSystemTime;
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::OnClockRestart(MFTIME hnsSystemTime)
 {
     (void)hnsSystemTime;
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::OnClockSetRate(MFTIME hnsSystemTime, float flRate)
 {
     (void)flRate;
     (void)hnsSystemTime;
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::OnSetPresentationClock(IMFPresentationClock* pClock)
 {
     (void)pClock;
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::OnProcessSample(REFGUID guidMajorMediaType, DWORD dwSampleFlags,
     LONGLONG llSampleTime, LONGLONG llSampleDuration, const BYTE * pSampleBuffer,
     DWORD dwSampleSize)
@@ -1173,6 +1265,16 @@ STDMETHODIMP ImageGrabber::OnProcessSample(REFGUID guidMajorMediaType, DWORD dwS
     (void)dwSampleFlags;
     (void)llSampleDuration;
     (void)dwSampleSize;
+
+    HANDLE tmp[] = {ig_hFinish, ig_hFrameGrabbed, NULL};
+
+    DWORD status = WaitForMultipleObjects(2, tmp, FALSE, INFINITE);
+    if (status == WAIT_OBJECT_0)
+    {
+        printf("OnProcessFrame called after ig_hFinish event\n");
+        return S_OK;
+    }
+
     if(ig_RIE)
     {
         ig_RIFirst->fastCopy(pSampleBuffer);
@@ -1183,27 +1285,41 @@ STDMETHODIMP ImageGrabber::OnProcessSample(REFGUID guidMajorMediaType, DWORD dwS
         ig_RISecond->fastCopy(pSampleBuffer);
         ig_RIOut = ig_RISecond;
     }
-    ig_RIE = !ig_RIE;
+
+    if (ig_Synchronous)
+    {
+        SetEvent(ig_hFrameReady);
+    }
+    else
+    {
+        ig_RIE = !ig_RIE;
+    }
+
     return S_OK;
 }
+
 STDMETHODIMP ImageGrabber::OnShutdown()
 {
+    SetEvent(ig_hFinish);
     return S_OK;
 }
+
 RawImage *ImageGrabber::getRawImage()
 {
     return ig_RIOut;
 }
+
 DWORD WINAPI MainThreadFunction( LPVOID lpParam )
 {
     ImageGrabberThread *pIGT = (ImageGrabberThread *)lpParam;
     pIGT->run();
     return 0;
 }
-HRESULT ImageGrabberThread::CreateInstance(ImageGrabberThread **ppIGT, IMFMediaSource *pSource, unsigned int deviceID)
+
+HRESULT ImageGrabberThread::CreateInstance(ImageGrabberThread **ppIGT, IMFMediaSource *pSource, unsigned int deviceID, bool synchronious)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
-    *ppIGT = new (std::nothrow) ImageGrabberThread(pSource, deviceID);
+    *ppIGT = new (std::nothrow) ImageGrabberThread(pSource, deviceID, synchronious);
     if (ppIGT == NULL)
     {
         DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Memory cannot be allocated\n", deviceID);
@@ -1213,10 +1329,14 @@ HRESULT ImageGrabberThread::CreateInstance(ImageGrabberThread **ppIGT, IMFMediaS
         DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Creating of the instance of ImageGrabberThread\n", deviceID);
     return S_OK;
 }
-ImageGrabberThread::ImageGrabberThread(IMFMediaSource *pSource, unsigned int deviceID): igt_Handle(NULL), igt_stop(false)
+
+ImageGrabberThread::ImageGrabberThread(IMFMediaSource *pSource, unsigned int deviceID, bool synchronious):
+    igt_func(NULL),
+    igt_Handle(NULL),
+    igt_stop(false)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
-    HRESULT hr = ImageGrabber::CreateInstance(&igt_pImageGrabber, deviceID);
+    HRESULT hr = ImageGrabber::CreateInstance(&igt_pImageGrabber, deviceID, synchronious);
     igt_DeviceID = deviceID;
     if(SUCCEEDED(hr))
     {
@@ -1235,6 +1355,7 @@ ImageGrabberThread::ImageGrabberThread(IMFMediaSource *pSource, unsigned int dev
         DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i There is a problem with creation of the instance of the ImageGrabber class\n", deviceID);
     }
 }
+
 void ImageGrabberThread::setEmergencyStopEvent(void *userData, void(*func)(int, void *))
 {
     if(func)
@@ -1243,12 +1364,16 @@ void ImageGrabberThread::setEmergencyStopEvent(void *userData, void(*func)(int,
         igt_userData = userData;
     }
 }
+
 ImageGrabberThread::~ImageGrabberThread(void)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
     DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Destroing ImageGrabberThread\n", igt_DeviceID);
+    if (igt_Handle)
+        WaitForSingleObject(igt_Handle, INFINITE);
     delete igt_pImageGrabber;
 }
+
 void ImageGrabberThread::stop()
 {
     igt_stop = true;
@@ -1257,16 +1382,18 @@ void ImageGrabberThread::stop()
         igt_pImageGrabber->stopGrabbing();
     }
 }
+
 void ImageGrabberThread::start()
 {
     igt_Handle = CreateThread(
-            NULL,                   // default security attributes
-            0,                      // use default stack size
-            MainThreadFunction,       // thread function name
-            this,          // argument to thread function
-            0,                      // use default creation flags
+            NULL,                  // default security attributes
+            0,                     // use default stack size
+            MainThreadFunction,    // thread function name
+            this,                  // argument to thread function
+            0,                     // use default creation flags
             &igt_ThreadIdArray);   // returns the thread identifier
 }
+
 void ImageGrabberThread::run()
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -1294,10 +1421,12 @@ void ImageGrabberThread::run()
     else
         DPO->printOut(L"IMAGEGRABBERTHREAD VIDEODEVICE %i: Finish thread\n", igt_DeviceID);
 }
+
 ImageGrabber *ImageGrabberThread::getImageGrabber()
 {
     return igt_pImageGrabber;
 }
+
 Media_Foundation::Media_Foundation(void)
 {
     HRESULT hr = MFStartup(MF_VERSION);
@@ -1307,6 +1436,7 @@ Media_Foundation::Media_Foundation(void)
         DPO->printOut(L"MEDIA FOUNDATION: It cannot be created!!!\n");
     }
 }
+
 Media_Foundation::~Media_Foundation(void)
 {
     HRESULT hr = MFShutdown();
@@ -1316,12 +1446,13 @@ Media_Foundation::~Media_Foundation(void)
         DPO->printOut(L"MEDIA FOUNDATION: Resources cannot be released\n");
     }
 }
+
 bool Media_Foundation::buildListOfDevices()
 {
     HRESULT hr = S_OK;
-    IMFAttributes *pAttributes = NULL;
+    ComPtr<IMFAttributes> pAttributes = NULL;
     CoInitialize(NULL);
-    hr = MFCreateAttributes(&pAttributes, 1);
+    hr = MFCreateAttributes(pAttributes.GetAddressOf(), 1);
     if (SUCCEEDED(hr))
     {
         hr = pAttributes->SetGUID(
@@ -1332,40 +1463,46 @@ bool Media_Foundation::buildListOfDevices()
     if (SUCCEEDED(hr))
     {
         videoDevices *vDs = &videoDevices::getInstance();
-        hr = vDs->initDevices(pAttributes);
+        hr = vDs->initDevices(pAttributes.Get());
     }
     else
     {
        DebugPrintOut *DPO = &DebugPrintOut::getInstance();
        DPO->printOut(L"MEDIA FOUNDATION: The access to the video cameras denied\n");
     }
-    SafeRelease(&pAttributes);
+
     return (SUCCEEDED(hr));
 }
+
 Media_Foundation& Media_Foundation::getInstance()
 {
     static Media_Foundation instance;
     return instance;
 }
+
 RawImage::RawImage(unsigned int size): ri_new(false), ri_pixels(NULL)
 {
     ri_size = size;
     ri_pixels = new unsigned char[size];
     memset((void *)ri_pixels,0,ri_size);
 }
+
 bool RawImage::isNew()
 {
     return ri_new;
 }
+
 unsigned int RawImage::getSize()
 {
     return ri_size;
 }
+
 RawImage::~RawImage(void)
 {
     delete []ri_pixels;
     ri_pixels = NULL;
 }
+
 long RawImage::CreateInstance(RawImage **ppRImage,unsigned int size)
 {
     *ppRImage = new (std::nothrow) RawImage(size);
@@ -1375,25 +1512,30 @@ long RawImage::CreateInstance(RawImage **ppRImage,unsigned int size)
     }
     return S_OK;
 }
+
 void RawImage::setCopy(const BYTE * pSampleBuffer)
 {
     memcpy(ri_pixels, pSampleBuffer, ri_size);
     ri_new = true;
 }
+
 void RawImage::fastCopy(const BYTE * pSampleBuffer)
 {
     memcpy(ri_pixels, pSampleBuffer, ri_size);
     ri_new = true;
 }
+
 unsigned char * RawImage::getpPixels()
 {
     ri_new = false;
     return ri_pixels;
 }
+
 videoDevice::videoDevice(void): vd_IsSetuped(false), vd_LockOut(OpenLock), vd_pFriendlyName(NULL),
     vd_Width(0), vd_Height(0), vd_pSource(NULL), vd_func(NULL), vd_userData(NULL)
 {
 }
+
 void videoDevice::setParametrs(CamParametrs parametrs)
 {
     if(vd_IsSetuped)
@@ -1428,6 +1570,7 @@ void videoDevice::setParametrs(CamParametrs parametrs)
         }
     }
 }
+
 CamParametrs videoDevice::getParametrs()
 {
     CamParametrs out;
@@ -1472,6 +1615,7 @@ CamParametrs videoDevice::getParametrs()
     }
     return out;
 }
+
 long videoDevice::resetDevice(IMFActivate *pActivate)
 {
     HRESULT hr = -1;
@@ -1503,6 +1647,7 @@ long videoDevice::resetDevice(IMFActivate *pActivate)
     }
     return hr;
 }
+
 long videoDevice::readInfoOfDevice(IMFActivate *pActivate, unsigned int Num)
 {
     HRESULT hr = -1;
@@ -1510,6 +1655,7 @@ long videoDevice::readInfoOfDevice(IMFActivate *pActivate, unsigned int Num)
     hr = resetDevice(pActivate);
     return hr;
 }
+
 long videoDevice::checkDevice(IMFAttributes *pAttributes, IMFActivate **pDevice)
 {
     HRESULT hr = S_OK;
@@ -1568,14 +1714,15 @@ long videoDevice::checkDevice(IMFAttributes *pAttributes, IMFActivate **pDevice)
     }
     return hr;
 }
+
 long videoDevice::initDevice()
 {
     HRESULT hr = -1;
-    IMFAttributes *pAttributes = NULL;
-    IMFActivate * vd_pActivate= NULL;
+    ComPtr<IMFAttributes> pAttributes = NULL;
+    IMFActivate *vd_pActivate = NULL;
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
     CoInitialize(NULL);
-    hr = MFCreateAttributes(&pAttributes, 1);
+    hr = MFCreateAttributes(pAttributes.GetAddressOf(), 1);
     if (SUCCEEDED(hr))
     {
         hr = pAttributes->SetGUID(
@@ -1585,7 +1732,7 @@ long videoDevice::initDevice()
     }
     if (SUCCEEDED(hr))
     {
-        hr = checkDevice(pAttributes, &vd_pActivate);
+        hr = checkDevice(pAttributes.Get(), &vd_pActivate);
         if (SUCCEEDED(hr) && vd_pActivate)
         {
             SafeRelease(&vd_pSource);
@@ -1607,9 +1754,10 @@ long videoDevice::initDevice()
     {
         DPO->printOut(L"VIDEODEVICE %i: The attribute of video cameras cannot be getting \n", vd_CurrentNumber);
     }
-    SafeRelease(&pAttributes);
+
     return hr;
 }
+
 MediaType videoDevice::getFormat(unsigned int id)
 {
     if(id < vd_CurrentFormats.size())
@@ -1713,6 +1861,7 @@ int videoDevice::findType(unsigned int size, unsigned int frameRate)
         return 0;
     return VN[0];
 }
+
 void videoDevice::buildLibraryofTypes()
 {
     unsigned int size;
@@ -1722,7 +1871,7 @@ void videoDevice::buildLibraryofTypes()
     for(; i != vd_CurrentFormats.end(); i++)
     {
         size = (*i).MF_MT_FRAME_SIZE;
-        framerate = (*i).MF_MT_FRAME_RATE;
+        framerate = (*i).MF_MT_FRAME_RATE_NUMERATOR;
         FrameRateMap FRM = vd_CaptureFormats[size];
         SUBTYPEMap STM = FRM[framerate];
         String subType((*i).pMF_MT_SUBTYPEName);
@@ -1734,45 +1883,45 @@ void videoDevice::buildLibraryofTypes()
         count++;
     }
 }
+
 long videoDevice::setDeviceFormat(IMFMediaSource *pSource, unsigned long  dwFormatIndex)
 {
-    IMFPresentationDescriptor *pPD = NULL;
-    IMFStreamDescriptor *pSD = NULL;
-    IMFMediaTypeHandler *pHandler = NULL;
-    IMFMediaType *pType = NULL;
-    HRESULT hr = pSource->CreatePresentationDescriptor(&pPD);
+    ComPtr<IMFPresentationDescriptor> pPD = NULL;
+    ComPtr<IMFStreamDescriptor> pSD = NULL;
+    ComPtr<IMFMediaTypeHandler> pHandler = NULL;
+    ComPtr<IMFMediaType> pType = NULL;
+    HRESULT hr = pSource->CreatePresentationDescriptor(pPD.GetAddressOf());
     if (FAILED(hr))
     {
         goto done;
     }
     BOOL fSelected;
-    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, &pSD);
+    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, pSD.GetAddressOf());
     if (FAILED(hr))
     {
         goto done;
     }
-    hr = pSD->GetMediaTypeHandler(&pHandler);
+    hr = pSD->GetMediaTypeHandler(pHandler.GetAddressOf());
     if (FAILED(hr))
     {
         goto done;
     }
-    hr = pHandler->GetMediaTypeByIndex((DWORD)dwFormatIndex, &pType);
+    hr = pHandler->GetMediaTypeByIndex((DWORD)dwFormatIndex, pType.GetAddressOf());
     if (FAILED(hr))
     {
         goto done;
     }
-    hr = pHandler->SetCurrentMediaType(pType);
+    hr = pHandler->SetCurrentMediaType(pType.Get());
+
 done:
-    SafeRelease(&pPD);
-    SafeRelease(&pSD);
-    SafeRelease(&pHandler);
-    SafeRelease(&pType);
     return hr;
 }
+
 bool videoDevice::isDeviceSetup()
 {
     return vd_IsSetuped;
 }
+
 RawImage * videoDevice::getRawImageOut()
 {
     if(!vd_IsSetuped) return NULL;
@@ -1785,6 +1934,7 @@ RawImage * videoDevice::getRawImageOut()
     }
     return NULL;
 }
+
 bool videoDevice::isFrameNew()
 {
     if(!vd_IsSetuped) return false;
@@ -1809,16 +1959,19 @@ bool videoDevice::isFrameNew()
     }
     return false;
 }
+
 bool videoDevice::isDeviceMediaSource()
 {
     if(vd_LockOut == MediaSourceLock) return true;
     return false;
 }
+
 bool videoDevice::isDeviceRawDataSource()
 {
     if(vd_LockOut == RawDataLock) return true;
     return false;
 }
+
 bool videoDevice::setupDevice(unsigned int id)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -1849,15 +2002,18 @@ bool videoDevice::setupDevice(unsigned int id)
         return false;
     }
 }
+
 bool videoDevice::setupDevice(unsigned int w, unsigned int h, unsigned int idealFramerate)
 {
     unsigned int id = findType(w * h, idealFramerate);
     return setupDevice(id);
 }
+
 wchar_t *videoDevice::getName()
 {
     return vd_pFriendlyName;
 }
+
 videoDevice::~videoDevice(void)
 {
     closeDevice();
@@ -1865,24 +2021,25 @@ videoDevice::~videoDevice(void)
     if(vd_pFriendlyName)
         CoTaskMemFree(vd_pFriendlyName);
 }
-long videoDevice::enumerateCaptureFormats(IMFMediaSource *pSource)
+
+HRESULT videoDevice::enumerateCaptureFormats(IMFMediaSource *pSource)
 {
-    IMFPresentationDescriptor *pPD = NULL;
-    IMFStreamDescriptor *pSD = NULL;
-    IMFMediaTypeHandler *pHandler = NULL;
-    IMFMediaType *pType = NULL;
-    HRESULT hr = pSource->CreatePresentationDescriptor(&pPD);
+    ComPtr<IMFPresentationDescriptor> pPD = NULL;
+    ComPtr<IMFStreamDescriptor> pSD = NULL;
+    ComPtr<IMFMediaTypeHandler> pHandler = NULL;
+    ComPtr<IMFMediaType> pType = NULL;
+    HRESULT hr = pSource->CreatePresentationDescriptor(pPD.GetAddressOf());
     if (FAILED(hr))
     {
         goto done;
     }
     BOOL fSelected;
-    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, &pSD);
+    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, pSD.GetAddressOf());
     if (FAILED(hr))
     {
         goto done;
     }
-    hr = pSD->GetMediaTypeHandler(&pHandler);
+    hr = pSD->GetMediaTypeHandler(pHandler.GetAddressOf());
     if (FAILED(hr))
     {
         goto done;
@@ -1895,24 +2052,22 @@ long videoDevice::enumerateCaptureFormats(IMFMediaSource *pSource)
     }
     for (DWORD i = 0; i < cTypes; i++)
     {
-        hr = pHandler->GetMediaTypeByIndex(i, &pType);
+        hr = pHandler->GetMediaTypeByIndex(i, pType.GetAddressOf());
         if (FAILED(hr))
         {
             goto done;
         }
-        MediaType MT = FormatReader::Read(pType);
+        MediaType MT = FormatReader::Read(pType.Get());
         vd_CurrentFormats.push_back(MT);
-        SafeRelease(&pType);
     }
+
 done:
-    SafeRelease(&pPD);
-    SafeRelease(&pSD);
-    SafeRelease(&pHandler);
-    SafeRelease(&pType);
     return hr;
 }
+
 videoDevices::videoDevices(void): count(0)
 {}
+
 void videoDevices::clearDevices()
 {
     std::vector<videoDevice *>::iterator i = vds_Devices.begin();
@@ -1920,10 +2075,12 @@ void videoDevices::clearDevices()
         delete (*i);
     vds_Devices.clear();
 }
+
 videoDevices::~videoDevices(void)
 {
     clearDevices();
 }
+
 videoDevice * videoDevices::getDevice(unsigned int i)
 {
     if(i >= vds_Devices.size())
@@ -1936,6 +2093,7 @@ videoDevice * videoDevices::getDevice(unsigned int i)
     }
     return vds_Devices[i];
 }
+
 long videoDevices::initDevices(IMFAttributes *pAttributes)
 {
     HRESULT hr = S_OK;
@@ -1965,15 +2123,18 @@ long videoDevices::initDevices(IMFAttributes *pAttributes)
     }
     return hr;
 }
+
 size_t videoDevices::getCount()
 {
     return vds_Devices.size();
 }
+
 videoDevices& videoDevices::getInstance()
 {
     static videoDevices instance;
     return instance;
 }
+
 Parametr::Parametr()
 {
     CurrentValue = 0;
@@ -1983,6 +2144,7 @@ Parametr::Parametr()
     Default = 0;
     Flag = 0;
 }
+
 MediaType::MediaType()
 {
     pMF_MT_AM_FORMAT_TYPEName = NULL;
@@ -1990,10 +2152,12 @@ MediaType::MediaType()
     pMF_MT_SUBTYPEName = NULL;
     Clear();
 }
+
 MediaType::~MediaType()
 {
     Clear();
 }
+
 void MediaType::Clear()
 {
     MF_MT_FRAME_SIZE = 0;
@@ -2005,8 +2169,8 @@ void MediaType::Clear()
     MF_MT_VIDEO_CHROMA_SITING = 0;
     MF_MT_FIXED_SIZE_SAMPLES = 0;
     MF_MT_VIDEO_NOMINAL_RANGE = 0;
-    MF_MT_FRAME_RATE = 0;
-    MF_MT_FRAME_RATE_low = 0;
+    MF_MT_FRAME_RATE_NUMERATOR = 0;
+    MF_MT_FRAME_RATE_DENOMINATOR = 0;
     MF_MT_PIXEL_ASPECT_RATIO = 0;
     MF_MT_PIXEL_ASPECT_RATIO_low = 0;
     MF_MT_ALL_SAMPLES_INDEPENDENT = 0;
@@ -2021,6 +2185,7 @@ void MediaType::Clear()
     memset(&MF_MT_AM_FORMAT_TYPE, 0, sizeof(GUID));
     memset(&MF_MT_SUBTYPE, 0, sizeof(GUID));
 }
+
 videoInput::videoInput(void): accessToDevices(false)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2029,6 +2194,7 @@ videoInput::videoInput(void): accessToDevices(false)
     if(!accessToDevices)
         DPO->printOut(L"INITIALIZATION: Ther is not any suitable video device\n");
 }
+
 void videoInput::updateListOfDevices()
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2037,11 +2203,13 @@ void videoInput::updateListOfDevices()
     if(!accessToDevices)
         DPO->printOut(L"UPDATING: Ther is not any suitable video device\n");
 }
+
 videoInput::~videoInput(void)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
     DPO->printOut(L"\n***** CLOSE VIDEOINPUT LIBRARY - 2013 *****\n\n");
 }
+
 IMFMediaSource *videoInput::getMediaSource(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2063,6 +2231,7 @@ IMFMediaSource *videoInput::getMediaSource(int deviceID)
     }
     return NULL;
 }
+
 bool videoInput::setupDevice(int deviceID, unsigned int id)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2089,6 +2258,7 @@ bool videoInput::setupDevice(int deviceID, unsigned int id)
     }
     return false;
 }
+
 bool videoInput::setupDevice(int deviceID, unsigned int w, unsigned int h, unsigned int idealFramerate)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2115,6 +2285,7 @@ bool videoInput::setupDevice(int deviceID, unsigned int w, unsigned int h, unsig
     }
     return false;
 }
+
 MediaType videoInput::getFormat(int deviceID, unsigned int id)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2136,6 +2307,7 @@ MediaType videoInput::getFormat(int deviceID, unsigned int id)
     }
     return MediaType();
 }
+
 bool videoInput::isDeviceSetup(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2157,6 +2329,7 @@ bool videoInput::isDeviceSetup(int deviceID)
     }
     return false;
 }
+
 bool videoInput::isDeviceMediaSource(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2178,6 +2351,7 @@ bool videoInput::isDeviceMediaSource(int deviceID)
     }
     return false;
 }
+
 bool videoInput::isDeviceRawDataSource(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2202,6 +2376,7 @@ bool videoInput::isDeviceRawDataSource(int deviceID)
     }
     return false;
 }
+
 bool videoInput::isFrameNew(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2230,6 +2405,7 @@ bool videoInput::isFrameNew(int deviceID)
     }
     return false;
 }
+
 unsigned int videoInput::getCountFormats(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2251,12 +2427,14 @@ unsigned int videoInput::getCountFormats(int deviceID)
     }
     return 0;
 }
+
 void videoInput::closeAllDevices()
 {
     videoDevices *VDS = &videoDevices::getInstance();
     for(unsigned int i = 0; i < VDS->getCount(); i++)
         closeDevice(i);
 }
+
 void videoInput::setParametrs(int deviceID, CamParametrs parametrs)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2277,6 +2455,7 @@ void videoInput::setParametrs(int deviceID, CamParametrs parametrs)
         DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
     }
 }
+
 CamParametrs videoInput::getParametrs(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2299,6 +2478,7 @@ CamParametrs videoInput::getParametrs(int deviceID)
     }
     return out;
 }
+
 void videoInput::closeDevice(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2319,6 +2499,7 @@ void videoInput::closeDevice(int deviceID)
         DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
     }
 }
+
 unsigned int videoInput::getWidth(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2340,6 +2521,7 @@ unsigned int videoInput::getWidth(int deviceID)
     }
     return 0;
 }
+
 unsigned int videoInput::getHeight(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2361,6 +2543,7 @@ unsigned int videoInput::getHeight(int deviceID)
     }
     return 0;
 }
+
 wchar_t *videoInput::getNameVideoDevice(int deviceID)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2382,6 +2565,7 @@ wchar_t *videoInput::getNameVideoDevice(int deviceID)
     }
     return L"Empty";
 }
+
 unsigned int videoInput::listDevices(bool silent)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2405,20 +2589,24 @@ unsigned int videoInput::listDevices(bool silent)
     }
     return out;
 }
+
 videoInput& videoInput::getInstance()
 {
     static videoInput instance;
     return instance;
 }
+
 bool videoInput::isDevicesAcceable()
 {
     return accessToDevices;
 }
+
 void videoInput::setVerbose(bool state)
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
     DPO->setVerbose(state);
 }
+
 void videoInput::setEmergencyStopEvent(int deviceID, void *userData, void(*func)(int, void *))
 {
     DebugPrintOut *DPO = &DebugPrintOut::getInstance();
@@ -2442,6 +2630,7 @@ void videoInput::setEmergencyStopEvent(int deviceID, void *userData, void(*func)
         DPO->printOut(L"VIDEODEVICE(s): There is not any suitable video device\n");
     }
 }
+
 bool videoInput::getPixels(int deviceID, unsigned char * dstBuffer, bool flipRedAndBlue, bool flipImage)
 {
     bool success = false;
@@ -2491,6 +2680,7 @@ bool videoInput::getPixels(int deviceID, unsigned char * dstBuffer, bool flipRed
     }
     return success;
 }
+
 void videoInput::processPixels(unsigned char * src, unsigned char * dst, unsigned int width,
                                 unsigned int height, unsigned int bpp, bool bRGB, bool bFlip)
 {
@@ -2553,6 +2743,7 @@ void videoInput::processPixels(unsigned char * src, unsigned char * dst, unsigne
     }
 }
 }
+
 /******* Capturing video from camera via Microsoft Media Foundation **********/
 class CvCaptureCAM_MSMF : public CvCapture
 {
@@ -2568,33 +2759,35 @@ public:
     virtual int getCaptureDomain() { return CV_CAP_MSMF; } // Return the type of the capture object: CV_CAP_VFW, etc...
 protected:
     void init();
-    int index, width, height,fourcc;
-    int widthSet, heightSet;
+    int index, width, height, fourcc;
     IplImage* frame;
     videoInput VI;
 };
+
 struct SuppressVideoInputMessages
 {
     SuppressVideoInputMessages() { videoInput::setVerbose(true); }
 };
+
 static SuppressVideoInputMessages do_it;
+
 CvCaptureCAM_MSMF::CvCaptureCAM_MSMF():
     index(-1),
     width(-1),
     height(-1),
     fourcc(-1),
-    widthSet(-1),
-    heightSet(-1),
-    frame(0),
+    frame(NULL),
     VI(videoInput::getInstance())
 {
     CoInitialize(0);
 }
+
 CvCaptureCAM_MSMF::~CvCaptureCAM_MSMF()
 {
     close();
     CoUninitialize();
 }
+
 void CvCaptureCAM_MSMF::close()
 {
     if( index >= 0 )
@@ -2603,8 +2796,9 @@ void CvCaptureCAM_MSMF::close()
         index = -1;
         cvReleaseImage(&frame);
     }
-    widthSet = heightSet = width = height = -1;
+    width = height = -1;
 }
+
 // Initialize camera input
 bool CvCaptureCAM_MSMF::open( int _index )
 {
@@ -2621,10 +2815,14 @@ bool CvCaptureCAM_MSMF::open( int _index )
     index = try_index;
     return true;
 }
+
 bool CvCaptureCAM_MSMF::grabFrame()
 {
-    return true;
+    while (VI.isDeviceSetup(index) && !VI.isFrameNew(index))
+        Sleep(1);
+    return VI.isDeviceSetup(index);
 }
+
 IplImage* CvCaptureCAM_MSMF::retrieveFrame(int)
 {
     if( !frame || (int)VI.getWidth(index) != frame->width || (int)VI.getHeight(index) != frame->height )
@@ -2637,6 +2835,7 @@ IplImage* CvCaptureCAM_MSMF::retrieveFrame(int)
     VI.getPixels( index, (uchar*)frame->imageData, false, true );
     return frame;
 }
+
 double CvCaptureCAM_MSMF::getProperty( int property_id )
 {
     // image format proprrties
@@ -2646,50 +2845,7 @@ double CvCaptureCAM_MSMF::getProperty( int property_id )
         return VI.getWidth(index);
     case CV_CAP_PROP_FRAME_HEIGHT:
         return VI.getHeight(index);
-    case CV_CAP_PROP_FOURCC:
-        // FIXME: implement method in VideoInput back end
-        //return VI.getFourcc(index);
-        ;
-    case CV_CAP_PROP_FPS:
-        // FIXME: implement method in VideoInput back end
-        //return VI.getFPS(index);
-        ;
     }
-    // video filter properties
-    switch( property_id )
-    {
-    case CV_CAP_PROP_BRIGHTNESS:
-    case CV_CAP_PROP_CONTRAST:
-    case CV_CAP_PROP_HUE:
-    case CV_CAP_PROP_SATURATION:
-    case CV_CAP_PROP_SHARPNESS:
-    case CV_CAP_PROP_GAMMA:
-    case CV_CAP_PROP_MONOCROME:
-    case CV_CAP_PROP_WHITE_BALANCE_BLUE_U:
-    case CV_CAP_PROP_BACKLIGHT:
-    case CV_CAP_PROP_GAIN:
-        // FIXME: implement method in VideoInput back end
-        // if ( VI.getVideoSettingFilter(index, VI.getVideoPropertyFromCV(property_id), min_value,
-        //                               max_value, stepping_delta, current_value, flags,defaultValue) )
-        //     return (double)current_value;
-        return 0.;
-    }
-    // camera properties
-    switch( property_id )
-    {
-    case CV_CAP_PROP_PAN:
-    case CV_CAP_PROP_TILT:
-    case CV_CAP_PROP_ROLL:
-    case CV_CAP_PROP_ZOOM:
-    case CV_CAP_PROP_EXPOSURE:
-    case CV_CAP_PROP_IRIS:
-    case CV_CAP_PROP_FOCUS:
-    // FIXME: implement method in VideoInput back end
-    //     if (VI.getVideoSettingCamera(index,VI.getCameraPropertyFromCV(property_id),min_value,
-    //          max_value,stepping_delta,current_value,flags,defaultValue) ) return (double)current_value;
-        return 0.;
-    }
-    // unknown parameter or value not available
     return -1;
 }
 bool CvCaptureCAM_MSMF::setProperty( int property_id, double value )
@@ -2706,91 +2862,272 @@ bool CvCaptureCAM_MSMF::setProperty( int property_id, double value )
         height = cvRound(value);
         handled = true;
         break;
-    case CV_CAP_PROP_FOURCC:
-        fourcc = (int)(unsigned long)(value);
-        if ( fourcc == -1 ) {
-            // following cvCreateVideo usage will pop up caprturepindialog here if fourcc=-1
-            // TODO - how to create a capture pin dialog
-        }
-        handled = true;
-        break;
-    case CV_CAP_PROP_FPS:
-        // FIXME: implement method in VideoInput back end
-        // int fps = cvRound(value);
-        // if (fps != VI.getFPS(index))
-        // {
-        //     VI.stopDevice(index);
-        //     VI.setIdealFramerate(index,fps);
-        //     if (widthSet > 0 && heightSet > 0)
-        //         VI.setupDevice(index, widthSet, heightSet);
-        //     else
-        //         VI.setupDevice(index);
-        // }
-        // return VI.isDeviceSetup(index);
-        ;
     }
+
     if ( handled ) {
-        // a stream setting
         if( width > 0 && height > 0 )
         {
-            if( width != (int)VI.getWidth(index) || height != (int)VI.getHeight(index) )//|| fourcc != VI.getFourcc(index) )
+            if( width != (int)VI.getWidth(index) || height != (int)VI.getHeight(index)  && VI.isDeviceSetup(index))//|| fourcc != VI.getFourcc(index) )
             {
-                // FIXME: implement method in VideoInput back end
-                // int fps = static_cast<int>(VI.getFPS(index));
-                // VI.stopDevice(index);
-                // VI.setIdealFramerate(index, fps);
-                // VI.setupDeviceFourcc(index, width, height, fourcc);
+                VI.closeDevice(index);
+                VI.setupDevice(index, width, height);
             }
-            bool success = VI.isDeviceSetup(index);
-            if (success)
-            {
-                widthSet = width;
-                heightSet = height;
-                width = height = fourcc = -1;
-            }
-            return success;
+            return VI.isDeviceSetup(index);
         }
         return true;
     }
-    // show video/camera filter dialog
-    // FIXME: implement method in VideoInput back end
-    // if ( property_id == CV_CAP_PROP_SETTINGS ) {
-    //     VI.showSettingsWindow(index);
-    //     return true;
-    // }
-    //video Filter properties
-    switch( property_id )
-    {
-    case CV_CAP_PROP_BRIGHTNESS:
-    case CV_CAP_PROP_CONTRAST:
-    case CV_CAP_PROP_HUE:
-    case CV_CAP_PROP_SATURATION:
-    case CV_CAP_PROP_SHARPNESS:
-    case CV_CAP_PROP_GAMMA:
-    case CV_CAP_PROP_MONOCROME:
-    case CV_CAP_PROP_WHITE_BALANCE_BLUE_U:
-    case CV_CAP_PROP_BACKLIGHT:
-    case CV_CAP_PROP_GAIN:
-        // FIXME: implement method in VideoInput back end
-        //return VI.setVideoSettingFilter(index,VI.getVideoPropertyFromCV(property_id),(long)value);
-        ;
-    }
-    //camera properties
-    switch( property_id )
-    {
-    case CV_CAP_PROP_PAN:
-    case CV_CAP_PROP_TILT:
-    case CV_CAP_PROP_ROLL:
-    case CV_CAP_PROP_ZOOM:
-    case CV_CAP_PROP_EXPOSURE:
-    case CV_CAP_PROP_IRIS:
-    case CV_CAP_PROP_FOCUS:
-        // FIXME: implement method in VideoInput back end
-        //return VI.setVideoSettingCamera(index,VI.getCameraPropertyFromCV(property_id),(long)value);
-        ;
-    }
+
     return false;
 }
+
+class CvCaptureFile_MSMF : public CvCapture
+{
+public:
+    CvCaptureFile_MSMF();
+    virtual ~CvCaptureFile_MSMF();
+
+    virtual bool open( const char* filename );
+    virtual void close();
+
+    virtual double getProperty(int);
+    virtual bool setProperty(int, double);
+    virtual bool grabFrame();
+    virtual IplImage* retrieveFrame(int);
+    virtual int getCaptureDomain() { return CV_CAP_MSMF; }
+protected:
+    ImageGrabberThread* grabberThread;
+    IMFMediaSource* videoFileSource;
+    std::vector<MediaType> captureFormats;
+    int captureFormatIndex;
+    IplImage* frame;
+    bool isOpened;
+
+    HRESULT enumerateCaptureFormats(IMFMediaSource *pSource);
+    HRESULT getSourceDuration(IMFMediaSource *pSource, MFTIME *pDuration);
+};
+
+CvCaptureFile_MSMF::CvCaptureFile_MSMF():
+    grabberThread(NULL),
+    videoFileSource(NULL),
+    captureFormatIndex(0),
+    frame(NULL),
+    isOpened(false)
+{
+    MFStartup(MF_VERSION);
+}
+
+CvCaptureFile_MSMF::~CvCaptureFile_MSMF()
+{
+    close();
+    MFShutdown();
+}
+
+bool CvCaptureFile_MSMF::open(const char* filename)
+{
+    if (!filename)
+        return false;
+
+    wchar_t* unicodeFileName = new wchar_t[strlen(filename)+1];
+    MultiByteToWideChar(CP_ACP, 0, filename, -1, unicodeFileName, strlen(filename)+1);
+
+    HRESULT hr = S_OK;
+
+    MF_OBJECT_TYPE ObjectType = MF_OBJECT_INVALID;
+
+    ComPtr<IMFSourceResolver> pSourceResolver = NULL;
+    IUnknown* pUnkSource = NULL;
+
+    hr = MFCreateSourceResolver(pSourceResolver.GetAddressOf());
+
+    if (SUCCEEDED(hr))
+    {
+        hr = pSourceResolver->CreateObjectFromURL(
+            unicodeFileName,
+            MF_RESOLUTION_MEDIASOURCE,
+            NULL, // Optional property store.
+            &ObjectType,
+            &pUnkSource
+            );
+    }
+
+    // Get the IMFMediaSource from the IUnknown pointer.
+    if (SUCCEEDED(hr))
+    {
+        hr = pUnkSource->QueryInterface(IID_PPV_ARGS(&videoFileSource));
+    }
+
+    SafeRelease(&pUnkSource);
+
+    if (SUCCEEDED(hr))
+    {
+        hr = enumerateCaptureFormats(videoFileSource);
+    }
+
+    if (SUCCEEDED(hr))
+    {
+        hr = ImageGrabberThread::CreateInstance(&grabberThread, videoFileSource, (unsigned int)-2, true);
+    }
+
+    if (SUCCEEDED(hr))
+    {
+        grabberThread->start();
+    }
+
+    isOpened = SUCCEEDED(hr);
+
+    return isOpened;
+}
+
+void CvCaptureFile_MSMF::close()
+{
+    if (grabberThread)
+    {
+        isOpened = false;
+        SetEvent(grabberThread->getImageGrabber()->ig_hFinish);
+        grabberThread->stop();
+        delete grabberThread;
+    }
+
+    if (videoFileSource)
+    {
+        videoFileSource->Shutdown();
+    }
+}
+
+bool CvCaptureFile_MSMF::setProperty(int property_id, double value)
+{
+    // image capture properties
+    // FIXME: implement method in VideoInput back end
+    (void) property_id;
+    (void) value;
+    return false;
+}
+
+double CvCaptureFile_MSMF::getProperty(int property_id)
+{
+    // image format proprrties
+    switch( property_id )
+    {
+    case CV_CAP_PROP_FRAME_WIDTH:
+        return captureFormats[captureFormatIndex].width;
+    case CV_CAP_PROP_FRAME_HEIGHT:
+        return captureFormats[captureFormatIndex].height;
+    case CV_CAP_PROP_FRAME_COUNT:
+        {
+            MFTIME duration;
+            getSourceDuration(this->videoFileSource, &duration);
+            double fps = ((double)captureFormats[captureFormatIndex].MF_MT_FRAME_RATE_NUMERATOR) /
+            ((double)captureFormats[captureFormatIndex].MF_MT_FRAME_RATE_DENOMINATOR);
+            return (double)floor(((double)duration/1e7)*fps+0.5);
+        }
+    case CV_CAP_PROP_FOURCC:
+        return captureFormats[captureFormatIndex].MF_MT_SUBTYPE.Data1;
+    case CV_CAP_PROP_FPS:
+        return ((double)captureFormats[captureFormatIndex].MF_MT_FRAME_RATE_NUMERATOR) /
+            ((double)captureFormats[captureFormatIndex].MF_MT_FRAME_RATE_DENOMINATOR);
+    }
+ 
+    return -1;
+}
+
+bool CvCaptureFile_MSMF::grabFrame()
+{
+    DWORD waitResult = (DWORD)-1;
+    if (isOpened)
+    {
+        SetEvent(grabberThread->getImageGrabber()->ig_hFrameGrabbed);
+        HANDLE tmp[] = {grabberThread->getImageGrabber()->ig_hFrameReady, grabberThread->getImageGrabber()->ig_hFinish, 0};
+        waitResult = WaitForMultipleObjects(2, tmp, FALSE, INFINITE);
+    }
+
+    return isOpened && grabberThread->getImageGrabber()->getRawImage()->isNew() && (waitResult == WAIT_OBJECT_0);
+}
+
+IplImage* CvCaptureFile_MSMF::retrieveFrame(int)
+{
+    unsigned int width = captureFormats[captureFormatIndex].width;
+    unsigned int height = captureFormats[captureFormatIndex].height;
+    unsigned int bytes = 3;
+    if( !frame || (int)width != frame->width || (int)height != frame->height )
+    {
+        if (frame)
+            cvReleaseImage( &frame );
+        frame = cvCreateImage( cvSize(width,height), 8, 3 );
+    }
+
+    RawImage *RIOut = grabberThread->getImageGrabber()->getRawImage();
+    unsigned int size = bytes * width * height;
+
+    bool verticalFlip = captureFormats[captureFormatIndex].MF_MT_DEFAULT_STRIDE < 0;
+
+    if(RIOut && size == RIOut->getSize())
+    {
+         videoInput::processPixels(RIOut->getpPixels(), (unsigned char*)frame->imageData, width, 
+             height, bytes, false, verticalFlip);
+    }
+
+    return frame;
+}
+
+HRESULT CvCaptureFile_MSMF::enumerateCaptureFormats(IMFMediaSource *pSource)
+{
+    ComPtr<IMFPresentationDescriptor> pPD = NULL;
+    ComPtr<IMFStreamDescriptor> pSD = NULL;
+    ComPtr<IMFMediaTypeHandler> pHandler = NULL;
+    ComPtr<IMFMediaType> pType = NULL;
+    HRESULT hr = pSource->CreatePresentationDescriptor(pPD.GetAddressOf());
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    BOOL fSelected;
+    hr = pPD->GetStreamDescriptorByIndex(0, &fSelected, pSD.GetAddressOf());
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    hr = pSD->GetMediaTypeHandler(pHandler.GetAddressOf());
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    DWORD cTypes = 0;
+    hr = pHandler->GetMediaTypeCount(&cTypes);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+    for (DWORD i = 0; i < cTypes; i++)
+    {
+        hr = pHandler->GetMediaTypeByIndex(i, pType.GetAddressOf());
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+        MediaType MT = FormatReader::Read(pType.Get());
+        captureFormats.push_back(MT);
+    }
+
+done:
+    return hr;
+}
+
+HRESULT CvCaptureFile_MSMF::getSourceDuration(IMFMediaSource *pSource, MFTIME *pDuration)
+{
+    *pDuration = 0;
+
+    IMFPresentationDescriptor *pPD = NULL;
+
+    HRESULT hr = pSource->CreatePresentationDescriptor(&pPD);
+    if (SUCCEEDED(hr))
+    {
+        hr = pPD->GetUINT64(MF_PD_DURATION, (UINT64*)pDuration);
+        pPD->Release();
+    }
+    return hr;
+}
+
 CvCapture* cvCreateCameraCapture_MSMF( int index )
 {
     CvCaptureCAM_MSMF* capture = new CvCaptureCAM_MSMF;
@@ -2807,4 +3144,392 @@ CvCapture* cvCreateCameraCapture_MSMF( int index )
     delete capture;
     return 0;
 }
+
+CvCapture* cvCreateFileCapture_MSMF (const char* filename)
+{
+    CvCaptureFile_MSMF* capture = new CvCaptureFile_MSMF;
+    try
+    {
+        if( capture->open(filename) )
+            return capture;
+        else
+        {
+            delete capture;
+            return NULL;
+        }
+    }
+    catch(...)
+    {
+        delete capture;
+        throw;
+    }
+}
+
+//
+//
+// Media Foundation-based Video Writer
+//
+//
+
+class CvVideoWriter_MSMF : public CvVideoWriter
+{
+public:
+    CvVideoWriter_MSMF();
+    virtual ~CvVideoWriter_MSMF();
+    virtual bool open(const char* filename, int fourcc,
+                       double fps, CvSize frameSize, bool isColor);
+    virtual void close();
+    virtual bool writeFrame(const IplImage* img);
+
+private:
+    UINT32 videoWidth;
+    UINT32 videoHeight;
+    double fps;
+    UINT32 bitRate;
+    UINT32 frameSize;
+    GUID   encodingFormat;
+    GUID   inputFormat;
+
+    DWORD  streamIndex;
+    ComPtr<IMFSinkWriter> sinkWriter;
+
+    bool   initiated;
+
+    LONGLONG rtStart;
+    UINT64 rtDuration;
+
+    HRESULT InitializeSinkWriter(const char* filename);
+    static const GUID FourCC2GUID(int fourcc);
+    HRESULT WriteFrame(DWORD *videoFrameBuffer, const LONGLONG& rtStart, const LONGLONG& rtDuration);
+};
+
+CvVideoWriter_MSMF::CvVideoWriter_MSMF():
+    initiated(false)
+{
+}
+
+CvVideoWriter_MSMF::~CvVideoWriter_MSMF()
+{
+    close();
+}
+
+const GUID CvVideoWriter_MSMF::FourCC2GUID(int fourcc)
+{
+    switch(fourcc)
+    {
+        case CV_FOURCC_MACRO('d', 'v', '2', '5'):
+            return MFVideoFormat_DV25; break;
+        case CV_FOURCC_MACRO('d', 'v', '5', '0'):
+            return MFVideoFormat_DV50; break;
+        case CV_FOURCC_MACRO('d', 'v', 'c', ' '):
+            return MFVideoFormat_DVC; break;
+        case CV_FOURCC_MACRO('d', 'v', 'h', '1'):
+            return MFVideoFormat_DVH1; break;
+        case CV_FOURCC_MACRO('d', 'v', 'h', 'd'):
+            return MFVideoFormat_DVHD; break;
+        case CV_FOURCC_MACRO('d', 'v', 's', 'd'):
+            return MFVideoFormat_DVSD; break;
+        case CV_FOURCC_MACRO('d', 'v', 's', 'l'):
+                return MFVideoFormat_DVSL; break;
+        case CV_FOURCC_MACRO('H', '2', '6', '3'):
+                return MFVideoFormat_H263; break;
+        case CV_FOURCC_MACRO('H', '2', '6', '4'):
+                return MFVideoFormat_H264; break;
+        case CV_FOURCC_MACRO('M', '4', 'S', '2'):
+                return MFVideoFormat_M4S2; break;
+        case CV_FOURCC_MACRO('M', 'J', 'P', 'G'):
+                return MFVideoFormat_MJPG; break;
+        case CV_FOURCC_MACRO('M', 'P', '4', '3'):
+                return MFVideoFormat_MP43; break;
+        case CV_FOURCC_MACRO('M', 'P', '4', 'S'):
+                return MFVideoFormat_MP4S; break;
+        case CV_FOURCC_MACRO('M', 'P', '4', 'V'):
+                return MFVideoFormat_MP4V; break;
+        case CV_FOURCC_MACRO('M', 'P', 'G', '1'):
+                return MFVideoFormat_MPG1; break;
+        case CV_FOURCC_MACRO('M', 'S', 'S', '1'):
+                return MFVideoFormat_MSS1; break;
+        case CV_FOURCC_MACRO('M', 'S', 'S', '2'):
+                return MFVideoFormat_MSS2; break;
+        case CV_FOURCC_MACRO('W', 'M', 'V', '1'):
+                return MFVideoFormat_WMV1; break;
+        case CV_FOURCC_MACRO('W', 'M', 'V', '2'):
+                return MFVideoFormat_WMV2; break;
+        case CV_FOURCC_MACRO('W', 'M', 'V', '3'):
+                return MFVideoFormat_WMV3; break;
+        case CV_FOURCC_MACRO('W', 'V', 'C', '1'):
+                return MFVideoFormat_WVC1; break;
+        default:
+            return MFVideoFormat_H264;
+    }
+}
+
+bool CvVideoWriter_MSMF::open( const char* filename, int fourcc,
+                       double _fps, CvSize frameSize, bool /*isColor*/ )
+{
+    videoWidth = frameSize.width;
+    videoHeight = frameSize.height;
+    fps = _fps;
+    bitRate = (UINT32)fps*videoWidth*videoHeight; // 1-bit per pixel
+    encodingFormat = FourCC2GUID(fourcc);
+    inputFormat = MFVideoFormat_RGB32;
+
+    HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
+    if (SUCCEEDED(hr))
+    {
+        hr = MFStartup(MF_VERSION);
+        if (SUCCEEDED(hr))
+        {
+            hr = InitializeSinkWriter(filename);
+            if (SUCCEEDED(hr))
+            {
+                initiated = true;
+                rtStart = 0;
+                MFFrameRateToAverageTimePerFrame((UINT32)fps, 1, &rtDuration);
+            }
+        }
+    }
+
+    return SUCCEEDED(hr);
+}
+
+void CvVideoWriter_MSMF::close()
+{
+    if (!initiated)
+    {
+        return;
+    }
+
+    initiated = false;
+    sinkWriter->Finalize();
+    MFShutdown();
+}
+
+bool CvVideoWriter_MSMF::writeFrame(const IplImage* img)
+{
+    if (!img)
+        return false;
+
+    int length = img->width * img->height * 4;
+    DWORD* target = new DWORD[length];
+
+    for (int rowIdx = 0; rowIdx < img->height; rowIdx++)
+    {
+        char* rowStart = img->imageData + rowIdx*img->widthStep;
+        for (int colIdx = 0; colIdx < img->width; colIdx++)
+        {
+            BYTE b = rowStart[colIdx * img->nChannels + 0];
+            BYTE g = rowStart[colIdx * img->nChannels + 1];
+            BYTE r = rowStart[colIdx * img->nChannels + 2];
+
+            target[rowIdx*img->width+colIdx] = (r << 16) + (g << 8) + b;
+        }
+    }
+
+    // Send frame to the sink writer.
+    HRESULT hr = WriteFrame(target, rtStart, rtDuration);
+    if (FAILED(hr))
+    {
+        delete[] target;
+        return false;
+    }
+    rtStart += rtDuration;
+
+    delete[] target;
+
+    return true;
+}
+
+HRESULT CvVideoWriter_MSMF::InitializeSinkWriter(const char* filename)
+{
+    ComPtr<IMFAttributes> spAttr;
+    ComPtr<IMFMediaType>  mediaTypeOut;
+    ComPtr<IMFMediaType>  mediaTypeIn;
+    ComPtr<IMFByteStream> spByteStream;
+
+    MFCreateAttributes(&spAttr, 10);
+    spAttr->SetUINT32(MF_READWRITE_ENABLE_HARDWARE_TRANSFORMS, true);
+
+    wchar_t* unicodeFileName = new wchar_t[strlen(filename)+1];
+    MultiByteToWideChar(CP_ACP, 0, filename, -1, unicodeFileName, strlen(filename)+1);
+
+    HRESULT hr = MFCreateSinkWriterFromURL(unicodeFileName, NULL, spAttr.Get(), &sinkWriter);
+
+    delete[] unicodeFileName;
+
+    // Set the output media type.
+    if (SUCCEEDED(hr))
+    {
+        hr = MFCreateMediaType(&mediaTypeOut);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = mediaTypeOut->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = mediaTypeOut->SetGUID(MF_MT_SUBTYPE, encodingFormat);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = mediaTypeOut->SetUINT32(MF_MT_AVG_BITRATE, bitRate);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = mediaTypeOut->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = MFSetAttributeSize(mediaTypeOut.Get(), MF_MT_FRAME_SIZE, videoWidth, videoHeight);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = MFSetAttributeRatio(mediaTypeOut.Get(), MF_MT_FRAME_RATE, (UINT32)fps, 1);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = MFSetAttributeRatio(mediaTypeOut.Get(), MF_MT_PIXEL_ASPECT_RATIO, 1, 1);
+    }
+
+    if (SUCCEEDED(hr))
+    {
+        hr = sinkWriter->AddStream(mediaTypeOut.Get(), &streamIndex);
+    }
+
+    // Set the input media type.
+    if (SUCCEEDED(hr))
+    {
+        hr = MFCreateMediaType(&mediaTypeIn);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = mediaTypeIn->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = mediaTypeIn->SetGUID(MF_MT_SUBTYPE, inputFormat);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = mediaTypeIn->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = MFSetAttributeSize(mediaTypeIn.Get(), MF_MT_FRAME_SIZE, videoWidth, videoHeight);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = MFSetAttributeRatio(mediaTypeIn.Get(), MF_MT_FRAME_RATE, (UINT32)fps, 1);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = MFSetAttributeRatio(mediaTypeIn.Get(), MF_MT_PIXEL_ASPECT_RATIO, 1, 1);
+    }
+
+    if (SUCCEEDED(hr))
+    {
+        hr = sinkWriter->SetInputMediaType(streamIndex, mediaTypeIn.Get(), NULL);
+    }
+
+    // Tell the sink writer to start accepting data.
+    if (SUCCEEDED(hr))
+    {
+        hr = sinkWriter->BeginWriting();
+    }
+
+    return hr;
+}
+
+HRESULT CvVideoWriter_MSMF::WriteFrame(DWORD *videoFrameBuffer, const LONGLONG& Start, const LONGLONG& Duration)
+{
+    ComPtr<IMFSample> sample;
+    ComPtr<IMFMediaBuffer> buffer;
+
+    const LONG cbWidth = 4 * videoWidth;
+    const DWORD cbBuffer = cbWidth * videoHeight;
+
+    BYTE *pData = NULL;
+
+    // Create a new memory buffer.
+    HRESULT hr = MFCreateMemoryBuffer(cbBuffer, &buffer);
+
+    // Lock the buffer and copy the video frame to the buffer.
+    if (SUCCEEDED(hr))
+    {
+        hr = buffer->Lock(&pData, NULL, NULL);
+    }
+
+    if (SUCCEEDED(hr))
+    {
+#if defined(_M_ARM)
+        hr = MFCopyImage(
+            pData,                      // Destination buffer.
+            -cbWidth,                   // Destination stride.
+            (BYTE*)videoFrameBuffer,    // First row in source image.
+            cbWidth,                    // Source stride.
+            cbWidth,                    // Image width in bytes.
+            videoHeight                 // Image height in pixels.
+            );
+#else
+        hr = MFCopyImage(
+            pData,                      // Destination buffer.
+            cbWidth,                    // Destination stride.
+            (BYTE*)videoFrameBuffer,    // First row in source image.
+            cbWidth,                    // Source stride.
+            cbWidth,                    // Image width in bytes.
+            videoHeight                 // Image height in pixels.
+            );
+#endif
+    }
+
+    if (buffer)
+    {
+        buffer->Unlock();
+    }
+
+    // Set the data length of the buffer.
+    if (SUCCEEDED(hr))
+    {
+        hr = buffer->SetCurrentLength(cbBuffer);
+    }
+
+    // Create a media sample and add the buffer to the sample.
+    if (SUCCEEDED(hr))
+    {
+        hr = MFCreateSample(&sample);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = sample->AddBuffer(buffer.Get());
+    }
+
+    // Set the time stamp and the duration.
+    if (SUCCEEDED(hr))
+    {
+        hr = sample->SetSampleTime(Start);
+    }
+    if (SUCCEEDED(hr))
+    {
+        hr = sample->SetSampleDuration(Duration);
+    }
+
+    // Send the sample to the Sink Writer.
+    if (SUCCEEDED(hr))
+    {
+        hr = sinkWriter->WriteSample(streamIndex, sample.Get());
+    }
+
+    return hr;
+}
+
+CvVideoWriter* cvCreateVideoWriter_MSMF( const char* filename, int fourcc,
+                                        double fps, CvSize frameSize, int isColor )
+{
+    CvVideoWriter_MSMF* writer = new CvVideoWriter_MSMF;
+    if( writer->open( filename, fourcc, fps, frameSize, isColor != 0 ))
+        return writer;
+    delete writer;
+    return NULL;
+}
+
 #endif
\ No newline at end of file
diff --git a/modules/highgui/src/cap_vfw.cpp b/modules/highgui/src/cap_vfw.cpp
index d419a48912..d845953f8e 100644
--- a/modules/highgui/src/cap_vfw.cpp
+++ b/modules/highgui/src/cap_vfw.cpp
@@ -613,8 +613,10 @@ bool CvVideoWriter_VFW::open( const char* filename, int _fourcc, double _fps, Cv
             close();
             return false;
         }
+        return true;
     }
-    return true;
+    else
+        return false;
 }
 
 
diff --git a/modules/highgui/src/cap_ximea.cpp b/modules/highgui/src/cap_ximea.cpp
index dbb8f58683..5acf2c09d1 100644
--- a/modules/highgui/src/cap_ximea.cpp
+++ b/modules/highgui/src/cap_ximea.cpp
@@ -20,25 +20,24 @@ public:
     virtual IplImage* retrieveFrame(int);
     virtual int getCaptureDomain() { return CV_CAP_XIAPI; } // Return the type of the capture object: CV_CAP_VFW, etc...
 
-protected:
+private:
     void init();
     void errMsg(const char* msg, int errNum);
+    void resetCvImage();
+    int  getBpp();
     IplImage* frame;
 
     HANDLE    hmv;
     DWORD     numDevices;
-    XI_IMG    image;
-    int       width;
-    int       height;
-    int       format;
     int       timeout;
+    XI_IMG    image;
 };
 
 /**********************************************************************************/
 
 CvCapture* cvCreateCameraCapture_XIMEA( int index )
 {
-     CvCaptureCAM_XIMEA* capture = new CvCaptureCAM_XIMEA;
+    CvCaptureCAM_XIMEA* capture = new CvCaptureCAM_XIMEA;
 
     if( capture->open( index ))
         return capture;
@@ -79,18 +78,19 @@ bool CvCaptureCAM_XIMEA::open( int wIndex )
     // always use auto white ballance
     mvret = xiSetParamInt( hmv, XI_PRM_AUTO_WB, 1);
     if(mvret != XI_OK) goto error;
+    
+    // default image format RGB24
+    mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, XI_RGB24);
+    if(mvret != XI_OK) goto error;
 
+    int width = 0;
     mvret = xiGetParamInt( hmv, XI_PRM_WIDTH, &width);
     if(mvret != XI_OK) goto error;
 
+    int height = 0;
     mvret = xiGetParamInt( hmv, XI_PRM_HEIGHT, &height);
     if(mvret != XI_OK) goto error;
 
-    // default image format RGB24
-    format = XI_RGB24;
-    mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, format);
-    if(mvret != XI_OK) goto error;
-
     // allocate frame buffer for RGB24 image
     frame = cvCreateImage(cvSize( width, height), IPL_DEPTH_8U, 3);
 
@@ -103,10 +103,10 @@ bool CvCaptureCAM_XIMEA::open( int wIndex )
         errMsg("StartAcquisition XI_DEVICE failed", mvret);
         goto error;
     }
-
     return true;
 
 error:
+    errMsg("Open XI_DEVICE failed", mvret);
     xiCloseDevice(hmv);
     hmv = NULL;
     return false;
@@ -116,18 +116,19 @@ error:
 
 void CvCaptureCAM_XIMEA::close()
 {
-    if(hmv)
-    {
-        xiStopAcquisition(hmv);
-        xiCloseDevice(hmv);
-        hmv = NULL;
-    }
+    if(frame)
+        cvReleaseImage(&frame);
+
+    xiStopAcquisition(hmv);
+    xiCloseDevice(hmv);
+    hmv = NULL;
 }
 
 /**********************************************************************************/
 
 bool CvCaptureCAM_XIMEA::grabFrame()
 {
+    memset(&image, 0, sizeof(XI_IMG));
     image.size = sizeof(XI_IMG);
     int mvret = xiGetImage( hmv, timeout, &image);
 
@@ -151,31 +152,18 @@ bool CvCaptureCAM_XIMEA::grabFrame()
 IplImage* CvCaptureCAM_XIMEA::retrieveFrame(int)
 {
     // update cvImage after format has changed
-    if( (int)image.width != width || (int)image.height != height || image.frm != (XI_IMG_FORMAT)format)
-    {
-        cvReleaseImage(&frame);
-        switch( image.frm)
-        {
-        case XI_MONO8  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 1); break;
-        case XI_MONO16 : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_16U, 1); break;
-        case XI_RGB24  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 3); break;
-        case XI_RGB32  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 4); break;
-        default :
-            return frame;
-        }
-        // update global image format
-        format = image.frm;
-        width = image.width;
-        height = image.height;
-    }
-
+    resetCvImage();
+    
     // copy pixel data
     switch( image.frm)
     {
-    case XI_MONO8  : memcpy( frame->imageData, image.bp, image.width*image.height); break;
-    case XI_MONO16 : memcpy( frame->imageData, image.bp, image.width*image.height*sizeof(WORD)); break;
-    case XI_RGB24  : memcpy( frame->imageData, image.bp, image.width*image.height*3); break;
-    case XI_RGB32  : memcpy( frame->imageData, image.bp, image.width*image.height*sizeof(DWORD)); break;
+    case XI_MONO8       : 
+    case XI_RAW8        : memcpy( frame->imageData, image.bp, image.width*image.height); break;
+    case XI_MONO16      :
+    case XI_RAW16       : memcpy( frame->imageData, image.bp, image.width*image.height*sizeof(WORD)); break;
+    case XI_RGB24       :
+    case XI_RGB_PLANAR  : memcpy( frame->imageData, image.bp, image.width*image.height*3); break;
+    case XI_RGB32       : memcpy( frame->imageData, image.bp, image.width*image.height*4); break;
     default: break;
     }
     return frame;
@@ -183,6 +171,35 @@ IplImage* CvCaptureCAM_XIMEA::retrieveFrame(int)
 
 /**********************************************************************************/
 
+void CvCaptureCAM_XIMEA::resetCvImage()
+{
+    int width = 0, height = 0, format = 0;
+    xiGetParamInt( hmv, XI_PRM_WIDTH, &width);
+    xiGetParamInt( hmv, XI_PRM_HEIGHT, &height);
+    xiGetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, &format);
+
+    if( (int)image.width != width || (int)image.height != height || image.frm != (XI_IMG_FORMAT)format)
+    {
+        if(frame) cvReleaseImage(&frame);
+        frame = NULL;
+
+        switch( image.frm)
+        {
+        case XI_MONO8       :
+        case XI_RAW8        : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 1); break;
+        case XI_MONO16      : 
+        case XI_RAW16       : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_16U, 1); break;
+        case XI_RGB24       : 
+        case XI_RGB_PLANAR  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 3); break;
+        case XI_RGB32       : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 4); break;
+        default :
+            return;
+        }
+    }	
+    cvZero(frame);
+}
+/**********************************************************************************/
+
 double CvCaptureCAM_XIMEA::getProperty( int property_id )
 {
     if(hmv == NULL)
@@ -238,20 +255,14 @@ bool CvCaptureCAM_XIMEA::setProperty( int property_id, double value )
     switch(property_id)
     {
     // OCV parameters
-    case CV_CAP_PROP_FRAME_WIDTH  : mvret = xiSetParamInt( hmv, XI_PRM_WIDTH, ival);
-        if(mvret == XI_OK) width = ival;
-        break;
-    case CV_CAP_PROP_FRAME_HEIGHT : mvret = xiSetParamInt( hmv, XI_PRM_HEIGHT, ival);
-        if(mvret == XI_OK) height = ival;
-        break;
+    case CV_CAP_PROP_FRAME_WIDTH  : mvret = xiSetParamInt( hmv, XI_PRM_WIDTH, ival); break;
+    case CV_CAP_PROP_FRAME_HEIGHT : mvret = xiSetParamInt( hmv, XI_PRM_HEIGHT, ival); break;
     case CV_CAP_PROP_FPS          : mvret = xiSetParamFloat( hmv, XI_PRM_FRAMERATE, fval); break;
     case CV_CAP_PROP_GAIN         : mvret = xiSetParamFloat( hmv, XI_PRM_GAIN, fval); break;
     case CV_CAP_PROP_EXPOSURE     : mvret = xiSetParamInt( hmv, XI_PRM_EXPOSURE, ival); break;
     // XIMEA camera properties
     case CV_CAP_PROP_XI_DOWNSAMPLING  : mvret = xiSetParamInt( hmv, XI_PRM_DOWNSAMPLING, ival); break;
-    case CV_CAP_PROP_XI_DATA_FORMAT   : mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, ival);
-        if(mvret == XI_OK) format = ival;
-        break;
+    case CV_CAP_PROP_XI_DATA_FORMAT   : mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, ival); break;
     case CV_CAP_PROP_XI_OFFSET_X      : mvret = xiSetParamInt( hmv, XI_PRM_OFFSET_X, ival); break;
     case CV_CAP_PROP_XI_OFFSET_Y      : mvret = xiSetParamInt( hmv, XI_PRM_OFFSET_Y, ival); break;
     case CV_CAP_PROP_XI_TRG_SOURCE    : mvret = xiSetParamInt( hmv, XI_PRM_TRG_SOURCE, ival); break;
@@ -288,7 +299,7 @@ bool CvCaptureCAM_XIMEA::setProperty( int property_id, double value )
 void CvCaptureCAM_XIMEA::errMsg(const char* msg, int errNum)
 {
 #if defined WIN32 || defined _WIN32
-    char buf[512];
+    char buf[512]="";
     sprintf( buf, "%s : %d\n", msg, errNum);
     OutputDebugString(buf);
 #else
@@ -296,4 +307,22 @@ void CvCaptureCAM_XIMEA::errMsg(const char* msg, int errNum)
 #endif
 }
 
+/**********************************************************************************/
+
+int  CvCaptureCAM_XIMEA::getBpp()
+{
+    switch( image.frm)
+    {
+    case XI_MONO8       :
+    case XI_RAW8        : return 1;
+    case XI_MONO16      : 
+    case XI_RAW16       : return 2;
+    case XI_RGB24       : 
+    case XI_RGB_PLANAR  : return 3;
+    case XI_RGB32       : return 4;
+    default :
+        return 0;
+    }
+}
+
 /**********************************************************************************/
\ No newline at end of file
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index aa327d6d7c..dcd4afdc01 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -119,6 +119,9 @@ CvVideoWriter* cvCreateVideoWriter_VFW( const char* filename, int fourcc,
                                         double fps, CvSize frameSize, int is_color );
 CvCapture* cvCreateCameraCapture_DShow( int index );
 CvCapture* cvCreateCameraCapture_MSMF( int index );
+CvCapture* cvCreateFileCapture_MSMF (const char* filename);
+CvVideoWriter* cvCreateVideoWriter_MSMF( const char* filename, int fourcc,
+                                        double fps, CvSize frameSize, int is_color );
 CvCapture* cvCreateCameraCapture_OpenNI( int index );
 CvCapture* cvCreateFileCapture_OpenNI( const char* filename );
 CvCapture* cvCreateCameraCapture_Android( int index );
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 6d29534643..1e47bf6ee3 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -256,12 +256,17 @@ namespace
 
 void cv::imshow( const string& winname, InputArray _img )
 {
+    const Size size = _img.size();
 #ifndef HAVE_OPENGL
-    Mat img = _img.getMat();
-    CvMat c_img = img;
-    cvShowImage(winname.c_str(), &c_img);
+    CV_Assert(size.width>0 && size.height>0);
+    {
+        Mat img = _img.getMat();
+        CvMat c_img = img;
+        cvShowImage(winname.c_str(), &c_img);
+    }
 #else
     const double useGl = getWindowProperty(winname, WND_PROP_OPENGL);
+    CV_Assert(size.width>0 && size.height>0);
 
     if (useGl <= 0)
     {
@@ -275,7 +280,6 @@ void cv::imshow( const string& winname, InputArray _img )
 
         if (autoSize > 0)
         {
-            Size size = _img.size();
             resizeWindow(winname, size.width, size.height);
         }
 
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index 50f2b9e787..64d57ab269 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -38,6 +38,7 @@
 
 //--------------------Google Code 2010 -- Yannick Verdie--------------------//
 
+#include "precomp.hpp"
 
 #if defined(HAVE_QT)
 
@@ -2473,35 +2474,33 @@ void DefaultViewPort::saveView()
     if (!fileName.isEmpty()) //save the picture
     {
         QString extension = fileName.right(3);
-
-        //   (no need anymore) create the image resized to receive the 'screenshot'
-        //    image2Draw_qt_resized = QImage(viewport()->width(), viewport()->height(),QImage::Format_RGB888);
-
-        QPainter saveimage(&image2Draw_qt_resized);
-        this->render(&saveimage);
+        
+        // Create a new pixmap to render the viewport into
+        QPixmap viewportPixmap(viewport()->size());
+        viewport()->render(&viewportPixmap);
 
         // Save it..
         if (QString::compare(extension, "png", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "PNG");
+            viewportPixmap.save(fileName, "PNG");
             return;
         }
 
         if (QString::compare(extension, "jpg", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "JPG");
+            viewportPixmap.save(fileName, "JPG");
             return;
         }
 
         if (QString::compare(extension, "bmp", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "BMP");
+            viewportPixmap.save(fileName, "BMP");
             return;
         }
 
         if (QString::compare(extension, "jpeg", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "JPEG");
+            viewportPixmap.save(fileName, "JPEG");
             return;
         }
 
@@ -2651,17 +2650,16 @@ void DefaultViewPort::paintEvent(QPaintEvent* evnt)
     //Now disable matrixWorld for overlay display
     myPainter.setWorldMatrixEnabled(false);
 
+    //overlay pixel values if zoomed in far enough
+    if (param_matrixWorld.m11()*ratioX >= threshold_zoom_img_region &&
+        param_matrixWorld.m11()*ratioY >= threshold_zoom_img_region)
+    {
+        drawImgRegion(&myPainter);
+    }
+
     //in mode zoom/panning
     if (param_matrixWorld.m11() > 1)
     {
-        if (param_matrixWorld.m11() >= threshold_zoom_img_region)
-        {
-            if (centralWidget->param_flags == CV_WINDOW_NORMAL)
-                startDisplayInfo("WARNING: The values displayed are the resized image's values. If you want the original image's values, use CV_WINDOW_AUTOSIZE", 1000);
-
-            drawImgRegion(&myPainter);
-        }
-
         drawViewOverview(&myPainter);
     }
 
@@ -2887,22 +2885,24 @@ void DefaultViewPort::drawStatusBar()
 //accept only CV_8UC1 and CV_8UC8 image for now
 void DefaultViewPort::drawImgRegion(QPainter *painter)
 {
-
     if (nbChannelOriginImage!=CV_8UC1 && nbChannelOriginImage!=CV_8UC3)
         return;
 
-    qreal offsetX = param_matrixWorld.dx()/param_matrixWorld.m11();
+    double pixel_width = param_matrixWorld.m11()*ratioX;
+    double pixel_height = param_matrixWorld.m11()*ratioY;
+
+    qreal offsetX = param_matrixWorld.dx()/pixel_width;
     offsetX = offsetX - floor(offsetX);
-    qreal offsetY = param_matrixWorld.dy()/param_matrixWorld.m11();
+    qreal offsetY = param_matrixWorld.dy()/pixel_height;
     offsetY = offsetY - floor(offsetY);
 
     QSize view = size();
     QVarLengthArray<QLineF, 30> linesX;
-    for (qreal _x = offsetX*param_matrixWorld.m11(); _x < view.width(); _x += param_matrixWorld.m11() )
+    for (qreal _x = offsetX*pixel_width; _x < view.width(); _x += pixel_width )
         linesX.append(QLineF(_x, 0, _x, view.height()));
 
     QVarLengthArray<QLineF, 30> linesY;
-    for (qreal _y = offsetY*param_matrixWorld.m11(); _y < view.height(); _y += param_matrixWorld.m11() )
+    for (qreal _y = offsetY*pixel_height; _y < view.height(); _y += pixel_height )
         linesY.append(QLineF(0, _y, view.width(), _y));
 
 
@@ -2910,27 +2910,25 @@ void DefaultViewPort::drawImgRegion(QPainter *painter)
     int original_font_size = f.pointSize();
     //change font size
     //f.setPointSize(4+(param_matrixWorld.m11()-threshold_zoom_img_region)/5);
-    f.setPixelSize(10+(param_matrixWorld.m11()-threshold_zoom_img_region)/5);
+    f.setPixelSize(10+(pixel_height-threshold_zoom_img_region)/5);
     painter->setFont(f);
-    QString val;
-    QRgb rgbValue;
 
-    QPointF point1;//sorry, I do not know how to name it
-    QPointF point2;//idem
 
-    for (int j=-1;j<height()/param_matrixWorld.m11();j++)//-1 because display the pixels top rows left colums
-        for (int i=-1;i<width()/param_matrixWorld.m11();i++)//-1
+    for (int j=-1;j<height()/pixel_height;j++)//-1 because display the pixels top rows left columns
+        for (int i=-1;i<width()/pixel_width;i++)//-1
         {
-            point1.setX((i+offsetX)*param_matrixWorld.m11());
-            point1.setY((j+offsetY)*param_matrixWorld.m11());
+            // Calculate top left of the pixel's position in the viewport (screen space)
+            QPointF pos_in_view((i+offsetX)*pixel_width, (j+offsetY)*pixel_height);
 
-            matrixWorld_inv.map(point1.x(),point1.y(),&point2.rx(),&point2.ry());
+            // Calculate top left of the pixel's position in the image (image space)
+            QPointF pos_in_image = matrixWorld_inv.map(pos_in_view);// Top left of pixel in view
+            pos_in_image.rx() = pos_in_image.x()/ratioX;
+            pos_in_image.ry() = pos_in_image.y()/ratioY;
+            QPoint point_in_image(pos_in_image.x() + 0.5f,pos_in_image.y() + 0.5f);// Add 0.5 for rounding
 
-            point2.rx()= (long) (point2.x() + 0.5);
-            point2.ry()= (long) (point2.y() + 0.5);
-
-            if (point2.x() >= 0 && point2.y() >= 0)
-                rgbValue = image2Draw_qt_resized.pixel(QPoint(point2.x(),point2.y()));
+            QRgb rgbValue;
+            if (image2Draw_qt.valid(point_in_image))
+                rgbValue = image2Draw_qt.pixel(point_in_image);
             else
                 rgbValue = qRgb(0,0,0);
 
@@ -2943,29 +2941,29 @@ void DefaultViewPort::drawImgRegion(QPainter *painter)
                 painter->drawText(QRect(point1.x(),point1.y(),param_matrixWorld.m11(),param_matrixWorld.m11()/2),
                     Qt::AlignCenter, val);
                 */
+                QString val;
 
                 val = tr("%1").arg(qRed(rgbValue));
                 painter->setPen(QPen(Qt::red, 1));
-                painter->drawText(QRect(point1.x(),point1.y(),param_matrixWorld.m11(),param_matrixWorld.m11()/3),
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y(),pixel_width,pixel_height/3),
                     Qt::AlignCenter, val);
 
                 val = tr("%1").arg(qGreen(rgbValue));
                 painter->setPen(QPen(Qt::green, 1));
-                painter->drawText(QRect(point1.x(),point1.y()+param_matrixWorld.m11()/3,param_matrixWorld.m11(),param_matrixWorld.m11()/3),
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y()+pixel_height/3,pixel_width,pixel_height/3),
                     Qt::AlignCenter, val);
 
                 val = tr("%1").arg(qBlue(rgbValue));
                 painter->setPen(QPen(Qt::blue, 1));
-                painter->drawText(QRect(point1.x(),point1.y()+2*param_matrixWorld.m11()/3,param_matrixWorld.m11(),param_matrixWorld.m11()/3),
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y()+2*pixel_height/3,pixel_width,pixel_height/3),
                     Qt::AlignCenter, val);
 
             }
 
             if (nbChannelOriginImage==CV_8UC1)
             {
-
-                val = tr("%1").arg(qRed(rgbValue));
-                painter->drawText(QRect(point1.x(),point1.y(),param_matrixWorld.m11(),param_matrixWorld.m11()),
+                QString val = tr("%1").arg(qRed(rgbValue));
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y(),pixel_width,pixel_height),
                     Qt::AlignCenter, val);
             }
         }
diff --git a/modules/highgui/src/window_QT.h b/modules/highgui/src/window_QT.h
index 089997f514..a96a8c6e69 100644
--- a/modules/highgui/src/window_QT.h
+++ b/modules/highgui/src/window_QT.h
@@ -522,7 +522,6 @@ private:
 
     CvMat* image2Draw_mat;
     QImage image2Draw_qt;
-    QImage image2Draw_qt_resized;
     int nbChannelOriginImage;
 
     //for mouse callback
diff --git a/modules/highgui/test/test_precomp.hpp b/modules/highgui/test/test_precomp.hpp
index 0d0bd80228..be06c0643a 100644
--- a/modules/highgui/test/test_precomp.hpp
+++ b/modules/highgui/test/test_precomp.hpp
@@ -47,7 +47,8 @@
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
     /*defined(HAVE_OPENNI)     || too specialized */ \
-    defined(HAVE_FFMPEG)
+    defined(HAVE_FFMPEG)       || \
+    defined(HAVE_MSMF)
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
@@ -57,7 +58,8 @@
     defined(HAVE_GSTREAMER)    || \
     defined(HAVE_QUICKTIME)    || \
     defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_FFMPEG)
+    defined(HAVE_FFMPEG)       || \
+    defined(HAVE_MSMF)
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 0
diff --git a/modules/highgui/test/test_video_io.cpp b/modules/highgui/test/test_video_io.cpp
index b0c2e53ba5..5d4de7ecb0 100644
--- a/modules/highgui/test/test_video_io.cpp
+++ b/modules/highgui/test/test_video_io.cpp
@@ -54,6 +54,35 @@ string fourccToString(int fourcc)
     return format("%c%c%c%c", fourcc & 255, (fourcc >> 8) & 255, (fourcc >> 16) & 255, (fourcc >> 24) & 255);
 }
 
+#ifdef HAVE_MSMF
+const VideoFormat g_specific_fmt_list[] =
+{
+        /*VideoFormat("wmv", CV_FOURCC_MACRO('d', 'v', '2', '5')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('d', 'v', '5', '0')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('d', 'v', 'c', ' ')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('d', 'v', 'h', '1')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('d', 'v', 'h', 'd')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('d', 'v', 's', 'd')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('d', 'v', 's', 'l')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('H', '2', '6', '3')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('M', '4', 'S', '2')),
+        VideoFormat("avi", CV_FOURCC_MACRO('M', 'J', 'P', 'G')),
+        VideoFormat("mp4", CV_FOURCC_MACRO('M', 'P', '4', 'S')),
+        VideoFormat("mp4", CV_FOURCC_MACRO('M', 'P', '4', 'V')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('M', 'P', '4', '3')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('M', 'P', 'G', '1')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('M', 'S', 'S', '1')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('M', 'S', 'S', '2')),*/
+#if !defined(_M_ARM)
+        VideoFormat("wmv", CV_FOURCC_MACRO('W', 'M', 'V', '1')),
+        VideoFormat("wmv", CV_FOURCC_MACRO('W', 'M', 'V', '2')),
+#endif
+        VideoFormat("wmv", CV_FOURCC_MACRO('W', 'M', 'V', '3')),
+        VideoFormat("avi", CV_FOURCC_MACRO('H', '2', '6', '4')),
+        //VideoFormat("wmv", CV_FOURCC_MACRO('W', 'V', 'C', '1')),
+        VideoFormat()
+};
+#else
 const VideoFormat g_specific_fmt_list[] =
 {
     VideoFormat("avi", CV_FOURCC('X', 'V', 'I', 'D')),
@@ -63,17 +92,17 @@ const VideoFormat g_specific_fmt_list[] =
     VideoFormat("mkv", CV_FOURCC('X', 'V', 'I', 'D')),
     VideoFormat("mkv", CV_FOURCC('M', 'P', 'E', 'G')),
     VideoFormat("mkv", CV_FOURCC('M', 'J', 'P', 'G')),
-
     VideoFormat("mov", CV_FOURCC('m', 'p', '4', 'v')),
     VideoFormat()
 };
+#endif
 
 }
 
 class CV_HighGuiTest : public cvtest::BaseTest
 {
 protected:
-    void ImageTest(const string& dir);
+    void ImageTest (const string& dir);
     void VideoTest (const string& dir, const cvtest::VideoFormat& fmt);
     void SpecificImageTest (const string& dir);
     void SpecificVideoTest (const string& dir, const cvtest::VideoFormat& fmt);
@@ -242,19 +271,19 @@ void CV_HighGuiTest::VideoTest(const string& dir, const cvtest::VideoFormat& fmt
 
     for(;;)
     {
-        IplImage * img = cvQueryFrame( cap );
+        IplImage* img = cvQueryFrame( cap );
 
         if (!img)
             break;
 
         frames.push_back(Mat(img).clone());
 
-        if (writer == 0)
+        if (writer == NULL)
         {
             writer = cvCreateVideoWriter(tmp_name.c_str(), fmt.fourcc, 24, cvGetSize(img));
-            if (writer == 0)
+            if (writer == NULL)
             {
-                ts->printf(ts->LOG, "can't create writer (with fourcc : %d)\n",
+                ts->printf(ts->LOG, "can't create writer (with fourcc : %s)\n",
                            cvtest::fourccToString(fmt.fourcc).c_str());
                 cvReleaseCapture( &cap );
                 ts->set_failed_test_info(ts->FAIL_MISMATCH);
@@ -290,15 +319,22 @@ void CV_HighGuiTest::VideoTest(const string& dir, const cvtest::VideoFormat& fmt
         double psnr = PSNR(img1, img);
         if (psnr < thresDbell)
         {
-            printf("Too low psnr = %gdb\n", psnr);
-            // imwrite("img.png", img);
-            // imwrite("img1.png", img1);
+            ts->printf(ts->LOG, "Too low frame %d psnr = %gdb\n", i, psnr);
             ts->set_failed_test_info(ts->FAIL_MISMATCH);
+
+            //imwrite("original.png", img);
+            //imwrite("after_test.png", img1);
+            //Mat diff;
+            //absdiff(img, img1, diff);
+            //imwrite("diff.png", diff);
+
             break;
         }
     }
 
+    printf("Before saved release for %s\n", tmp_name.c_str());
     cvReleaseCapture( &saved );
+    printf("After release\n");
 
     ts->printf(ts->LOG, "end test function : ImagesVideo \n");
 }
diff --git a/modules/imgproc/doc/miscellaneous_transformations.rst b/modules/imgproc/doc/miscellaneous_transformations.rst
index 4ebf6d5ee5..9fd8df517a 100644
--- a/modules/imgproc/doc/miscellaneous_transformations.rst
+++ b/modules/imgproc/doc/miscellaneous_transformations.rst
@@ -116,6 +116,7 @@ If you use ``cvtColor`` with 8-bit images, the conversion will have some informa
 The function can do the following transformations:
 
 *
+    RGB :math:`\leftrightarrow` GRAY ( ``CV_BGR2GRAY, CV_RGB2GRAY, CV_GRAY2BGR, CV_GRAY2RGB``     )
     Transformations within RGB space like adding/removing the alpha channel, reversing the channel order, conversion to/from 16-bit RGB color (R5:G6:B5 or R5:G5:B5), as well as conversion to/from grayscale using:
 
     .. math::
@@ -765,7 +766,7 @@ Runs the GrabCut algorithm.
 
         * **GC_PR_BGD** defines a possible background pixel.
 
-        * **GC_PR_BGD** defines a possible foreground pixel.
+        * **GC_PR_FGD** defines a possible foreground pixel.
 
     :param rect: ROI containing a segmented object. The pixels outside of the ROI are marked as "obvious background". The parameter is only used when  ``mode==GC_INIT_WITH_RECT`` .
 
diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index 9b87afe99c..601beb8996 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -258,7 +258,8 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u,
     declare.time(100);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn);
+    int runs = sz.width <= 320 ? 70 : 5;
+    TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
 
     SANITY_CHECK(dst, 1);
 }
@@ -334,7 +335,8 @@ PERF_TEST_P(Size_CvtMode3, cvtColorRGB2YUV420p,
     declare.time(100);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn);
+    int runs = (sz.width <= 640) ? 10 : 1;
+    TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
 
     SANITY_CHECK(dst, 1);
 }
diff --git a/modules/imgproc/perf/perf_morph.cpp b/modules/imgproc/perf/perf_morph.cpp
index 9aadeaff52..d3dbba38fb 100644
--- a/modules/imgproc/perf/perf_morph.cpp
+++ b/modules/imgproc/perf/perf_morph.cpp
@@ -19,7 +19,8 @@ PERF_TEST_P(Size_MatType, erode, TYPICAL_MATS_MORPH)
 
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() erode(src, dst, noArray());
+    int runs = (sz.width <= 320) ? 15 : 1;
+    TEST_CYCLE_MULTIRUN(runs) erode(src, dst, noArray());
 
     SANITY_CHECK(dst);
 }
diff --git a/modules/imgproc/perf/perf_remap.cpp b/modules/imgproc/perf/perf_remap.cpp
index 334c5ff960..92c6007a2b 100644
--- a/modules/imgproc/perf/perf_remap.cpp
+++ b/modules/imgproc/perf/perf_remap.cpp
@@ -63,7 +63,8 @@ PERF_TEST_P( TestRemap, Remap,
 
     declare.in(src, WARMUP_RNG).out(dst).time(20);
 
-    TEST_CYCLE() remap(src, dst, map1, map2, inter_type);
+    int runs = (sz.width <= 640) ? 3 : 1;
+    TEST_CYCLE_MULTIRUN(runs) remap(src, dst, map1, map2, inter_type);
 
     SANITY_CHECK(dst);
 }
diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp
index 7aef05ee52..ea959a627a 100644
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -85,7 +85,8 @@ PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast,
 
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() resize(src, dst, dst.size(), 0, 0, INTER_AREA);
+    int runs = 15;
+    TEST_CYCLE_MULTIRUN(runs) resize(src, dst, dst.size(), 0, 0, INTER_AREA);
 
     //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
     SANITY_CHECK(dst, 1);
diff --git a/modules/imgproc/perf/perf_threshold.cpp b/modules/imgproc/perf/perf_threshold.cpp
index 61255e2283..9ccafd6b54 100644
--- a/modules/imgproc/perf/perf_threshold.cpp
+++ b/modules/imgproc/perf/perf_threshold.cpp
@@ -32,7 +32,7 @@ PERF_TEST_P(Size_MatType_ThreshType, threshold,
 
     declare.in(src, WARMUP_RNG).out(dst);
 
-    int runs = (sz.width <= 640) ? 8 : 1;
+    int runs = (sz.width <= 640) ? 40 : 1;
     TEST_CYCLE_MULTIRUN(runs) threshold(src, dst, thresh, maxval, threshType);
 
     SANITY_CHECK(dst);
@@ -51,7 +51,8 @@ PERF_TEST_P(Size_Only, threshold_otsu, testing::Values(TYPICAL_MAT_SIZES))
 
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() threshold(src, dst, 0, maxval, THRESH_BINARY|THRESH_OTSU);
+    int runs = 15;
+    TEST_CYCLE_MULTIRUN(runs) threshold(src, dst, 0, maxval, THRESH_BINARY|THRESH_OTSU);
 
     SANITY_CHECK(dst);
 }
diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
new file mode 100644
index 0000000000..4ce479713e
--- /dev/null
+++ b/modules/imgproc/src/clahe.cpp
@@ -0,0 +1,334 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+// ----------------------------------------------------------------------
+// CLAHE
+
+namespace
+{
+    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
+    {
+    public:
+        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
+            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
+        {
+        }
+
+        void operator ()(const cv::Range& range) const;
+
+    private:
+        cv::Mat src_;
+        mutable cv::Mat lut_;
+
+        cv::Size tileSize_;
+        int tilesX_;
+        int tilesY_;
+        int clipLimit_;
+        float lutScale_;
+    };
+
+    void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
+    {
+        const int histSize = 256;
+
+        uchar* tileLut = lut_.ptr(range.start);
+        const size_t lut_step = lut_.step;
+
+        for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
+        {
+            const int ty = k / tilesX_;
+            const int tx = k % tilesX_;
+
+            // retrieve tile submatrix
+
+            cv::Rect tileROI;
+            tileROI.x = tx * tileSize_.width;
+            tileROI.y = ty * tileSize_.height;
+            tileROI.width = tileSize_.width;
+            tileROI.height = tileSize_.height;
+
+            const cv::Mat tile = src_(tileROI);
+
+            // calc histogram
+
+            int tileHist[histSize] = {0, };
+
+            int height = tileROI.height;
+            const size_t sstep = tile.step;
+            for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
+            {
+                int x = 0;
+                for (; x <= tileROI.width - 4; x += 4)
+                {
+                    int t0 = ptr[x], t1 = ptr[x+1];
+                    tileHist[t0]++; tileHist[t1]++;
+                    t0 = ptr[x+2]; t1 = ptr[x+3];
+                    tileHist[t0]++; tileHist[t1]++;
+                }
+
+                for (; x < tileROI.width; ++x)
+                    tileHist[ptr[x]]++;
+            }
+
+            // clip histogram
+
+            if (clipLimit_ > 0)
+            {
+                // how many pixels were clipped
+                int clipped = 0;
+                for (int i = 0; i < histSize; ++i)
+                {
+                    if (tileHist[i] > clipLimit_)
+                    {
+                        clipped += tileHist[i] - clipLimit_;
+                        tileHist[i] = clipLimit_;
+                    }
+                }
+
+                // redistribute clipped pixels
+                int redistBatch = clipped / histSize;
+                int residual = clipped - redistBatch * histSize;
+
+                for (int i = 0; i < histSize; ++i)
+                    tileHist[i] += redistBatch;
+
+                for (int i = 0; i < residual; ++i)
+                    tileHist[i]++;
+            }
+
+            // calc Lut
+
+            int sum = 0;
+            for (int i = 0; i < histSize; ++i)
+            {
+                sum += tileHist[i];
+                tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
+            }
+        }
+    }
+
+    class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
+    {
+    public:
+        CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
+            src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
+        {
+        }
+
+        void operator ()(const cv::Range& range) const;
+
+    private:
+        cv::Mat src_;
+        mutable cv::Mat dst_;
+        cv::Mat lut_;
+
+        cv::Size tileSize_;
+        int tilesX_;
+        int tilesY_;
+    };
+
+    void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
+    {
+        const size_t lut_step = lut_.step;
+
+        for (int y = range.start; y < range.end; ++y)
+        {
+            const uchar* srcRow = src_.ptr<uchar>(y);
+            uchar* dstRow = dst_.ptr<uchar>(y);
+
+            const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
+
+            int ty1 = cvFloor(tyf);
+            int ty2 = ty1 + 1;
+
+            const float ya = tyf - ty1;
+
+            ty1 = std::max(ty1, 0);
+            ty2 = std::min(ty2, tilesY_ - 1);
+
+            const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
+            const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
+
+            for (int x = 0; x < src_.cols; ++x)
+            {
+                const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
+
+                int tx1 = cvFloor(txf);
+                int tx2 = tx1 + 1;
+
+                const float xa = txf - tx1;
+
+                tx1 = std::max(tx1, 0);
+                tx2 = std::min(tx2, tilesX_ - 1);
+
+                const int srcVal = srcRow[x];
+
+                const size_t ind1 = tx1 * lut_step + srcVal;
+                const size_t ind2 = tx2 * lut_step + srcVal;
+
+                float res = 0;
+
+                res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
+                res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
+                res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
+                res += lutPlane2[ind2] * ((xa) * (ya));
+
+                dstRow[x] = cv::saturate_cast<uchar>(res);
+            }
+        }
+    }
+
+    class CLAHE_Impl : public cv::CLAHE
+    {
+    public:
+        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+        cv::AlgorithmInfo* info() const;
+
+        void apply(cv::InputArray src, cv::OutputArray dst);
+
+        void setClipLimit(double clipLimit);
+        double getClipLimit() const;
+
+        void setTilesGridSize(cv::Size tileGridSize);
+        cv::Size getTilesGridSize() const;
+
+        void collectGarbage();
+
+    private:
+        double clipLimit_;
+        int tilesX_;
+        int tilesY_;
+
+        cv::Mat srcExt_;
+        cv::Mat lut_;
+    };
+
+    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+    {
+    }
+
+    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
+        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
+        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
+        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
+
+    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
+    {
+        cv::Mat src = _src.getMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+
+        _dst.create( src.size(), src.type() );
+        cv::Mat dst = _dst.getMat();
+
+        const int histSize = 256;
+
+        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
+
+        cv::Size tileSize;
+        cv::Mat srcForLut;
+
+        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        {
+            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+            srcForLut = src;
+        }
+        else
+        {
+            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
+
+            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+            srcForLut = srcExt_;
+        }
+
+        const int tileSizeTotal = tileSize.area();
+        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+        int clipLimit = 0;
+        if (clipLimit_ > 0.0)
+        {
+            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+            clipLimit = std::max(clipLimit, 1);
+        }
+
+        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
+        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
+
+        CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
+        cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
+    }
+
+    void CLAHE_Impl::setClipLimit(double clipLimit)
+    {
+        clipLimit_ = clipLimit;
+    }
+
+    double CLAHE_Impl::getClipLimit() const
+    {
+        return clipLimit_;
+    }
+
+    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+    {
+        tilesX_ = tileGridSize.width;
+        tilesY_ = tileGridSize.height;
+    }
+
+    cv::Size CLAHE_Impl::getTilesGridSize() const
+    {
+        return cv::Size(tilesX_, tilesY_);
+    }
+
+    void CLAHE_Impl::collectGarbage()
+    {
+        srcExt_.release();
+        lut_.release();
+    }
+}
+
+cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
+{
+    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
+}
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 3799d435e3..41ca2db9c0 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -2755,7 +2755,7 @@ const int ITUR_BT_601_CGV = -385875;
 const int ITUR_BT_601_CBV = -74448;
 
 template<int bIdx, int uIdx>
-struct YUV420sp2RGB888Invoker
+struct YUV420sp2RGB888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *muv;
@@ -2764,10 +2764,10 @@ struct YUV420sp2RGB888Invoker
     YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;
 
         //R = 1.164(Y - 16) + 1.596(V - 128)
         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -2824,7 +2824,7 @@ struct YUV420sp2RGB888Invoker
 };
 
 template<int bIdx, int uIdx>
-struct YUV420sp2RGBA8888Invoker
+struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *muv;
@@ -2833,10 +2833,10 @@ struct YUV420sp2RGBA8888Invoker
     YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;
 
         //R = 1.164(Y - 16) + 1.596(V - 128)
         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -2897,7 +2897,7 @@ struct YUV420sp2RGBA8888Invoker
 };
 
 template<int bIdx>
-struct YUV420p2RGB888Invoker
+struct YUV420p2RGB888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *mu, *mv;
@@ -2907,19 +2907,19 @@ struct YUV420p2RGB888Invoker
     YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        const int rangeBegin = range.begin() * 2;
-        const int rangeEnd = range.end() * 2;
+        const int rangeBegin = range.start * 2;
+        const int rangeEnd = range.end * 2;
 
         size_t uvsteps[2] = {width/2, stride - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
-        const uchar* u1 = mu + (range.begin() / 2) * stride;
-        const uchar* v1 = mv + (range.begin() / 2) * stride;
+        const uchar* u1 = mu + (range.start / 2) * stride;
+        const uchar* v1 = mv + (range.start / 2) * stride;
 
-        if(range.begin() % 2 == 1)
+        if(range.start % 2 == 1)
         {
             u1 += uvsteps[(usIdx++) & 1];
             v1 += uvsteps[(vsIdx++) & 1];
@@ -2965,7 +2965,7 @@ struct YUV420p2RGB888Invoker
 };
 
 template<int bIdx>
-struct YUV420p2RGBA8888Invoker
+struct YUV420p2RGBA8888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *mu, *mv;
@@ -2975,19 +2975,19 @@ struct YUV420p2RGBA8888Invoker
     YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;
 
         size_t uvsteps[2] = {width/2, stride - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
-        const uchar* u1 = mu + (range.begin() / 2) * stride;
-        const uchar* v1 = mv + (range.begin() / 2) * stride;
+        const uchar* u1 = mu + (range.start / 2) * stride;
+        const uchar* v1 = mv + (range.start / 2) * stride;
 
-        if(range.begin() % 2 == 1)
+        if(range.start % 2 == 1)
         {
             u1 += uvsteps[(usIdx++) & 1];
             v1 += uvsteps[(vsIdx++) & 1];
@@ -3042,48 +3042,40 @@ template<int bIdx, int uIdx>
 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
 {
     YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 template<int bIdx, int uIdx>
 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
 {
     YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 template<int bIdx>
 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
     YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 template<int bIdx>
 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
     YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
@@ -3167,7 +3159,7 @@ static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
 
 template<int bIdx, int uIdx, int yIdx>
-struct YUV422toRGB888Invoker
+struct YUV422toRGB888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* src;
@@ -3176,10 +3168,10 @@ struct YUV422toRGB888Invoker
     YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin();
-        int rangeEnd = range.end();
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;
 
         const int uidx = 1 - yIdx + uIdx * 2;
         const int vidx = (2 + uidx) % 4;
@@ -3213,7 +3205,7 @@ struct YUV422toRGB888Invoker
 };
 
 template<int bIdx, int uIdx, int yIdx>
-struct YUV422toRGBA8888Invoker
+struct YUV422toRGBA8888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* src;
@@ -3222,10 +3214,10 @@ struct YUV422toRGBA8888Invoker
     YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin();
-        int rangeEnd = range.end();
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;
 
         const int uidx = 1 - yIdx + uIdx * 2;
         const int vidx = (2 + uidx) % 4;
@@ -3266,24 +3258,20 @@ template<int bIdx, int uIdx, int yIdx>
 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
 {
     YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows), converter);
+        parallel_for_(Range(0, _dst.rows), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows));
+        converter(Range(0, _dst.rows));
 }
 
 template<int bIdx, int uIdx, int yIdx>
 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
 {
     YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows), converter);
+        parallel_for_(Range(0, _dst.rows), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows));
+        converter(Range(0, _dst.rows));
 }
 
 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
diff --git a/modules/imgproc/src/distransform.cpp b/modules/imgproc/src/distransform.cpp
index 89d3a550f4..d3e6f90242 100644
--- a/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@@ -443,7 +443,7 @@ icvGetDistanceTransformMask( int maskType, float *metrics )
 namespace cv
 {
 
-struct DTColumnInvoker
+struct DTColumnInvoker : ParallelLoopBody
 {
     DTColumnInvoker( const CvMat* _src, CvMat* _dst, const int* _sat_tab, const float* _sqr_tab)
     {
@@ -453,9 +453,9 @@ struct DTColumnInvoker
         sqr_tab = _sqr_tab;
     }
 
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
-        int i, i1 = range.begin(), i2 = range.end();
+        int i, i1 = range.start, i2 = range.end;
         int m = src->rows;
         size_t sstep = src->step, dstep = dst->step/sizeof(float);
         AutoBuffer<int> _d(m);
@@ -490,7 +490,7 @@ struct DTColumnInvoker
 };
 
 
-struct DTRowInvoker
+struct DTRowInvoker : ParallelLoopBody
 {
     DTRowInvoker( CvMat* _dst, const float* _sqr_tab, const float* _inv_tab )
     {
@@ -499,10 +499,10 @@ struct DTRowInvoker
         inv_tab = _inv_tab;
     }
 
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
         const float inf = 1e15f;
-        int i, i1 = range.begin(), i2 = range.end();
+        int i, i1 = range.start, i2 = range.end;
         int n = dst->cols;
         AutoBuffer<uchar> _buf((n+2)*2*sizeof(float) + (n+2)*sizeof(int));
         float* f = (float*)(uchar*)_buf;
@@ -586,7 +586,7 @@ icvTrueDistTrans( const CvMat* src, CvMat* dst )
     for( ; i <= m*3; i++ )
         sat_tab[i] = i - shift;
 
-    cv::parallel_for(cv::BlockedRange(0, n), cv::DTColumnInvoker(src, dst, sat_tab, sqr_tab));
+    cv::parallel_for_(cv::Range(0, n), cv::DTColumnInvoker(src, dst, sat_tab, sqr_tab));
 
     // stage 2: compute modified distance transform for each row
     float* inv_tab = sqr_tab + n;
@@ -598,7 +598,7 @@ icvTrueDistTrans( const CvMat* src, CvMat* dst )
         sqr_tab[i] = (float)(i*i);
     }
 
-    cv::parallel_for(cv::BlockedRange(0, m), cv::DTRowInvoker(dst, sqr_tab, inv_tab));
+    cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(dst, sqr_tab, inv_tab));
 }
 
 
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 22dd9beb1f..bfcdee515f 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -2986,29 +2986,23 @@ cvCalcProbDensity( const CvHistogram* hist, const CvHistogram* hist_mask,
     }
 }
 
-class EqualizeHistCalcHist_Invoker
+class EqualizeHistCalcHist_Invoker : public cv::ParallelLoopBody
 {
 public:
     enum {HIST_SZ = 256};
 
-#ifdef HAVE_TBB
-    typedef tbb::mutex* MutextPtr;
-#else
-    typedef void* MutextPtr;
-#endif
-
-    EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, MutextPtr histogramLock)
+    EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, cv::Mutex* histogramLock)
         : src_(src), globalHistogram_(histogram), histogramLock_(histogramLock)
     { }
 
-    void operator()( const cv::BlockedRange& rowRange ) const
+    void operator()( const cv::Range& rowRange ) const
     {
         int localHistogram[HIST_SZ] = {0, };
 
         const size_t sstep = src_.step;
 
         int width = src_.cols;
-        int height = rowRange.end() - rowRange.begin();
+        int height = rowRange.end - rowRange.start;
 
         if (src_.isContinuous())
         {
@@ -3016,7 +3010,7 @@ public:
             height = 1;
         }
 
-        for (const uchar* ptr = src_.ptr<uchar>(rowRange.begin()); height--; ptr += sstep)
+        for (const uchar* ptr = src_.ptr<uchar>(rowRange.start); height--; ptr += sstep)
         {
             int x = 0;
             for (; x <= width - 4; x += 4)
@@ -3031,9 +3025,7 @@ public:
                 localHistogram[ptr[x]]++;
         }
 
-#ifdef HAVE_TBB
-        tbb::mutex::scoped_lock lock(*histogramLock_);
-#endif
+        cv::AutoLock lock(*histogramLock_);
 
         for( int i = 0; i < HIST_SZ; i++ )
             globalHistogram_[i] += localHistogram[i];
@@ -3041,12 +3033,7 @@ public:
 
     static bool isWorthParallel( const cv::Mat& src )
     {
-#ifdef HAVE_TBB
         return ( src.total() >= 640*480 );
-#else
-        (void)src;
-        return false;
-#endif
     }
 
 private:
@@ -3054,10 +3041,10 @@ private:
 
     cv::Mat& src_;
     int* globalHistogram_;
-    MutextPtr histogramLock_;
+    cv::Mutex* histogramLock_;
 };
 
-class EqualizeHistLut_Invoker
+class EqualizeHistLut_Invoker : public cv::ParallelLoopBody
 {
 public:
     EqualizeHistLut_Invoker( cv::Mat& src, cv::Mat& dst, int* lut )
@@ -3066,13 +3053,13 @@ public:
           lut_(lut)
     { }
 
-    void operator()( const cv::BlockedRange& rowRange ) const
+    void operator()( const cv::Range& rowRange ) const
     {
         const size_t sstep = src_.step;
         const size_t dstep = dst_.step;
 
         int width = src_.cols;
-        int height = rowRange.end() - rowRange.begin();
+        int height = rowRange.end - rowRange.start;
         int* lut = lut_;
 
         if (src_.isContinuous() && dst_.isContinuous())
@@ -3081,8 +3068,8 @@ public:
             height = 1;
         }
 
-        const uchar* sptr = src_.ptr<uchar>(rowRange.begin());
-        uchar* dptr = dst_.ptr<uchar>(rowRange.begin());
+        const uchar* sptr = src_.ptr<uchar>(rowRange.start);
+        uchar* dptr = dst_.ptr<uchar>(rowRange.start);
 
         for (; height--; sptr += sstep, dptr += dstep)
         {
@@ -3111,12 +3098,7 @@ public:
 
     static bool isWorthParallel( const cv::Mat& src )
     {
-#ifdef HAVE_TBB
         return ( src.total() >= 640*480 );
-#else
-        (void)src;
-        return false;
-#endif
     }
 
 private:
@@ -3143,23 +3125,18 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
     if(src.empty())
         return;
 
-#ifdef HAVE_TBB
-    tbb::mutex histogramLockInstance;
-    EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = &histogramLockInstance;
-#else
-    EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = 0;
-#endif
+    Mutex histogramLockInstance;
 
     const int hist_sz = EqualizeHistCalcHist_Invoker::HIST_SZ;
     int hist[hist_sz] = {0,};
     int lut[hist_sz];
 
-    EqualizeHistCalcHist_Invoker calcBody(src, hist, histogramLock);
+    EqualizeHistCalcHist_Invoker calcBody(src, hist, &histogramLockInstance);
     EqualizeHistLut_Invoker      lutBody(src, dst, lut);
-    cv::BlockedRange heightRange(0, src.rows);
+    cv::Range heightRange(0, src.rows);
 
     if(EqualizeHistCalcHist_Invoker::isWorthParallel(src))
-        parallel_for(heightRange, calcBody);
+        parallel_for_(heightRange, calcBody);
     else
         calcBody(heightRange);
 
@@ -3183,303 +3160,11 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
     }
 
     if(EqualizeHistLut_Invoker::isWorthParallel(src))
-        parallel_for(heightRange, lutBody);
+        parallel_for_(heightRange, lutBody);
     else
         lutBody(heightRange);
 }
 
-// ----------------------------------------------------------------------
-// CLAHE
-
-namespace
-{
-    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
-    {
-    public:
-        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
-            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
-        {
-        }
-
-        void operator ()(const cv::Range& range) const;
-
-    private:
-        cv::Mat src_;
-        mutable cv::Mat lut_;
-
-        cv::Size tileSize_;
-        int tilesX_;
-        int tilesY_;
-        int clipLimit_;
-        float lutScale_;
-    };
-
-    void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
-    {
-        const int histSize = 256;
-
-        uchar* tileLut = lut_.ptr(range.start);
-        const size_t lut_step = lut_.step;
-
-        for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
-        {
-            const int ty = k / tilesX_;
-            const int tx = k % tilesX_;
-
-            // retrieve tile submatrix
-
-            cv::Rect tileROI;
-            tileROI.x = tx * tileSize_.width;
-            tileROI.y = ty * tileSize_.height;
-            tileROI.width = tileSize_.width;
-            tileROI.height = tileSize_.height;
-
-            const cv::Mat tile = src_(tileROI);
-
-            // calc histogram
-
-            int tileHist[histSize] = {0, };
-
-            int height = tileROI.height;
-            const size_t sstep = tile.step;
-            for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
-            {
-                int x = 0;
-                for (; x <= tileROI.width - 4; x += 4)
-                {
-                    int t0 = ptr[x], t1 = ptr[x+1];
-                    tileHist[t0]++; tileHist[t1]++;
-                    t0 = ptr[x+2]; t1 = ptr[x+3];
-                    tileHist[t0]++; tileHist[t1]++;
-                }
-
-                for (; x < tileROI.width; ++x)
-                    tileHist[ptr[x]]++;
-            }
-
-            // clip histogram
-
-            if (clipLimit_ > 0)
-            {
-                // how many pixels were clipped
-                int clipped = 0;
-                for (int i = 0; i < histSize; ++i)
-                {
-                    if (tileHist[i] > clipLimit_)
-                    {
-                        clipped += tileHist[i] - clipLimit_;
-                        tileHist[i] = clipLimit_;
-                    }
-                }
-
-                // redistribute clipped pixels
-                int redistBatch = clipped / histSize;
-                int residual = clipped - redistBatch * histSize;
-
-                for (int i = 0; i < histSize; ++i)
-                    tileHist[i] += redistBatch;
-
-                for (int i = 0; i < residual; ++i)
-                    tileHist[i]++;
-            }
-
-            // calc Lut
-
-            int sum = 0;
-            for (int i = 0; i < histSize; ++i)
-            {
-                sum += tileHist[i];
-                tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
-            }
-        }
-    }
-
-    class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
-    {
-    public:
-        CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
-            src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
-        {
-        }
-
-        void operator ()(const cv::Range& range) const;
-
-    private:
-        cv::Mat src_;
-        mutable cv::Mat dst_;
-        cv::Mat lut_;
-
-        cv::Size tileSize_;
-        int tilesX_;
-        int tilesY_;
-    };
-
-    void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
-    {
-        const size_t lut_step = lut_.step;
-
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const uchar* srcRow = src_.ptr<uchar>(y);
-            uchar* dstRow = dst_.ptr<uchar>(y);
-
-            const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
-
-            int ty1 = cvFloor(tyf);
-            int ty2 = ty1 + 1;
-
-            const float ya = tyf - ty1;
-
-            ty1 = std::max(ty1, 0);
-            ty2 = std::min(ty2, tilesY_ - 1);
-
-            const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
-            const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
-
-            for (int x = 0; x < src_.cols; ++x)
-            {
-                const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
-
-                int tx1 = cvFloor(txf);
-                int tx2 = tx1 + 1;
-
-                const float xa = txf - tx1;
-
-                tx1 = std::max(tx1, 0);
-                tx2 = std::min(tx2, tilesX_ - 1);
-
-                const int srcVal = srcRow[x];
-
-                const size_t ind1 = tx1 * lut_step + srcVal;
-                const size_t ind2 = tx2 * lut_step + srcVal;
-
-                float res = 0;
-
-                res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
-                res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
-                res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
-                res += lutPlane2[ind2] * ((xa) * (ya));
-
-                dstRow[x] = cv::saturate_cast<uchar>(res);
-            }
-        }
-    }
-
-    class CLAHE_Impl : public cv::CLAHE
-    {
-    public:
-        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
-
-        cv::AlgorithmInfo* info() const;
-
-        void apply(cv::InputArray src, cv::OutputArray dst);
-
-        void setClipLimit(double clipLimit);
-        double getClipLimit() const;
-
-        void setTilesGridSize(cv::Size tileGridSize);
-        cv::Size getTilesGridSize() const;
-
-        void collectGarbage();
-
-    private:
-        double clipLimit_;
-        int tilesX_;
-        int tilesY_;
-
-        cv::Mat srcExt_;
-        cv::Mat lut_;
-    };
-
-    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
-        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
-    {
-    }
-
-    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
-        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
-        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
-        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
-
-    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
-    {
-        cv::Mat src = _src.getMat();
-
-        CV_Assert( src.type() == CV_8UC1 );
-
-        _dst.create( src.size(), src.type() );
-        cv::Mat dst = _dst.getMat();
-
-        const int histSize = 256;
-
-        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
-
-        cv::Size tileSize;
-        cv::Mat srcForLut;
-
-        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
-        {
-            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
-            srcForLut = src;
-        }
-        else
-        {
-            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
-
-            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
-            srcForLut = srcExt_;
-        }
-
-        const int tileSizeTotal = tileSize.area();
-        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
-
-        int clipLimit = 0;
-        if (clipLimit_ > 0.0)
-        {
-            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
-            clipLimit = std::max(clipLimit, 1);
-        }
-
-        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
-        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
-
-        CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
-        cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
-    }
-
-    void CLAHE_Impl::setClipLimit(double clipLimit)
-    {
-        clipLimit_ = clipLimit;
-    }
-
-    double CLAHE_Impl::getClipLimit() const
-    {
-        return clipLimit_;
-    }
-
-    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
-    {
-        tilesX_ = tileGridSize.width;
-        tilesY_ = tileGridSize.height;
-    }
-
-    cv::Size CLAHE_Impl::getTilesGridSize() const
-    {
-        return cv::Size(tilesX_, tilesY_);
-    }
-
-    void CLAHE_Impl::collectGarbage()
-    {
-        srcExt_.release();
-        lut_.release();
-    }
-}
-
-cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
-{
-    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
-}
-
 // ----------------------------------------------------------------------
 
 /* Implementation of RTTI and Generic Functions for CvHistogram */
diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index a63e08ff01..53d2347ec4 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1081,7 +1081,7 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor)
 namespace cv
 {
 
-class MorphologyRunner
+class MorphologyRunner : public ParallelLoopBody
 {
 public:
     MorphologyRunner(Mat _src, Mat _dst, int _nStripes, int _iterations,
@@ -1102,14 +1102,14 @@ public:
         columnBorderType = _columnBorderType;
     }
 
-    void operator () ( const BlockedRange& range ) const
+    void operator () ( const Range& range ) const
     {
-        int row0 = min(cvRound(range.begin() * src.rows / nStripes), src.rows);
-        int row1 = min(cvRound(range.end() * src.rows / nStripes), src.rows);
+        int row0 = min(cvRound(range.start * src.rows / nStripes), src.rows);
+        int row1 = min(cvRound(range.end * src.rows / nStripes), src.rows);
 
         /*if(0)
             printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
-                   src.rows, src.cols, range.begin(), range.end(), row0, row1);*/
+                   src.rows, src.cols, range.start, range.end, row0, row1);*/
 
         Mat srcStripe = src.rowRange(row0, row1);
         Mat dstStripe = dst.rowRange(row0, row1);
@@ -1173,15 +1173,15 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
     }
 
     int nStripes = 1;
-#if defined HAVE_TBB && defined HAVE_TEGRA_OPTIMIZATION
+#if defined HAVE_TEGRA_OPTIMIZATION
     if (src.data != dst.data && iterations == 1 &&  //NOTE: threads are not used for inplace processing
         (borderType & BORDER_ISOLATED) == 0 && //TODO: check border types
         src.rows >= 64 ) //NOTE: just heuristics
         nStripes = 4;
 #endif
 
-    parallel_for(BlockedRange(0, nStripes),
-                 MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
+    parallel_for_(Range(0, nStripes),
+                  MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
 
     //Ptr<FilterEngine> f = createMorphologyFilter(op, src.type(),
     //                                             kernel, anchor, borderType, borderType, borderValue );
diff --git a/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java b/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java
index 8bcaf58a05..db806b6fc9 100644
--- a/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java
+++ b/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java
@@ -585,4 +585,18 @@ public class Calib3dTest extends OpenCVTestCase {
     public void testValidateDisparityMatMatIntIntInt() {
         fail("Not yet implemented");
     }
+
+    public void testComputeCorrespondEpilines()
+    {
+        Mat fundamental = new Mat(3, 3, CvType.CV_64F);
+        fundamental.put(0, 0, 0, -0.577, 0.288, 0.577, 0, 0.288, -0.288, -0.288, 0);
+        MatOfPoint2f left = new MatOfPoint2f();
+        left.alloc(1);
+        left.put(0, 0, 2, 3); //add(new Point(x, y));
+        Mat lines = new Mat();
+        Mat truth = new Mat(1, 1, CvType.CV_32FC3);
+        truth.put(0, 0, -0.70735186, 0.70686162, -0.70588124);
+        Calib3d.computeCorrespondEpilines(left, 1, fundamental, lines);
+        assertMatEqual(truth, lines, EPS);
+    }
 }
diff --git a/modules/java/generator/src/java/android+CameraBridgeViewBase.java b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
index 6c5c3294ff..c0c9f5bde7 100644
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@@ -81,6 +81,14 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
         styledAttrs.recycle();
     }
 
+    /**
+     * Sets the camera index
+     * @param cameraIndex new camera index
+     */
+    public void setCameraIndex(int cameraIndex) {
+        this.mCameraIndex = cameraIndex;
+    }
+
     public interface CvCameraViewListener {
         /**
          * This method is invoked when camera preview has started. After this method is invoked
diff --git a/modules/java/generator/src/java/core+MatOfByte.java b/modules/java/generator/src/java/core+MatOfByte.java
index 0ebdb66733..b3fe5691ee 100644
--- a/modules/java/generator/src/java/core+MatOfByte.java
+++ b/modules/java/generator/src/java/core+MatOfByte.java
@@ -14,7 +14,7 @@ public class MatOfByte extends Mat {
 
     protected MatOfByte(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfByte extends Mat {
 
     public MatOfByte(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfDouble.java b/modules/java/generator/src/java/core+MatOfDouble.java
index cca5251105..4eb7cbc280 100644
--- a/modules/java/generator/src/java/core+MatOfDouble.java
+++ b/modules/java/generator/src/java/core+MatOfDouble.java
@@ -14,7 +14,7 @@ public class MatOfDouble extends Mat {
 
     protected MatOfDouble(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfDouble extends Mat {
 
     public MatOfDouble(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfFloat.java b/modules/java/generator/src/java/core+MatOfFloat.java
index ce73b6f638..96bbeab9fb 100644
--- a/modules/java/generator/src/java/core+MatOfFloat.java
+++ b/modules/java/generator/src/java/core+MatOfFloat.java
@@ -14,7 +14,7 @@ public class MatOfFloat extends Mat {
 
     protected MatOfFloat(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfFloat extends Mat {
 
     public MatOfFloat(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfFloat4.java b/modules/java/generator/src/java/core+MatOfFloat4.java
index 8a3e51014f..aaa97b7990 100644
--- a/modules/java/generator/src/java/core+MatOfFloat4.java
+++ b/modules/java/generator/src/java/core+MatOfFloat4.java
@@ -14,7 +14,7 @@ public class MatOfFloat4 extends Mat {
 
     protected MatOfFloat4(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfFloat4 extends Mat {
 
     public MatOfFloat4(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfFloat6.java b/modules/java/generator/src/java/core+MatOfFloat6.java
index 1e23101a72..68e6249b6d 100644
--- a/modules/java/generator/src/java/core+MatOfFloat6.java
+++ b/modules/java/generator/src/java/core+MatOfFloat6.java
@@ -14,7 +14,7 @@ public class MatOfFloat6 extends Mat {
 
     protected MatOfFloat6(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfFloat6 extends Mat {
 
     public MatOfFloat6(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfInt.java b/modules/java/generator/src/java/core+MatOfInt.java
index 80c5b3a5c2..33e5124e4f 100644
--- a/modules/java/generator/src/java/core+MatOfInt.java
+++ b/modules/java/generator/src/java/core+MatOfInt.java
@@ -15,7 +15,7 @@ public class MatOfInt extends Mat {
 
     protected MatOfInt(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -26,7 +26,7 @@ public class MatOfInt extends Mat {
 
     public MatOfInt(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfInt4.java b/modules/java/generator/src/java/core+MatOfInt4.java
index 60277103cc..c924233a6c 100644
--- a/modules/java/generator/src/java/core+MatOfInt4.java
+++ b/modules/java/generator/src/java/core+MatOfInt4.java
@@ -15,7 +15,7 @@ public class MatOfInt4 extends Mat {
 
     protected MatOfInt4(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -26,7 +26,7 @@ public class MatOfInt4 extends Mat {
 
     public MatOfInt4(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfKeyPoint.java b/modules/java/generator/src/java/core+MatOfKeyPoint.java
index b91fedcee8..b402fe1245 100644
--- a/modules/java/generator/src/java/core+MatOfKeyPoint.java
+++ b/modules/java/generator/src/java/core+MatOfKeyPoint.java
@@ -16,7 +16,7 @@ public class MatOfKeyPoint extends Mat {
 
     protected MatOfKeyPoint(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -27,7 +27,7 @@ public class MatOfKeyPoint extends Mat {
 
     public MatOfKeyPoint(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint.java b/modules/java/generator/src/java/core+MatOfPoint.java
index 23eeed0ebb..6d23ed1162 100644
--- a/modules/java/generator/src/java/core+MatOfPoint.java
+++ b/modules/java/generator/src/java/core+MatOfPoint.java
@@ -14,7 +14,7 @@ public class MatOfPoint extends Mat {
 
     protected MatOfPoint(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint extends Mat {
 
     public MatOfPoint(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint2f.java b/modules/java/generator/src/java/core+MatOfPoint2f.java
index ba4be4ac5e..0c6960730b 100644
--- a/modules/java/generator/src/java/core+MatOfPoint2f.java
+++ b/modules/java/generator/src/java/core+MatOfPoint2f.java
@@ -14,7 +14,7 @@ public class MatOfPoint2f extends Mat {
 
     protected MatOfPoint2f(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint2f extends Mat {
 
     public MatOfPoint2f(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint3.java b/modules/java/generator/src/java/core+MatOfPoint3.java
index 16e21301ef..0c8374f250 100644
--- a/modules/java/generator/src/java/core+MatOfPoint3.java
+++ b/modules/java/generator/src/java/core+MatOfPoint3.java
@@ -14,7 +14,7 @@ public class MatOfPoint3 extends Mat {
 
     protected MatOfPoint3(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint3 extends Mat {
 
     public MatOfPoint3(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint3f.java b/modules/java/generator/src/java/core+MatOfPoint3f.java
index 97e2a95702..b0d50d4500 100644
--- a/modules/java/generator/src/java/core+MatOfPoint3f.java
+++ b/modules/java/generator/src/java/core+MatOfPoint3f.java
@@ -14,7 +14,7 @@ public class MatOfPoint3f extends Mat {
 
     protected MatOfPoint3f(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint3f extends Mat {
 
     public MatOfPoint3f(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfRect.java b/modules/java/generator/src/java/core+MatOfRect.java
index 2e58bfe897..3844d9dfbf 100644
--- a/modules/java/generator/src/java/core+MatOfRect.java
+++ b/modules/java/generator/src/java/core+MatOfRect.java
@@ -15,7 +15,7 @@ public class MatOfRect extends Mat {
 
     protected MatOfRect(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -26,7 +26,7 @@ public class MatOfRect extends Mat {
 
     public MatOfRect(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index 438872ae8c..7323ab57a7 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -40,10 +40,6 @@
 
 #include "precomp.hpp"
 
-#ifdef HAVE_TBB
-#include <tbb/tbb.h>
-#endif
-
 CvANN_MLP_TrainParams::CvANN_MLP_TrainParams()
 {
     term_crit = cvTermCriteria( CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 1000, 0.01 );
@@ -255,7 +251,7 @@ void CvANN_MLP::create( const CvMat* _layer_sizes, int _activ_func,
     buf_sz += (l_dst[0] + l_dst[l_count-1]*2)*2;
 
     CV_CALL( wbuf = cvCreateMat( 1, buf_sz, CV_64F ));
-    CV_CALL( weights = (double**)cvAlloc( (l_count+1)*sizeof(weights[0]) ));
+    CV_CALL( weights = (double**)cvAlloc( (l_count+2)*sizeof(weights[0]) ));
 
     weights[0] = wbuf->data.db;
     weights[1] = weights[0] + l_dst[0]*2;
@@ -1022,7 +1018,7 @@ int CvANN_MLP::train_backprop( CvVectors x0, CvVectors u, const double* sw )
     return iter;
 }
 
-struct rprop_loop {
+struct rprop_loop : cv::ParallelLoopBody {
   rprop_loop(const CvANN_MLP* _point, double**& _weights, int& _count, int& _ivcount, CvVectors* _x0,
      int& _l_count, CvMat*& _layer_sizes, int& _ovcount, int& _max_count,
      CvVectors* _u, const double*& _sw, double& _inv_count, CvMat*& _dEdw, int& _dcount0, double* _E, int _buf_sz)
@@ -1063,7 +1059,7 @@ struct rprop_loop {
   int buf_sz;
 
 
-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
   {
     double* buf_ptr;
     double** x = 0;
@@ -1084,7 +1080,7 @@ struct rprop_loop {
         buf_ptr += (df[i] - x[i])*2;
     }
 
-    for(int si = range.begin(); si < range.end(); si++ )
+    for(int si = range.start; si < range.end; si++ )
     {
         if (si % dcount0 != 0) continue;
         int n1, n2, k;
@@ -1170,36 +1166,33 @@ struct rprop_loop {
             }
 
         // backward pass, update dEdw
-        #ifdef HAVE_TBB
-        static tbb::spin_mutex mutex;
-        tbb::spin_mutex::scoped_lock lock;
-        #endif
+        static cv::Mutex mutex;
+
         for(int i = l_count-1; i > 0; i-- )
         {
             n1 = layer_sizes->data.i[i-1]; n2 = layer_sizes->data.i[i];
             cvInitMatHeader( &_df, dcount, n2, CV_64F, df[i] );
             cvMul( grad1, &_df, grad1 );
-            #ifdef HAVE_TBB
-            lock.acquire(mutex);
-            #endif
-            cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
-            cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
-            cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
 
-            // update bias part of dEdw
-           for( k = 0; k < dcount; k++ )
-           {
-               double* dst = _dEdw.data.db + n1*n2;
-               const double* src = grad1->data.db + k*n2;
-               for(int j = 0; j < n2; j++ )
-                   dst[j] += src[j];
+            {
+                cv::AutoLock lock(mutex);
+                cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
+                cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
+                cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
+
+                // update bias part of dEdw
+                for( k = 0; k < dcount; k++ )
+                {
+                    double* dst = _dEdw.data.db + n1*n2;
+                    const double* src = grad1->data.db + k*n2;
+                    for(int j = 0; j < n2; j++ )
+                        dst[j] += src[j];
+                }
+
+                if (i > 1)
+                    cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
            }
 
-           if (i > 1)
-               cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
-           #ifdef HAVE_TBB
-           lock.release();
-           #endif
            cvInitMatHeader( grad2, dcount, n1, CV_64F, grad2->data.db );
            if( i > 1 )
                cvGEMM( grad1, &_w, 1, 0, 0, grad2, CV_GEMM_B_T );
@@ -1297,7 +1290,7 @@ int CvANN_MLP::train_rprop( CvVectors x0, CvVectors u, const double* sw )
         double E = 0;
 
         // first, iterate through all the samples and compute dEdw
-        cv::parallel_for(cv::BlockedRange(0, count),
+        cv::parallel_for_(cv::Range(0, count),
             rprop_loop(this, weights, count, ivcount, &x0, l_count, layer_sizes,
                        ovcount, max_count, &u, sw, inv_count, dEdw, dcount0, &E, buf_sz)
         );
diff --git a/modules/ml/src/gbt.cpp b/modules/ml/src/gbt.cpp
index 6671a3495b..b52ffbe5a3 100644
--- a/modules/ml/src/gbt.cpp
+++ b/modules/ml/src/gbt.cpp
@@ -900,7 +900,7 @@ float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
 }
 
 
-class Tree_predictor
+class Tree_predictor : public cv::ParallelLoopBody
 {
 private:
     pCvSeq* weak;
@@ -910,9 +910,7 @@ private:
     const CvMat* missing;
     const float shrinkage;
 
-#ifdef HAVE_TBB
-    static tbb::spin_mutex SumMutex;
-#endif
+    static cv::Mutex SumMutex;
 
 
 public:
@@ -931,14 +929,11 @@ public:
     Tree_predictor& operator=( const Tree_predictor& )
     { return *this; }
 
-    virtual void operator()(const cv::BlockedRange& range) const
+    virtual void operator()(const cv::Range& range) const
     {
-#ifdef HAVE_TBB
-        tbb::spin_mutex::scoped_lock lock;
-#endif
         CvSeqReader reader;
-        int begin = range.begin();
-        int end = range.end();
+        int begin = range.start;
+        int end = range.end;
 
         int weak_count = end - begin;
         CvDTree* tree;
@@ -956,13 +951,11 @@ public:
                     tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
                 }
             }
-#ifdef HAVE_TBB
-            lock.acquire(SumMutex);
-            sum[i] += tmp_sum;
-            lock.release();
-#else
-            sum[i] += tmp_sum;
-#endif
+
+            {
+                cv::AutoLock lock(SumMutex);
+                sum[i] += tmp_sum;
+            }
         }
     } // Tree_predictor::operator()
 
@@ -970,11 +963,7 @@ public:
 
 }; // class Tree_predictor
 
-
-#ifdef HAVE_TBB
-tbb::spin_mutex Tree_predictor::SumMutex;
-#endif
-
+cv::Mutex Tree_predictor::SumMutex;
 
 
 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
@@ -992,12 +981,7 @@ float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
         Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
                                     params.shrinkage, _sample, _missing, sum);
 
-//#ifdef HAVE_TBB
-//      tbb::parallel_for(cv::BlockedRange(begin, end), predictor,
-//                          tbb::auto_partitioner());
-//#else
-        cv::parallel_for(cv::BlockedRange(begin, end), predictor);
-//#endif
+        cv::parallel_for_(cv::Range(begin, end), predictor);
 
         for (int i=0; i<class_count; ++i)
             sum[i] = sum[i] /** params.shrinkage*/ + base_value;
@@ -1228,7 +1212,7 @@ void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
 
 //===========================================================================
 
-class Sample_predictor
+class Sample_predictor : public cv::ParallelLoopBody
 {
 private:
     const CvGBTrees* gbt;
@@ -1258,10 +1242,10 @@ public:
     {}
 
 
-    virtual void operator()(const cv::BlockedRange& range) const
+    virtual void operator()(const cv::Range& range) const
     {
-        int begin = range.begin();
-        int end = range.end();
+        int begin = range.start;
+        int end = range.end;
 
         CvMat x;
         CvMat miss;
@@ -1317,11 +1301,7 @@ CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
     Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
             _data->get_missing(), _sample_idx);
 
-//#ifdef HAVE_TBB
-//    tbb::parallel_for(cv::BlockedRange(0,n), predictor, tbb::auto_partitioner());
-//#else
-    cv::parallel_for(cv::BlockedRange(0,n), predictor);
-//#endif
+    cv::parallel_for_(cv::Range(0,n), predictor);
 
     int* sidx = _sample_idx ? _sample_idx->data.i : 0;
     int r_step = CV_IS_MAT_CONT(response->type) ?
diff --git a/modules/ml/src/knearest.cpp b/modules/ml/src/knearest.cpp
index 3c2f9ebada..6b6f5e6afa 100644
--- a/modules/ml/src/knearest.cpp
+++ b/modules/ml/src/knearest.cpp
@@ -306,7 +306,7 @@ float CvKNearest::write_results( int k, int k1, int start, int end,
     return result;
 }
 
-struct P1 {
+struct P1 : cv::ParallelLoopBody {
   P1(const CvKNearest* _pointer, int _buf_sz, int _k, const CvMat* __samples, const float** __neighbors,
      int _k1, CvMat* __results, CvMat* __neighbor_responses, CvMat* __dist, float* _result)
   {
@@ -333,10 +333,10 @@ struct P1 {
   float* result;
   int buf_sz;
 
-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
   {
     cv::AutoBuffer<float> buf(buf_sz);
-    for(int i = range.begin(); i < range.end(); i += 1 )
+    for(int i = range.start; i < range.end; i += 1 )
     {
         float* neighbor_responses = &buf[0];
         float* dist = neighbor_responses + 1*k;
@@ -410,8 +410,8 @@ float CvKNearest::find_nearest( const CvMat* _samples, int k, CvMat* _results,
     int k1 = get_sample_count();
     k1 = MIN( k1, k );
 
-    cv::parallel_for(cv::BlockedRange(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
-                                                    _results, _neighbor_responses, _dist, &result)
+    cv::parallel_for_(cv::Range(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
+                                             _results, _neighbor_responses, _dist, &result)
     );
 
     return result;
diff --git a/modules/ml/src/nbayes.cpp b/modules/ml/src/nbayes.cpp
index 15146d6f4e..f1f7a24ec0 100644
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@@ -277,7 +277,7 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
     return result;
 }
 
-struct predict_body {
+struct predict_body : cv::ParallelLoopBody {
   predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
      const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
      CvMat* _results, float* _value, int _var_count1
@@ -307,7 +307,7 @@ struct predict_body {
   float* value;
   int var_count1;
 
-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
   {
 
     int cls = -1;
@@ -324,7 +324,7 @@ struct predict_body {
     cv::AutoBuffer<double> buffer(nclasses + var_count1);
     CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );
 
-    for(int k = range.begin(); k < range.end(); k += 1 )
+    for(int k = range.start; k < range.end; k += 1 )
     {
         int ival;
         double opt = FLT_MAX;
@@ -397,9 +397,9 @@ float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) c
 
     const int* vidx = var_idx ? var_idx->data.i : 0;
 
-    cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
-                                                                      vidx, cls_labels, results, &value, var_count
-    ));
+    cv::parallel_for_(cv::Range(0, samples->rows),
+                      predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
+                                   vidx, cls_labels, results, &value, var_count));
 
     return value;
 }
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 9752848b9a..2e1b2e3565 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -2143,7 +2143,7 @@ float CvSVM::predict( const CvMat* sample, bool returnDFVal ) const
     return result;
 }
 
-struct predict_body_svm {
+struct predict_body_svm : ParallelLoopBody {
     predict_body_svm(const CvSVM* _pointer, float* _result, const CvMat* _samples, CvMat* _results)
     {
         pointer = _pointer;
@@ -2157,9 +2157,9 @@ struct predict_body_svm {
     const CvMat* samples;
     CvMat* results;
 
-    void operator()( const cv::BlockedRange& range ) const
+    void operator()( const cv::Range& range ) const
     {
-        for(int i = range.begin(); i < range.end(); i++ )
+        for(int i = range.start; i < range.end; i++ )
         {
             CvMat sample;
             cvGetRow( samples, &sample, i );
@@ -2175,7 +2175,7 @@ struct predict_body_svm {
 float CvSVM::predict(const CvMat* samples, CV_OUT CvMat* results) const
 {
     float result = 0;
-    cv::parallel_for(cv::BlockedRange(0, samples->rows),
+    cv::parallel_for_(cv::Range(0, samples->rows),
              predict_body_svm(this, &result, samples, results)
     );
     return result;
diff --git a/modules/nonfree/doc/background_subtraction.rst b/modules/nonfree/doc/background_subtraction.rst
deleted file mode 100644
index 11603ca566..0000000000
--- a/modules/nonfree/doc/background_subtraction.rst
+++ /dev/null
@@ -1,79 +0,0 @@
-Background Subtraction
-======================
-
-.. highlight:: cpp
-
-
-
-gpu::VIBE_GPU
--------------
-.. ocv:class:: gpu::VIBE_GPU
-
-Class used for background/foreground segmentation. ::
-
-    class VIBE_GPU
-    {
-    public:
-        explicit VIBE_GPU(unsigned long rngSeed = 1234567);
-
-        void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
-
-        void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
-
-        void release();
-
-        ...
-    };
-
-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [VIBE2011]_.
-
-
-
-gpu::VIBE_GPU::VIBE_GPU
------------------------
-The constructor.
-
-.. ocv:function:: gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed = 1234567)
-
-    :param rngSeed: Value used to initiate a random sequence.
-
-Default constructor sets all parameters to default values.
-
-
-
-gpu::VIBE_GPU::initialize
--------------------------
-Initialize background model and allocates all inner buffers.
-
-.. ocv:function:: void gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null())
-
-    :param firstFrame: First frame from video sequence.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::VIBE_GPU::operator()
--------------------------
-Updates the background model and returns the foreground mask
-
-.. ocv:function:: void gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null())
-
-    :param frame: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::VIBE_GPU::release
-----------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void gpu::VIBE_GPU::release()
-
-
-
-
-.. [VIBE2011] O. Barnich and M. Van D Roogenbroeck. *ViBe: A universal background subtraction algorithm for video sequences*. IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
diff --git a/modules/nonfree/doc/nonfree.rst b/modules/nonfree/doc/nonfree.rst
index f8fa1d6eba..e524ea82f8 100644
--- a/modules/nonfree/doc/nonfree.rst
+++ b/modules/nonfree/doc/nonfree.rst
@@ -8,4 +8,3 @@ The module contains algorithms that may be patented in some countries or have so
     :maxdepth: 2
 
     feature_detection
-    background_subtraction
diff --git a/modules/nonfree/include/opencv2/nonfree/gpu.hpp b/modules/nonfree/include/opencv2/nonfree/gpu.hpp
index c8a24e01ec..3cb0b47621 100644
--- a/modules/nonfree/include/opencv2/nonfree/gpu.hpp
+++ b/modules/nonfree/include/opencv2/nonfree/gpu.hpp
@@ -125,41 +125,6 @@ public:
     GpuMat maxPosBuffer;
 };
 
-/*!
- * The class implements the following algorithm:
- * "ViBe: A universal background subtraction algorithm for video sequences"
- * O. Barnich and M. Van D Roogenbroeck
- * IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
- */
-class CV_EXPORTS VIBE_GPU
-{
-public:
-    //! the default constructor
-    explicit VIBE_GPU(unsigned long rngSeed = 1234567);
-
-    //! re-initiaization method
-    void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
-
-    //! the update operator
-    void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
-
-    //! releases all inner buffers
-    void release();
-
-    int nbSamples;         // number of samples per pixel
-    int reqMatches;        // #_min
-    int radius;            // R
-    int subsamplingFactor; // amount of random subsampling
-
-private:
-    Size frameSize_;
-
-    unsigned long rngSeed_;
-    GpuMat randStates_;
-
-    GpuMat samples_;
-};
-
 } // namespace gpu
 
 } // namespace cv
diff --git a/modules/nonfree/perf/perf_gpu.cpp b/modules/nonfree/perf/perf_gpu.cpp
index aa8516b1c4..9f451deaba 100644
--- a/modules/nonfree/perf/perf_gpu.cpp
+++ b/modules/nonfree/perf/perf_gpu.cpp
@@ -50,18 +50,6 @@ using namespace std;
 using namespace testing;
 using namespace perf;
 
-#if defined(HAVE_XINE)         || \
-    defined(HAVE_GSTREAMER)    || \
-    defined(HAVE_QUICKTIME)    || \
-    defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
-
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
-#else
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
-#endif
-
 //////////////////////////////////////////////////////////////////////
 // SURF
 
@@ -108,75 +96,4 @@ PERF_TEST_P(Image, GPU_SURF,
     }
 }
 
-//////////////////////////////////////////////////////
-// VIBE
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
-
-DEF_PARAM_TEST(Video_Cn, string, int);
-
-PERF_TEST_P(Video_Cn, GPU_VIBE,
-            Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
-                    GPU_CHANNELS_1_3_4))
-{
-    const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
-    const int cn = GET_PARAM(1);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_frame(frame);
-        cv::gpu::VIBE_GPU vibe;
-        cv::gpu::GpuMat foreground;
-
-        vibe(d_frame, foreground);
-
-        for (int i = 0; i < 10; ++i)
-        {
-            cap >> frame;
-            ASSERT_FALSE(frame.empty());
-
-            if (cn != 3)
-            {
-                cv::Mat temp;
-                if (cn == 1)
-                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-                else
-                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-                cv::swap(temp, frame);
-            }
-
-            d_frame.upload(frame);
-
-            startTimer(); next();
-            vibe(d_frame, foreground);
-            stopTimer();
-        }
-
-        GPU_SANITY_CHECK(foreground);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-#endif
-
 #endif
diff --git a/modules/nonfree/perf/perf_main.cpp b/modules/nonfree/perf/perf_main.cpp
index de1242149e..d5f4a1a512 100644
--- a/modules/nonfree/perf/perf_main.cpp
+++ b/modules/nonfree/perf/perf_main.cpp
@@ -1,4 +1,11 @@
 #include "perf_precomp.hpp"
 #include "opencv2/ts/gpu_perf.hpp"
 
-CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo())
+static const char * impls[] = {
+#ifdef HAVE_CUDA
+    "cuda",
+#endif
+    "plain"
+};
+
+CV_PERF_TEST_MAIN_WITH_IMPLS(nonfree, impls, perf::printCudaInfo())
diff --git a/modules/nonfree/src/cuda/vibe.cu b/modules/nonfree/src/cuda/vibe.cu
deleted file mode 100644
index ba678abae2..0000000000
--- a/modules/nonfree/src/cuda/vibe.cu
+++ /dev/null
@@ -1,271 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/opencv_modules.hpp"
-
-#ifdef HAVE_OPENCV_GPU
-
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace vibe
-    {
-        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
-
-        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
-
-        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
-    }
-}}}
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace vibe
-    {
-        __constant__ int c_nbSamples;
-        __constant__ int c_reqMatches;
-        __constant__ int c_radius;
-        __constant__ int c_subsamplingFactor;
-
-        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_nbSamples, &nbSamples, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_reqMatches, &reqMatches, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_radius, &radius, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_subsamplingFactor, &subsamplingFactor, sizeof(int)) );
-        }
-
-        __device__ __forceinline__ uint nextRand(uint& state)
-        {
-            const unsigned int CV_RNG_COEFF = 4164903690U;
-            state = state * CV_RNG_COEFF + (state >> 16);
-            return state;
-        }
-
-        __constant__ int c_xoff[9] = {-1,  0,  1, -1, 1, -1, 0, 1, 0};
-        __constant__ int c_yoff[9] = {-1, -1, -1,  0, 0,  1, 1, 1, 0};
-
-        __device__ __forceinline__ int2 chooseRandomNeighbor(int x, int y, uint& randState, int count = 8)
-        {
-            int idx = nextRand(randState) % count;
-
-            return make_int2(x + c_xoff[idx], y + c_yoff[idx]);
-        }
-
-        __device__ __forceinline__ uchar cvt(uchar val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ uchar4 cvt(const uchar3& val)
-        {
-            return make_uchar4(val.x, val.y, val.z, 0);
-        }
-        __device__ __forceinline__ uchar4 cvt(const uchar4& val)
-        {
-            return val;
-        }
-
-        template <typename SrcT, typename SampleT>
-        __global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            uint localState = randStates(y, x);
-
-            for (int k = 0; k < c_nbSamples; ++k)
-            {
-                int2 np = chooseRandomNeighbor(x, y, localState, 9);
-
-                np.x = ::max(0, ::min(np.x, frame.cols - 1));
-                np.y = ::max(0, ::min(np.y, frame.rows - 1));
-
-                SrcT pix = frame(np.y, np.x);
-
-                samples(k * frame.rows + y, x) = cvt(pix);
-            }
-
-            randStates(y, x) = localState;
-        }
-
-        template <typename SrcT, typename SampleT>
-        void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );
-
-            init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
-            };
-
-            funcs[cn](frame, samples, randStates, stream);
-        }
-
-        __device__ __forceinline__ int calcDist(uchar a, uchar b)
-        {
-            return ::abs(a - b);
-        }
-        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar4& b)
-        {
-            return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
-        }
-        __device__ __forceinline__ int calcDist(const uchar4& a, const uchar4& b)
-        {
-            return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
-        }
-
-        template <typename SrcT, typename SampleT>
-        __global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            uint localState = randStates(y, x);
-
-            SrcT imgPix = frame(y, x);
-
-            // comparison with the model
-
-            int count = 0;
-            for (int k = 0; (count < c_reqMatches) && (k < c_nbSamples); ++k)
-            {
-                SampleT samplePix = samples(k * frame.rows + y, x);
-
-                int distance = calcDist(imgPix, samplePix);
-
-                if (distance < c_radius)
-                    ++count;
-            }
-
-            // pixel classification according to reqMatches
-
-            fgmask(y, x) = (uchar) (-(count < c_reqMatches));
-
-            if (count >= c_reqMatches)
-            {
-                // the pixel belongs to the background
-
-                // gets a random number between 0 and subsamplingFactor-1
-                int randomNumber = nextRand(localState) % c_subsamplingFactor;
-
-                // update of the current pixel model
-                if (randomNumber == 0)
-                {
-                    // random subsampling
-
-                    int k = nextRand(localState) % c_nbSamples;
-
-                    samples(k * frame.rows + y, x) = cvt(imgPix);
-                }
-
-                // update of a neighboring pixel model
-                randomNumber = nextRand(localState) % c_subsamplingFactor;
-
-                if (randomNumber == 0)
-                {
-                    // random subsampling
-
-                    // chooses a neighboring pixel randomly
-                    int2 np = chooseRandomNeighbor(x, y, localState);
-
-                    np.x = ::max(0, ::min(np.x, frame.cols - 1));
-                    np.y = ::max(0, ::min(np.y, frame.rows - 1));
-
-                    // chooses the value to be replaced randomly
-                    int k = nextRand(localState) % c_nbSamples;
-
-                    samples(k * frame.rows + np.y, np.x) = cvt(imgPix);
-                }
-            }
-
-            randStates(y, x) = localState;
-        }
-
-        template <typename SrcT, typename SampleT>
-        void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );
-
-            update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>
-            };
-
-            funcs[cn](frame, fgmask, samples, randStates, stream);
-        }
-    }
-}}}
-
-#endif /* HAVE_OPENCV_GPU */
diff --git a/modules/nonfree/src/sift.cpp b/modules/nonfree/src/sift.cpp
index 58ebd31016..5a7fd89407 100644
--- a/modules/nonfree/src/sift.cpp
+++ b/modules/nonfree/src/sift.cpp
@@ -774,9 +774,6 @@ void SIFT::operator()(InputArray _image, InputArray _mask,
         findScaleSpaceExtrema(gpyr, dogpyr, keypoints);
         KeyPointsFilter::removeDuplicated( keypoints );
 
-        if( !mask.empty() )
-            KeyPointsFilter::runByPixelsMask( keypoints, mask );
-
         if( nfeatures > 0 )
             KeyPointsFilter::retainBest(keypoints, nfeatures);
         //t = (double)getTickCount() - t;
@@ -791,6 +788,9 @@ void SIFT::operator()(InputArray _image, InputArray _mask,
                 kpt.pt *= scale;
                 kpt.size *= scale;
             }
+
+        if( !mask.empty() )
+            KeyPointsFilter::runByPixelsMask( keypoints, mask );
     }
     else
     {
diff --git a/modules/nonfree/src/surf.cpp b/modules/nonfree/src/surf.cpp
index bb6d53e4b9..2fc459fb61 100644
--- a/modules/nonfree/src/surf.cpp
+++ b/modules/nonfree/src/surf.cpp
@@ -258,7 +258,7 @@ interpolateKeypoint( float N9[3][9], int dx, int dy, int ds, KeyPoint& kpt )
 }
 
 // Multi-threaded construction of the scale-space pyramid
-struct SURFBuildInvoker
+struct SURFBuildInvoker : ParallelLoopBody
 {
     SURFBuildInvoker( const Mat& _sum, const vector<int>& _sizes,
                       const vector<int>& _sampleSteps,
@@ -271,9 +271,9 @@ struct SURFBuildInvoker
         traces = &_traces;
     }
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        for( int i=range.begin(); i<range.end(); i++ )
+        for( int i=range.start; i<range.end; i++ )
             calcLayerDetAndTrace( *sum, (*sizes)[i], (*sampleSteps)[i], (*dets)[i], (*traces)[i] );
     }
 
@@ -285,7 +285,7 @@ struct SURFBuildInvoker
 };
 
 // Multi-threaded search of the scale-space pyramid for keypoints
-struct SURFFindInvoker
+struct SURFFindInvoker : ParallelLoopBody
 {
     SURFFindInvoker( const Mat& _sum, const Mat& _mask_sum,
                      const vector<Mat>& _dets, const vector<Mat>& _traces,
@@ -310,9 +310,9 @@ struct SURFFindInvoker
                    const vector<int>& sizes, vector<KeyPoint>& keypoints,
                    int octave, int layer, float hessianThreshold, int sampleStep );
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        for( int i=range.begin(); i<range.end(); i++ )
+        for( int i=range.start; i<range.end; i++ )
         {
             int layer = (*middleIndices)[i];
             int octave = i / nOctaveLayers;
@@ -333,14 +333,10 @@ struct SURFFindInvoker
     int nOctaveLayers;
     float hessianThreshold;
 
-#ifdef HAVE_TBB
-    static tbb::mutex findMaximaInLayer_m;
-#endif
+    static Mutex findMaximaInLayer_m;
 };
 
-#ifdef HAVE_TBB
-tbb::mutex SURFFindInvoker::findMaximaInLayer_m;
-#endif
+Mutex SURFFindInvoker::findMaximaInLayer_m;
 
 
 /*
@@ -437,9 +433,7 @@ void SURFFindInvoker::findMaximaInLayer( const Mat& sum, const Mat& mask_sum,
                     if( interp_ok  )
                     {
                         /*printf( "KeyPoint %f %f %d\n", point.pt.x, point.pt.y, point.size );*/
-#ifdef HAVE_TBB
-                        tbb::mutex::scoped_lock lock(findMaximaInLayer_m);
-#endif
+                        cv::AutoLock lock(findMaximaInLayer_m);
                         keypoints.push_back(kpt);
                     }
                 }
@@ -505,20 +499,20 @@ static void fastHessianDetector( const Mat& sum, const Mat& mask_sum, vector<Key
     }
 
     // Calculate hessian determinant and trace samples in each layer
-    parallel_for( BlockedRange(0, nTotalLayers),
-                      SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
+    parallel_for_( Range(0, nTotalLayers),
+                   SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
 
     // Find maxima in the determinant of the hessian
-    parallel_for( BlockedRange(0, nMiddleLayers),
-                      SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
-                                      sampleSteps, middleIndices, keypoints,
-                                      nOctaveLayers, hessianThreshold) );
+    parallel_for_( Range(0, nMiddleLayers),
+                   SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
+                                   sampleSteps, middleIndices, keypoints,
+                                   nOctaveLayers, hessianThreshold) );
 
     std::sort(keypoints.begin(), keypoints.end(), KeypointGreater());
 }
 
 
-struct SURFInvoker
+struct SURFInvoker : ParallelLoopBody
 {
     enum { ORI_RADIUS = 6, ORI_WIN = 60, PATCH_SZ = 20 };
 
@@ -566,7 +560,7 @@ struct SURFInvoker
         }
     }
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
         /* X and Y gradient wavelet data */
         const int NX=2, NY=2;
@@ -587,7 +581,7 @@ struct SURFInvoker
 
         int dsize = extended ? 128 : 64;
 
-        int k, k1 = range.begin(), k2 = range.end();
+        int k, k1 = range.start, k2 = range.end;
         float maxSize = 0;
         for( k = k1; k < k2; k++ )
         {
@@ -954,7 +948,7 @@ void SURF::operator()(InputArray _img, InputArray _mask,
 
         // we call SURFInvoker in any case, even if we do not need descriptors,
         // since it computes orientation of each feature.
-        parallel_for(BlockedRange(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
+        parallel_for_(Range(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
 
         // remove keypoints that were marked for deletion
         for( i = j = 0; i < N; i++ )
diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp
index acc188edf8..de7cac2fdc 100644
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -60,27 +60,24 @@ namespace cv
 
         const char noImage2dOption [] = "-D DISABLE_IMAGE2D";
 
-        static char SURF_OPTIONS [1024] = ""; 
-        static bool USE_IMAGE2d = false;
+        static bool use_image2d = false;
+
         static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
             size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels, int depth)
         {
-            char * pSURF_OPTIONS = SURF_OPTIONS;
-            static bool OPTION_INIT = false;
-            if(!OPTION_INIT)
+            char optBuf [100] = {0};
+            char * optBufPtr = optBuf;
+            if( !use_image2d )
             {
-                if( !USE_IMAGE2d )
-                {
-                    strcat(pSURF_OPTIONS, noImage2dOption);
-                    pSURF_OPTIONS += strlen(noImage2dOption);
-                }
-
-                size_t wave_size = 0;
-                queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
-                std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
-                OPTION_INIT = true;
+                strcat(optBufPtr, noImage2dOption);
+                optBufPtr += strlen(noImage2dOption);
             }
-            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);
+            cl_kernel kernel;
+            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
+            size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
+            CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
+            sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
+            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
         }
     }
 }
@@ -161,22 +158,12 @@ public:
         counters.setTo(Scalar::all(0));
 
         integral(img, surf_.sum);
-        if(support_image2d())
+        use_image2d = support_image2d();
+        if(use_image2d)
         {
-            try
-            {
-                bindImgTex(img, imgTex);
-                bindImgTex(surf_.sum, sumTex);
-                USE_IMAGE2d = true;
-            }
-            catch (const cv::Exception& e)
-            {
-                USE_IMAGE2d = false;
-                if(e.code != CL_IMAGE_FORMAT_NOT_SUPPORTED && e.code != -217)
-                {
-                    throw e;
-                }
-            }
+            bindImgTex(img, imgTex);
+            bindImgTex(surf_.sum, sumTex);
+            finish();
         }
 
         maskSumTex = 0;
diff --git a/modules/nonfree/src/vibe_gpu.cpp b/modules/nonfree/src/vibe_gpu.cpp
deleted file mode 100644
index e34862765d..0000000000
--- a/modules/nonfree/src/vibe_gpu.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if defined(HAVE_OPENCV_GPU)
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long) { throw_nogpu(); }
-void cv::gpu::VIBE_GPU::initialize(const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::VIBE_GPU::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::VIBE_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace vibe
-    {
-        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
-
-        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
-
-        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    const int defaultNbSamples = 20;
-    const int defaultReqMatches = 2;
-    const int defaultRadius = 20;
-    const int defaultSubsamplingFactor = 16;
-}
-
-cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed) :
-    frameSize_(0, 0), rngSeed_(rngSeed)
-{
-    nbSamples = defaultNbSamples;
-    reqMatches = defaultReqMatches;
-    radius = defaultRadius;
-    subsamplingFactor = defaultSubsamplingFactor;
-}
-
-void cv::gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& s)
-{
-    using namespace cv::gpu::device::vibe;
-
-    CV_Assert(firstFrame.type() == CV_8UC1 || firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    loadConstants(nbSamples, reqMatches, radius, subsamplingFactor);
-
-    frameSize_ = firstFrame.size();
-
-    if (randStates_.size() != frameSize_)
-    {
-        cv::RNG rng(rngSeed_);
-        cv::Mat h_randStates(frameSize_, CV_8UC4);
-        rng.fill(h_randStates, cv::RNG::UNIFORM, 0, 255);
-        randStates_.upload(h_randStates);
-    }
-
-    int ch = firstFrame.channels();
-    int sample_ch = ch == 1 ? 1 : 4;
-
-    samples_.create(nbSamples * frameSize_.height, frameSize_.width, CV_8UC(sample_ch));
-
-    init_gpu(firstFrame, ch, samples_, randStates_, stream);
-}
-
-void cv::gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& s)
-{
-    using namespace cv::gpu::device::vibe;
-
-    CV_Assert(frame.depth() == CV_8U);
-
-    int ch = frame.channels();
-    int sample_ch = ch == 1 ? 1 : 4;
-
-    if (frame.size() != frameSize_ || sample_ch != samples_.channels())
-        initialize(frame);
-
-    fgmask.create(frameSize_, CV_8UC1);
-
-    update_gpu(frame, ch, fgmask, samples_, randStates_, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::VIBE_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-
-    randStates_.release();
-
-    samples_.release();
-}
-
-#endif
-
-#endif // defined(HAVE_OPENCV_GPU)
diff --git a/modules/nonfree/test/test_features2d.cpp b/modules/nonfree/test/test_features2d.cpp
index 001d628aaa..4cce77b9d5 100644
--- a/modules/nonfree/test/test_features2d.cpp
+++ b/modules/nonfree/test/test_features2d.cpp
@@ -1146,3 +1146,76 @@ protected:
 TEST(Features2d_SIFTHomographyTest, regression) { CV_DetectPlanarTest test("SIFT", 80); test.safe_run(); }
 TEST(Features2d_SURFHomographyTest, regression) { CV_DetectPlanarTest test("SURF", 80); test.safe_run(); }
 
+class FeatureDetectorUsingMaskTest : public cvtest::BaseTest
+{
+public:
+    FeatureDetectorUsingMaskTest(const Ptr<FeatureDetector>& featureDetector) :
+        featureDetector_(featureDetector)
+    {
+        CV_Assert(!featureDetector_.empty());
+    }
+
+protected:
+
+    void run(int)
+    {
+        const int nStepX = 2;
+        const int nStepY = 2;
+
+        const string imageFilename = string(ts->get_data_path()) + "/features2d/tsukuba.png";
+
+        Mat image = imread(imageFilename);
+        if(image.empty())
+        {
+            ts->printf(cvtest::TS::LOG, "Image %s can not be read.\n", imageFilename.c_str());
+            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+            return;
+        }
+
+        Mat mask(image.size(), CV_8U);
+
+        const int stepX = image.size().width / nStepX;
+        const int stepY = image.size().height / nStepY;
+
+        vector<KeyPoint> keyPoints;
+        vector<Point2f> points;
+        for(int i=0; i<nStepX; ++i)
+            for(int j=0; j<nStepY; ++j)
+            {
+
+                mask.setTo(0);
+                Rect whiteArea(i * stepX, j * stepY, stepX, stepY);
+                mask(whiteArea).setTo(255);
+
+                featureDetector_->detect(image, keyPoints, mask);
+                KeyPoint::convert(keyPoints, points);
+
+                for(size_t k=0; k<points.size(); ++k)
+                {
+                    if ( !whiteArea.contains(points[k]) )
+                    {
+                        ts->printf(cvtest::TS::LOG, "The feature point is outside of the mask.");
+                        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+                        return;
+                    }
+                }
+            }
+
+        ts->set_failed_test_info( cvtest::TS::OK );
+    }
+
+    Ptr<FeatureDetector> featureDetector_;
+};
+
+TEST(Features2d_SIFT_using_mask, regression)
+{
+    FeatureDetectorUsingMaskTest test(Algorithm::create<FeatureDetector>("Feature2D.SIFT"));
+    test.safe_run();
+}
+
+TEST(DISABLED_Features2d_SURF_using_mask, regression)
+{
+    FeatureDetectorUsingMaskTest test(Algorithm::create<FeatureDetector>("Feature2D.SURF"));
+    test.safe_run();
+}
+
diff --git a/modules/nonfree/test/test_gpu.cpp b/modules/nonfree/test/test_gpu.cpp
index 30aec352cd..3f63eeddf2 100644
--- a/modules/nonfree/test/test_gpu.cpp
+++ b/modules/nonfree/test/test_gpu.cpp
@@ -191,42 +191,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
     testing::Values(SURF_Extended(false), SURF_Extended(true)),
     testing::Values(SURF_Upright(false), SURF_Upright(true))));
 
-//////////////////////////////////////////////////////
-// VIBE
-
-PARAM_TEST_CASE(VIBE, cv::Size, MatType, UseRoi)
-{
-};
-
-GPU_TEST_P(VIBE, Accuracy)
-{
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const bool useRoi = GET_PARAM(2);
-
-    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
-
-    cv::Mat frame = randomMat(size, type, 0.0, 100);
-    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
-
-    cv::gpu::VIBE_GPU vibe;
-    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
-    vibe.initialize(d_frame);
-
-    for (int i = 0; i < 20; ++i)
-        vibe(d_frame, d_fgmask);
-
-    frame = randomMat(size, type, 160, 255);
-    d_frame = loadMat(frame, useRoi);
-    vibe(d_frame, d_fgmask);
-
-    // now fgmask should be entirely foreground
-    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
-    WHOLE_SUBMAT));
-
 #endif
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 46a232ed6a..9e78dce243 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -1141,7 +1141,7 @@ void CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& object
 
         Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) );
         Size scaledImageSize( cvRound( grayImage.cols/factor ), cvRound( grayImage.rows/factor ) );
-        Size processingRectSize( scaledImageSize.width - originalWindowSize.width + 1, scaledImageSize.height - originalWindowSize.height + 1 );
+        Size processingRectSize( scaledImageSize.width - originalWindowSize.width, scaledImageSize.height - originalWindowSize.height );
 
         if( processingRectSize.width <= 0 || processingRectSize.height <= 0 )
             break;
@@ -1165,15 +1165,10 @@ void CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& object
 
         int stripCount, stripSize;
 
-    #ifdef HAVE_TBB
         const int PTS_PER_THREAD = 1000;
         stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
         stripCount = std::min(std::max(stripCount, 1), 100);
         stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
-    #else
-        stripCount = 1;
-        stripSize = processingRectSize.height;
-    #endif
 
         if( !detectSingleScale( scaledImage, stripCount, processingRectSize, stripSize, yStep, factor, candidates,
             rejectLevels, levelWeights, outputRejectLevels ) )
diff --git a/modules/objdetect/src/latentsvm.cpp b/modules/objdetect/src/latentsvm.cpp
index 521f0fdf56..5a45965e77 100644
--- a/modules/objdetect/src/latentsvm.cpp
+++ b/modules/objdetect/src/latentsvm.cpp
@@ -582,7 +582,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
     // For each component perform searching
     for (i = 0; i < kComponents; i++)
     {
-#ifdef HAVE_TBB
         int error = searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
             b[i], maxXBorder, maxYBorder, scoreThreshold,
             &(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
@@ -598,13 +597,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
             free(partsDisplacementArr);
             return LATENT_SVM_SEARCH_OBJECT_FAILED;
         }
-#else
-    (void)numThreads;
-        searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
-            b[i], maxXBorder, maxYBorder, scoreThreshold,
-            &(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
-            &(scoreArr[i]), &(partsDisplacementArr[i]));
-#endif
         estimateBoxes(pointsArr[i], levelsArr[i], kPointsArr[i],
             filters[componentIndex]->sizeX, filters[componentIndex]->sizeY, &(oppPointsArr[i]));
         componentIndex += (kPartFilters[i] + 1);
diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt
index a7cd3a0715..05b28b83fe 100644
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -3,5 +3,5 @@ if(NOT HAVE_OPENCL)
 endif()
 
 set(the_description "OpenCL-accelerated Computer Vision")
-ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video)
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_calib3d)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index 6e34d27881..8bd1c9f112 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -118,12 +118,10 @@ namespace cv
         //the devnum is the index of the selected device in DeviceName vector of INfo
         CV_EXPORTS void setDevice(Info &oclinfo, int devnum = 0);
 
-        //optional function, if you want save opencl binary kernel to the file, set its path
-        CV_EXPORTS  void setBinpath(const char *path);
-
         //The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
+        //returns cl_context * 
         CV_EXPORTS void* getoclContext();
-
+        //returns cl_command_queue *
         CV_EXPORTS void* getoclCommandQueue();
 
         //explicit call clFinish. The global command queue will be used.
@@ -133,6 +131,9 @@ namespace cv
         //getDevice also need to be called before this function
         CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
 
+        //returns true when global OpenCL context is initialized
+        CV_EXPORTS bool initialized();
+
         //////////////////////////////// Error handling ////////////////////////
         CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
 
@@ -143,7 +144,7 @@ namespace cv
         protected:
             Context();
             friend class auto_ptr<Context>;
-
+            friend bool initialized();
         private:
             static auto_ptr<Context> clCxt;
             static int val;
@@ -180,6 +181,29 @@ namespace cv
                                                         bool finish = true, bool measureKernelTime = false,
                                                         bool cleanUp = true);
 
+        //! Enable or disable OpenCL program binary caching onto local disk
+        // After a program (*.cl files in opencl/ folder) is built at runtime, we allow the
+        // compiled OpenCL program to be cached to the path automatically as "path/*.clb" 
+        // binary file, which will be reused when the OpenCV executable is started again. 
+        //
+        // Caching mode is controlled by the following enums
+        // Notes
+        //   1. the feature is by default enabled when OpenCV is built in release mode.
+        //   2. the CACHE_DEBUG / CACHE_RELEASE flags only effectively work with MSVC compiler;
+        //      for GNU compilers, the function always treats the build as release mode (enabled by default).
+        enum
+        {
+            CACHE_NONE    = 0,        // do not cache OpenCL binary
+            CACHE_DEBUG   = 0x1 << 0, // cache OpenCL binary when built in debug mode (only work with MSVC)
+            CACHE_RELEASE = 0x1 << 1, // default behavior, only cache when built in release mode (only work with MSVC)
+            CACHE_ALL     = CACHE_DEBUG | CACHE_RELEASE, // always cache opencl binary
+            CACHE_UPDATE  = 0x1 << 2  // if the binary cache file with the same name is already on the disk, it will be updated.
+        };
+        CV_EXPORTS void setBinaryDiskCache(int mode = CACHE_RELEASE, cv::String path = "./");
+
+        //! set where binary cache to be saved to 
+        CV_EXPORTS void setBinpath(const char *path);
+
         class CV_EXPORTS oclMatExpr;
         //////////////////////////////// oclMat ////////////////////////////////
         class CV_EXPORTS oclMat
@@ -224,6 +248,11 @@ namespace cv
             operator Mat() const;
             void download(cv::Mat &m) const;
 
+            //! convert to _InputArray
+            operator _InputArray();
+
+            //! convert to _OutputArray
+            operator _OutputArray();
 
             //! returns a new oclMatrix header for the specified row
             oclMat row(int y) const;
@@ -363,6 +392,9 @@ namespace cv
             int wholecols;
         };
 
+        // convert InputArray/OutputArray to oclMat references
+        CV_EXPORTS oclMat& getOclMatRef(InputArray src);
+        CV_EXPORTS oclMat& getOclMatRef(OutputArray src);
 
         ///////////////////// mat split and merge /////////////////////////////////
         //! Compose a multi-channel array from several single-channel arrays
@@ -407,6 +439,9 @@ namespace cv
         //! computes element-wise product of the two arrays (c = a * b)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
         CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
+        //! multiplies matrix to a number (dst = scalar * src)
+        // supports CV_32FC1 only
+        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
         //! computes element-wise quotient of the two arrays (c = a / b)
         // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
         CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
@@ -458,6 +493,7 @@ namespace cv
         // support all C1 types
 
         CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
+        CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);
 
         //! finds global minimum and maximum array elements and returns their values with locations
         // support all C1 types
@@ -478,6 +514,10 @@ namespace cv
         CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist);
         //! only 8UC1 and 256 bins is supported now
         CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst);
+        
+        //! only 8UC1 is supported now
+        CV_EXPORTS Ptr<cv::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
+        
         //! bilateralFilter
         // supports 8UC1 8UC4
         CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpave, int borderType=BORDER_DEFAULT);
@@ -684,6 +724,8 @@ namespace cv
         }
 
         //! applies non-separable 2D linear filter to the image
+        //  Note, at the moment this function only works when anchor point is in the kernel center
+        //  and kernel size supported is either 3x3 or 5x5; otherwise the function will fail to output valid result
         CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
                                  Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
@@ -786,7 +828,11 @@ namespace cv
         CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
         CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
         CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
+        CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
+            int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
         CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
+        CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
+            int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
 
         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
         ///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
@@ -808,7 +854,7 @@ namespace cv
             OclCascadeClassifierBuf() :
                 m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
 
-            ~OclCascadeClassifierBuf() {}
+            ~OclCascadeClassifierBuf() { release(); }
 
             void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
                                   double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
@@ -866,7 +912,6 @@ namespace cv
             std::vector<oclMat> image_sqsums;
         };
 
-
         //! computes the proximity map for the raster template and the image where the template is searched for
         // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
         // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
@@ -877,71 +922,36 @@ namespace cv
         // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
         CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf);
 
-
-
         ///////////////////////////////////////////// Canny /////////////////////////////////////////////
-
         struct CV_EXPORTS CannyBuf;
-
-
-
         //! compute edges of the input image using Canny operator
-
         // Support CV_8UC1 only
-
         CV_EXPORTS void Canny(const oclMat &image, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-
         CV_EXPORTS void Canny(const oclMat &image, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-
         CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
-
         CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
 
-
-
         struct CV_EXPORTS CannyBuf
-
         {
-
             CannyBuf() : counter(NULL) {}
-
             ~CannyBuf()
             {
                 release();
             }
-
             explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(NULL)
-
             {
-
                 create(image_size, apperture_size);
-
             }
-
             CannyBuf(const oclMat &dx_, const oclMat &dy_);
 
-
-
             void create(const Size &image_size, int apperture_size = 3);
-
-
-
             void release();
-
-
-
             oclMat dx, dy;
-
             oclMat dx_buf, dy_buf;
-
             oclMat edgeBuf;
-
             oclMat trackBuf1, trackBuf2;
-
             void *counter;
-
             Ptr<FilterEngine_GPU> filterDX, filterDY;
-
         };
 
         ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
@@ -966,159 +976,69 @@ namespace cv
                              const oclMat &src3, double beta, oclMat &dst, int flags = 0);
 
         //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
-
         struct CV_EXPORTS HOGDescriptor
-
         {
-
             enum { DEFAULT_WIN_SIGMA = -1 };
-
             enum { DEFAULT_NLEVELS = 64 };
-
             enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
-
-
-
             HOGDescriptor(Size win_size = Size(64, 128), Size block_size = Size(16, 16),
-
                           Size block_stride = Size(8, 8), Size cell_size = Size(8, 8),
-
                           int nbins = 9, double win_sigma = DEFAULT_WIN_SIGMA,
-
                           double threshold_L2hys = 0.2, bool gamma_correction = true,
-
                           int nlevels = DEFAULT_NLEVELS);
 
-
-
             size_t getDescriptorSize() const;
-
             size_t getBlockHistogramSize() const;
-
-
-
             void setSVMDetector(const vector<float> &detector);
-
-
-
             static vector<float> getDefaultPeopleDetector();
-
             static vector<float> getPeopleDetector48x96();
-
             static vector<float> getPeopleDetector64x128();
-
-
-
             void detect(const oclMat &img, vector<Point> &found_locations,
-
                         double hit_threshold = 0, Size win_stride = Size(),
-
                         Size padding = Size());
-
-
-
             void detectMultiScale(const oclMat &img, vector<Rect> &found_locations,
-
                                   double hit_threshold = 0, Size win_stride = Size(),
-
                                   Size padding = Size(), double scale0 = 1.05,
-
                                   int group_threshold = 2);
-
-
-
             void getDescriptors(const oclMat &img, Size win_stride,
-
                                 oclMat &descriptors,
-
                                 int descr_format = DESCR_FORMAT_COL_BY_COL);
-
-
-
             Size win_size;
-
             Size block_size;
-
             Size block_stride;
-
             Size cell_size;
 
             int nbins;
-
             double win_sigma;
-
             double threshold_L2hys;
-
             bool gamma_correction;
-
             int nlevels;
 
-
-
         protected:
-
             // initialize buffers; only need to do once in case of multiscale detection
-
             void init_buffer(const oclMat &img, Size win_stride);
-
-
-
             void computeBlockHistograms(const oclMat &img);
-
             void computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle);
-
-
-
             double getWinSigma() const;
-
             bool checkDetectorSize() const;
 
-
-
             static int numPartsWithin(int size, int part_size, int stride);
-
             static Size numPartsWithin(Size size, Size part_size, Size stride);
 
-
-
             // Coefficients of the separating plane
-
             float free_coef;
-
             oclMat detector;
-
-
-
             // Results of the last classification step
-
             oclMat labels;
-
             Mat labels_host;
-
-
-
             // Results of the last histogram evaluation step
-
             oclMat block_hists;
-
-
-
             // Gradients conputation results
-
             oclMat grad, qangle;
-
-
-
             // scaled image
-
             oclMat image_scale;
-
-
-
             // effect size of input image (might be different from original size after scaling)
-
             Size effect_size;
-
         };
 
 
@@ -1126,13 +1046,11 @@ namespace cv
         /****************************************************************************************\
         *                                      Distance                                          *
         \****************************************************************************************/
-
         template<typename T>
         struct CV_EXPORTS Accumulator
         {
             typedef T Type;
         };
-
         template<> struct Accumulator<unsigned char>
         {
             typedef float Type;
@@ -1206,469 +1124,276 @@ namespace cv
         {
         public:
             enum DistType {L1Dist = 0, L2Dist, HammingDist};
-
             explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
-
-
-
             // Add descriptors to train descriptor collection
-
             void add(const std::vector<oclMat> &descCollection);
-
-
-
             // Get train descriptors collection
-
             const std::vector<oclMat> &getTrainDescriptors() const;
-
-
-
             // Clear train descriptors collection
-
             void clear();
-
-
-
             // Return true if there are not train descriptors in collection
-
             bool empty() const;
 
-
-
             // Return true if the matcher supports mask in match methods
-
             bool isMaskSupported() const;
 
-
-
             // Find one best match for each query descriptor
-
             void matchSingle(const oclMat &query, const oclMat &train,
-
                              oclMat &trainIdx, oclMat &distance,
-
                              const oclMat &mask = oclMat());
 
-
-
             // Download trainIdx and distance and convert it to CPU vector with DMatch
-
             static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches);
-
             // Convert trainIdx and distance to vector with DMatch
-
             static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches);
 
-
-
             // Find one best match for each query descriptor
-
             void match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask = oclMat());
 
-
-
             // Make gpu collection of trains and masks in suitable format for matchCollection function
-
             void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks = std::vector<oclMat>());
 
 
-
             // Find one best match from train collection for each query descriptor
-
             void matchCollection(const oclMat &query, const oclMat &trainCollection,
-
                                  oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
-
                                  const oclMat &masks = oclMat());
 
-
-
             // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
-
             static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches);
-
             // Convert trainIdx, imgIdx and distance to vector with DMatch
-
             static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches);
 
-
-
             // Find one best match from train collection for each query descriptor.
-
             void match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks = std::vector<oclMat>());
 
-
-
             // Find k best matches for each query descriptor (in increasing order of distances)
-
             void knnMatchSingle(const oclMat &query, const oclMat &train,
-
                                 oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k,
-
                                 const oclMat &mask = oclMat());
 
-
-
             // Download trainIdx and distance and convert it to vector with DMatch
-
             // compactResult is used when mask is not empty. If compactResult is false matches
-
             // vector will have the same size as queryDescriptors rows. If compactResult is true
-
             // matches vector will not contain matches for fully masked out query descriptors.
-
             static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance,
-
                                          std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 
             // Convert trainIdx and distance to vector with DMatch
-
             static void knnMatchConvert(const Mat &trainIdx, const Mat &distance,
-
                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 
-
-
             // Find k best matches for each query descriptor (in increasing order of distances).
-
             // compactResult is used when mask is not empty. If compactResult is false matches
-
             // vector will have the same size as queryDescriptors rows. If compactResult is true
-
             // matches vector will not contain matches for fully masked out query descriptors.
-
             void knnMatch(const oclMat &query, const oclMat &train,
-
                           std::vector< std::vector<DMatch> > &matches, int k, const oclMat &mask = oclMat(),
-
                           bool compactResult = false);
 
-
-
             // Find k best matches from train collection for each query descriptor (in increasing order of distances)
-
             void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
-
                                      oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
-
                                      const oclMat &maskCollection = oclMat());
 
-
-
             // Download trainIdx and distance and convert it to vector with DMatch
-
             // compactResult is used when mask is not empty. If compactResult is false matches
-
             // vector will have the same size as queryDescriptors rows. If compactResult is true
-
             // matches vector will not contain matches for fully masked out query descriptors.
-
             static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
-
                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 
             // Convert trainIdx and distance to vector with DMatch
-
             static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
-
                                          std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 
-
-
             // Find k best matches  for each query descriptor (in increasing order of distances).
-
             // compactResult is used when mask is not empty. If compactResult is false matches
-
             // vector will have the same size as queryDescriptors rows. If compactResult is true
-
             // matches vector will not contain matches for fully masked out query descriptors.
-
             void knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
-
                           const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
 
-
-
             // Find best matches for each query descriptor which have distance less than maxDistance.
-
             // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
-
             // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
-
             // because it didn't have enough memory.
-
             // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
-
             // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-
             // Matches doesn't sorted.
-
             void radiusMatchSingle(const oclMat &query, const oclMat &train,
-
                                    oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
-
                                    const oclMat &mask = oclMat());
 
-
-
             // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
-
             // matches will be sorted in increasing order of distances.
-
             // compactResult is used when mask is not empty. If compactResult is false matches
-
             // vector will have the same size as queryDescriptors rows. If compactResult is true
-
             // matches vector will not contain matches for fully masked out query descriptors.
-
             static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
-
                                             std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
             // Convert trainIdx, nMatches and distance to vector with DMatch.
-
             static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
-
                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
-
-
             // Find best matches for each query descriptor which have distance less than maxDistance
-
             // in increasing order of distances).
-
             void radiusMatch(const oclMat &query, const oclMat &train,
-
                              std::vector< std::vector<DMatch> > &matches, float maxDistance,
-
                              const oclMat &mask = oclMat(), bool compactResult = false);
-
-
-
             // Find best matches for each query descriptor which have distance less than maxDistance.
-
             // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
-
             // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-
             // Matches doesn't sorted.
-
             void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
-
                                        const std::vector<oclMat> &masks = std::vector<oclMat>());
-
-
-
             // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
-
             // matches will be sorted in increasing order of distances.
-
             // compactResult is used when mask is not empty. If compactResult is false matches
-
             // vector will have the same size as queryDescriptors rows. If compactResult is true
-
             // matches vector will not contain matches for fully masked out query descriptors.
-
             static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches,
-
                                             std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
             // Convert trainIdx, nMatches and distance to vector with DMatch.
-
             static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
-
                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
-
-
             // Find best matches from train collection for each query descriptor which have distance less than
-
             // maxDistance (in increasing order of distances).
-
             void radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
-
                              const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
-
-
-
             DistType distType;
-
-
-
         private:
-
             std::vector<oclMat> trainDescCollection;
-
         };
 
-
-
         template <class Distance>
-
         class CV_EXPORTS BruteForceMatcher_OCL;
 
-
-
         template <typename T>
-
         class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
-
         {
-
         public:
-
             explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
-
             explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
-
         };
 
         template <typename T>
-
         class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
-
         {
-
         public:
-
             explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
-
             explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
-
         };
 
         template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
-
         {
-
         public:
-
             explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
-
             explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
-
         };
 
+        class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base
+        {
+        public:
+            explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
+        };
 
+        class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
+        {
+        public:
+            explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
+                int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
+
+            //! return 1 rows matrix with CV_32FC2 type
+            void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
+            //! download points of type Point2f to a vector. the vector's content will be erased
+            void downloadPoints(const oclMat &points, vector<Point2f> &points_v);
+
+            int maxCorners;
+            double qualityLevel;
+            double minDistance;
+
+            int blockSize;
+            bool useHarrisDetector;
+            double harrisK;
+            void releaseMemory()
+            {
+                Dx_.release();
+                Dy_.release();
+                eig_.release();
+                minMaxbuf_.release();
+                tmpCorners_.release();
+            }
+        private:
+            oclMat Dx_;
+            oclMat Dy_;
+            oclMat eig_;
+            oclMat minMaxbuf_;
+            oclMat tmpCorners_;
+        };
+
+        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
+            int blockSize_, bool useHarrisDetector_, double harrisK_)
+        {
+            maxCorners = maxCorners_;
+            qualityLevel = qualityLevel_;
+            minDistance = minDistance_;
+            blockSize = blockSize_;
+            useHarrisDetector = useHarrisDetector_;
+            harrisK = harrisK_;
+        }
 
         /////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
-
         class CV_EXPORTS PyrLKOpticalFlow
-
         {
-
         public:
-
             PyrLKOpticalFlow()
-
             {
-
                 winSize = Size(21, 21);
-
                 maxLevel = 3;
-
                 iters = 30;
-
                 derivLambda = 0.5;
-
                 useInitialFlow = false;
-
                 minEigThreshold = 1e-4f;
-
                 getMinEigenVals = false;
-
                 isDeviceArch11_ = false;
-
             }
 
-
-
             void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts,
-
                         oclMat &status, oclMat *err = 0);
-
-
-
             void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0);
-
-
-
             Size winSize;
-
             int maxLevel;
-
             int iters;
-
             double derivLambda;
-
             bool useInitialFlow;
-
             float minEigThreshold;
-
             bool getMinEigenVals;
-
-
-
             void releaseMemory()
-
             {
-
                 dx_calcBuf_.release();
-
                 dy_calcBuf_.release();
 
-
-
                 prevPyr_.clear();
-
                 nextPyr_.clear();
 
-
-
                 dx_buf_.release();
-
                 dy_buf_.release();
-
             }
-
-
-
         private:
-
             void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy);
-
-
-
             void buildImagePyramid(const oclMat &img0, vector<oclMat> &pyr, bool withBorder);
 
-
-
             oclMat dx_calcBuf_;
-
             oclMat dy_calcBuf_;
 
-
-
             vector<oclMat> prevPyr_;
-
             vector<oclMat> nextPyr_;
 
-
-
             oclMat dx_buf_;
-
             oclMat dy_buf_;
-
-
-
             oclMat uPyr_[2];
-
             oclMat vPyr_[2];
-
-
-
             bool isDeviceArch11_;
-
         };
         //////////////// build warping maps ////////////////////
         //! builds plane warping maps
@@ -1739,6 +1464,7 @@ namespace cv
         private:
             oclMat minSSD, leBuf, riBuf;
         };
+
         class CV_EXPORTS StereoBeliefPropagation
         {
         public:
@@ -1769,6 +1495,7 @@ namespace cv
             std::vector<oclMat> datas;
             oclMat out;
         };
+
         class CV_EXPORTS StereoConstantSpaceBP
         {
         public:
@@ -1807,6 +1534,94 @@ namespace cv
             oclMat temp;
             oclMat out;
         };
+
+        // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
+        //
+        // see reference:
+        //   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+        //   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+        class CV_EXPORTS OpticalFlowDual_TVL1_OCL
+        {
+        public:
+            OpticalFlowDual_TVL1_OCL();
+
+            void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy);
+
+            void collectGarbage();
+
+            /**
+            * Time step of the numerical scheme.
+            */
+            double tau;
+
+            /**
+            * Weight parameter for the data term, attachment parameter.
+            * This is the most relevant parameter, which determines the smoothness of the output.
+            * The smaller this parameter is, the smoother the solutions we obtain.
+            * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
+            */
+            double lambda;
+
+            /**
+            * Weight parameter for (u - v)^2, tightness parameter.
+            * It serves as a link between the attachment and the regularization terms.
+            * In theory, it should have a small value in order to maintain both parts in correspondence.
+            * The method is stable for a large range of values of this parameter.
+            */
+            double theta;
+
+            /**
+            * Number of scales used to create the pyramid of images.
+            */
+            int nscales;
+
+            /**
+            * Number of warpings per scale.
+            * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
+            * This is a parameter that assures the stability of the method.
+            * It also affects the running time, so it is a compromise between speed and accuracy.
+            */
+            int warps;
+
+            /**
+            * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
+            * A small value will yield more accurate solutions at the expense of a slower convergence.
+            */
+            double epsilon;
+
+            /**
+            * Stopping criterion iterations number used in the numerical scheme.
+            */
+            int iterations;
+
+            bool useInitialFlow;
+
+        private:
+            void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2);
+
+            std::vector<oclMat> I0s;
+            std::vector<oclMat> I1s;
+            std::vector<oclMat> u1s;
+            std::vector<oclMat> u2s;
+
+            oclMat I1x_buf;
+            oclMat I1y_buf;
+
+            oclMat I1w_buf;
+            oclMat I1wx_buf;
+            oclMat I1wy_buf;
+
+            oclMat grad_buf;
+            oclMat rho_c_buf;
+
+            oclMat p11_buf;
+            oclMat p12_buf;
+            oclMat p21_buf;
+            oclMat p22_buf;
+
+            oclMat diff_buf;
+            oclMat norm_buf;
+        };
     }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index 081d2343dc..634f2f2b15 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -49,7 +49,7 @@
 #include "opencv2/ocl/ocl.hpp"
 
 #if defined __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/opencl.h>
 #endif
@@ -120,6 +120,33 @@ namespace cv
         cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
         void CV_EXPORTS releaseTexture(cl_mem& texture);
 
+        //Represents an image texture object
+        class CV_EXPORTS TextureCL
+        {
+        public:
+            TextureCL(cl_mem tex, int r, int c, int t)
+                : tex_(tex), rows(r), cols(c), type(t) {}
+            ~TextureCL()
+            {
+                openCLFree(tex_);
+            }
+            operator cl_mem() 
+            {
+                return tex_;
+            }
+            cl_mem const tex_;
+            const int rows;
+            const int cols;
+            const int type;
+        private:
+            //disable assignment
+            void operator=(const TextureCL&);
+        };
+        // bind oclMat to OpenCL image textures and retunrs an TextureCL object
+        // note:
+        //   for faster clamping, there is no buffer padding for the constructed texture
+        Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);
+
         // returns whether the current context supports image2d_t format or not
         bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
 
@@ -128,11 +155,17 @@ namespace cv
         enum DEVICE_INFO
         {
             WAVEFRONT_SIZE,             //in AMD speak
-            WARP_SIZE = WAVEFRONT_SIZE, //in nvidia speak
             IS_CPU_DEVICE               //check if the device is CPU
         };
-        //info should have been pre-allocated
-        void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info);
+        template<DEVICE_INFO _it, typename _ty>
+        _ty queryDeviceInfo(cl_kernel kernel = NULL);
+
+        template<>
+        int CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel);
+        template<>
+        size_t CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel);
+        template<>
+        bool CV_EXPORTS queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel kernel);
 
     }//namespace ocl
 
diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp
index 2da17755eb..bd2a4ec4b6 100644
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@@ -44,41 +44,21 @@
 
 int main(int argc, const char *argv[])
 {
-    vector<ocl::Info> oclinfo;
-    int num_devices = getDevice(oclinfo);
-
-    if (num_devices < 1)
-    {
-        cerr << "no device found\n";
-        return -1;
-    }
-
-    int devidx = 0;
-
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
-        {
-            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
-        }
-    }
-
-    redirectError(cvErrorCallback);
-
     const char *keys =
         "{ h | help    | false | print help message }"
         "{ f | filter  |       | filter for test }"
         "{ w | workdir |       | set working directory }"
         "{ l | list    | false | show all tests }"
         "{ d | device  | 0     | device id }"
+        "{ c | cpu_ocl | false | use cpu as ocl device}"
         "{ i | iters   | 10    | iteration count }"
         "{ m | warmup  | 1     | gpu warm up iteration count}"
-        "{ t | xtop    | 1.1	  | xfactor top boundary}"
-        "{ b | xbottom | 0.9	  | xfactor bottom boundary}"
+        "{ t | xtop    | 1.1   | xfactor top boundary}"
+        "{ b | xbottom | 0.9   | xfactor bottom boundary}"
         "{ v | verify  | false | only run gpu once to verify if problems occur}";
 
+    redirectError(cvErrorCallback);
     CommandLineParser cmd(argc, argv, keys);
-
     if (cmd.get<bool>("help"))
     {
         cout << "Avaible options:" << endl;
@@ -86,14 +66,40 @@ int main(int argc, const char *argv[])
         return 0;
     }
 
-    int device = cmd.get<int>("device");
+    // get ocl devices
+    bool use_cpu = cmd.get<bool>("c");
+    vector<ocl::Info> oclinfo;
+    int num_devices = 0;
+    if(use_cpu)
+        num_devices = getDevice(oclinfo, ocl::CVCL_DEVICE_TYPE_CPU);
+    else
+        num_devices = getDevice(oclinfo);
+    if (num_devices < 1)
+    {
+        cerr << "no device found\n";
+        return -1;
+    }
 
+    // show device info
+    int devidx = 0;
+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
+        {
+            cout << "device " << devidx++ << ": " << oclinfo[i].DeviceName[j] << endl;
+        }
+    }
+
+    int device = cmd.get<int>("device");
     if (device < 0 || device >= num_devices)
     {
         cerr << "Invalid device ID" << endl;
         return -1;
     }
 
+    // set this to overwrite binary cache every time the test starts
+    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
+    
     if (cmd.get<bool>("verify"))
     {
         TestSystem::instance().setNumIters(1);
@@ -102,7 +108,6 @@ int main(int argc, const char *argv[])
     }
 
     devidx = 0;
-
     for (size_t i = 0; i < oclinfo.size(); i++)
     {
         for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
@@ -111,7 +116,7 @@ int main(int argc, const char *argv[])
             {
                 ocl::setDevice(oclinfo[i], (int)j);
                 TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
-                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
+                cout << "use " << devidx << ": " <<oclinfo[i].DeviceName[j] << endl;
                 goto END_DEV;
             }
         }
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
index e6e957641b..3ef0634e70 100644
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,9 +46,9 @@
 
 #include "precomp.hpp"
 ///////////// Lut ////////////////////////
-TEST(lut)
+PERFTEST(lut)
 {
-    Mat src, lut, dst;
+    Mat src, lut, dst, ocl_dst;
     ocl::oclMat d_src, d_lut, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC3};
@@ -61,7 +62,6 @@ TEST(lut)
 
             gen(src, size, size, all_type[j], 0, 256);
             gen(lut, 1, 256, CV_8UC1, 0, 1);
-            gen(dst, size, size, all_type[j], 0, 256);
 
             LUT(src, lut, dst);
 
@@ -78,33 +78,32 @@ TEST(lut)
 
             GPU_ON;
             ocl::LUT(d_src, d_lut, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             d_lut.upload(lut);
             ocl::LUT(d_src, d_lut, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
 
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0);
         }
 
     }
 }
 
 ///////////// Exp ////////////////////////
-TEST(Exp)
+PERFTEST(Exp)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
         SUBTEST << size << 'x' << size << "; CV_32FC1";
 
-        gen(src, size, size, CV_32FC1, 0, 256);
-        gen(dst, size, size, CV_32FC1, 0, 256);
+        gen(src, size, size, CV_32FC1, 5, 16);
 
         exp(src, dst);
 
@@ -119,21 +118,22 @@ TEST(Exp)
 
         GPU_ON;
         ocl::exp(d_src, d_dst);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
         d_src.upload(src);
         ocl::exp(d_src, d_dst);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
         GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 2);
     }
 }
 
 ///////////// LOG ////////////////////////
-TEST(Log)
+PERFTEST(Log)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -155,21 +155,22 @@ TEST(Log)
 
         GPU_ON;
         ocl::log(d_src, d_dst);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
         d_src.upload(src);
         ocl::log(d_src, d_dst);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
         GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
     }
 }
 
 ///////////// Add ////////////////////////
-TEST(Add)
+PERFTEST(Add)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     int all_type[] = {CV_8UC1, CV_32FC1};
@@ -189,6 +190,7 @@ TEST(Add)
             CPU_ON;
             add(src1, src2, dst);
             CPU_OFF;
+
             d_src1.upload(src1);
             d_src2.upload(src2);
 
@@ -198,24 +200,25 @@ TEST(Add)
 
             GPU_ON;
             ocl::add(d_src1, d_src2, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::add(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
         }
 
     }
 }
 
 ///////////// Mul ////////////////////////
-TEST(Mul)
+PERFTEST(Mul)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -229,8 +232,6 @@ TEST(Mul)
 
             gen(src1, size, size, all_type[j], 0, 256);
             gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
 
             multiply(src1, src2, dst);
 
@@ -246,24 +247,25 @@ TEST(Mul)
 
             GPU_ON;
             ocl::multiply(d_src1, d_src2, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::multiply(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
         }
 
     }
 }
 
 ///////////// Div ////////////////////////
-TEST(Div)
+PERFTEST(Div)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
     int all_type[] = {CV_8UC1, CV_8UC4};
     std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -276,14 +278,13 @@ TEST(Div)
 
             gen(src1, size, size, all_type[j], 0, 256);
             gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
 
             divide(src1, src2, dst);
 
             CPU_ON;
             divide(src1, src2, dst);
             CPU_OFF;
+
             d_src1.upload(src1);
             d_src2.upload(src2);
 
@@ -293,24 +294,25 @@ TEST(Div)
 
             GPU_ON;
             ocl::divide(d_src1, d_src2, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::divide(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
         }
 
     }
 }
 
 ///////////// Absdiff ////////////////////////
-TEST(Absdiff)
+PERFTEST(Absdiff)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -326,12 +328,12 @@ TEST(Absdiff)
             gen(src2, size, size, all_type[j], 0, 256);
             gen(dst, size, size, all_type[j], 0, 256);
 
-
             absdiff(src1, src2, dst);
 
             CPU_ON;
             absdiff(src1, src2, dst);
             CPU_OFF;
+
             d_src1.upload(src1);
             d_src2.upload(src2);
 
@@ -341,24 +343,25 @@ TEST(Absdiff)
 
             GPU_ON;
             ocl::absdiff(d_src1, d_src2, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::absdiff(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
         }
 
     }
 }
 
 ///////////// CartToPolar ////////////////////////
-TEST(CartToPolar)
+PERFTEST(CartToPolar)
 {
-    Mat src1, src2, dst, dst1;
+    Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
     ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
 
     int all_type[] = {CV_32FC1};
@@ -381,6 +384,7 @@ TEST(CartToPolar)
             CPU_ON;
             cartToPolar(src1, src2, dst, dst1, 1);
             CPU_OFF;
+
             d_src1.upload(src1);
             d_src2.upload(src2);
 
@@ -390,25 +394,30 @@ TEST(CartToPolar)
 
             GPU_ON;
             ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
+            d_dst.download(ocl_dst);
+            d_dst1.download(ocl_dst1);
             GPU_FULL_OFF;
+
+            double diff1 = checkNorm(ocl_dst1, dst1);
+            double diff2 = checkNorm(ocl_dst, dst);
+            double max_diff = max(diff1, diff2);
+            TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
+
         }
 
     }
 }
 
 ///////////// PolarToCart ////////////////////////
-TEST(PolarToCart)
+PERFTEST(PolarToCart)
 {
-    Mat src1, src2, dst, dst1;
+    Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
     ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
 
     int all_type[] = {CV_32FC1};
@@ -440,25 +449,30 @@ TEST(PolarToCart)
 
             GPU_ON;
             ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
+            d_dst.download(ocl_dst);
+            d_dst1.download(ocl_dst1);
             GPU_FULL_OFF;
+
+            double diff1 = checkNorm(ocl_dst1, dst1);
+            double diff2 = checkNorm(ocl_dst, dst);
+            double max_diff = max(diff1, diff2);
+            TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
+
         }
 
     }
 }
 
 ///////////// Magnitude ////////////////////////
-TEST(magnitude)
+PERFTEST(magnitude)
 {
-    Mat x, y, mag;
+    Mat x, y, mag, ocl_mag;
     ocl::oclMat d_x, d_y, d_mag;
 
     int all_type[] = {CV_32FC1};
@@ -487,24 +501,25 @@ TEST(magnitude)
 
             GPU_ON;
             ocl::magnitude(d_x, d_y, d_mag);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_x.upload(x);
             d_y.upload(y);
             ocl::magnitude(d_x, d_y, d_mag);
-            d_mag.download(mag);
+            d_mag.download(ocl_mag);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_mag, mag, 1e-5);
         }
 
     }
 }
 
 ///////////// Transpose ////////////////////////
-TEST(Transpose)
+PERFTEST(Transpose)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -532,23 +547,24 @@ TEST(Transpose)
 
             GPU_ON;
             ocl::transpose(d_src, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::transpose(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
         }
 
     }
 }
 
 ///////////// Flip ////////////////////////
-TEST(Flip)
+PERFTEST(Flip)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -576,26 +592,28 @@ TEST(Flip)
 
             GPU_ON;
             ocl::flip(d_src, d_dst, 0);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::flip(d_src, d_dst, 0);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
         }
 
     }
 }
 
 ///////////// minMax ////////////////////////
-TEST(minMax)
+PERFTEST(minMax)
 {
     Mat src;
     ocl::oclMat d_src;
 
-    double min_val, max_val;
+    double min_val = 0.0, max_val = 0.0;
+    double min_val_ = 0.0, max_val_ = 0.0;
     Point min_loc, max_loc;
     int all_type[] = {CV_8UC1, CV_32FC1};
     std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
@@ -614,12 +632,16 @@ TEST(minMax)
             d_src.upload(src);
 
             WARMUP_ON;
-            ocl::minMax(d_src, &min_val, &max_val);
+            ocl::minMax(d_src, &min_val_, &max_val_);
             WARMUP_OFF;
 
+            if(EeceptDoubleEQ<double>(max_val_, max_val) && EeceptDoubleEQ<double>(min_val_, min_val))
+                TestSystem::instance().setAccurate(1, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));
+            else
+                TestSystem::instance().setAccurate(0, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));
+
             GPU_ON;
             ocl::minMax(d_src, &min_val, &max_val);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
@@ -633,13 +655,15 @@ TEST(minMax)
 }
 
 ///////////// minMaxLoc ////////////////////////
-TEST(minMaxLoc)
+PERFTEST(minMaxLoc)
 {
     Mat src;
     ocl::oclMat d_src;
 
-    double min_val, max_val;
+    double min_val = 0.0, max_val = 0.0;
+    double min_val_ = 0.0, max_val_ = 0.0;
     Point min_loc, max_loc;
+    Point min_loc_, max_loc_;
     int all_type[] = {CV_8UC1, CV_32FC1};
     std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
@@ -657,12 +681,71 @@ TEST(minMaxLoc)
             d_src.upload(src);
 
             WARMUP_ON;
-            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
+            ocl::minMaxLoc(d_src, &min_val_, &max_val_, &min_loc_, &max_loc_);
             WARMUP_OFF;
 
+            double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
+            if(src.depth() == 0)
+            {
+                minlocVal = src.at<unsigned char>(min_loc);
+                minlocVal_ = src.at<unsigned char>(min_loc_);
+                maxlocVal = src.at<unsigned char>(max_loc);
+                maxlocVal_ = src.at<unsigned char>(max_loc_);
+            }
+            if(src.depth() == 1)
+            {
+                minlocVal = src.at<signed char>(min_loc);
+                minlocVal_ = src.at<signed char>(min_loc_);
+                maxlocVal = src.at<signed char>(max_loc);
+                maxlocVal_ = src.at<signed char>(max_loc_);
+            }
+            if(src.depth() == 2)
+            {
+                minlocVal = src.at<unsigned short>(min_loc);
+                minlocVal_ = src.at<unsigned short>(min_loc_);
+                maxlocVal = src.at<unsigned short>(max_loc);
+                maxlocVal_ = src.at<unsigned short>(max_loc_);
+            }
+            if(src.depth() == 3)
+            {
+                minlocVal = src.at<signed short>(min_loc);
+                minlocVal_ = src.at<signed short>(min_loc_);
+                maxlocVal = src.at<signed short>(max_loc);
+                maxlocVal_ = src.at<signed short>(max_loc_);
+            }
+            if(src.depth() == 4)
+            {
+                minlocVal = src.at<int>(min_loc);
+                minlocVal_ = src.at<int>(min_loc_);
+                maxlocVal = src.at<int>(max_loc);
+                maxlocVal_ = src.at<int>(max_loc_);
+            }
+            if(src.depth() == 5)
+            {
+                minlocVal = src.at<float>(min_loc);
+                minlocVal_ = src.at<float>(min_loc_);
+                maxlocVal = src.at<float>(max_loc);
+                maxlocVal_ = src.at<float>(max_loc_);
+            }
+            if(src.depth() == 6)
+            {
+                minlocVal = src.at<double>(min_loc);
+                minlocVal_ = src.at<double>(min_loc_);
+                maxlocVal = src.at<double>(max_loc);
+                maxlocVal_ = src.at<double>(max_loc_);
+            }
+            error0 = ::abs(minlocVal_ - minlocVal);
+            error1 = ::abs(maxlocVal_ - maxlocVal);
+            if( EeceptDoubleEQ<double>(maxlocVal_, maxlocVal)
+                &&EeceptDoubleEQ<double>(minlocVal_, minlocVal)
+                &&EeceptDoubleEQ<double>(max_val_, max_val)
+                &&EeceptDoubleEQ<double>(min_val_, min_val))
+                TestSystem::instance().setAccurate(1, 0.);
+            else
+                TestSystem::instance().setAccurate(0, max(error0, error1));
+
             GPU_ON;
             ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
@@ -675,7 +758,7 @@ TEST(minMaxLoc)
 }
 
 ///////////// Sum ////////////////////////
-TEST(Sum)
+PERFTEST(Sum)
 {
     Mat src;
     Scalar cpures, gpures;
@@ -690,7 +773,7 @@ TEST(Sum)
         {
             SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            gen(src, size, size, all_type[j], 0, 256);
+            gen(src, size, size, all_type[j], 0, 60);
 
             cpures = sum(src);
 
@@ -703,9 +786,16 @@ TEST(Sum)
             gpures = ocl::sum(d_src);
             WARMUP_OFF;
 
+            vector<double> diffs(4);
+            diffs[3] = fabs(cpures[3] - gpures[3]);
+            diffs[2] = fabs(cpures[2] - gpures[2]);
+            diffs[1] = fabs(cpures[1] - gpures[1]);
+            diffs[0] = fabs(cpures[0] - gpures[0]);
+            double max_diff = *max_element(diffs.begin(), diffs.end());
+            TestSystem::instance().setAccurate(max_diff<0.1?1:0, max_diff);
+
             GPU_ON;
             gpures = ocl::sum(d_src);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
@@ -718,7 +808,7 @@ TEST(Sum)
 }
 
 ///////////// countNonZero ////////////////////////
-TEST(countNonZero)
+PERFTEST(countNonZero)
 {
     Mat src;
     ocl::oclMat d_src;
@@ -736,18 +826,24 @@ TEST(countNonZero)
 
             countNonZero(src);
 
+            int cpures = 0, gpures = 0;
             CPU_ON;
-            countNonZero(src);
+            cpures = countNonZero(src);
             CPU_OFF;
             d_src.upload(src);
 
             WARMUP_ON;
-            ocl::countNonZero(d_src);
+            gpures = ocl::countNonZero(d_src);
             WARMUP_OFF;
 
+            int diff = abs(cpures - gpures);
+            if(diff == 0)
+                TestSystem::instance().setAccurate(1, 0);
+            else
+                TestSystem::instance().setAccurate(0, diff);
+
             GPU_ON;
             ocl::countNonZero(d_src);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
@@ -760,9 +856,9 @@ TEST(countNonZero)
 }
 
 ///////////// Phase ////////////////////////
-TEST(Phase)
+PERFTEST(Phase)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     int all_type[] = {CV_32FC1};
@@ -778,12 +874,12 @@ TEST(Phase)
             gen(src2, size, size, all_type[j], 0, 256);
             gen(dst, size, size, all_type[j], 0, 256);
 
-
             phase(src1, src2, dst, 1);
 
             CPU_ON;
             phase(src1, src2, dst, 1);
             CPU_OFF;
+
             d_src1.upload(src1);
             d_src2.upload(src2);
 
@@ -793,24 +889,25 @@ TEST(Phase)
 
             GPU_ON;
             ocl::phase(d_src1, d_src2, d_dst, 1);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::phase(d_src1, d_src2, d_dst, 1);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-2);
         }
 
     }
 }
 
 ///////////// bitwise_and////////////////////////
-TEST(bitwise_and)
+PERFTEST(bitwise_and)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     int all_type[] = {CV_8UC1, CV_32SC1};
@@ -826,7 +923,6 @@ TEST(bitwise_and)
             gen(src2, size, size, all_type[j], 0, 256);
             gen(dst, size, size, all_type[j], 0, 256);
 
-
             bitwise_and(src1, src2, dst);
 
             CPU_ON;
@@ -841,120 +937,25 @@ TEST(bitwise_and)
 
             GPU_ON;
             ocl::bitwise_and(d_src1, d_src2, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::bitwise_and(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
-        }
 
-    }
-}
-
-///////////// bitwise_or////////////////////////
-TEST(bitwise_or)
-{
-    Mat src1, src2, dst;
-    ocl::oclMat d_src1, d_src2, d_dst;
-
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_or(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_or(src1, src2, dst);
-            CPU_OFF;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-             ;
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-        }
-
-    }
-}
-
-///////////// bitwise_xor////////////////////////
-TEST(bitwise_xor)
-{
-    Mat src1, src2, dst;
-    ocl::oclMat d_src1, d_src2, d_dst;
-
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_xor(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_xor(src1, src2, dst);
-            CPU_OFF;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-             ;
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
         }
 
     }
 }
 
 ///////////// bitwise_not////////////////////////
-TEST(bitwise_not)
+PERFTEST(bitwise_not)
 {
-    Mat src1, dst;
+    Mat src1, dst, ocl_dst;
     ocl::oclMat d_src1, d_dst;
 
     int all_type[] = {CV_8UC1, CV_32SC1};
@@ -969,7 +970,6 @@ TEST(bitwise_not)
             gen(src1, size, size, all_type[j], 0, 256);
             gen(dst, size, size, all_type[j], 0, 256);
 
-
             bitwise_not(src1, dst);
 
             CPU_ON;
@@ -983,23 +983,24 @@ TEST(bitwise_not)
 
             GPU_ON;
             ocl::bitwise_not(d_src1, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             ocl::bitwise_not(d_src1, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
         }
 
     }
 }
 
 ///////////// compare////////////////////////
-TEST(compare)
+PERFTEST(compare)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     int CMP_EQ = 0;
@@ -1016,12 +1017,12 @@ TEST(compare)
             gen(src2, size, size, all_type[j], 0, 256);
             gen(dst, size, size, all_type[j], 0, 256);
 
-
             compare(src1, src2, dst, CMP_EQ);
 
             CPU_ON;
             compare(src1, src2, dst, CMP_EQ);
             CPU_OFF;
+
             d_src1.upload(src1);
             d_src2.upload(src2);
 
@@ -1031,24 +1032,25 @@ TEST(compare)
 
             GPU_ON;
             ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
         }
 
     }
 }
 
 ///////////// pow ////////////////////////
-TEST(pow)
+PERFTEST(pow)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_32FC1};
@@ -1060,8 +1062,7 @@ TEST(pow)
         {
             SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            gen(src, size, size, all_type[j], 0, 100);
-            gen(dst, size, size, all_type[j], 0, 100);
+            gen(src, size, size, all_type[j], 5, 16);
 
             pow(src, -2.0, dst);
 
@@ -1077,23 +1078,24 @@ TEST(pow)
 
             GPU_ON;
             ocl::pow(d_src, -2.0, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::pow(d_src, -2.0, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
         }
 
     }
 }
 
 ///////////// MagnitudeSqr////////////////////////
-TEST(MagnitudeSqr)
+PERFTEST(MagnitudeSqr)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     int all_type[] = {CV_32FC1};
@@ -1109,31 +1111,17 @@ TEST(MagnitudeSqr)
             gen(src2, size, size, all_type[t], 0, 256);
             gen(dst, size, size, all_type[t], 0, 256);
 
-
-            for (int i = 0; i < src1.rows; ++i)
-
-                for (int j = 0; j < src1.cols; ++j)
-                {
-                    float val1 = src1.at<float>(i, j);
-                    float val2 = src2.at<float>(i, j);
-
-                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
-
-                }
-
             CPU_ON;
-
             for (int i = 0; i < src1.rows; ++i)
                 for (int j = 0; j < src1.cols; ++j)
                 {
                     float val1 = src1.at<float>(i, j);
                     float val2 = src2.at<float>(i, j);
-
                     ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
 
                 }
-
             CPU_OFF;
+
             d_src1.upload(src1);
             d_src2.upload(src2);
 
@@ -1143,24 +1131,25 @@ TEST(MagnitudeSqr)
 
             GPU_ON;
             ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
         }
 
     }
 }
 
 ///////////// AddWeighted////////////////////////
-TEST(AddWeighted)
+PERFTEST(AddWeighted)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_dst;
 
     double alpha = 2.0, beta = 1.0, gama = 3.0;
@@ -1192,15 +1181,16 @@ TEST(AddWeighted)
 
             GPU_ON;
             ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             d_src2.upload(src2);
             ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
         }
 
     }
diff --git a/modules/ocl/perf/perf_blend.cpp b/modules/ocl/perf/perf_blend.cpp
index 00034700b4..8ebb6482ba 100644
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -68,9 +69,9 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we
         }
     }
 }
-TEST(blend)
+PERFTEST(blend)
 {
-    Mat src1, src2, weights1, weights2, dst;
+    Mat src1, src2, weights1, weights2, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -104,7 +105,6 @@ TEST(blend)
 
             GPU_ON;
             ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
@@ -113,8 +113,10 @@ TEST(blend)
             d_weights1.upload(weights1);
             d_weights2.upload(weights2);
             ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.f);
         }
     }
 }
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_brute_force_matcher.cpp b/modules/ocl/perf/perf_brute_force_matcher.cpp
index 6562f91e43..406b46a324 100644
--- a/modules/ocl/perf/perf_brute_force_matcher.cpp
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,7 +46,7 @@
 #include "precomp.hpp"
 
 //////////////////// BruteForceMatch /////////////////
-TEST(BruteForceMatcher)
+PERFTEST(BruteForceMatcher)
 {
     Mat trainIdx_cpu;
     Mat distance_cpu;
@@ -66,6 +67,7 @@ TEST(BruteForceMatcher)
         gen(train, size, desc_len, CV_32F, 0, 1);
         // Output
         vector< vector<DMatch> > matches(2);
+        vector< vector<DMatch> > d_matches(2);
         // Init GPU matcher
         ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
 
@@ -88,15 +90,20 @@ TEST(BruteForceMatcher)
 
         GPU_ON;
         d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
         d_query.upload(query);
         d_train.upload(train);
-        d_matcher.match(d_query, d_train, matches[0]);
+        d_matcher.match(d_query, d_train, d_matches[0]);
         GPU_FULL_OFF;
 
+        int diff = abs((int)d_matches[0].size() - (int)matches[0].size());
+        if(diff == 0)
+            TestSystem::instance().setAccurate(1, 0);
+        else
+            TestSystem::instance().setAccurate(0, diff);
+
         SUBTEST << size << "; knnMatch";
 
         matcher.knnMatch(query, train, matches, 2);
@@ -111,15 +118,20 @@ TEST(BruteForceMatcher)
 
         GPU_ON;
         d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
         d_query.upload(query);
         d_train.upload(train);
-        d_matcher.knnMatch(d_query, d_train, matches, 2);
+        d_matcher.knnMatch(d_query, d_train, d_matches, 2);
         GPU_FULL_OFF;
 
+        diff = abs((int)d_matches[0].size() - (int)matches[0].size());
+        if(diff == 0)
+            TestSystem::instance().setAccurate(1, 0);
+        else
+            TestSystem::instance().setAccurate(0, diff);
+
         SUBTEST << size << "; radiusMatch";
 
         float max_distance = 2.0f;
@@ -138,13 +150,18 @@ TEST(BruteForceMatcher)
 
         GPU_ON;
         d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
         d_query.upload(query);
         d_train.upload(train);
-        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
+        d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
         GPU_FULL_OFF;
+
+        diff = abs((int)d_matches[0].size() - (int)matches[0].size());
+        if(diff == 0)
+            TestSystem::instance().setAccurate(1, 0);
+        else
+            TestSystem::instance().setAccurate(0, diff);
     }
 }
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_columnsum.cpp b/modules/ocl/perf/perf_calib3d.cpp
similarity index 66%
rename from modules/ocl/perf/perf_columnsum.cpp
rename to modules/ocl/perf/perf_calib3d.cpp
index d2e3b45e53..f998ddf0f3 100644
--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_calib3d.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,47 +43,59 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #include "precomp.hpp"
-
-///////////// columnSum////////////////////////
-TEST(columnSum)
+///////////// StereoMatchBM ////////////////////////
+PERFTEST(StereoMatchBM)
 {
-    Mat src, dst;
-    ocl::oclMat d_src, d_dst;
+	Mat left_image = imread(abspath("aloeL.jpg"), cv::IMREAD_GRAYSCALE);
+	Mat right_image = imread(abspath("aloeR.jpg"), cv::IMREAD_GRAYSCALE);
+	Mat disp,dst;
+	ocl::oclMat d_left, d_right,d_disp;
+	int n_disp= 128;
+	int winSize =19;
 
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        SUBTEST << size << 'x' << size << "; CV_32FC1";
+	SUBTEST << left_image.cols << 'x' << left_image.rows << "; aloeL.jpg ;"<< right_image.cols << 'x' << right_image.rows << "; aloeR.jpg ";
 
-        gen(src, size, size, CV_32FC1, 0, 256);
+	StereoBM bm(0, n_disp, winSize);
+	bm(left_image, right_image, dst);
 
-        CPU_ON;
-        dst.create(src.size(), src.type());
+	CPU_ON;
+	bm(left_image, right_image, dst);
+	CPU_OFF;
 
-        for (int i = 1; i < src.rows; ++i)
-        {
-            for (int j = 0; j < src.cols; ++j)
-            {
-                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
-            }
-        }
+	d_left.upload(left_image);
+	d_right.upload(right_image);
 
-        CPU_OFF;
+	ocl::StereoBM_OCL d_bm(0, n_disp, winSize);
 
-        d_src.upload(src);
-        WARMUP_ON;
-        ocl::columnSum(d_src, d_dst);
-        WARMUP_OFF;
+	WARMUP_ON;
+	d_bm(d_left, d_right, d_disp);
+	WARMUP_OFF;
 
-        GPU_ON;
-        ocl::columnSum(d_src, d_dst);
-         ;
-        GPU_OFF;
+    cv::Mat ocl_mat;
+    d_disp.download(ocl_mat);
+    ocl_mat.convertTo(ocl_mat, dst.type());
 
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::columnSum(d_src, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-    }
-}
\ No newline at end of file
+	GPU_ON;
+	d_bm(d_left, d_right, d_disp);
+	GPU_OFF;
+
+	GPU_FULL_ON;
+	d_left.upload(left_image);
+	d_right.upload(right_image);
+	d_bm(d_left, d_right, d_disp);
+	d_disp.download(disp);
+	GPU_FULL_OFF;
+    
+    TestSystem::instance().setAccurate(-1, 0.);
+}
+
+
+
+
+
+
+
+
+	
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_canny.cpp b/modules/ocl/perf/perf_canny.cpp
index 428e036d0c..cb23d7ad28 100644
--- a/modules/ocl/perf/perf_canny.cpp
+++ b/modules/ocl/perf/perf_canny.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,7 +46,7 @@
 #include "precomp.hpp"
 
 ///////////// Canny ////////////////////////
-TEST(Canny)
+PERFTEST(Canny)
 {
     Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
 
@@ -56,7 +57,7 @@ TEST(Canny)
 
     SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
 
-    Mat edges(img.size(), CV_8UC1);
+    Mat edges(img.size(), CV_8UC1), ocl_edges;
 
     CPU_ON;
     Canny(img, edges, 50.0, 100.0);
@@ -72,12 +73,13 @@ TEST(Canny)
 
     GPU_ON;
     ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-     ;
     GPU_OFF;
 
     GPU_FULL_ON;
     d_img.upload(img);
     ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    d_edges.download(edges);
+    d_edges.download(ocl_edges);
     GPU_FULL_OFF;
+
+    TestSystem::instance().ExceptedMatSimilar(edges, ocl_edges, 2e-2);
 }
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_color.cpp b/modules/ocl/perf/perf_color.cpp
index e32a1839d8..daf1cfdc9c 100644
--- a/modules/ocl/perf/perf_color.cpp
+++ b/modules/ocl/perf/perf_color.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,9 +46,9 @@
 #include "precomp.hpp"
 
 ///////////// cvtColor////////////////////////
-TEST(cvtColor)
+PERFTEST(cvtColor)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_8UC4};
@@ -74,14 +75,15 @@ TEST(cvtColor)
 
             GPU_ON;
             ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExceptedMatSimilar(dst, ocl_dst, 1e-5);
         }
 
 
diff --git a/modules/ocl/perf/perf_fft.cpp b/modules/ocl/perf/perf_fft.cpp
index 50be2546ee..6e0be3f19d 100644
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/ocl/perf/perf_fft.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,13 +46,13 @@
 #include "precomp.hpp"
 
 ///////////// dft ////////////////////////
-TEST(dft)
+PERFTEST(dft)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
-    int all_type[] = {CV_32FC1, CV_32FC2};
-    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};
+    int all_type[] = {CV_32FC2};
+    std::string type_name[] = {"CV_32FC2"};
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
@@ -75,14 +76,15 @@ TEST(dft)
 
             GPU_ON;
             ocl::dft(d_src, d_dst, Size(size, size));
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::dft(d_src, d_dst, Size(size, size));
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, src.size().area() * 1e-4);
         }
 
     }
diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
index e9646c77e2..e988ce09d6 100644
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,9 +46,9 @@
 #include "precomp.hpp"
 
 ///////////// Blur////////////////////////
-TEST(Blur)
+PERFTEST(Blur)
 {
-    Mat src1, dst;
+    Mat src1, dst, ocl_dst;
     ocl::oclMat d_src1, d_dst;
 
     Size ksize = Size(3, 3);
@@ -64,7 +65,6 @@ TEST(Blur)
             gen(src1, size, size, all_type[j], 0, 256);
             gen(dst, size, size, all_type[j], 0, 256);
 
-
             blur(src1, dst, ksize, Point(-1, -1), bordertype);
 
             CPU_ON;
@@ -79,22 +79,23 @@ TEST(Blur)
 
             GPU_ON;
             ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
         }
 
     }
 }
 ///////////// Laplacian////////////////////////
-TEST(Laplacian)
+PERFTEST(Laplacian)
 {
-    Mat src1, dst;
+    Mat src1, dst, ocl_dst;
     ocl::oclMat d_src1, d_dst;
 
     int ksize = 3;
@@ -110,7 +111,6 @@ TEST(Laplacian)
             gen(src1, size, size, all_type[j], 0, 256);
             gen(dst, size, size, all_type[j], 0, 256);
 
-
             Laplacian(src1, dst, -1, ksize, 1);
 
             CPU_ON;
@@ -125,23 +125,24 @@ TEST(Laplacian)
 
             GPU_ON;
             ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src1.upload(src1);
             ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
         }
 
     }
 }
 
 ///////////// Erode ////////////////////
-TEST(Erode)
+PERFTEST(Erode)
 {
-    Mat src, dst, ker;
+    Mat src, dst, ker, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
@@ -170,23 +171,24 @@ TEST(Erode)
 
             GPU_ON;
             ocl::erode(d_src, d_dst, ker);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::erode(d_src, d_dst, ker);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
         }
 
     }
 }
 
 ///////////// Sobel ////////////////////////
-TEST(Sobel)
+PERFTEST(Sobel)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int dx = 1;
@@ -216,22 +218,23 @@ TEST(Sobel)
 
             GPU_ON;
             ocl::Sobel(d_src, d_dst, -1, dx, dy);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
         }
 
     }
 }
 ///////////// Scharr ////////////////////////
-TEST(Scharr)
+PERFTEST(Scharr)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int dx = 1;
@@ -261,25 +264,27 @@ TEST(Scharr)
 
             GPU_ON;
             ocl::Scharr(d_src, d_dst, -1, dx, dy);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
         }
 
     }
 }
 
 ///////////// GaussianBlur ////////////////////////
-TEST(GaussianBlur)
+PERFTEST(GaussianBlur)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
     std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
+    const int ksize = 7;	
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
@@ -289,37 +294,37 @@ TEST(GaussianBlur)
 
             gen(src, size, size, all_type[j], 0, 256);
 
-            GaussianBlur(src, dst, Size(9, 9), 0);
+            GaussianBlur(src, dst, Size(ksize, ksize), 0);
 
             CPU_ON;
-            GaussianBlur(src, dst, Size(9, 9), 0);
+            GaussianBlur(src, dst, Size(ksize, ksize), 0);
             CPU_OFF;
 
             ocl::oclMat d_src(src);
-            ocl::oclMat d_dst(src.size(), src.type());
-            ocl::oclMat d_buf;
+            ocl::oclMat d_dst;
 
             WARMUP_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
             WARMUP_OFF;
 
             GPU_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-             ;
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            d_dst.download(dst);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
         }
 
     }
 }
 
 ///////////// filter2D////////////////////////
-TEST(filter2D)
+PERFTEST(filter2D)
 {
     Mat src;
 
@@ -332,38 +337,38 @@ TEST(filter2D)
         {
             gen(src, size, size, all_type[j], 0, 256);
 
-            for (int ksize = 3; ksize <= 15; ksize = 2*ksize+1)
-            {
-                SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
+            const int ksize = 3;
 
-                Mat kernel;
-                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
+            SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
 
-                Mat dst;
-                cv::filter2D(src, dst, -1, kernel);
+            Mat kernel;
+            gen(kernel, ksize, ksize, CV_32SC1, -3.0, 3.0);
 
-                CPU_ON;
-                cv::filter2D(src, dst, -1, kernel);
-                CPU_OFF;
+            Mat dst, ocl_dst;
 
-                ocl::oclMat d_src(src);
-                ocl::oclMat d_dst;
+            cv::filter2D(src, dst, -1, kernel);
 
-                WARMUP_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                WARMUP_OFF;
+            CPU_ON;
+            cv::filter2D(src, dst, -1, kernel);
+            CPU_OFF;
 
-                GPU_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                 ;
-                GPU_OFF;
+            ocl::oclMat d_src(src), d_dst;
 
-                GPU_FULL_ON;
-                d_src.upload(src);
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                d_dst.download(dst);
-                GPU_FULL_OFF;
-            }
+            WARMUP_ON;
+            ocl::filter2D(d_src, d_dst, -1, kernel);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::filter2D(d_src, d_dst, -1, kernel);
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::filter2D(d_src, d_dst, -1, kernel);
+            d_dst.download(ocl_dst);
+            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
 
         }
 
diff --git a/modules/ocl/perf/perf_gemm.cpp b/modules/ocl/perf/perf_gemm.cpp
index 930ecb0464..f197c5f5a0 100644
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,9 +46,9 @@
 #include "precomp.hpp"
 
 ///////////// gemm ////////////////////////
-TEST(gemm)
+PERFTEST(gemm)
 {
-    Mat src1, src2, src3, dst;
+    Mat src1, src2, src3, dst, ocl_dst;
     ocl::oclMat d_src1, d_src2, d_src3, d_dst;
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -74,7 +75,6 @@ TEST(gemm)
 
         GPU_ON;
         ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
@@ -82,7 +82,9 @@ TEST(gemm)
         d_src2.upload(src2);
         d_src3.upload(src3);
         ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
         GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(ocl_dst, dst, src1.cols * src1.rows * 1e-4);
     }
 }
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_haar.cpp b/modules/ocl/perf/perf_haar.cpp
index 5a909ace4e..72f01dc935 100644
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/ocl/perf/perf_haar.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -82,7 +83,7 @@ public:
 
 }
 }
-TEST(Haar)
+PERFTEST(Haar)
 {
     Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);
 
@@ -106,6 +107,8 @@ TEST(Haar)
                                     1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
     CPU_OFF;
 
+
+    vector<Rect> oclfaces;
     ocl::CascadeClassifier_GPU faceCascade;
 
     if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
@@ -115,24 +118,26 @@ TEST(Haar)
 
     ocl::oclMat d_img(img);
 
-    faces.clear();
-
     WARMUP_ON;
-    faceCascade.detectMultiScale(d_img, faces,
+    faceCascade.detectMultiScale(d_img, oclfaces,
                                  1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
     WARMUP_OFF;
 
+    if(faces.size() == oclfaces.size())
+        TestSystem::instance().setAccurate(1, 0);
+    else
+        TestSystem::instance().setAccurate(0, abs((int)faces.size() - (int)oclfaces.size()));
+
     faces.clear();
 
     GPU_ON;
-    faceCascade.detectMultiScale(d_img, faces,
+    faceCascade.detectMultiScale(d_img, oclfaces,
                                  1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-     ;
     GPU_OFF;
 
     GPU_FULL_ON;
     d_img.upload(img);
-    faceCascade.detectMultiScale(d_img, faces,
+    faceCascade.detectMultiScale(d_img, oclfaces,
                                  1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
     GPU_FULL_OFF;
 }
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_hog.cpp b/modules/ocl/perf/perf_hog.cpp
index b74077ff40..7daa61396c 100644
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,7 +46,8 @@
 #include "precomp.hpp"
 
 ///////////// HOG////////////////////////
-TEST(HOG)
+
+PERFTEST(HOG)
 {
     Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);
 
@@ -54,12 +56,12 @@ TEST(HOG)
         throw runtime_error("can't open road.png");
     }
 
-
     cv::HOGDescriptor hog;
     hog.setSVMDetector(hog.getDefaultPeopleDetector());
     std::vector<cv::Rect> found_locations;
+    std::vector<cv::Rect> d_found_locations;
 
-    SUBTEST << 768 << 'x' << 576 << "; road.png";
+    SUBTEST << src.cols << 'x' << src.rows << "; road.png";
 
     hog.detectMultiScale(src, found_locations);
 
@@ -73,12 +75,16 @@ TEST(HOG)
     d_src.upload(src);
 
     WARMUP_ON;
-    ocl_hog.detectMultiScale(d_src, found_locations);
+    ocl_hog.detectMultiScale(d_src, d_found_locations);
     WARMUP_OFF;
+    
+    if(d_found_locations.size() == found_locations.size())
+        TestSystem::instance().setAccurate(1, 0);
+    else
+        TestSystem::instance().setAccurate(0, abs((int)found_locations.size() - (int)d_found_locations.size()));
 
     GPU_ON;
     ocl_hog.detectMultiScale(d_src, found_locations);
-     ;
     GPU_OFF;
 
     GPU_FULL_ON;
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
index 756f69556f..ade5019147 100644
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,9 +46,9 @@
 #include "precomp.hpp"
 
 ///////////// equalizeHist ////////////////////////
-TEST(equalizeHist)
+PERFTEST(equalizeHist)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     int all_type[] = {CV_8UC1};
     std::string type_name[] = {"CV_8UC1"};
 
@@ -76,22 +77,23 @@ TEST(equalizeHist)
 
             GPU_ON;
             ocl::equalizeHist(d_src, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::equalizeHist(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.1);
         }
 
     }
 }
 /////////// CopyMakeBorder //////////////////////
-TEST(CopyMakeBorder)
+PERFTEST(CopyMakeBorder)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_dst;
 
     int bordertype = BORDER_CONSTANT;
@@ -121,22 +123,23 @@ TEST(CopyMakeBorder)
 
             GPU_ON;
             ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
         }
 
     }
 }
 ///////////// cornerMinEigenVal ////////////////////////
-TEST(cornerMinEigenVal)
+PERFTEST(cornerMinEigenVal)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_dst;
 
     int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
@@ -150,7 +153,6 @@ TEST(cornerMinEigenVal)
         {
             SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-
             gen(src, size, size, all_type[j], 0, 256);
 
             cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
@@ -167,22 +169,23 @@ TEST(cornerMinEigenVal)
 
             GPU_ON;
             ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
         }
 
     }
 }
 ///////////// cornerHarris ////////////////////////
-TEST(cornerHarris)
+PERFTEST(cornerHarris)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_8UC1, CV_32FC1};
@@ -210,23 +213,24 @@ TEST(cornerHarris)
 
             GPU_ON;
             ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
         }
 
 
     }
 }
 ///////////// integral ////////////////////////
-TEST(integral)
+PERFTEST(integral)
 {
-    Mat src, sum;
+    Mat src, sum, ocl_sum;
     ocl::oclMat d_src, d_sum, d_buf;
 
     int all_type[] = {CV_8UC1};
@@ -254,28 +258,31 @@ TEST(integral)
 
             GPU_ON;
             ocl::integral(d_src, d_sum);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::integral(d_src, d_sum);
-            d_sum.download(sum);
+            d_sum.download(ocl_sum);
             GPU_FULL_OFF;
+
+            if(sum.type() == ocl_sum.type()) //we won't test accuracy when cpu function overlow
+                TestSystem::instance().ExpectedMatNear(sum, ocl_sum, 0.0);
+
         }
 
     }
 }
 ///////////// WarpAffine ////////////////////////
-TEST(WarpAffine)
+PERFTEST(WarpAffine)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     static const double coeffs[2][3] =
     {
-        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
-        {sin(3.14 / 6), cos(3.14 / 6), -100.0}
+        {cos(CV_PI / 6), -sin(CV_PI / 6), 100.0},
+        {sin(CV_PI / 6), cos(CV_PI / 6), -100.0}
     };
     Mat M(2, 3, CV_64F, (void *)coeffs);
     int interpolation = INTER_NEAREST;
@@ -308,32 +315,33 @@ TEST(WarpAffine)
 
             GPU_ON;
             ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
         }
 
     }
 }
 ///////////// WarpPerspective ////////////////////////
-TEST(WarpPerspective)
+PERFTEST(WarpPerspective)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     static const double coeffs[3][3] =
     {
-        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
-        {sin(3.14 / 6), cos(3.14 / 6), -100.0},
+        {cos(CV_PI / 6), -sin(CV_PI / 6), 100.0},
+        {sin(CV_PI / 6), cos(CV_PI / 6), -100.0},
         {0.0, 0.0, 1.0}
     };
     Mat M(3, 3, CV_64F, (void *)coeffs);
-    int interpolation = INTER_NEAREST;
+    int interpolation = INTER_LINEAR;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
     std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -362,23 +370,24 @@ TEST(WarpPerspective)
 
             GPU_ON;
             ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
         }
 
     }
 }
 
 ///////////// resize ////////////////////////
-TEST(resize)
+PERFTEST(resize)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
 
@@ -407,14 +416,15 @@ TEST(resize)
 
             GPU_ON;
             ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
         }
 
     }
@@ -441,25 +451,25 @@ TEST(resize)
 
             GPU_ON;
             ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
         }
 
     }
 }
 ///////////// threshold////////////////////////
-TEST(threshold)
+PERFTEST(threshold)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
-
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
         SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
@@ -480,15 +490,15 @@ TEST(threshold)
 
         GPU_ON;
         ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
         d_src.upload(src);
         ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
         GPU_FULL_OFF;
 
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
     }
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -511,57 +521,18 @@ TEST(threshold)
 
         GPU_ON;
         ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-         ;
         GPU_OFF;
 
         GPU_FULL_ON;
         d_src.upload(src);
         ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
         GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
     }
 }
 ///////////// meanShiftFiltering////////////////////////
-TEST(meanShiftFiltering)
-{
-    int sp = 10, sr = 10;
-    Mat src, dst;
-
-    ocl::oclMat d_src, d_dst;
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";
-
-        gen(src, size, size, CV_8UC3, Scalar::all(0), Scalar::all(256));
-
-        pyrMeanShiftFiltering(src, dst, sp, sr);
-
-        CPU_ON;
-        pyrMeanShiftFiltering(src, dst, sp, sr);
-        CPU_OFF;
-
-        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-         ;
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-    }
-}
-///////////// meanShiftProc////////////////////////
 COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
 {
 
@@ -575,9 +546,8 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
     c1 = sptr[1];
     c2 = sptr[2];
     c3 = sptr[3];
-
     // iterate meanshift procedure
-    for (iter = 0; iter < maxIter; iter++)
+    for(iter = 0; iter < maxIter; iter++ )
     {
         int count = 0;
         int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
@@ -589,27 +559,11 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
         int maxy = y0 + sp;
 
         //deal with the image boundary
-        if (minx < 0)
-        {
-            minx = 0;
-        }
-
-        if (miny < 0)
-        {
-            miny = 0;
-        }
-
-        if (maxx >= size.width)
-        {
-            maxx = size.width - 1;
-        }
-
-        if (maxy >= size.height)
-        {
-            maxy = size.height - 1;
-        }
-
-        if (iter == 0)
+        if(minx < 0) minx = 0;
+        if(miny < 0) miny = 0;
+        if(maxx >= size.width) maxx = size.width - 1;
+        if(maxy >= size.height) maxy = size.height - 1;
+        if(iter == 0)
         {
             pstart = sptr;
         }
@@ -617,22 +571,19 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
         {
             pstart = pstart + revy * sstep + (revx << 2); //point to the new position
         }
-
         ptr = pstart;
         ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
 
-        for (int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
+        for( int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
         {
             int rowCount = 0;
             int x = minx;
 #if CV_ENABLE_UNROLLED
-
-            for (; x + 4 <= maxx; x += 4, ptr += 16)
+            for( ; x + 4 <= maxx; x += 4, ptr += 16)
             {
                 int t0, t1, t2;
                 t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                 {
                     s0 += t0;
                     s1 += t1;
@@ -640,10 +591,8 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
                     sx += x;
                     rowCount++;
                 }
-
                 t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                 {
                     s0 += t0;
                     s1 += t1;
@@ -651,10 +600,8 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
                     sx += x + 1;
                     rowCount++;
                 }
-
                 t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                 {
                     s0 += t0;
                     s1 += t1;
@@ -662,10 +609,8 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
                     sx += x + 2;
                     rowCount++;
                 }
-
                 t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                 {
                     s0 += t0;
                     s1 += t1;
@@ -674,14 +619,11 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
                     rowCount++;
                 }
             }
-
 #endif
-
-            for (; x <= maxx; x++, ptr += 4)
+            for(; x <= maxx; x++, ptr += 4)
             {
                 int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                 {
                     s0 += t0;
                     s1 += t1;
@@ -690,20 +632,14 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
                     rowCount++;
                 }
             }
-
-            if (rowCount == 0)
-            {
+            if(rowCount == 0)
                 continue;
-            }
-
             count += rowCount;
             sy += y * rowCount;
         }
 
-        if (count == 0)
-        {
+        if( count == 0 )
             break;
-        }
 
         int x1 = sx / count;
         int y1 = sy / count;
@@ -712,7 +648,7 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
         s2 = s2 / count;
 
         bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
-                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
+            tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
 
         //revise the pointer corresponding to the new (y0,x0)
         revx = x1 - x0;
@@ -724,10 +660,8 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
         c1 = s1;
         c2 = s2;
 
-        if (stopFlag)
-        {
+        if( stopFlag )
             break;
-        }
     } //for iter
 
     dptr[0] = (uchar)c0;
@@ -741,19 +675,101 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
     return coor;
 }
 
+static void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::TermCriteria crit)
+{
+    if( src_roi.empty() )
+        CV_Error( CV_StsBadArg, "The input image is empty" );
+
+    if( src_roi.depth() != CV_8U || src_roi.channels() != 4 )
+        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
+
+    dst_roi.create(src_roi.size(), src_roi.type());
+
+    CV_Assert( (src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) );
+    CV_Assert( !(dst_roi.step & 0x3) );
+
+    if( !(crit.type & cv::TermCriteria::MAX_ITER) )
+        crit.maxCount = 5;
+    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
+    float eps;
+    if( !(crit.type & cv::TermCriteria::EPS) )
+        eps = 1.f;
+    eps = (float)std::max(crit.epsilon, 0.0);
+
+    int tab[512];
+    for(int i = 0; i < 512; i++)
+        tab[i] = (i - 255) * (i - 255);
+    uchar *sptr = src_roi.data;
+    uchar *dptr = dst_roi.data;
+    int sstep = (int)src_roi.step;
+    int dstep = (int)dst_roi.step;
+    cv::Size size = src_roi.size();
+
+    for(int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
+        dptr += dstep - (size.width << 2))
+    {
+        for(int j = 0; j < size.width; j++, sptr += 4, dptr += 4)
+        {
+            do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
+        }
+    }
+}
+
+PERFTEST(meanShiftFiltering)
+{
+    int sp = 5, sr = 6;
+    Mat src, dst, ocl_dst;
+
+    ocl::oclMat d_src, d_dst;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";
+
+        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
+
+        cv::TermCriteria crit(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1);
+
+        meanShiftFiltering_(src, dst, sp, sr, crit);
+
+        CPU_ON;
+        meanShiftFiltering_(src, dst, sp, sr, crit);
+        CPU_OFF;
+
+        d_src.upload(src);
+
+        WARMUP_ON;
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
+        d_dst.download(ocl_dst);
+        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
+    }
+}
+
 void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
 {
-
     if (src_roi.empty())
     {
         CV_Error(CV_StsBadArg, "The input image is empty");
     }
-
     if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
     {
         CV_Error(CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
     }
 
+    dst_roi.create(src_roi.size(), src_roi.type());
+    dstCoor_roi.create(src_roi.size(), CV_16SC2);
+
     CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
               (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
     CV_Assert(!(dstCoor_roi.step & 0x3));
@@ -798,10 +814,11 @@ void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp,
     }
 
 }
-TEST(meanShiftProc)
+PERFTEST(meanShiftProc)
 {
-    Mat src, dst, dstCoor_roi;
-    ocl::oclMat d_src, d_dst, d_dstCoor_roi;
+    Mat src;
+    vector<Mat> dst(2), ocl_dst(2);
+    ocl::oclMat d_src, d_dst, d_dstCoor;
 
     TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
 
@@ -810,40 +827,39 @@ TEST(meanShiftProc)
         SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
 
         gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
 
-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
+        meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
 
         CPU_ON;
-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
+        meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
         CPU_OFF;
 
         d_src.upload(src);
 
         WARMUP_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
         WARMUP_OFF;
 
         GPU_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-         ;
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
         GPU_OFF;
 
         GPU_FULL_ON;
         d_src.upload(src);
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        d_dst.download(dst);
-        d_dstCoor_roi.download(dstCoor_roi);
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
+        d_dst.download(ocl_dst[0]);
+        d_dstCoor.download(ocl_dst[1]);
         GPU_FULL_OFF;
 
+        vector<double> eps(2, 0.);
+        TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);      
     }
 }
 
 ///////////// remap////////////////////////
-TEST(remap)
+PERFTEST(remap)
 {
-    Mat src, dst, xmap, ymap;
+    Mat src, dst, xmap, ymap, ocl_dst;
     ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -876,7 +892,6 @@ TEST(remap)
                 }
             }
 
-
             remap(src, dst, xmap, ymap, interpolation, borderMode);
 
             CPU_ON;
@@ -894,15 +909,105 @@ TEST(remap)
 
             GPU_ON;
             ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 2.0);
         }
 
     }
-}
\ No newline at end of file
+}
+///////////// CLAHE ////////////////////////
+PERFTEST(CLAHE)
+{
+    Mat src, dst, ocl_dst;
+    cv::ocl::oclMat d_src, d_dst;
+    int all_type[] = {CV_8UC1};
+    std::string type_name[] = {"CV_8UC1"};
+
+    double clipLimit = 40.0;
+
+    cv::Ptr<cv::CLAHE> clahe   = cv::createCLAHE(clipLimit);
+    cv::Ptr<cv::CLAHE> d_clahe = cv::ocl::createCLAHE(clipLimit);
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            CPU_ON;
+            clahe->apply(src, dst);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_clahe->apply(d_src, d_dst);
+            WARMUP_OFF;
+
+            ocl_dst = d_dst;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
+
+            GPU_ON;
+            d_clahe->apply(d_src, d_dst);
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_clahe->apply(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+    }
+}
+
+///////////// columnSum////////////////////////
+PERFTEST(columnSum)
+{
+    Mat src, dst, ocl_dst;
+    ocl::oclMat d_src, d_dst;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
+
+        gen(src, size, size, CV_32FC1, 0, 256);
+
+        CPU_ON;
+        dst.create(src.size(), src.type());
+        for (int j = 0; j < src.cols; j++)
+            dst.at<float>(0, j) = src.at<float>(0, j);
+
+        for (int i = 1; i < src.rows; ++i)
+            for (int j = 0; j < src.cols; ++j)
+                dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
+        CPU_OFF;
+
+        d_src.upload(src);
+
+        WARMUP_ON;
+        ocl::columnSum(d_src, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::columnSum(d_src, d_dst);
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::columnSum(d_src, d_dst);
+        d_dst.download(ocl_dst);
+        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
+    }
+}
diff --git a/modules/ocl/perf/perf_match_template.cpp b/modules/ocl/perf/perf_match_template.cpp
index 2828efe01a..5da15aaf64 100644
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -52,14 +53,12 @@
 //	ocl::oclMat d_src(src), d_templ(templ), d_dst;
 //	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
 //}
-TEST(matchTemplate)
+PERFTEST(matchTemplate)
 {
     //InitMatchTemplate();
-
-    Mat src, templ, dst;
+    Mat src, templ, dst, ocl_dst;
     int templ_size = 5;
 
-
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
         int all_type[] = {CV_32FC1, CV_32FC4};
@@ -81,9 +80,7 @@ TEST(matchTemplate)
                 matchTemplate(src, templ, dst, CV_TM_CCORR);
                 CPU_OFF;
 
-                ocl::oclMat d_src(src), d_templ, d_dst;
-
-                d_templ.upload(templ);
+                ocl::oclMat d_src(src), d_templ(templ), d_dst;
 
                 WARMUP_ON;
                 ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
@@ -91,15 +88,16 @@ TEST(matchTemplate)
 
                 GPU_ON;
                 ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                 ;
                 GPU_OFF;
 
                 GPU_FULL_ON;
                 d_src.upload(src);
                 d_templ.upload(templ);
                 ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                d_dst.download(dst);
+                d_dst.download(ocl_dst);
                 GPU_FULL_OFF;
+
+                TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
             }
         }
 
@@ -131,15 +129,16 @@ TEST(matchTemplate)
 
                 GPU_ON;
                 ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                 ;
                 GPU_OFF;
 
                 GPU_FULL_ON;
                 d_src.upload(src);
                 d_templ.upload(templ);
                 ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                d_dst.download(dst);
+                d_dst.download(ocl_dst);
                 GPU_FULL_OFF;
+
+                TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
             }
         }
     }
diff --git a/modules/ocl/perf/perf_matrix_operation.cpp b/modules/ocl/perf/perf_matrix_operation.cpp
index 495b2b82cf..b724cdbe64 100644
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,9 +46,9 @@
 #include "precomp.hpp"
 
 ///////////// ConvertTo////////////////////////
-TEST(ConvertTo)
+PERFTEST(ConvertTo)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -78,22 +79,23 @@ TEST(ConvertTo)
 
             GPU_ON;
             d_src.convertTo(d_dst, CV_32FC1);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             d_src.convertTo(d_dst, CV_32FC1);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
         }
 
     }
 }
 ///////////// copyTo////////////////////////
-TEST(copyTo)
+PERFTEST(copyTo)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
     ocl::oclMat d_src, d_dst;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
@@ -124,24 +126,25 @@ TEST(copyTo)
 
             GPU_ON;
             d_src.copyTo(d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             d_src.copyTo(d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
         }
 
     }
 }
 ///////////// setTo////////////////////////
-TEST(setTo)
+PERFTEST(setTo)
 {
-    Mat src, dst;
+    Mat src, ocl_src;
     Scalar val(1, 2, 3, 4);
-    ocl::oclMat d_src, d_dst;
+    ocl::oclMat d_src;
 
     int all_type[] = {CV_8UC1, CV_8UC4};
     std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -166,9 +169,11 @@ TEST(setTo)
             d_src.setTo(val);
             WARMUP_OFF;
 
-            GPU_ON;
+            d_src.download(ocl_src);
+            TestSystem::instance().ExpectedMatNear(src, ocl_src, 1.0);
+
+            GPU_ON;;
             d_src.setTo(val);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
diff --git a/modules/ocl/perf/perf_pyrdown.cpp b/modules/ocl/perf/perf_moments.cpp
similarity index 78%
rename from modules/ocl/perf/perf_pyrdown.cpp
rename to modules/ocl/perf/perf_moments.cpp
index 1d1d2dec11..7fa3948dec 100644
--- a/modules/ocl/perf/perf_pyrdown.cpp
+++ b/modules/ocl/perf/perf_moments.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,45 +44,49 @@
 //
 //M*/
 #include "precomp.hpp"
-
-///////////// pyrDown //////////////////////
-TEST(pyrDown)
+///////////// Moments ////////////////////////
+PERFTEST(Moments)
 {
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+    Mat src;
+    bool binaryImage = 0;
+
+    int all_type[] = {CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_16SC1", "CV_32FC1", "CV_64FC1"};
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
         for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
             gen(src, size, size, all_type[j], 0, 256);
 
-            pyrDown(src, dst);
+            cv::Moments CvMom = moments(src, binaryImage);
 
             CPU_ON;
-            pyrDown(src, dst);
+            moments(src, binaryImage);
             CPU_OFF;
 
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-
+            cv::Moments oclMom;
             WARMUP_ON;
-            ocl::pyrDown(d_src, d_dst);
+            oclMom = ocl::ocl_moments(src, binaryImage);
             WARMUP_OFF;
 
+            Mat gpu_dst, cpu_dst;
+            HuMoments(CvMom, cpu_dst);
+            HuMoments(oclMom, gpu_dst);
+
             GPU_ON;
-            ocl::pyrDown(d_src, d_dst);
-             ;
+            ocl::ocl_moments(src, binaryImage);
             GPU_OFF;
 
             GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrDown(d_src, d_dst);
-            d_dst.download(dst);
+            ocl::ocl_moments(src, binaryImage);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(gpu_dst, cpu_dst, .5);
+
         }
+
     }
-}
\ No newline at end of file
+}
diff --git a/modules/ocl/perf/perf_norm.cpp b/modules/ocl/perf/perf_norm.cpp
index 8b7118a6ea..1d986c8e49 100644
--- a/modules/ocl/perf/perf_norm.cpp
+++ b/modules/ocl/perf/perf_norm.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,40 +46,42 @@
 #include "precomp.hpp"
 
 ///////////// norm////////////////////////
-TEST(norm)
+PERFTEST(norm)
 {
-    Mat src, buf;
-    ocl::oclMat d_src, d_buf;
-
+    Mat src1, src2, ocl_src1;
+    ocl::oclMat d_src1, d_src2;
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
         SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
 
-        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
-        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(src1, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(src2, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
 
-        norm(src, NORM_INF);
+        norm(src1, src2, NORM_INF);
 
         CPU_ON;
-        norm(src, NORM_INF);
+        norm(src1, src2, NORM_INF);
         CPU_OFF;
 
-        d_src.upload(src);
-        d_buf.upload(buf);
+        d_src1.upload(src1);
+        d_src2.upload(src2);
 
         WARMUP_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
+        ocl::norm(d_src1, d_src2, NORM_INF);
         WARMUP_OFF;
 
+        d_src1.download(ocl_src1);
+        TestSystem::instance().ExpectedMatNear(src1, ocl_src1, .5);                        
+
         GPU_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
-         ;
+        ocl::norm(d_src1, d_src2, NORM_INF);
         GPU_OFF;
 
         GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::norm(d_src, d_buf, NORM_INF);
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        ocl::norm(d_src1, d_src2, NORM_INF);
         GPU_FULL_OFF;
     }
 }
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_pyrlk.cpp b/modules/ocl/perf/perf_opticalflow.cpp
similarity index 61%
rename from modules/ocl/perf/perf_pyrlk.cpp
rename to modules/ocl/perf/perf_opticalflow.cpp
index f7fc22b9d0..97283b206c 100644
--- a/modules/ocl/perf/perf_pyrlk.cpp
+++ b/modules/ocl/perf/perf_opticalflow.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,10 +46,10 @@
 #include "precomp.hpp"
 
 ///////////// PyrLKOpticalFlow ////////////////////////
-TEST(PyrLKOpticalFlow)
+PERFTEST(PyrLKOpticalFlow)
 {
-    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
-    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
+    std::string images1[] = {"rubberwhale1.png", "basketball1.png"};
+    std::string images2[] = {"rubberwhale2.png", "basketball2.png"};
 
     for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
     {
@@ -81,8 +82,8 @@ TEST(PyrLKOpticalFlow)
                 SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
             else
                 SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
-            Mat nextPts_cpu;
-            Mat status_cpu;
+            Mat ocl_nextPts;
+            Mat ocl_status;
 
             vector<Point2f> pts;
             goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
@@ -117,7 +118,6 @@ TEST(PyrLKOpticalFlow)
 
             GPU_ON;
             d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
@@ -127,17 +127,102 @@ TEST(PyrLKOpticalFlow)
             d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
 
             if (!d_nextPts.empty())
-            {
-                d_nextPts.download(nextPts_cpu);
-            }
+                d_nextPts.download(ocl_nextPts);
 
             if (!d_status.empty())
-            {
-                d_status.download(status_cpu);
-            }
-
+                d_status.download(ocl_status);
             GPU_FULL_OFF;
+
+            size_t mismatch = 0;
+            for (int i = 0; i < (int)nextPts.size(); ++i)
+            {
+                if(status[i] != ocl_status.at<unsigned char>(0, i)){
+                    mismatch++;
+                    continue;
+                }
+                if(status[i]){
+                    Point2f gpu_rst = ocl_nextPts.at<Point2f>(0, i);
+                    Point2f cpu_rst = nextPts[i];
+                    if(fabs(gpu_rst.x - cpu_rst.x) >= 1. || fabs(gpu_rst.y - cpu_rst.y) >= 1.)
+                        mismatch++;
+                }
+            }
+            double ratio = (double)mismatch / (double)nextPts.size();
+            if(ratio < .02)
+                TestSystem::instance().setAccurate(1, ratio);
+            else
+                TestSystem::instance().setAccurate(0, ratio);
         }
 
     }
 }
+
+
+PERFTEST(tvl1flow)
+{
+    cv::Mat frame0 = imread("rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    assert(!frame0.empty());
+
+    cv::Mat frame1 = imread("rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    assert(!frame1.empty());
+
+    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
+    cv::ocl::oclMat d_flowx(frame0.size(), CV_32FC1);
+    cv::ocl::oclMat d_flowy(frame1.size(), CV_32FC1);
+
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cv::Mat flow;
+
+
+    SUBTEST << frame0.cols << 'x' << frame0.rows << "; rubberwhale1.png; "<<frame1.cols<<'x'<<frame1.rows<<"; rubberwhale2.png";
+
+    alg->calc(frame0, frame1, flow);
+
+    CPU_ON;
+    alg->calc(frame0, frame1, flow);
+    CPU_OFF;
+
+    cv::Mat gold[2];
+    cv::split(flow, gold);
+
+    cv::ocl::oclMat d0(frame0.size(), CV_32FC1);
+    d0.upload(frame0);
+    cv::ocl::oclMat d1(frame1.size(), CV_32FC1);
+    d1.upload(frame1);
+
+    WARMUP_ON;
+    d_alg(d0, d1, d_flowx, d_flowy);
+    WARMUP_OFF;
+/*
+    double diff1 = 0.0, diff2 = 0.0;
+    if(ExceptedMatSimilar(gold[0], cv::Mat(d_flowx), 3e-3, diff1) == 1
+        &&ExceptedMatSimilar(gold[1], cv::Mat(d_flowy), 3e-3, diff2) == 1)
+        TestSystem::instance().setAccurate(1);
+    else
+        TestSystem::instance().setAccurate(0);
+
+    TestSystem::instance().setDiff(diff1);
+    TestSystem::instance().setDiff(diff2);
+*/
+
+
+    GPU_ON;
+    d_alg(d0, d1, d_flowx, d_flowy);
+    d_alg.collectGarbage();
+    GPU_OFF;
+    
+
+    cv::Mat flowx, flowy;
+
+    GPU_FULL_ON;
+    d0.upload(frame0);
+    d1.upload(frame1);
+    d_alg(d0, d1, d_flowx, d_flowy);
+    d_alg.collectGarbage();
+    d_flowx.download(flowx);
+    d_flowy.download(flowy);
+    GPU_FULL_OFF;
+
+    TestSystem::instance().ExceptedMatSimilar(gold[0], flowx, 3e-3);
+    TestSystem::instance().ExceptedMatSimilar(gold[1], flowy, 3e-3);
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_pyrup.cpp b/modules/ocl/perf/perf_pyramid.cpp
similarity index 70%
rename from modules/ocl/perf/perf_pyrup.cpp
rename to modules/ocl/perf/perf_pyramid.cpp
index d3b3003a2e..3b96251e5d 100644
--- a/modules/ocl/perf/perf_pyrup.cpp
+++ b/modules/ocl/perf/perf_pyramid.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -44,10 +45,53 @@
 //M*/
 #include "precomp.hpp"
 
-///////////// pyrUp ////////////////////////
-TEST(pyrUp)
+///////////// pyrDown //////////////////////
+PERFTEST(pyrDown)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            pyrDown(src, dst);
+
+            CPU_ON;
+            pyrDown(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+
+            WARMUP_ON;
+            ocl::pyrDown(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::pyrDown(d_src, d_dst);
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrDown(d_src, d_dst);
+            d_dst.download(ocl_dst);
+            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, dst.depth() == CV_32F ? 1e-4f : 1.0f);
+        }
+    }
+}
+
+///////////// pyrUp ////////////////////////
+PERFTEST(pyrUp)
+{
+    Mat src, dst, ocl_dst;
     int all_type[] = {CV_8UC1, CV_8UC4};
     std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
@@ -74,14 +118,15 @@ TEST(pyrUp)
 
             GPU_ON;
             ocl::pyrUp(d_src, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::pyrUp(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, (src.depth() == CV_32F ? 1e-4f : 1.0));
         }
     }
 }
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_split_merge.cpp b/modules/ocl/perf/perf_split_merge.cpp
index 48ff1ff15a..0fafd14aba 100644
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ b/modules/ocl/perf/perf_split_merge.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,9 +46,9 @@
 #include "precomp.hpp"
 
 ///////////// Merge////////////////////////
-TEST(Merge)
+PERFTEST(Merge)
 {
-    Mat dst;
+    Mat dst, ocl_dst;
     ocl::oclMat d_dst;
 
     int channels = 4;
@@ -86,26 +87,25 @@ TEST(Merge)
 
             GPU_ON;
             ocl::merge(d_src, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
-
             for (int i = 0; i < channels; ++i)
             {
-                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
+                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
             }
-
             ocl::merge(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
             GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
         }
 
     }
 }
 
 ///////////// Split////////////////////////
-TEST(Split)
+PERFTEST(Split)
 {
     //int channels = 4;
     int all_type[] = {CV_8UC1, CV_32FC1};
@@ -120,7 +120,7 @@ TEST(Split)
 
             Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
 
-            std::vector<cv::Mat> dst;
+            std::vector<cv::Mat> dst, ocl_dst(4);
 
             split(src, dst);
 
@@ -133,17 +133,21 @@ TEST(Split)
 
             WARMUP_ON;
             ocl::split(d_src, d_dst);
-            WARMUP_OFF;
+            WARMUP_OFF;         
 
             GPU_ON;
             ocl::split(d_src, d_dst);
-             ;
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
             ocl::split(d_src, d_dst);
+            for(size_t i = 0; i < dst.size(); i++)
+                d_dst[i].download(ocl_dst[i]);
             GPU_FULL_OFF;
+
+            vector<double> eps(4, 0.);
+            TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
         }
 
     }
diff --git a/modules/ocl/perf/precomp.cpp b/modules/ocl/perf/precomp.cpp
index e35a071450..dd3b5e4ea1 100644
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@ -41,6 +41,12 @@
 //M*/
 
 #include "precomp.hpp"
+#if GTEST_OS_WINDOWS
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+# include <windows.h>
+#endif
 
 // This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
 // All images needed in this test are in samples/gpu folder.
@@ -166,7 +172,7 @@ void TestSystem::finishCurrentSubtest()
         deviation = std::sqrt(sum / gpu_times_.size());
     }
 
-    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
+    printMetrics(is_accurate_, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
     writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
 
     num_subtests_called_++;
@@ -184,10 +190,19 @@ double TestSystem::meanTime(const vector<int64> &samples)
 void TestSystem::printHeading()
 {
     cout << endl;
-    cout << setiosflags(ios_base::left);
-    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
-         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
-         << "DESCRIPTION\n";
+    cout<< setiosflags(ios_base::left);
+
+#if 0
+    cout<<TAB<<setw(7)<< "Accu." << setw(10) << "CPU (ms)" << setw(10) << "GPU, ms"
+        << setw(8) << "Speedup"<< setw(10)<<"GPUTotal" << setw(10) << "Total"
+        << "Description\n";
+    cout<<TAB<<setw(7)<<""<<setw(10)<<""<<setw(10)<<""<<setw(8)<<""<<setw(10)<<"(ms)"<<setw(10)<<"Speedup\n";
+#endif
+
+    cout<<TAB<< setw(10) << "CPU (ms)" << setw(10) << "GPU, ms"
+        << setw(8) << "Speedup"<< setw(10)<<"GPUTotal" << setw(10) << "Total"
+        << "Description\n";
+    cout<<TAB<<setw(10)<<""<<setw(10)<<""<<setw(8)<<""<<setw(10)<<"(ms)"<<setw(10)<<"Speedup\n";
 
     cout << resetiosflags(ios_base::left);
 }
@@ -198,9 +213,14 @@ void TestSystem::writeHeading()
     {
         recordname_ += "_OCL.csv";
         record_ = fopen(recordname_.c_str(), "w");
+        if(record_ == NULL)
+        {
+            cout<<".csv file open failed.\n";
+            exit(0);
+        }
     }
 
-    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
+    fprintf(record_, "NAME,DESCRIPTION,ACCURACY,DIFFERENCE,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
 
     fflush(record_);
 }
@@ -209,54 +229,82 @@ void TestSystem::printSummary()
 {
     cout << setiosflags(ios_base::fixed);
     cout << "\naverage GPU speedup: x"
-         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
-         << endl;
+        << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
+        << endl;
     cout << "\nGPU exceeded: "
-         << setprecision(3) << speedup_faster_count_
-         << "\nGPU passed: "
-         << setprecision(3) << speedup_equal_count_
-         << "\nGPU failed: "
-         << setprecision(3) << speedup_slower_count_
-         << endl;
+        << setprecision(3) << speedup_faster_count_
+        << "\nGPU passed: "
+        << setprecision(3) << speedup_equal_count_
+        << "\nGPU failed: "
+        << setprecision(3) << speedup_slower_count_
+        << endl;
     cout << "\nGPU exceeded rate: "
-         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPU passed rate: "
-         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPU failed rate: "
-         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << endl;
+        << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
+        << "%"
+        << "\nGPU passed rate: "
+        << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
+        << "%"
+        << "\nGPU failed rate: "
+        << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
+        << "%"
+        << endl;
     cout << "\naverage GPUTOTAL speedup: x"
-         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
-         << endl;
+        << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
+        << endl;
     cout << "\nGPUTOTAL exceeded: "
-         << setprecision(3) << speedup_full_faster_count_
-         << "\nGPUTOTAL passed: "
-         << setprecision(3) << speedup_full_equal_count_
-         << "\nGPUTOTAL failed: "
-         << setprecision(3) << speedup_full_slower_count_
-         << endl;
+        << setprecision(3) << speedup_full_faster_count_
+        << "\nGPUTOTAL passed: "
+        << setprecision(3) << speedup_full_equal_count_
+        << "\nGPUTOTAL failed: "
+        << setprecision(3) << speedup_full_slower_count_
+        << endl;
     cout << "\nGPUTOTAL exceeded rate: "
-         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPUTOTAL passed rate: "
-         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPUTOTAL failed rate: "
-         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << endl;
+        << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
+        << "%"
+        << "\nGPUTOTAL passed rate: "
+        << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
+        << "%"
+        << "\nGPUTOTAL failed rate: "
+        << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+        << "%"
+        << endl;
     cout << resetiosflags(ios_base::fixed);
 }
 
 
-void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
-{
-    cout << TAB << setiosflags(ios_base::left);
-    stringstream stream;
+enum GTestColor {
+    COLOR_DEFAULT,
+    COLOR_RED,
+    COLOR_GREEN,
+    COLOR_YELLOW
+};
+#if GTEST_OS_WINDOWS&&!GTEST_OS_WINDOWS_MOBILE
+// Returns the character attribute for the given color.
+static WORD GetColorAttribute(GTestColor color) {
+    switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+    }
+}
+#else
+static const char* GetAnsiColorCode(GTestColor color) {
+    switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+    };
+}
+#endif
 
+static void printMetricsUti(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, std::stringstream& stream, std::stringstream& cur_subtest_description)
+{
+    //cout <<TAB<< setw(7) << stream.str(); 
+    cout <<TAB; 
+
+    stream.str("");
     stream << cpu_time;
     cout << setw(10) << stream.str();
 
@@ -266,20 +314,71 @@ void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_
 
     stream.str("");
     stream << "x" << setprecision(3) << speedup;
-    cout << setw(14) << stream.str();
+    cout << setw(8) << stream.str();
 
     stream.str("");
     stream << gpu_full_time;
-    cout << setw(14) << stream.str();
+    cout << setw(10) << stream.str();
 
     stream.str("");
     stream << "x" << setprecision(3) << fullspeedup;
-    cout << setw(14) << stream.str();
+    cout << setw(10) << stream.str();
 
-    cout << cur_subtest_description_.str();
+    cout << cur_subtest_description.str();
     cout << resetiosflags(ios_base::left) << endl;
 }
 
+void TestSystem::printMetrics(int is_accurate, double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
+{
+    cout << setiosflags(ios_base::left);
+    stringstream stream;
+
+    std::stringstream &cur_subtest_description = getCurSubtestDescription();
+   
+#if GTEST_OS_WINDOWS&&!GTEST_OS_WINDOWS_MOBILE
+    
+    WORD color;
+    const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+    // Gets the current text color.
+    CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+    GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+    const WORD old_color_attrs = buffer_info.wAttributes;
+    // We need to flush the stream buffers into the console before each
+    // SetConsoleTextAttribute call lest it affect the text that is already
+    // printed but has not yet reached the console.
+    fflush(stdout);
+
+    if(is_accurate == 1||is_accurate == -1)
+    {
+        color = old_color_attrs;
+        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
+
+    }else
+    {
+        color = GetColorAttribute(COLOR_RED);
+        SetConsoleTextAttribute(stdout_handle,
+            color| FOREGROUND_INTENSITY);
+
+        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
+        fflush(stdout);
+        // Restores the text color.
+        SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+    }
+#else
+    GTestColor color = COLOR_RED;
+    if(is_accurate == 1|| is_accurate == -1)
+    {
+        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
+
+    }else
+    {
+        printf("\033[0;3%sm", GetAnsiColorCode(color));
+        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
+        printf("\033[m");  // Resets the terminal to default.
+    }
+#endif
+}
+
 void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
 {
     if (!record_)
@@ -288,10 +387,27 @@ void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_
         record_ = fopen(recordname_.c_str(), "w");
     }
 
-    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
-            cur_subtest_description_.str().c_str(),
-            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
-            gpu_min, gpu_max, std_dev);
+    string _is_accurate_;
+
+    if(is_accurate_ == 1)
+        _is_accurate_ = "Pass";
+    else if(is_accurate_ == 0)
+        _is_accurate_ = "Fail";
+    else if(is_accurate_ == -1)
+        _is_accurate_ = " ";
+    else
+    {
+        std::cout<<"is_accurate errer: "<<is_accurate_<<"\n";
+        exit(-1);
+    }
+
+    fprintf(record_, "%s,%s,%s,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", 
+        itname_changed_ ? itname_.c_str() : "",
+        cur_subtest_description_.str().c_str(),
+        _is_accurate_.c_str(), 
+        accurate_diff_,
+        cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
+        gpu_min, gpu_max, std_dev);
 
     if (itname_changed_)
     {
@@ -310,31 +426,31 @@ void TestSystem::writeSummary()
     }
 
     fprintf(record_, "\nAverage GPU speedup: %.3f\n"
-            "exceeded: %d (%.3f%%)\n"
-            "passed: %d (%.3f%%)\n"
-            "failed: %d (%.3f%%)\n"
-            "\nAverage GPUTOTAL speedup: %.3f\n"
-            "exceeded: %d (%.3f%%)\n"
-            "passed: %d (%.3f%%)\n"
-            "failed: %d (%.3f%%)\n",
-            speedup_total_ / std::max(1, num_subtests_called_),
-            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_total_ / std::max(1, num_subtests_called_),
-            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-           );
+        "exceeded: %d (%.3f%%)\n"
+        "passed: %d (%.3f%%)\n"
+        "failed: %d (%.3f%%)\n"
+        "\nAverage GPUTOTAL speedup: %.3f\n"
+        "exceeded: %d (%.3f%%)\n"
+        "passed: %d (%.3f%%)\n"
+        "failed: %d (%.3f%%)\n",
+        speedup_total_ / std::max(1, num_subtests_called_),
+        speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
+        speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
+        speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
+        speedup_full_total_ / std::max(1, num_subtests_called_),
+        speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
+        speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
+        speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+        );
     fflush(record_);
 }
 
 void TestSystem::printError(const std::string &msg)
 {
-	if(msg != "CL_INVALID_BUFFER_SIZE")
-	{
-		cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
-	}
+    if(msg != "CL_INVALID_BUFFER_SIZE")
+    {
+        cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
+    }
 }
 
 void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
@@ -344,7 +460,6 @@ void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
     rng.fill(mat, RNG::UNIFORM, low, high);
 }
 
-
 string abspath(const string &relpath)
 {
     return TestSystem::instance().workingDir() + relpath;
@@ -352,11 +467,30 @@ string abspath(const string &relpath)
 
 
 int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
-                             const char *err_msg, const char * /*file_name*/,
-                             int /*line*/, void * /*userdata*/)
+    const char *err_msg, const char * /*file_name*/,
+    int /*line*/, void * /*userdata*/)
 {
     TestSystem::instance().printError(err_msg);
     return 0;
 }
 
+double checkNorm(const Mat &m)
+{
+    return norm(m, NORM_INF);
+}
+
+double checkNorm(const Mat &m1, const Mat &m2)
+{
+    return norm(m1, m2, NORM_INF);
+}
+
+double checkSimilarity(const Mat &m1, const Mat &m2)
+{
+    Mat diff;
+    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
+    return std::abs(diff.at<float>(0, 0) - 1.f);
+}
+
+
+
 
diff --git a/modules/ocl/perf/precomp.hpp b/modules/ocl/perf/precomp.hpp
index c2cf1238ef..97e3d7e5c6 100644
--- a/modules/ocl/perf/precomp.hpp
+++ b/modules/ocl/perf/precomp.hpp
@@ -50,10 +50,15 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
+#include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/video/video.hpp"
 #include "opencv2/objdetect/objdetect.hpp"
 #include "opencv2/features2d/features2d.hpp"
 #include "opencv2/ocl/ocl.hpp"
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/ts/ts_gtest.h"
+
 
 #define Min_Size 1000
 #define Max_Size 4000
@@ -64,6 +69,8 @@ using namespace std;
 using namespace cv;
 
 void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
+void gen(Mat &mat, int rows, int cols, int type, int low, int high, int n);
+
 string abspath(const string &relpath);
 int CV_CDECL cvErrorCallback(int, const char *, const char *, const char *, int, void *);
 typedef struct
@@ -76,6 +83,50 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
 void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
                     int sp, int sr, cv::TermCriteria crit);
 
+
+template<class T1, class T2>
+int ExpectedEQ(T1 expected, T2 actual)
+{
+    if(expected == actual)
+        return 1;
+
+    return 0;
+}
+
+template<class T1>
+int EeceptDoubleEQ(T1 expected, T1 actual)
+{
+    testing::internal::Double lhs(expected);
+    testing::internal::Double rhs(actual);
+
+    if (lhs.AlmostEquals(rhs)) 
+    {
+        return 1;
+    }
+
+    return 0;
+}
+
+template<class T>
+int AssertEQ(T expected, T actual)
+{
+    if(expected == actual)
+    {
+        return 1;
+    }
+    return 0;
+}
+
+int ExceptDoubleNear(double val1, double val2, double abs_error);
+bool match_rect(cv::Rect r1, cv::Rect r2, int threshold);
+
+double checkNorm(const cv::Mat &m);
+double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
+double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
+
+int ExpectedMatNear(cv::Mat dst, cv::Mat cpu_dst, double eps);
+int ExceptedMatSimilar(cv::Mat dst, cv::Mat cpu_dst, double eps);
+
 class Runnable
 {
 public:
@@ -171,6 +222,16 @@ public:
         return cur_iter_idx_ >= cpu_num_iters_;
     }
 
+    int get_cur_iter_idx()
+    {
+        return cur_iter_idx_;
+    }
+
+    int get_cpu_num_iters()
+    {
+        return cpu_num_iters_;
+    }
+
     bool warmupStop()
     {
         return cur_warmup_idx_++ >= gpu_warmup_iters_;
@@ -252,6 +313,53 @@ public:
         itname_changed_ = true;
     }
 
+    void setAccurate(int accurate, double diff)
+    {
+        is_accurate_ = accurate;
+        accurate_diff_ = diff;
+    }
+
+    void ExpectMatsNear(vector<Mat>& dst, vector<Mat>& cpu_dst, vector<double>& eps)
+    {
+        assert(dst.size() == cpu_dst.size());
+        assert(cpu_dst.size() == eps.size());
+        is_accurate_ = 1;
+        for(size_t i=0; i<dst.size(); i++)
+        {
+            double cur_diff = checkNorm(dst[i], cpu_dst[i]);
+            accurate_diff_ = max(accurate_diff_, cur_diff);
+            if(cur_diff > eps[i])
+                is_accurate_ = 0;
+        }
+    }
+
+    void ExpectedMatNear(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
+    {
+        assert(dst.type() == cpu_dst.type());
+        assert(dst.size() == cpu_dst.size());
+        accurate_diff_ = checkNorm(dst, cpu_dst);
+        if(accurate_diff_ <= eps)
+            is_accurate_ = 1;
+        else
+            is_accurate_ = 0;
+    }
+
+    void ExceptedMatSimilar(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
+    {
+        assert(dst.type() == cpu_dst.type());
+        assert(dst.size() == cpu_dst.size());
+        accurate_diff_ = checkSimilarity(cpu_dst, dst);
+        if(accurate_diff_ <= eps)
+            is_accurate_ = 1;
+        else
+            is_accurate_ = 0;    
+    }
+
+    std::stringstream &getCurSubtestDescription()
+    {
+        return cur_subtest_description_;
+    }
+
 private:
     TestSystem():
         cur_subtest_is_empty_(true), cpu_elapsed_(0),
@@ -261,7 +369,8 @@ private:
         speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
         num_iters_(10), cpu_num_iters_(2),
         gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
-        record_(0), recordname_("performance"), itname_changed_(true)
+        record_(0), recordname_("performance"), itname_changed_(true), 
+        is_accurate_(-1), accurate_diff_(0.)
     {
         cpu_times_.reserve(num_iters_);
         gpu_times_.reserve(num_iters_);
@@ -277,16 +386,19 @@ private:
         cur_subtest_description_.str("");
         cur_subtest_is_empty_ = true;
         cur_iter_idx_ = 0;
+        cur_warmup_idx_ = 0;
         cpu_times_.clear();
         gpu_times_.clear();
         gpu_full_times_.clear();
+        is_accurate_ = -1;
+        accurate_diff_ = 0.;
     }
 
     double meanTime(const std::vector<int64> &samples);
 
     void printHeading();
     void printSummary();
-    void printMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f, double speedup = 0.0f, double fullspeedup = 0.0f);
+    void printMetrics(int is_accurate, double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f, double speedup = 0.0f, double fullspeedup = 0.0f);
 
     void writeHeading();
     void writeSummary();
@@ -340,6 +452,9 @@ private:
     std::string recordname_;
     std::string itname_;
     bool itname_changed_;
+
+    int is_accurate_;
+    double accurate_diff_;
 };
 
 
@@ -353,7 +468,7 @@ struct name##_init: Runnable { \
 	void name##_init::run()
 
 
-#define TEST(name) \
+#define PERFTEST(name) \
 struct name##_test: Runnable { \
 	name##_test(): Runnable(#name) { \
 	TestSystem::instance().addTest(this); \
@@ -375,7 +490,7 @@ struct name##_test: Runnable { \
 	while (!TestSystem::instance().stop()) { \
 	TestSystem::instance().gpuOn()
 #define GPU_OFF \
-    ocl::finish(); \
+	ocl::finish();\
 	TestSystem::instance().gpuOff(); \
 	} TestSystem::instance().gpuComplete()
 
@@ -389,5 +504,5 @@ struct name##_test: Runnable { \
 #define WARMUP_ON \
 	while (!TestSystem::instance().warmupStop()) {
 #define WARMUP_OFF \
-        ocl::finish(); \
+	ocl::finish();\
 	} TestSystem::instance().warmupComplete()
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index d679a93480..49a56ceabb 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -22,6 +22,7 @@
 //    Jiang Liyuan, jlyuan001.good@163.com
 //    Rock Li, Rock.Li@amd.com
 //    Zailong Wu, bullet@yeah.net
+//    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -286,6 +287,7 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
     else
         arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
 }
+
 void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
 {
 
@@ -411,11 +413,11 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelN
     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
+    float f_scalar = (float)scalar;
     if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
         args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
     else
     {
-        float f_scalar = (float)scalar;
         args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar));
     }
 
@@ -468,6 +470,11 @@ void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, cons
     const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
     arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
 }
+void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst)
+{
+    string kernelName = "arithm_muls";
+    arithmetic_scalar_run( src, dst, kernelName, &arithm_mul, scalar);
+}
 void cv::ocl::divide(double scalar, const oclMat &src,  oclMat &dst)
 {
     if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
@@ -775,45 +782,55 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
     }
 }
 
-template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
+template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
+                                             const oclMat &mask, oclMat &buf)
 {
     size_t groupnum = src.clCxt->computeUnits();
     CV_Assert(groupnum != 0);
     groupnum = groupnum * 2;
     int vlen = 8;
     int dbsize = groupnum * 2 * vlen * sizeof(T) ;
-    Context *clCxt = src.clCxt;
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
-    *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
+
+    ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
+
+    cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
+
     if (mask.empty())
     {
-        arithmetic_minMax_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax");
+        arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax");
     }
     else
     {
-        arithmetic_minMax_mask_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax_mask");
+        arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask");
     }
-    T *p = new T[groupnum * vlen * 2];
-    memset(p, 0, dbsize);
-    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
-    if(minVal != NULL){
+
+    Mat matbuf = Mat(buf);
+    T *p = matbuf.ptr<T>();
+    if(minVal != NULL)
+    {
+        *minVal = std::numeric_limits<double>::max();
         for(int i = 0; i < vlen * (int)groupnum; i++)
         {
             *minVal = *minVal < p[i] ? *minVal : p[i];
         }
     }
-    if(maxVal != NULL){
+    if(maxVal != NULL)
+    {
+        *maxVal = -std::numeric_limits<double>::max();
         for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
         {
             *maxVal = *maxVal > p[i] ? *maxVal : p[i];
         }
     }
-    delete[] p;
-    openCLFree(dstBuffer);
 }
 
-typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
+typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
 void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
+{
+    oclMat buf;
+    minMax_buf(src, minVal, maxVal, mask, buf);
+}
+void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
     CV_Assert(src.oclchannels() == 1);
     if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -833,7 +850,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
     };
     minMaxFunc func;
     func = functab[src.depth()];
-    func(src, minVal, maxVal, mask);
+    func(src, minVal, maxVal, mask, buf);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1680,10 +1697,11 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string ker
     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
+    T scalar;
     if(_scalar != NULL)
     {
         double scalar1 = *((double *)_scalar);
-        T scalar = (T)scalar1;
+        scalar = (T)scalar1;
         args.push_back( make_pair( sizeof(T), (void *)&scalar ));
     }
 
@@ -2300,9 +2318,9 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+    float pf = p;
     if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
     {
-        float pf = p;
         args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
     }
     else
diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp
index c12fa73064..74da6ddd06 100644
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -245,11 +245,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, const oclM
 {
     const oclMat zeroMask;
     const oclMat &tempMask = mask.data ? mask : zeroMask;
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
     if (query.cols <= 64)
     {
         matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
     }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
     {
         matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx,  distance, distType);
     }
@@ -264,11 +265,12 @@ static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, co
 {
     const oclMat zeroMask;
     const oclMat &tempMask = mask.data ? mask : zeroMask;
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
     if (query.cols <= 64)
     {
         matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
     }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
     {
         matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
     }
@@ -284,11 +286,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, float maxD
 {
     const oclMat zeroMask;
     const oclMat &tempMask = mask.data ? mask : zeroMask;
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
     if (query.cols <= 64)
     {
         matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
     }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
     {
         matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
     }
@@ -466,11 +469,12 @@ static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, con
 static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                       const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
     if (query.cols <= 64)
     {
         knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
     }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
     {
         knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType);
     }
diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp
index cc7e60e0d9..82bb01bfdc 100644
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -87,7 +87,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
             filterDY = createDerivFilter_GPU(CV_8U, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
         }
     }
-    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);
+    ensureSizeIsEnough(2 * (image_size.height + 2), image_size.width + 2, CV_32FC1, edgeBuf);
 
     ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
     ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
@@ -141,13 +141,16 @@ namespace
     void CannyCaller(CannyBuf &buf, oclMat &dst, float low_thresh, float high_thresh)
     {
         using namespace ::cv::ocl::canny;
-        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
+        oclMat magBuf = buf.edgeBuf(Rect(0, 0, buf.edgeBuf.cols, buf.edgeBuf.rows / 2));
+        oclMat mapBuf = buf.edgeBuf(Rect(0, buf.edgeBuf.rows / 2, buf.edgeBuf.cols, buf.edgeBuf.rows / 2));
 
-        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1, buf.counter, dst.rows, dst.cols);
+        calcMap_gpu(buf.dx, buf.dy, magBuf, mapBuf, dst.rows, dst.cols, low_thresh, high_thresh);
 
-        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1, buf.trackBuf2, buf.counter, dst.rows, dst.cols);
+        edgesHysteresisLocal_gpu(mapBuf, buf.trackBuf1, buf.counter, dst.rows, dst.cols);
 
-        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
+        edgesHysteresisGlobal_gpu(mapBuf, buf.trackBuf1, buf.trackBuf2, buf.counter, dst.rows, dst.cols);
+
+        getEdges_gpu(mapBuf, dst, dst.rows, dst.cols);
     }
 }
 
@@ -172,18 +175,20 @@ void cv::ocl::Canny(const oclMat &src, CannyBuf &buf, oclMat &dst, double low_th
     buf.create(src.size(), apperture_size);
     buf.edgeBuf.setTo(Scalar::all(0));
 
+    oclMat magBuf = buf.edgeBuf(Rect(0, 0, buf.edgeBuf.cols, buf.edgeBuf.rows / 2));
+
     if (apperture_size == 3)
     {
         calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
 
-        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, magBuf, src.rows, src.cols, L2gradient);
     }
     else
     {
         buf.filterDX->apply(src, buf.dx);
         buf.filterDY->apply(src, buf.dy);
 
-        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude_gpu(buf.dx, buf.dy, magBuf, src.rows, src.cols, L2gradient);
     }
     CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }
@@ -209,7 +214,10 @@ void cv::ocl::Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &d
     buf.dy = dy;
     buf.create(dx.size(), -1);
     buf.edgeBuf.setTo(Scalar::all(0));
-    calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
+
+    oclMat magBuf = buf.edgeBuf(Rect(0, 0, buf.edgeBuf.cols, buf.edgeBuf.rows / 2));
+
+    calcMagnitude_gpu(buf.dx, buf.dy, magBuf, dx.rows, dx.cols, L2gradient);
 
     CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }
@@ -234,7 +242,7 @@ void canny::calcSobelRowPass_gpu(const oclMat &src, oclMat &dx_buf, oclMat &dy_b
 
     size_t globalThreads[3] = {cols, rows, 1};
     size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat &dx, oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
@@ -264,12 +272,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat
     size_t globalThreads[3] = {cols, rows, 1};
     size_t localThreads[3]  = {16, 16, 1};
 
-    char build_options [15] = "";
-    if(L2Grad)
-    {
-        strcat(build_options, "-D L2GRAD");
-    }
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    const char * build_options = L2Grad ? "-D L2GRAD":"";
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }
 void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
 {
@@ -292,12 +296,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, i
     size_t globalThreads[3] = {cols, rows, 1};
     size_t localThreads[3]  = {16, 16, 1};
 
-    char build_options [15] = "";
-    if(L2Grad)
-    {
-        strcat(build_options, "-D L2GRAD");
-    }
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    const char * build_options = L2Grad ? "-D L2GRAD":"";
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }
 
 void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh)
@@ -328,7 +328,7 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro
     string kernelName = "calcMap";
     size_t localThreads[3]  = {16, 16, 1};
 
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
@@ -348,7 +348,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
     size_t globalThreads[3] = {cols, rows, 1};
     size_t localThreads[3]  = {16, 16, 1};
 
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
@@ -378,7 +378,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
         args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
         args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
 
-        openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
+        openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
         openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
         std::swap(st1, st2);
     }
@@ -403,5 +403,5 @@ void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
     size_t globalThreads[3] = {cols, rows, 1};
     size_t localThreads[3]  = {16, 16, 1};
 
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index cc07209b15..f35a26e332 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -356,8 +356,7 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
     char compile_option[128];
     sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s", 
         anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], 
-        rectKernel?"-D RECTKERNEL":"",
-        s);
+        s, rectKernel?"-D RECTKERNEL":"");
     vector< pair<size_t, const void *> > args;
     args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
@@ -646,7 +645,11 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
     args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
     args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
 
-    openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth);
+    const int buffer_size = 100;
+    char opt_buffer [buffer_size] = "";
+    sprintf(opt_buffer, "-DANCHOR=%d -DANX=%d -DANY=%d", ksize.width, anchor.x, anchor.y);
+
+    openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth, opt_buffer);
 }
 Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
         Point anchor, int borderType)
@@ -657,7 +660,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const
 
     oclMat gpu_krnl;
     int nDivisor;
-    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
+    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, false);
     normalizeAnchor(anchor, ksize);
 
     return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
@@ -1173,7 +1176,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel
     args.push_back(make_pair(sizeof(cl_int), (void *)&ridusy));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
 
-    openCLExecuteKernel2(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option, CLFLUSH);
+    openCLExecuteKernel(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option);
 }
 
 Ptr<BaseRowFilter_GPU> cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype)
diff --git a/modules/ocl/src/gfft.cpp b/modules/ocl/src/gfft.cpp
new file mode 100644
index 0000000000..7fd5e3a174
--- /dev/null
+++ b/modules/ocl/src/gfft.cpp
@@ -0,0 +1,352 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+
+static bool use_cpu_sorter = true;
+
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *imgproc_gfft;
+    }
+}
+
+namespace
+{
+enum SortMethod
+{
+    CPU_STL,
+    BITONIC,
+    SELECTION
+};
+
+const int GROUP_SIZE = 256;
+
+template<SortMethod method>
+struct Sorter
+{
+    //typedef EigType;
+};
+
+//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
+template<>
+struct Sorter<CPU_STL>
+{
+    typedef oclMat EigType;
+    static cv::Mutex cs;
+    static Mat mat_eig;
+
+    //prototype
+    static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
+    {
+        float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
+        float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
+        return v1 > v2;
+    }
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        cv::AutoLock lock(cs);
+        //temporarily use STL's sort function
+        Mat mat_corners = corners;
+        mat_eig = eig_tex;
+        std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
+        corners = mat_corners;
+    }
+};
+cv::Mutex Sorter<CPU_STL>::cs;
+cv::Mat   Sorter<CPU_STL>::mat_eig;
+
+template<>
+struct Sorter<BITONIC>
+{
+    typedef TextureCL EigType;
+
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        Context * cxt = Context::getContext();
+        size_t globalThreads[3] = {count / 2, 1, 1};
+        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+        // 2^numStages should be equal to count or the output is invalid
+        int numStages = 0;
+        for(int i = count; i > 1; i >>= 1)
+        {
+            ++numStages;
+        }
+        const int argc = 5;
+        std::vector< std::pair<size_t, const void *> > args(argc);
+        std::string kernelname = "sortCorners_bitonicSort";
+        args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
+        args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
+        args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
+        for(int stage = 0; stage < numStages; ++stage)
+        {
+            args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
+            for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
+            {
+                args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+                openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+            }
+        }
+    }
+};
+
+template<>
+struct Sorter<SELECTION>
+{
+    typedef TextureCL EigType;
+
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        Context * cxt = Context::getContext();
+        
+        size_t globalThreads[3] = {count, 1, 1};
+        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+        std::vector< std::pair<size_t, const void *> > args;
+        //local
+        std::string kernelname = "sortCorners_selectionSortLocal";
+        int lds_size = GROUP_SIZE * sizeof(cl_float2);
+        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
+        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
+        args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
+        args.push_back( std::make_pair( lds_size,       (void*)NULL) );
+
+        openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+
+        //final
+        kernelname = "sortCorners_selectionSortFinal";
+        args.pop_back();
+        openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+    }
+};
+
+int findCorners_caller(
+    const TextureCL& eig, 
+    const float threshold,
+    const oclMat& mask,
+    oclMat& corners,
+    const int max_count)
+{
+    std::vector<int> k;
+    Context * cxt = Context::getContext();
+
+    std::vector< std::pair<size_t, const void*> > args;
+    std::string kernelname = "findCorners";
+
+    const int mask_strip = mask.step / mask.elemSize1();
+
+    oclMat g_counter(1, 1, CV_32SC1);
+    g_counter.setTo(0);
+
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&eig  ));
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&mask.data ));
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&corners.data ));
+    args.push_back(make_pair( sizeof(cl_int),   (void*)&mask_strip));
+    args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
+    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
+    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
+    args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
+    args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
+
+    size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+
+    const char * opt = mask.empty() ? "" : "-D WITH_MASK";
+    openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1, opt);
+    return std::min(Mat(g_counter).at<int>(0), max_count);
+}
+}//unnamed namespace
+
+void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
+{
+    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
+
+    CV_DbgAssert(support_image2d());
+
+    ensureSizeIsEnough(image.size(), CV_32F, eig_);
+
+    if (useHarrisDetector)
+        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+    else
+        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
+
+    double maxVal = 0;
+    minMax_buf(eig_, 0, &maxVal, oclMat(), minMaxbuf_);
+
+    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+
+    Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
+    int total = findCorners_caller(
+        *eig_tex,
+        static_cast<float>(maxVal * qualityLevel),
+        mask,
+        tmpCorners_,
+        tmpCorners_.cols);
+
+    if (total == 0)
+    {
+        corners.release();
+        return;
+    }
+    if(use_cpu_sorter)
+    {
+        Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
+    }
+    else
+    {
+        //if total is power of 2
+        if(((total - 1) & (total)) == 0)
+        {
+            Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+        }
+        else
+        {
+            Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+        }
+    }
+    
+    if (minDistance < 1)
+    {
+        Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
+        tmpCorners_(roi_range).copyTo(corners);
+    }
+    else
+    {
+        vector<Point2f> tmp(total);
+        downloadPoints(tmpCorners_, tmp);
+
+        vector<Point2f> tmp2;
+        tmp2.reserve(total);
+
+        const int cell_size = cvRound(minDistance);
+        const int grid_width = (image.cols + cell_size - 1) / cell_size;
+        const int grid_height = (image.rows + cell_size - 1) / cell_size;
+
+        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+
+        for (int i = 0; i < total; ++i)
+        {
+            Point2f p = tmp[i];
+
+            bool good = true;
+
+            int x_cell = static_cast<int>(p.x / cell_size);
+            int y_cell = static_cast<int>(p.y / cell_size);
+
+            int x1 = x_cell - 1;
+            int y1 = y_cell - 1;
+            int x2 = x_cell + 1;
+            int y2 = y_cell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(grid_width - 1, x2);
+            y2 = std::min(grid_height - 1, y2);
+
+            for (int yy = y1; yy <= y2; yy++)
+            {
+                for (int xx = x1; xx <= x2; xx++)
+                {
+                    vector<Point2f>& m = grid[yy * grid_width + xx];
+
+                    if (!m.empty())
+                    {
+                        for(size_t j = 0; j < m.size(); j++)
+                        {
+                            float dx = p.x - m[j].x;
+                            float dy = p.y - m[j].y;
+
+                            if (dx * dx + dy * dy < minDistance * minDistance)
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[y_cell * grid_width + x_cell].push_back(p);
+
+                tmp2.push_back(p);
+
+                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
+                    break;
+            }
+        }
+
+        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+    }
+}
+void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
+{
+    CV_DbgAssert(points.type() == CV_32FC2);
+    points_v.resize(points.cols);
+    openCLSafeCall(clEnqueueReadBuffer(
+        *reinterpret_cast<cl_command_queue*>(getoclCommandQueue()), 
+        reinterpret_cast<cl_mem>(points.data), 
+        CL_TRUE,                                    
+        0, 
+        points.cols * sizeof(Point2f), 
+        &points_v[0], 
+        0, 
+        NULL, 
+        NULL));
+}
+
+
diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp
index 5afe5423ed..6283ac8d9f 100644
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -137,47 +137,22 @@ struct CvHidHaarClassifierCascade
 };
 typedef struct
 {
-    //int rows;
-    //int ystep;
     int width_height;
-    //int height;
     int grpnumperline_totalgrp;
-    //int totalgrp;
     int imgoff;
     float factor;
 } detect_piramid_info;
-
-#if defined WIN32 && !defined __MINGW__ && !defined __MINGW32__
+#ifdef _MSC_VER
 #define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
-typedef _ALIGNED_ON(128) struct  GpuHidHaarFeature
-{
-    _ALIGNED_ON(32) struct
-    {
-        _ALIGNED_ON(4)  int    p0 ;
-        _ALIGNED_ON(4)  int    p1 ;
-        _ALIGNED_ON(4)  int    p2 ;
-        _ALIGNED_ON(4)  int    p3 ;
-        _ALIGNED_ON(4)  float weight  ;
-    }
-    /*_ALIGNED_ON(32)*/ rect[CV_HAAR_FEATURE_MAX] ;
-}
-GpuHidHaarFeature;
-
 
 typedef _ALIGNED_ON(128) struct  GpuHidHaarTreeNode
 {
     _ALIGNED_ON(64) int p[CV_HAAR_FEATURE_MAX][4];
-    //_ALIGNED_ON(16) int p1[CV_HAAR_FEATURE_MAX] ;
-    //_ALIGNED_ON(16) int p2[CV_HAAR_FEATURE_MAX] ;
-    //_ALIGNED_ON(16) int p3[CV_HAAR_FEATURE_MAX] ;
-    /*_ALIGNED_ON(16)*/
     float weight[CV_HAAR_FEATURE_MAX] ;
-    /*_ALIGNED_ON(4)*/
     float threshold ;
-    _ALIGNED_ON(8) float alpha[2] ;
+    _ALIGNED_ON(16) float alpha[3] ;
     _ALIGNED_ON(4) int left ;
     _ALIGNED_ON(4) int right ;
-    // GpuHidHaarFeature feature __attribute__((aligned (128)));
 }
 GpuHidHaarTreeNode;
 
@@ -185,7 +160,6 @@ GpuHidHaarTreeNode;
 typedef  _ALIGNED_ON(32) struct  GpuHidHaarClassifier
 {
     _ALIGNED_ON(4) int count;
-    //CvHaarFeature* orig_feature;
     _ALIGNED_ON(8) GpuHidHaarTreeNode *node ;
     _ALIGNED_ON(8) float *alpha ;
 }
@@ -220,32 +194,16 @@ typedef _ALIGNED_ON(64) struct  GpuHidHaarClassifierCascade
     _ALIGNED_ON(4) int p2 ;
     _ALIGNED_ON(4) int p3 ;
     _ALIGNED_ON(4) float inv_window_area ;
-    // GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
 } GpuHidHaarClassifierCascade;
 #else
 #define _ALIGNED_ON(_ALIGNMENT) __attribute__((aligned(_ALIGNMENT) ))
 
-typedef struct _ALIGNED_ON(128) GpuHidHaarFeature
-{
-    struct _ALIGNED_ON(32)
-{
-    int    p0 _ALIGNED_ON(4);
-    int    p1 _ALIGNED_ON(4);
-    int    p2 _ALIGNED_ON(4);
-    int    p3 _ALIGNED_ON(4);
-    float weight  _ALIGNED_ON(4);
-}
-rect[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(32);
-}
-GpuHidHaarFeature;
-
-
 typedef struct _ALIGNED_ON(128) GpuHidHaarTreeNode
 {
     int p[CV_HAAR_FEATURE_MAX][4] _ALIGNED_ON(64);
     float weight[CV_HAAR_FEATURE_MAX];// _ALIGNED_ON(16);
     float threshold;// _ALIGNED_ON(4);
-    float alpha[2] _ALIGNED_ON(8);
+    float alpha[3] _ALIGNED_ON(16);
     int left _ALIGNED_ON(4);
     int right _ALIGNED_ON(4);
 }
@@ -288,7 +246,6 @@ typedef struct _ALIGNED_ON(64) GpuHidHaarClassifierCascade
     int p2 _ALIGNED_ON(4);
     int p3 _ALIGNED_ON(4);
     float inv_window_area _ALIGNED_ON(4);
-    // GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
 } GpuHidHaarClassifierCascade;
 #endif
 
@@ -296,36 +253,6 @@ const int icv_object_win_border = 1;
 const float icv_stage_threshold_bias = 0.0001f;
 double globaltime = 0;
 
-
-// static CvHaarClassifierCascade * gpuCreateHaarClassifierCascade( int stage_count )
-// {
-//     CvHaarClassifierCascade *cascade = 0;
-
-//     int block_size = sizeof(*cascade) + stage_count * sizeof(*cascade->stage_classifier);
-
-//     if( stage_count <= 0 )
-//         CV_Error( CV_StsOutOfRange, "Number of stages should be positive" );
-
-//     cascade = (CvHaarClassifierCascade *)cvAlloc( block_size );
-//     memset( cascade, 0, block_size );
-
-//     cascade->stage_classifier = (CvHaarStageClassifier *)(cascade + 1);
-//     cascade->flags = CV_HAAR_MAGIC_VAL;
-//     cascade->count = stage_count;
-
-//     return cascade;
-// }
-
-//static int globalcounter = 0;
-
-// static void gpuReleaseHidHaarClassifierCascade( GpuHidHaarClassifierCascade **_cascade )
-// {
-//     if( _cascade && *_cascade )
-//     {
-//         cvFree( _cascade );
-//     }
-// }
-
 /* create more efficient internal representation of haar classifier cascade */
 static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size, int *totalclassifier)
 {
@@ -441,24 +368,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
         hid_stage_classifier->two_rects = 1;
         haar_classifier_ptr += stage_classifier->count;
 
-        /*
-        hid_stage_classifier->parent = (stage_classifier->parent == -1)
-        ? NULL : stage_classifier_ptr + stage_classifier->parent;
-        hid_stage_classifier->next = (stage_classifier->next == -1)
-        ? NULL : stage_classifier_ptr + stage_classifier->next;
-        hid_stage_classifier->child = (stage_classifier->child == -1)
-        ? NULL : stage_classifier_ptr + stage_classifier->child;
-
-        out->is_tree |= hid_stage_classifier->next != NULL;
-        */
-
         for( j = 0; j < stage_classifier->count; j++ )
         {
             CvHaarClassifier *classifier         = stage_classifier->classifier + j;
             GpuHidHaarClassifier *hid_classifier = hid_stage_classifier->classifier + j;
             int node_count = classifier->count;
 
-            //   float* alpha_ptr = (float*)(haar_node_ptr + node_count);
             float *alpha_ptr = &haar_node_ptr->alpha[0];
 
             hid_classifier->count = node_count;
@@ -485,16 +400,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
                     node->p[2][3] = 0;
                     node->weight[2] = 0;
                 }
-                //   memset( &(node->feature.rect[2]), 0, sizeof(node->feature.rect[2]) );
                 else
                     hid_stage_classifier->two_rects = 0;
+
+                memcpy( node->alpha, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
+                haar_node_ptr = haar_node_ptr + 1;
             }
-
-            memcpy( alpha_ptr, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
-            haar_node_ptr = haar_node_ptr + 1;
-            // (GpuHidHaarTreeNode*)cvAlignPtr(alpha_ptr+node_count+1, sizeof(void*));
-            //   (GpuHidHaarTreeNode*)(alpha_ptr+node_count+1);
-
             out->is_stump_based &= node_count == 1;
         }
     }
@@ -507,25 +418,19 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
 
 
 #define sum_elem_ptr(sum,row,col)  \
-	((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
+    ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
 
 #define sqsum_elem_ptr(sqsum,row,col)  \
-	((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
+    ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
 
 #define calc_sum(rect,offset) \
-	((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
+    ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
 
 
 static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_cascade,
-                                      /*   const CvArr* _sum,
-                                      const CvArr* _sqsum,
-                                      const CvArr* _tilted_sum,*/
                                       double scale,
                                       int step)
 {
-    //   CvMat sum_stub, *sum = (CvMat*)_sum;
-    //   CvMat sqsum_stub, *sqsum = (CvMat*)_sqsum;
-    //   CvMat tilted_stub, *tilted = (CvMat*)_tilted_sum;
     GpuHidHaarClassifierCascade *cascade;
     int coi0 = 0, coi1 = 0;
     int i;
@@ -541,61 +446,25 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
     if( scale <= 0 )
         CV_Error( CV_StsOutOfRange, "Scale must be positive" );
 
-    //   sum = cvGetMat( sum, &sum_stub, &coi0 );
-    //   sqsum = cvGetMat( sqsum, &sqsum_stub, &coi1 );
-
     if( coi0 || coi1 )
         CV_Error( CV_BadCOI, "COI is not supported" );
 
-    //   if( !CV_ARE_SIZES_EQ( sum, sqsum ))
-    //       CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
-
-    //   if( CV_MAT_TYPE(sqsum->type) != CV_64FC1 ||
-    //       CV_MAT_TYPE(sum->type) != CV_32SC1 )
-    //       CV_Error( CV_StsUnsupportedFormat,
-    //       "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
-
     if( !_cascade->hid_cascade )
         gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);
 
     cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
     stage_classifier = (GpuHidHaarStageClassifier *) (cascade + 1);
 
-    if( cascade->has_tilted_features )
-    {
-        //    tilted = cvGetMat( tilted, &tilted_stub, &coi1 );
-
-        //    if( CV_MAT_TYPE(tilted->type) != CV_32SC1 )
-        //        CV_Error( CV_StsUnsupportedFormat,
-        //        "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
-
-        //    if( sum->step != tilted->step )
-        //        CV_Error( CV_StsUnmatchedSizes,
-        //        "Sum and tilted_sum must have the same stride (step, widthStep)" );
-
-        //    if( !CV_ARE_SIZES_EQ( sum, tilted ))
-        //        CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
-        //  cascade->tilted = *tilted;
-    }
-
     _cascade->scale = scale;
     _cascade->real_window_size.width = cvRound( _cascade->orig_window_size.width * scale );
     _cascade->real_window_size.height = cvRound( _cascade->orig_window_size.height * scale );
 
-    //cascade->sum = *sum;
-    //cascade->sqsum = *sqsum;
-
     equRect.x = equRect.y = cvRound(scale);
     equRect.width = cvRound((_cascade->orig_window_size.width - 2) * scale);
     equRect.height = cvRound((_cascade->orig_window_size.height - 2) * scale);
     weight_scale = 1. / (equRect.width * equRect.height);
     cascade->inv_window_area = weight_scale;
 
-    //	cascade->pq0 = equRect.y * step + equRect.x;
-    //	cascade->pq1 = equRect.y * step + equRect.x + equRect.width ;
-    //	cascade->pq2 = (equRect.y + equRect.height)*step + equRect.x;
-    //	cascade->pq3 = (equRect.y + equRect.height)*step + equRect.x + equRect.width ;
-
     cascade->pq0 = equRect.x;
     cascade->pq1 = equRect.y;
     cascade->pq2 = equRect.x + equRect.width;
@@ -618,10 +487,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
             {
                 CvHaarFeature *feature =
                     &_cascade->stage_classifier[i].classifier[j].haar_feature[l];
-                /*  GpuHidHaarClassifier* classifier =
-                cascade->stage_classifier[i].classifier + j; */
-                //GpuHidHaarFeature* hidfeature =
-                //    &cascade->stage_classifier[i].classifier[j].node[l].feature;
                 GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
                 double sum0 = 0, area0 = 0;
                 CvRect r[3];
@@ -636,8 +501,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
                 /* align blocks */
                 for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
                 {
-                    //if( !hidfeature->rect[k].p0 )
-                    //    break;
                     if(!hidnode->p[k][0])
                         break;
                     r[k] = feature->rect[k].r;
@@ -717,15 +580,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
 
                     if( !feature->tilted )
                     {
-                        /*     hidfeature->rect[k].p0 = tr.y * sum->cols + tr.x;
-                        hidfeature->rect[k].p1 = tr.y * sum->cols + tr.x + tr.width;
-                        hidfeature->rect[k].p2 = (tr.y + tr.height) * sum->cols + tr.x;
-                        hidfeature->rect[k].p3 = (tr.y + tr.height) * sum->cols + tr.x + tr.width;
-                        */
-                        /*hidnode->p0[k] = tr.y * step + tr.x;
-                        hidnode->p1[k] = tr.y * step + tr.x + tr.width;
-                        hidnode->p2[k] = (tr.y + tr.height) * step + tr.x;
-                        hidnode->p3[k] = (tr.y + tr.height) * step + tr.x + tr.width;*/
                         hidnode->p[k][0] = tr.x;
                         hidnode->p[k][1] = tr.y;
                         hidnode->p[k][2] = tr.x + tr.width;
@@ -733,37 +587,24 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
                     }
                     else
                     {
-                        /*    hidfeature->rect[k].p2 = (tr.y + tr.width) * tilted->cols + tr.x + tr.width;
-                        hidfeature->rect[k].p3 = (tr.y + tr.width + tr.height) * tilted->cols + tr.x + tr.width - tr.height;
-                        hidfeature->rect[k].p0 = tr.y * tilted->cols + tr.x;
-                        hidfeature->rect[k].p1 = (tr.y + tr.height) * tilted->cols + tr.x - tr.height;
-                        */
-
                         hidnode->p[k][2] = (tr.y + tr.width) * step + tr.x + tr.width;
                         hidnode->p[k][3] = (tr.y + tr.width + tr.height) * step + tr.x + tr.width - tr.height;
                         hidnode->p[k][0] = tr.y * step + tr.x;
                         hidnode->p[k][1] = (tr.y + tr.height) * step + tr.x - tr.height;
                     }
-
-                    //hidfeature->rect[k].weight = (float)(feature->rect[k].weight * correction_ratio);
                     hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
                     if( k == 0 )
                         area0 = tr.width * tr.height;
                     else
-                        //sum0 += hidfeature->rect[k].weight * tr.width * tr.height;
                         sum0 += hidnode->weight[k] * tr.width * tr.height;
                 }
-
-                // hidfeature->rect[0].weight = (float)(-sum0/area0);
                 hidnode->weight[0] = (float)(-sum0 / area0);
             } /* l */
         } /* j */
     }
 }
 
-static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
-                             /*double scale=0.0,*/
-                             /*int step*/)
+static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade)
 {
     GpuHidHaarClassifierCascade *cascade;
     int i;
@@ -817,11 +658,7 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
                     if(!hidnode->p[k][0])
                         break;
                     r[k] = feature->rect[k].r;
-                    // 					base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].width-1) );
-                    // 					base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].x - r[0].x-1) );
-                    // 					base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].height-1) );
-                    // 					base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].y - r[0].y-1) );
-                }
+               }
 
                 nr = k;
                 for( k = 0; k < nr; k++ )
@@ -839,7 +676,6 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
                     hidnode->p[k][3] = tr.height;
                     hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
                 }
-                //hidnode->weight[0]=(float)(-sum0/area0);
             } /* l */
         } /* j */
     }
@@ -852,7 +688,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 
     const double GROUP_EPS = 0.2;
     CvSeq *result_seq = 0;
-    cv::Ptr<CvMemStorage> temp_storage;
 
     cv::ConcurrentRectVector allCandidates;
     std::vector<cv::Rect> rectList;
@@ -910,6 +745,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
     if( gimg.cols < minSize.width || gimg.rows < minSize.height )
         CV_Error(CV_StsError, "Image too small");
 
+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
     if( (flags & CV_HAAR_SCALE_IMAGE) )
     {
         CvSize winSize0 = cascade->orig_window_size;
@@ -952,7 +788,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 
         size_t blocksize = 8;
         size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->computeUnits()) *localThreads[0],
                                     localThreads[1], 1
                                   };
         int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -997,7 +833,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
 
         stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
         openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
 
         nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
@@ -1044,7 +879,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
         args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
 
-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
+        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
 
         openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
 
@@ -1059,6 +896,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
         openCLSafeCall(clReleaseMemObject(nodebuffer));
         openCLSafeCall(clReleaseMemObject(candidatebuffer));
+
     }
     else
     {
@@ -1118,7 +956,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
                        sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
         nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
                                         nodenum * sizeof(GpuHidHaarTreeNode));
-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
         openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
                                             nodenum * sizeof(GpuHidHaarTreeNode),
                                             node, 0, NULL, NULL));
@@ -1160,7 +997,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
             args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
 
             size_t globalThreads2[3] = {nodenum, 1, 1};
-
             openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
         }
 
@@ -1195,8 +1031,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         args.push_back ( make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
         args.push_back ( make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
         args.push_back ( make_pair(sizeof(cl_int) , (void *)&nodenum ));
-
-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
+        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
 
         candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
 
@@ -1284,7 +1120,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
     int blocksize = 8;
     int grp_per_CU = 12;
     size_t localThreads[3] = { blocksize, blocksize, 1 };
-    size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0],
+    size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->computeUnits() *localThreads[0],
         localThreads[1],
         1 };
     int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -1300,8 +1136,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
     CvHaarClassifierCascade      *cascade = oldCascade;
     GpuHidHaarClassifierCascade  *gcascade;
     GpuHidHaarStageClassifier    *stage;
-    GpuHidHaarClassifier         *classifier;
-    GpuHidHaarTreeNode           *node;
 
     if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
         CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
@@ -1314,7 +1148,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
     }
 
     int *candidate;
-
+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
     if( (flags & CV_HAAR_SCALE_IMAGE) )
     {
         int indexy = 0;
@@ -1340,19 +1174,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
 
         gcascade   = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
         stage      = (GpuHidHaarStageClassifier *)(gcascade + 1);
-        classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
-        node       = (GpuHidHaarTreeNode *)(classifier->node);
-
-        gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
-
-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
-        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
-                                            sizeof(GpuHidHaarStageClassifier) * gcascade->count,
-                                            stage, 0, NULL, NULL));
-
-        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
-                                            m_nodenum * sizeof(GpuHidHaarTreeNode),
-                                            node, 0, NULL, NULL));
 
         int startstage = 0;
         int endstage = gcascade->count;
@@ -1389,17 +1210,23 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
         args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
         args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
 
-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
+        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
 
         candidate = (int *)malloc(4 * sizeof(int) * outputsz);
         memset(candidate, 0, 4 * sizeof(int) * outputsz);
+
         openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
 
         for(int i = 0; i < outputsz; i++)
+        {
             if(candidate[4 * i + 2] != 0)
+            {
                 allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
                 candidate[4 * i + 2], candidate[4 * i + 3]));
-
+            }
+        }
         free((void *)candidate);
         candidate = NULL;
     }
@@ -1407,6 +1234,132 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
     {
         cv::ocl::integral(gimg, gsum, gsqsum);
 
+        gcascade   = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
+
+        int step = gsum.step / 4;
+        int startnode = 0;
+        int splitstage = 3;
+
+        int startstage = 0;
+        int endstage = gcascade->count;
+
+        vector<pair<size_t, const void *> > args;
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
+
+        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
+
+        candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
+
+        for(int i = 0; i < outputsz; i++)
+        {
+            if(candidate[4 * i + 2] != 0)
+                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
+                candidate[4 * i + 2], candidate[4 * i + 3]));
+        }
+        clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
+    }
+    rectList.resize(allCandidates.size());
+    if(!allCandidates.empty())
+        std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
+
+    if( minNeighbors != 0 || findBiggestObject )
+        groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
+    else
+        rweights.resize(rectList.size(), 0);
+
+    GenResult(faces, rectList, rweights);
+}
+
+void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
+    double scaleFactor, int flags,
+    const int outputsz, const size_t localThreads[],
+    CvSize minSize, CvSize maxSize)
+{
+    if(initialized)
+    {
+        return; // we only allow one time initialization
+    }
+    CvHaarClassifierCascade      *cascade = oldCascade;
+
+    if( !CV_IS_HAAR_CLASSIFIER(cascade) )
+        CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
+
+    if( scaleFactor <= 1 )
+        CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
+
+    if( cols < minSize.width || rows < minSize.height )
+        CV_Error(CV_StsError, "Image too small");
+
+    int datasize=0;
+    int totalclassifier=0;
+
+    if( !cascade->hid_cascade )
+    {
+        gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
+    }
+
+    if( maxSize.height == 0 || maxSize.width == 0 )
+    {
+        maxSize.height = rows;
+        maxSize.width = cols;
+    }
+
+    findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
+    if( findBiggestObject )
+        flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
+
+    CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
+    CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
+
+    m_scaleFactor = scaleFactor;
+    m_rows = rows;
+    m_cols = cols;
+    m_flags = flags;
+    m_minSize = minSize;
+    m_maxSize = maxSize;
+
+    // initialize nodes
+    GpuHidHaarClassifierCascade  *gcascade;
+    GpuHidHaarStageClassifier    *stage;
+    GpuHidHaarClassifier         *classifier;
+    GpuHidHaarTreeNode           *node;
+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    if( (flags & CV_HAAR_SCALE_IMAGE) )
+    {
+        gcascade   = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
+        stage      = (GpuHidHaarStageClassifier *)(gcascade + 1);
+        classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
+        node       = (GpuHidHaarTreeNode *)(classifier->node);
+
+        gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
+
+        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
+            sizeof(GpuHidHaarStageClassifier) * gcascade->count,
+            stage, 0, NULL, NULL));
+
+        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
+                                            m_nodenum * sizeof(GpuHidHaarTreeNode),
+                                            node, 0, NULL, NULL));
+    }
+    else
+    {
         gpuSetHaarClassifierCascade(cascade);
 
         gcascade   = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
@@ -1414,15 +1367,12 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
         classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
         node       = (GpuHidHaarTreeNode *)(classifier->node);
 
-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
         openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
-                                            m_nodenum * sizeof(GpuHidHaarTreeNode),
-                                            node, 0, NULL, NULL));
+            m_nodenum * sizeof(GpuHidHaarTreeNode),
+            node, 0, NULL, NULL));
 
         cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
         float *correction = (float *)malloc(sizeof(float) * m_loopcount);
-        int startstage = 0;
-        int endstage = gcascade->count;
         double factor;
         for(int i = 0; i < m_loopcount; i++)
         {
@@ -1448,105 +1398,15 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
 
             size_t globalThreads2[3] = {m_nodenum, 1, 1};
 
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
+            openCLExecuteKernel(Context::getContext(), &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
         }
-
-        int step = gsum.step / 4;
-        int startnode = 0;
-        int splitstage = 3;
         openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
         openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
         openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));
 
-        vector<pair<size_t, const void *> > args;
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
-
-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
-
-        candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
-
-        for(int i = 0; i < outputsz; i++)
-        {
-            if(candidate[4 * i + 2] != 0)
-                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
-                candidate[4 * i + 2], candidate[4 * i + 3]));
-        }
-
         free(p);
         free(correction);
-        clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
     }
-
-    rectList.resize(allCandidates.size());
-    if(!allCandidates.empty())
-        std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
-
-    if( minNeighbors != 0 || findBiggestObject )
-        groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
-    else
-        rweights.resize(rectList.size(), 0);
-
-    GenResult(faces, rectList, rweights);
-}
-
-void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
-    double scaleFactor, int flags,
-    const int outputsz, const size_t localThreads[],
-    CvSize minSize, CvSize maxSize)
-{
-    CvHaarClassifierCascade      *cascade = oldCascade;
-
-    if( !CV_IS_HAAR_CLASSIFIER(cascade) )
-        CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
-
-    if( scaleFactor <= 1 )
-        CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
-
-    if( cols < minSize.width || rows < minSize.height )
-        CV_Error(CV_StsError, "Image too small");
-
-    int datasize=0;
-    int totalclassifier=0;
-
-    if( !cascade->hid_cascade )
-        gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
-
-    if( maxSize.height == 0 || maxSize.width == 0 )
-    {
-        maxSize.height = rows;
-        maxSize.width = cols;
-    }
-
-    findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
-    if( findBiggestObject )
-        flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
-
-    CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
-    CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
-
-    m_scaleFactor = scaleFactor;
-    m_rows = rows;
-    m_cols = cols;
-    m_flags = flags;
-    m_minSize = minSize;
-    m_maxSize = maxSize;
-
     initialized = true;
 }
 
@@ -1645,6 +1505,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
     CvSize sz;
     CvSize winSize0 = oldCascade->orig_window_size;
     detect_piramid_info *scaleinfo;
+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
     if (flags & CV_HAAR_SCALE_IMAGE)
     {
         for(factor = 1.f;; factor *= scaleFactor)
@@ -1746,7 +1607,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
         ((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
     }
 
-    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
+    openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
         sizeof(detect_piramid_info)*loopcount,
         scaleinfo, 0, NULL, NULL));
     free(scaleinfo);
@@ -1758,7 +1619,8 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f
                                                  const std::vector<cv::Rect> &rectList,
                                                  const std::vector<int> &rweights)
 {
-    CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) );
+    MemStorage tempStorage(cvCreateMemStorage(0));
+    CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), tempStorage );
 
     if( findBiggestObject && rectList.size() )
     {
@@ -1794,167 +1656,30 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f
 
 void cv::ocl::OclCascadeClassifierBuf::release()
 {
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
-
-    if( (m_flags & CV_HAAR_SCALE_IMAGE) )
+    if(initialized)
     {
-        cvFree(&oldCascade->hid_cascade);
-    }
-    else
-    {
-        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
-        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
-        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
-    }
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
 
-    free(buffers);
-    buffers = NULL;
+        if( (m_flags & CV_HAAR_SCALE_IMAGE) )
+        {
+            cvFree(&oldCascade->hid_cascade);
+        }
+        else
+        {
+            openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
+            openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
+            openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
+        }
+
+        free(buffers);
+        buffers = NULL;
+        initialized = false;
+    }
 }
 
 #ifndef _MAX_PATH
 #define _MAX_PATH 1024
 #endif
-
-
-/****************************************************************************************\
-*                                  Persistence functions                                 *
-\****************************************************************************************/
-
-/* field names */
-
-#define ICV_HAAR_SIZE_NAME            "size"
-#define ICV_HAAR_STAGES_NAME          "stages"
-#define ICV_HAAR_TREES_NAME             "trees"
-#define ICV_HAAR_FEATURE_NAME             "feature"
-#define ICV_HAAR_RECTS_NAME                 "rects"
-#define ICV_HAAR_TILTED_NAME                "tilted"
-#define ICV_HAAR_THRESHOLD_NAME           "threshold"
-#define ICV_HAAR_LEFT_NODE_NAME           "left_node"
-#define ICV_HAAR_LEFT_VAL_NAME            "left_val"
-#define ICV_HAAR_RIGHT_NODE_NAME          "right_node"
-#define ICV_HAAR_RIGHT_VAL_NAME           "right_val"
-#define ICV_HAAR_STAGE_THRESHOLD_NAME   "stage_threshold"
-#define ICV_HAAR_PARENT_NAME            "parent"
-#define ICV_HAAR_NEXT_NAME              "next"
-
-static int gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade, CvPoint pt, int start_stage */)
-{
-    return 1;
-}
-
-namespace cv
-{
-namespace ocl
-{
-
-struct gpuHaarDetectObjects_ScaleImage_Invoker
-{
-    gpuHaarDetectObjects_ScaleImage_Invoker( const CvHaarClassifierCascade *_cascade,
-            int _stripSize, double _factor,
-            const Mat &_sum1, const Mat &_sqsum1, Mat *_norm1,
-            Mat *_mask1, Rect _equRect, ConcurrentRectVector &_vec )
-    {
-        cascade = _cascade;
-        stripSize = _stripSize;
-        factor = _factor;
-        sum1 = _sum1;
-        sqsum1 = _sqsum1;
-        norm1 = _norm1;
-        mask1 = _mask1;
-        equRect = _equRect;
-        vec = &_vec;
-    }
-
-    void operator()( const BlockedRange &range ) const
-    {
-        Size winSize0 = cascade->orig_window_size;
-        Size winSize(cvRound(winSize0.width * factor), cvRound(winSize0.height * factor));
-        int y1 = range.begin() * stripSize, y2 = min(range.end() * stripSize, sum1.rows - 1 - winSize0.height);
-        Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1);
-        int x, y, ystep = factor > 2 ? 1 : 2;
-
-        for( y = y1; y < y2; y += ystep )
-            for( x = 0; x < ssz.width; x += ystep )
-            {
-                if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 )
-                    vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor),
-                                        winSize.width, winSize.height));
-            }
-    }
-
-    const CvHaarClassifierCascade *cascade;
-    int stripSize;
-    double factor;
-    Mat sum1, sqsum1, *norm1, *mask1;
-    Rect equRect;
-    ConcurrentRectVector *vec;
-};
-
-
-struct gpuHaarDetectObjects_ScaleCascade_Invoker
-{
-    gpuHaarDetectObjects_ScaleCascade_Invoker( const CvHaarClassifierCascade *_cascade,
-            Size _winsize, const Range &_xrange, double _ystep,
-            size_t _sumstep, const int **_p, const int **_pq,
-            ConcurrentRectVector &_vec )
-    {
-        cascade = _cascade;
-        winsize = _winsize;
-        xrange = _xrange;
-        ystep = _ystep;
-        sumstep = _sumstep;
-        p = _p;
-        pq = _pq;
-        vec = &_vec;
-    }
-
-    void operator()( const BlockedRange &range ) const
-    {
-        int iy, startY = range.begin(), endY = range.end();
-        const int *p0 = p[0], *p1 = p[1], *p2 = p[2], *p3 = p[3];
-        const int *pq0 = pq[0], *pq1 = pq[1], *pq2 = pq[2], *pq3 = pq[3];
-        bool doCannyPruning = p0 != 0;
-        int sstep = (int)(sumstep / sizeof(p0[0]));
-
-        for( iy = startY; iy < endY; iy++ )
-        {
-            int ix, y = cvRound(iy * ystep), ixstep = 1;
-            for( ix = xrange.start; ix < xrange.end; ix += ixstep )
-            {
-                int x = cvRound(ix * ystep); // it should really be ystep, not ixstep
-
-                if( doCannyPruning )
-                {
-                    int offset = y * sstep + x;
-                    int s = p0[offset] - p1[offset] - p2[offset] + p3[offset];
-                    int sq = pq0[offset] - pq1[offset] - pq2[offset] + pq3[offset];
-                    if( s < 100 || sq < 20 )
-                    {
-                        ixstep = 2;
-                        continue;
-                    }
-                }
-
-                int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */);
-                if( result > 0 )
-                    vec->push_back(Rect(x, y, winsize.width, winsize.height));
-                ixstep = result != 0 ? 1 : 2;
-            }
-        }
-    }
-
-    const CvHaarClassifierCascade *cascade;
-    double ystep;
-    size_t sumstep;
-    Size winsize;
-    Range xrange;
-    const int **p;
-    const int **pq;
-    ConcurrentRectVector *vec;
-};
-
-}
-}
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index 7a13324077..3533cce69a 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Wenju He, wenju@multicorewareinc.com
+//     Wenju He, wenju@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -48,13 +48,107 @@ using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 
-
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
 #define CELLS_PER_BLOCK_X 2
 #define CELLS_PER_BLOCK_Y 2
 #define NTHREADS 256
 
+static oclMat gauss_w_lut;
+static bool hog_device_cpu;
+/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
+static const float gaussian_interp_lut[] = 
+{
+    /* gaussian lut */
+    0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f, 
+    0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f, 
+    0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f, 
+    0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f, 
+    0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f, 
+    0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
+    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
+    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
+    0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 
+    0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 
+    0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f, 
+    0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f, 
+    0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f, 
+    0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
+    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
+    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
+    0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 
+    0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 
+    0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f, 
+    0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f, 
+    0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f, 
+    0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f, 
+    0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f, 
+    0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f, 
+    0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 
+    0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 
+    0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f, 
+    0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f, 
+    0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f, 
+    0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
+    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
+    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
+    0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 
+    0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 
+    0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f, 
+    0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f, 
+    0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f, 
+    0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
+    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
+    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
+    0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 
+    0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 
+    0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f, 
+    /* interp_weight lut */
+    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
+    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
+    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f, 
+    0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 
+    0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 
+    0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f, 
+    0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f, 
+    0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f, 
+    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
+    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
+    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f, 
+    0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 
+    0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 
+    0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f, 
+    0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f, 
+    0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f, 
+    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
+    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
+    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f, 
+    0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 
+    0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 
+    0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f, 
+    0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f, 
+    0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f, 
+    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
+    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
+    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f, 
+    0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 
+    0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 
+    0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f, 
+    0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f, 
+    0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f, 
+    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
+    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
+    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f, 
+    0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 
+    0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 
+    0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f, 
+    0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f, 
+    0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f, 
+    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
+    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
+    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
+};
+
 namespace cv
 {
     namespace ocl
@@ -78,38 +172,43 @@ namespace cv
                 int cnblocks_win_x;
                 int cnblocks_win_y;
                 int cblock_hist_size;
-                int cblock_hist_size_2up;
                 int cdescr_size;
                 int cdescr_width;
+                int cdescr_height;
 
                 void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                                       int nblocks_win_x, int nblocks_win_y);
 
                 void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                                   int height, int width, const cv::ocl::oclMat &grad,
-                                   const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists);
+                                   int height, int width, float sigma, const cv::ocl::oclMat &grad,
+                                   const cv::ocl::oclMat &qangle, 
+                                   const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
 
                 void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                                     int height, int width, cv::ocl::oclMat &block_hists, float threshold);
+                                     int height, int width, cv::ocl::oclMat &block_hists, 
+                                     float threshold);
 
                 void classify_hists(int win_height, int win_width, int block_stride_y,
-                                    int block_stride_x, int win_stride_y, int win_stride_x, int height,
-                                    int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
+                                    int block_stride_x, int win_stride_y, int win_stride_x, 
+                                    int height, int width, const cv::ocl::oclMat &block_hists, 
+                                    const cv::ocl::oclMat &coefs, float free_coef,
                                     float threshold, cv::ocl::oclMat &labels);
 
-                void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                            int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+                void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, 
+                                            int block_stride_x, int win_stride_y, int win_stride_x, 
+                                            int height, int width, const cv::ocl::oclMat &block_hists,
                                             cv::ocl::oclMat &descriptors);
-                void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                            int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+                void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, 
+                                            int block_stride_x, int win_stride_y, int win_stride_x, 
+                                            int height, int width, const cv::ocl::oclMat &block_hists,
                                             cv::ocl::oclMat &descriptors);
 
                 void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
+                                            float angle_scale, cv::ocl::oclMat &grad, 
+                                            cv::ocl::oclMat &qangle, bool correct_gamma);
                 void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
-
-                void resize( const oclMat &src, oclMat &dst, const Size sz);
+                                            float angle_scale, cv::ocl::oclMat &grad, 
+                                            cv::ocl::oclMat &qangle, bool correct_gamma);
             }
         }
     }
@@ -117,8 +216,14 @@ namespace cv
 
 using namespace ::cv::ocl::device;
 
-cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
-                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
+static inline int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+
+cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, 
+                                      Size cell_size_, int nbins_, double win_sigma_, 
+                                      double threshold_L2hys_, bool gamma_correction_, int nlevels_)
     : win_size(win_size_),
       block_size(block_size_),
       block_stride(block_stride_),
@@ -132,19 +237,27 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
     CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
               (win_size.height - block_size.height) % block_stride.height == 0);
 
-    CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
+    CV_Assert(block_size.width % cell_size.width == 0 && 
+        block_size.height % cell_size.height == 0);
 
     CV_Assert(block_stride == cell_size);
 
     CV_Assert(cell_size == Size(8, 8));
 
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+    Size cells_per_block(block_size.width / cell_size.width, 
+        block_size.height / cell_size.height);
     CV_Assert(cells_per_block == Size(2, 2));
 
     cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
+    hog::set_up_constants(nbins, block_stride.width, block_stride.height, 
+        blocks_per_win.width, blocks_per_win.height);
 
     effect_size = Size(0, 0);
+
+	if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
+        hog_device_cpu = true;
+    else
+        hog_device_cpu = false;
 }
 
 size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@@ -154,7 +267,8 @@ size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
 
 size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const
 {
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+    Size cells_per_block = Size(block_size.width / cell_size.width, 
+        block_size.height / cell_size.height);
     return (size_t)(nbins * cells_per_block.area());
 }
 
@@ -167,7 +281,8 @@ bool cv::ocl::HOGDescriptor::checkDetectorSize() const
 {
     size_t detector_size = detector.rows * detector.cols;
     size_t descriptor_size = getDescriptorSize();
-    return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
+    return detector_size == 0 || detector_size == descriptor_size || 
+        detector_size == descriptor_size + 1;
 }
 
 void cv::ocl::HOGDescriptor::setSVMDetector(const vector<float> &_detector)
@@ -207,10 +322,16 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
 
     const size_t block_hist_size = getBlockHistogramSize();
     const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
-    block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
+    block_hists.create(1, 
+        static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
 
     Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
     labels.create(1, wins_per_img.area(), CV_8U);
+
+    vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut + 
+        sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
+    Mat m_lut(v_lut);
+    gauss_w_lut.upload(m_lut.reshape(1,1));
 }
 
 void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
@@ -221,29 +342,34 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
     switch (img.type())
     {
     case CV_8UC1:
-        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, 
+            angleScale, grad, qangle, gamma_correction);
         break;
     case CV_8UC4:
-        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, 
+            angleScale, grad, qangle, gamma_correction);
         break;
     }
 }
 
+
 void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
 {
-    computeGradient(img, grad, qangle);
+    computeGradient(img, this->grad, this->qangle);
 
-    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
-                       grad, qangle, (float)getWinSigma(), block_hists);
+    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
+        effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
 
-    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
-                         block_hists, (float)threshold_L2hys);
+    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
+        effect_size.width, block_hists, (float)threshold_L2hys);
 }
 
 
-void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, oclMat &descriptors, int descr_format)
+void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, 
+                                            oclMat &descriptors, int descr_format)
 {
-    CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+    CV_Assert(win_stride.width % block_stride.width == 0 && 
+        win_stride.height % block_stride.height == 0);
 
     init_buffer(img, win_stride);
 
@@ -253,17 +379,20 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
     Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
     Size wins_per_img   = numPartsWithin(effect_size, win_size, win_stride);
 
-    descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
+    descriptors.create(wins_per_img.area(), 
+        static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
 
     switch (descr_format)
     {
     case DESCR_FORMAT_ROW_BY_ROW:
-        hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+        hog::extract_descrs_by_rows(win_size.height, win_size.width, 
+            block_stride.height, block_stride.width, win_stride.height, win_stride.width, 
+            effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     case DESCR_FORMAT_COL_BY_COL:
-        hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+        hog::extract_descrs_by_cols(win_size.height, win_size.width, 
+            block_stride.height, block_stride.width, win_stride.height, win_stride.width, 
+            effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     default:
         CV_Error(CV_StsBadArg, "Unknown descriptor format");
@@ -271,7 +400,8 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
 }
 
 
-void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, double hit_threshold, Size win_stride, Size padding)
+void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, 
+                                    double hit_threshold, Size win_stride, Size padding)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
     CV_Assert(padding == Size(0, 0));
@@ -283,14 +413,16 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
     if (win_stride == Size())
         win_stride = block_stride;
     else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+        CV_Assert(win_stride.width % block_stride.width == 0 && 
+            win_stride.height % block_stride.height == 0);
     init_buffer(img, win_stride);
 
     computeBlockHistograms(img);
 
-    hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                        win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists,
-                        detector, (float)free_coef, (float)hit_threshold, labels);
+    hog::classify_hists(win_size.height, win_size.width, block_stride.height, 
+        block_stride.width, win_stride.height, win_stride.width, 
+        effect_size.height, effect_size.width, block_hists, detector, 
+        (float)free_coef, (float)hit_threshold, labels);
 
     labels.download(labels_host);
     unsigned char *vec = labels_host.ptr();
@@ -306,8 +438,9 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
 
 
 
-void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations, double hit_threshold,
-        Size win_stride, Size padding, double scale0, int group_threshold)
+void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations, 
+                                              double hit_threshold, Size win_stride, Size padding, 
+                                              double scale0, int group_threshold)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
     CV_Assert(scale0 > 1);
@@ -333,7 +466,8 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
     if (win_stride == Size())
         win_stride = block_stride;
     else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+        CV_Assert(win_stride.width % block_stride.width == 0 && 
+            win_stride.height % block_stride.height == 0);
     init_buffer(img, win_stride);
     image_scale.create(img.size(), img.type());
 
@@ -347,16 +481,18 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
         }
         else
         {
-            hog::resize( img, image_scale, effect_size);
+            resize(img, image_scale, effect_size);
             detect(image_scale, locations, hit_threshold, win_stride, padding);
         }
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
+        Size scaled_win_size(cvRound(win_size.width * scale), 
+            cvRound(win_size.height * scale));
         for (size_t j = 0; j < locations.size(); j++)
-            all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size));
+            all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, 
+              scaled_win_size));
     }
 
     found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
+    groupRectangles(found_locations, group_threshold, 0.2);
 }
 
 int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
@@ -364,9 +500,11 @@ int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
     return (size - part_size + stride) / stride;
 }
 
-cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
+cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, 
+                                                cv::Size stride)
 {
-    return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
+    return Size(numPartsWithin(size.width, part_size.width, stride.width), 
+        numPartsWithin(size.height, part_size.height, stride.height));
 }
 
 std::vector<float> cv::ocl::HOGDescriptor::getDefaultPeopleDetector()
@@ -1547,8 +1685,9 @@ static int power_2up(unsigned int n)
     return -1; // Input is too big
 }
 
-void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-        int nblocks_win_x, int nblocks_win_y)
+void cv::ocl::device::hog::set_up_constants(int nbins, 
+                                            int block_stride_x, int block_stride_y, 
+                                            int nblocks_win_x, int nblocks_win_y)
 {
     cnbins = nbins;
     cblock_stride_x = block_stride_x;
@@ -1559,52 +1698,32 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b
     int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
     cblock_hist_size = block_hist_size;
 
-    int block_hist_size_2up = power_2up(block_hist_size);
-    cblock_hist_size_2up = block_hist_size_2up;
-
     int descr_width = nblocks_win_x * block_hist_size;
     cdescr_width = descr_width;
+    cdescr_height = nblocks_win_y;
 
     int descr_size = descr_width * nblocks_win_y;
     cdescr_size = descr_size;
 }
 
-static inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
-static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName, 
-                                    size_t globalThreads[3], size_t localThreads[3], 
-                                    vector< pair<size_t, const void *> > &args)
-{
-    size_t wave_size = 0;
-    queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
-    if (wave_size <= 16)
-    {
-        char build_options[64];
-        sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1");
-        openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-    }
-    else
-        openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y,
-        int height, int width, const cv::ocl::oclMat &grad,
-        const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists)
+void cv::ocl::device::hog::compute_hists(int nbins, 
+                                         int block_stride_x, int block_stride_y,
+                                         int height, int width, float sigma, 
+                                         const cv::ocl::oclMat &grad, 
+                                         const cv::ocl::oclMat &qangle, 
+                                         const cv::ocl::oclMat &gauss_w_lut, 
+                                         cv::ocl::oclMat &block_hists)
 {
     Context *clCxt = Context::getContext();
-    string kernelName = "compute_hists_kernel";
     vector< pair<size_t, const void *> > args;
+    string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" : 
+        "compute_hists_kernel";
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) 
+        / block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) 
+        / block_stride_y;
     int blocks_total = img_block_width * img_block_height;
-    int blocks_in_group = 4;
-    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
-    size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 };
 
     int grad_quadstep = grad.step >> 2;
     int qangle_step = qangle.step;
@@ -1612,6 +1731,11 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
     // Precompute gaussian spatial window parameter
     float scale = 1.f / (2.f * sigma * sigma);
 
+    int blocks_in_group = 4;
+    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
+    size_t globalThreads[3] = { 
+        divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 };
+
     int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
     int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
     int smem = (hists_size + final_hists_size) * blocks_in_group;
@@ -1627,62 +1751,120 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
     args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
+    if (kernelName.compare("compute_hists_lut_kernel") == 0)
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
+    else
+        args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( smem, (void *)NULL));
 
-    openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-        int height, int width, cv::ocl::oclMat &block_hists, float threshold)
+void cv::ocl::device::hog::normalize_hists(int nbins, 
+                                           int block_stride_x, int block_stride_y,
+                                           int height, int width, 
+                                           cv::ocl::oclMat &block_hists, 
+                                           float threshold)
 {
     Context *clCxt = Context::getContext();
-    string kernelName = "normalize_hists_kernel";
     vector< pair<size_t, const void *> > args;
+    string kernelName;
 
     int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
-    int nthreads = power_2up(block_hist_size);
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) 
+        / block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) 
+        / block_stride_y;
+    int nthreads;
+    size_t globalThreads[3] = { 1, 1, 1  };
+    size_t localThreads[3] = { 1, 1, 1  };
+    
+    if ( nbins == 9 )
+    {
+        /* optimized for the case of 9 bins */
+        kernelName = "normalize_hists_36_kernel";
+        int blocks_in_group = NTHREADS / block_hist_size;
+        nthreads = blocks_in_group * block_hist_size;
+        int num_groups = divUp( img_block_width * img_block_height, blocks_in_group);
+        globalThreads[0] = nthreads * num_groups;
+        localThreads[0] = nthreads;
+    }
+    else
+    {
+        kernelName = "normalize_hists_kernel";
+        nthreads = power_2up(block_hist_size);
+        globalThreads[0] = img_block_width * nthreads;
+        globalThreads[1] = img_block_height;
+        localThreads[0] = nthreads;
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-    size_t globalThreads[3] = { img_block_width * nthreads, img_block_height, 1 };
-    size_t localThreads[3] = { nthreads, 1, 1  };
+        if ((nthreads < 32) || (nthreads > 512) )
+            cv::ocl::error("normalize_hists: histogram's size is too small or too big", 
+                __FILE__, __LINE__, "normalize_hists");
 
-    if ((nthreads < 32) || (nthreads > 512) )
-        cv::ocl::error("normalize_hists: histogram's size is too small or too big", __FILE__, __LINE__, "normalize_hists");
+        args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
+    }
 
-    args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
     args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL));
 
-    openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+    if(hog_device_cpu)
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1, "-D CPU");
+    else
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
-        int block_stride_x, int win_stride_y, int win_stride_x, int height,
-        int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
-        float threshold, cv::ocl::oclMat &labels)
+void cv::ocl::device::hog::classify_hists(int win_height, int win_width, 
+                                          int block_stride_y, int block_stride_x, 
+                                          int win_stride_y, int win_stride_x, 
+                                          int height, int width, 
+                                          const cv::ocl::oclMat &block_hists, 
+                                          const cv::ocl::oclMat &coefs, 
+                                          float free_coef, float threshold, 
+                                          cv::ocl::oclMat &labels)
 {
     Context *clCxt = Context::getContext();
-    string kernelName = "classify_hists_kernel";
     vector< pair<size_t, const void *> > args;
 
+    int nthreads;
+    string kernelName;
+    switch (cdescr_width)
+    {
+    case 180:
+        nthreads = 180;
+        kernelName = "classify_hists_180_kernel";
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
+        break;
+    case 252:
+        nthreads = 256;
+        kernelName = "classify_hists_252_kernel";
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
+        break;
+    default:
+        nthreads = 256;
+        kernelName = "classify_hists_kernel";
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
+    }
+
     int win_block_stride_x = win_stride_x / block_stride_x;
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-
-    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
-    size_t localThreads[3] = { NTHREADS, 1, 1 };
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+        block_stride_x;
 
+    size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
+    size_t localThreads[3] = { nthreads, 1, 1 };
     args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
     args.push_back( make_pair( sizeof(cl_int), (void *)&img_win_width));
     args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
     args.push_back( make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
@@ -1693,12 +1875,20 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
     args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data));
 
-    openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
+    if(hog_device_cpu)
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1, "-D CPU");
+    else
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+                             localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-        int win_stride_y, int win_stride_x, int height, int width,
-        const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, 
+                                                  int block_stride_y, int block_stride_x,
+                                                  int win_stride_y, int win_stride_x, 
+                                                  int height, int width,
+                                                  const cv::ocl::oclMat &block_hists, 
+                                                  cv::ocl::oclMat &descriptors)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "extract_descrs_by_rows_kernel";
@@ -1708,7 +1898,8 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+        block_stride_x;
     int descriptors_quadstep = descriptors.step >> 2;
 
     size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@@ -1724,12 +1915,16 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-        int win_stride_y, int win_stride_x, int height, int width,
-        const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, 
+                                                  int block_stride_y, int block_stride_x,
+                                                  int win_stride_y, int win_stride_x, 
+                                                  int height, int width,
+                                                  const cv::ocl::oclMat &block_hists, 
+                                                  cv::ocl::oclMat &descriptors)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "extract_descrs_by_cols_kernel";
@@ -1739,7 +1934,8 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+        block_stride_x;
     int descriptors_quadstep = descriptors.step >> 2;
 
     size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@@ -1756,11 +1952,16 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
-        float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, 
+                                                  const cv::ocl::oclMat &img,
+                                                  float angle_scale, 
+                                                  cv::ocl::oclMat &grad, 
+                                                  cv::ocl::oclMat &qangle, 
+                                                  bool correct_gamma)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "compute_gradients_8UC1_kernel";
@@ -1785,11 +1986,16 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
     args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
     args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
-        float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, 
+                                                  const cv::ocl::oclMat &img,
+                                                  float angle_scale, 
+                                                  cv::ocl::oclMat &grad, 
+                                                  cv::ocl::oclMat &qangle, 
+                                                  bool correct_gamma)
 {
     Context *clCxt = Context::getContext();
     string kernelName = "compute_gradients_8UC4_kernel";
@@ -1815,39 +2021,6 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
     args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
     args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
-{
-    CV_Assert( (src.channels() == dst.channels()) );
-    Context *clCxt = Context::getContext();
-
-    string kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel";
-    size_t blkSizeX = 16, blkSizeY = 16;
-    size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX;
-    size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY;
-    size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
-    size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
-
-    float ifx = (float)src.cols / sz.width;
-    float ify = (float)src.rows / sz.height;
-    int src_step = static_cast<int>(src.step);
-    int dst_step = static_cast<int>(dst.step);
-
-    vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width));
-    args.push_back( make_pair(sizeof(cl_int), (void *)&sz.height));
-    args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
-    args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
-
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
-}
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+        localThreads, args, -1, -1);
+}
\ No newline at end of file
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index ee1e92a712..15c1539c0e 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -25,6 +25,7 @@
 //    Xu Pang, pangxu010@163.com
 //    Wu Zailong, bullet@yeah.net
 //    Wenju He, wenju@multicorewareinc.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -80,6 +81,7 @@ namespace cv
         extern const char *imgproc_calcHarris;
         extern const char *imgproc_calcMinEigenVal;
         extern const char *imgproc_convolve;
+        extern const char *imgproc_clahe;
         ////////////////////////////////////OpenCL call wrappers////////////////////////////
 
         template <typename T> struct index_and_sizeof;
@@ -269,7 +271,7 @@ namespace cv
             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 
-
+            float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
             vector< pair<size_t, const void *> > args;
             if(map1.channels() == 2)
             {
@@ -289,9 +291,8 @@ namespace cv
                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
-                float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
-
-               if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                
+                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                 {
                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
                 }
@@ -325,7 +326,6 @@ namespace cv
                 }
                 else
                 {
-                    float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
                 }
             }
@@ -1207,30 +1207,41 @@ namespace cv
         void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
                           double k, int borderType)
         {
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(CV_GpuNotSupported, "select device don't support double");
-            }
-            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-            oclMat Dx, Dy;
-            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-            extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
-            dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), Dx, Dy, dst, borderType);
+            oclMat dx, dy;
+            cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
         }
 
-        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
+        void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
+                          double k, int borderType)
         {
             if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
             {
                 CV_Error(CV_GpuNotSupported, "select device don't support double");
             }
             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-            oclMat Dx, Dy;
             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-            extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
+            extractCovData(src, dx, dy, blockSize, ksize, borderType);
             dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType);
+            corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
+        }
+
+        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
+        {
+            oclMat dx, dy;
+            cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
+        }
+        
+        void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
+        {
+            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            {
+                CV_Error(CV_GpuNotSupported, "select device don't support double");
+            }
+            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
+            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+            extractCovData(src, dx, dy, blockSize, ksize, borderType);
+            dst.create(src.size(), CV_32F);
+            corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
         }
         /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
         static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
@@ -1502,6 +1513,194 @@ namespace cv
             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
             LUT(mat_src, lut, mat_dst);
         }
+
+        ////////////////////////////////////////////////////////////////////////
+        // CLAHE
+        namespace clahe
+        {
+            inline int divUp(int total, int grain)
+            {
+                return (total + grain - 1) / grain * grain;
+            }
+
+            static void calcLut(const oclMat &src, oclMat &dst,
+                const int tilesX, const int tilesY, const cv::Size tileSize,
+                const int clipLimit, const float lutScale)
+            {
+                cl_int2 tile_size;
+                tile_size.s[0] = tileSize.width;
+                tile_size.s[1] = tileSize.height;
+
+                std::vector<pair<size_t , const void *> > args;
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
+                args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
+
+                String kernelName = "calcLut";
+                size_t localThreads[3]  = { 32, 8, 1 };
+                size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
+                bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+                if (is_cpu)
+                {
+                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
+                }
+                else
+                {
+                    cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
+                    int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+                    openCLSafeCall(clReleaseKernel(kernel));
+
+                    static char opt[20] = {0};
+                    sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
+                }
+            }
+
+            static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
+                const int tilesX, const int tilesY, const cv::Size tileSize)
+            {
+                cl_int2 tile_size;
+                tile_size.s[0] = tileSize.width;
+                tile_size.s[1] = tileSize.height;
+
+                std::vector<pair<size_t , const void *> > args;
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
+
+                String kernelName = "transform";
+                size_t localThreads[3]  = { 32, 8, 1 };
+                size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 };
+
+                openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
+            }
+        }
+
+        namespace
+        {
+            class CLAHE_Impl : public cv::CLAHE
+            {
+            public:
+                CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+                cv::AlgorithmInfo* info() const;
+
+                void apply(cv::InputArray src, cv::OutputArray dst);
+
+                void setClipLimit(double clipLimit);
+                double getClipLimit() const;
+
+                void setTilesGridSize(cv::Size tileGridSize);
+                cv::Size getTilesGridSize() const;
+
+                void collectGarbage();
+
+            private:
+                double clipLimit_;
+                int tilesX_;
+                int tilesY_;
+
+                oclMat srcExt_;
+                oclMat lut_;
+            };
+            CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+            clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+            {
+            }
+
+            CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE_OCL",
+                obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
+                obj.info()->addParam(obj, "tilesX", obj.tilesX_);
+                obj.info()->addParam(obj, "tilesY", obj.tilesY_))
+            void CLAHE_Impl::apply(cv::InputArray src_raw, cv::OutputArray dst_raw)
+            {
+                oclMat& src = getOclMatRef(src_raw);
+                oclMat& dst = getOclMatRef(dst_raw);
+                CV_Assert( src.type() == CV_8UC1 );
+
+                dst.create( src.size(), src.type() );
+
+                const int histSize = 256;
+
+                ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
+
+                cv::Size tileSize;
+                oclMat srcForLut;
+
+                if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+                {
+                    tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+                    srcForLut = src;
+                }
+                else
+                {
+                    cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar());
+
+                    tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+                    srcForLut = srcExt_;
+                }
+
+                const int tileSizeTotal = tileSize.area();
+                const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+                int clipLimit = 0;
+                if (clipLimit_ > 0.0)
+                {
+                    clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+                    clipLimit = std::max(clipLimit, 1);
+                }
+
+                clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale);
+                //finish();
+                clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize);
+            }
+
+            void CLAHE_Impl::setClipLimit(double clipLimit)
+            {
+                clipLimit_ = clipLimit;
+            }
+
+            double CLAHE_Impl::getClipLimit() const
+            {
+                return clipLimit_;
+            }
+
+            void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+            {
+                tilesX_ = tileGridSize.width;
+                tilesY_ = tileGridSize.height;
+            }
+
+            cv::Size CLAHE_Impl::getTilesGridSize() const
+            {
+                return cv::Size(tilesX_, tilesY_);
+            }
+
+            void CLAHE_Impl::collectGarbage()
+            {
+                srcExt_.release();
+                lut_.release();
+            }
+        }
+
+        cv::Ptr<cv::CLAHE> createCLAHE(double clipLimit, cv::Size tileGridSize)
+        {
+            return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
+        }
+
         //////////////////////////////////bilateralFilter////////////////////////////////////////////////////
         static void
         oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d,
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index 799c49c50c..d4841fcfd4 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -124,7 +124,8 @@ namespace cv
             cacheSize = 0;
         }
 
-
+        // not to be exported to dynamic lib
+        void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl);
         struct Info::Impl
         {
             cl_platform_id oclplatform;
@@ -142,22 +143,12 @@ namespace cv
             char extra_options[512];
             int  double_support;
             int unified_memory; //1 means integrated GPU, otherwise this value is 0
+            bool enable_disk_cache; 
+            bool update_disk_cache;
             string binpath;
             int refcounter;
 
-            Impl()
-            {
-                refcounter = 1;
-                oclplatform = 0;
-                oclcontext = 0;
-                clCmdQueue = 0;
-                devnum = -1;
-                maxComputeUnits = 0;
-                maxWorkGroupSize = 0;
-                memset(extra_options, 0, 512);
-                double_support = 0;
-                unified_memory = 0;
-            }
+            Impl();
 
             void setDevice(void *ctx, void *q, int devnum);
 
@@ -182,6 +173,25 @@ namespace cv
             void releaseResources();
         };
 
+        Info::Impl::Impl()
+            :oclplatform(0),
+            oclcontext(0),
+            clCmdQueue(0),
+            devnum(-1),
+            maxWorkGroupSize(0),
+            maxDimensions(0),
+            maxComputeUnits(0),
+            double_support(0),
+            unified_memory(0),
+            enable_disk_cache(false),
+            update_disk_cache(false),
+            binpath("./"),
+            refcounter(1)
+        {
+            memset(extra_options, 0, 512);
+            setBinaryDiskCacheImpl(CACHE_RELEASE, String("./"), this);
+        }
+
         void Info::Impl::releaseResources()
         {
             devnum = -1;
@@ -333,6 +343,10 @@ namespace cv
                     oclinfo.push_back(ocltmpinfo);
                 }
             }
+            if(devcienums > 0)
+            {
+                setDevice(oclinfo[0]);
+            }
             return devcienums;
         }
 
@@ -363,64 +377,43 @@ namespace cv
             clFinish(Context::getContext()->impl->clCmdQueue);
         }
 
-        void queryDeviceInfo(DEVICE_INFO info_type, void* info)
+        //template specializations of queryDeviceInfo
+        template<>
+        bool queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel)
         {
-            static Info::Impl* impl = Context::getContext()->impl;
-            switch(info_type)
-            {
-            case WAVEFRONT_SIZE:
-                {
-                    bool is_cpu = false;
-                    queryDeviceInfo(IS_CPU_DEVICE, &is_cpu);
-                    if(is_cpu)
-                    {
-                        *(int*)info = 1;
-                        return;
-                    }
-#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
-                    try
-                    {
-                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
-                            CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(size_t), info, 0));
-                    }
-                    catch(const cv::Exception&)
-#elif defined (CL_DEVICE_WARP_SIZE_NV)
-                    const int EXT_LEN = 4096 + 1 ;
-                    char extends_set[EXT_LEN];
-                    size_t extends_size;
-                    openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
-                    extends_set[EXT_LEN - 1] = 0;
-                    if(std::string(extends_set).find("cl_nv_device_attribute_query") != std::string::npos)
-                    {
-                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
-                            CL_DEVICE_WARP_SIZE_NV, sizeof(size_t), info, 0));
-                    }
-                    else
-#endif
-                    {
-                        // if no way left for us to query the warp size, we can get it from kernel group info
-                        static const char * _kernel_string = "__kernel void test_func() {}";
-                        cl_kernel kernel;
-                        kernel = openCLGetKernelFromSource(Context::getContext(), &_kernel_string, "test_func");
-                        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
-                            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), info, NULL));
-                    }
+            Info::Impl* impl = Context::getContext()->impl;
+            cl_device_type devicetype;
+            openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
+                CL_DEVICE_TYPE, sizeof(cl_device_type),
+                &devicetype, NULL));
+            return (devicetype == CVCL_DEVICE_TYPE_CPU);
+        }
 
-                }
-                break;
-            case IS_CPU_DEVICE:
-                {
-                    cl_device_type devicetype;
-                    openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], 
-                                    CL_DEVICE_TYPE, sizeof(cl_device_type), 
-                                    &devicetype, NULL));
-                    *(bool*)info = (devicetype == CVCL_DEVICE_TYPE_CPU);
-                }
-                break;
-            default:
-                CV_Error(-1, "Invalid device info type");
-                break;
+        template<typename _ty>
+        static _ty queryWavesize(cl_kernel kernel)
+        {
+            size_t info = 0;
+            Info::Impl* impl = Context::getContext()->impl;
+            bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+            if(is_cpu)
+            {
+                return 1;
             }
+            CV_Assert(kernel != NULL);
+            openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
+                CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &info, NULL));
+            return static_cast<_ty>(info);
+        }
+
+        template<>
+        size_t queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel)
+        {
+            return queryWavesize<size_t>(kernel);
+        }
+        template<>
+        int queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel)
+        {
+            return queryWavesize<int>(kernel);
         }
 
         void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
@@ -511,6 +504,24 @@ namespace cv
             return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
         }
 
+        void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl)
+        {
+            impl->update_disk_cache = (mode & CACHE_UPDATE) == CACHE_UPDATE;
+            impl->enable_disk_cache = 
+#ifdef _DEBUG 
+                (mode & CACHE_DEBUG)   == CACHE_DEBUG;
+#else
+                (mode & CACHE_RELEASE) == CACHE_RELEASE;
+#endif
+            if(impl->enable_disk_cache && !path.empty())
+            {
+                impl->binpath = path;
+            }
+        }
+        void setBinaryDiskCache(int mode, cv::String path)
+        {
+            setBinaryDiskCacheImpl(mode, path, Context::getContext()->impl);
+        }
 
         void setBinpath(const char *path)
         {
@@ -590,8 +601,8 @@ namespace cv
                     filename = clCxt->impl->binpath  + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
                 }
 
-                FILE *fp = fopen(filename.c_str(), "rb");
-                if(fp == NULL || clCxt->impl->binpath.size() == 0)    //we should generate a binary file for the first time.
+                FILE *fp = clCxt->impl->enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
+                if(fp == NULL || clCxt->impl->update_disk_cache)
                 {
                     if(fp != NULL)
                         fclose(fp);
@@ -600,7 +611,7 @@ namespace cv
                                   clCxt->impl->oclcontext, 1, source, NULL, &status);
                     openCLVerifyCall(status);
                     status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
-                    if(status == CL_SUCCESS && clCxt->impl->binpath.size())
+                    if(status == CL_SUCCESS && clCxt->impl->enable_disk_cache)
                         savetofile(clCxt, program, filename.c_str());
                 }
                 else
@@ -934,6 +945,14 @@ namespace cv
         int Context::val = 0;
         static Mutex cs;
         static volatile int context_tear_down = 0;
+
+        bool initialized()
+        {
+            return *((volatile int*)&Context::val) != 0 && 
+                Context::clCxt->impl->clCmdQueue != NULL&& 
+                Context::clCxt->impl->oclcontext != NULL;
+        }
+
         Context* Context::getContext()
         {
             if(*((volatile int*)&val) != 1)
@@ -947,8 +966,6 @@ namespace cv
                         clCxt.reset(new Context);
                     std::vector<Info> oclinfo;
                     CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
-                    oclinfo[0].impl->setDevice(0, 0, 0);
-                    clCxt.get()->impl = oclinfo[0].impl->copy();
 
                     *((volatile int*)&val) = 1;
                 }
@@ -1073,7 +1090,7 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
         Context* cv_ctx = Context::getContext();
         if(cv_ctx)
         {
-            cl_context ctx = (cl_context)&(cv_ctx->impl->oclcontext);
+            cl_context ctx = cv_ctx->impl->oclcontext;
             if(ctx)
                 openCLSafeCall(clReleaseContext(ctx));
         }
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index 268a1fe9b5..1ff963a5cd 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -74,6 +74,7 @@ namespace cv
     }
 }
 
+
 ////////////////////////////////////////////////////////////////////////
 // convert_C3C4
 static void convert_C3C4(const cl_mem &src, oclMat &dst)
@@ -227,6 +228,34 @@ void cv::ocl::oclMat::upload(const Mat &m)
     //download_channels = m.channels();
 }
 
+cv::ocl::oclMat::operator cv::_InputArray()
+{
+    _InputArray newInputArray;
+    newInputArray.flags = cv::_InputArray::OCL_MAT;
+    newInputArray.obj   = reinterpret_cast<void *>(this);
+    return newInputArray;
+}
+
+cv::ocl::oclMat::operator cv::_OutputArray()
+{
+    _OutputArray newOutputArray;
+    newOutputArray.flags = cv::_InputArray::OCL_MAT;
+    newOutputArray.obj   = reinterpret_cast<void *>(this);
+    return newOutputArray;
+}
+
+cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
+{
+    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
+    return *reinterpret_cast<oclMat*>(src.obj);
+}
+
+cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
+{
+    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
+    return *reinterpret_cast<oclMat*>(src.obj);
+}
+
 void cv::ocl::oclMat::download(cv::Mat &m) const
 {
     CV_DbgAssert(!this->empty());
@@ -394,7 +423,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
     if( rtype < 0 )
         rtype = type();
     else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels());
+        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
 
     //int scn = channels();
     int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index 3bcb8700b7..4292a1f877 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -43,9 +43,28 @@
 //
 //M*/
 
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #include "precomp.hpp"
 
+#ifdef __GNUC__
+#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
+#define GCC_DIAG_STR(s) #s
+#define GCC_DIAG_JOINSTR(x,y) GCC_DIAG_STR(x ## y)
+# define GCC_DIAG_DO_PRAGMA(x) _Pragma (#x)
+# define GCC_DIAG_PRAGMA(x) GCC_DIAG_DO_PRAGMA(GCC diagnostic x)
+# if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406
+#  define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(push) \
+GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
+#  define GCC_DIAG_ON(x) GCC_DIAG_PRAGMA(pop)
+# else
+#  define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
+#  define GCC_DIAG_ON(x)  GCC_DIAG_PRAGMA(warning GCC_DIAG_JOINSTR(-W,x))
+# endif
+#else
+# define GCC_DIAG_OFF(x)
+# define GCC_DIAG_ON(x)
+#endif
+#endif /* __GNUC__ */
+
 using namespace std;
 
 namespace cv
@@ -121,13 +140,16 @@ namespace cv
                                   build_options, finish_mode);
         }
 
+#ifdef __GNUC__
+        GCC_DIAG_OFF(deprecated-declarations)
+#endif
         cl_mem bindTexture(const oclMat &mat)
         {
             cl_mem texture;
             cl_image_format format;
             int err;
             int depth    = mat.depth();
-            int channels = mat.channels();
+            int channels = mat.oclchannels();
 
             switch(depth)
             {
@@ -156,7 +178,7 @@ namespace cv
                 format.image_channel_order     = CL_RGBA;
                 break;
             default:
-                CV_Error(-1, "Image forma is not supported");
+                CV_Error(-1, "Image format is not supported");
                 break;
             }
 #ifdef CL_VERSION_1_2
@@ -180,10 +202,6 @@ namespace cv
             else
 #endif
             {
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
                 texture = clCreateImage2D(
                     (cl_context)mat.clCxt->oclContext(),
                     CL_MEM_READ_WRITE,
@@ -193,9 +211,6 @@ namespace cv
                     0,
                     NULL,
                     &err);
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
             }
             size_t origin[] = { 0, 0, 0 };
             size_t region[] = { mat.cols, mat.rows, 1 };
@@ -225,6 +240,14 @@ namespace cv
             openCLSafeCall(err);
             return texture;
         }
+#ifdef __GNUC__
+        GCC_DIAG_ON(deprecated-declarations)
+#endif
+
+        Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
+        {
+            return Ptr<TextureCL>(new TextureCL(bindTexture(mat), mat.rows, mat.cols, mat.type()));
+        }
         void releaseTexture(cl_mem& texture)
         {
             openCLFree(texture);
diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp
index d6baba207c..cb16fb136d 100644
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@@ -16,7 +16,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Sen Liu, sen@multicorewareinc.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
         blocky = size.height/TILE_SIZE;
     else
         blocky = size.height/TILE_SIZE + 1;
-    cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1);
-    cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
+    oclMat dst_m(blocky * 10, blockx, CV_64FC1);
+    oclMat sum(1, 10, CV_64FC1);
     int tile_width  = std::min(size.width,TILE_SIZE);
     int tile_height = std::min(size.height,TILE_SIZE);
     size_t localThreads[3]  = { tile_height, 1, 1};
@@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
     args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
 
     size_t localThreadss[3]  = { 128, 1, 1};
     size_t globalThreadss[3] = { 128, 1, 1};
@@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
     args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height ));
     args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
     args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum ));
+    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
     args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
     args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
-    openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
-    double* dstsum = new double[10];
-    memset(dstsum,0,10*sizeof(double));
-    openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double));
-    mom->m00 = dstsum[0];
-    mom->m10 = dstsum[1];
-    mom->m01 = dstsum[2];
-    mom->m20 = dstsum[3];
-    mom->m11 = dstsum[4];
-    mom->m02 = dstsum[5];
-    mom->m30 = dstsum[6];
-    mom->m21 = dstsum[7];
-    mom->m12 = dstsum[8];
-    mom->m03 = dstsum[9];
-    delete [] dstsum;
-    openCLSafeCall(clReleaseMemObject(sum));
+    openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
+
+    Mat dstsum(sum);
+    mom->m00 = dstsum.at<double>(0, 0);
+    mom->m10 = dstsum.at<double>(0, 1);
+    mom->m01 = dstsum.at<double>(0, 2);
+    mom->m20 = dstsum.at<double>(0, 3);
+    mom->m11 = dstsum.at<double>(0, 4);
+    mom->m02 = dstsum.at<double>(0, 5);
+    mom->m30 = dstsum.at<double>(0, 6);
+    mom->m21 = dstsum.at<double>(0, 7);
+    mom->m12 = dstsum.at<double>(0, 8);
+    mom->m03 = dstsum.at<double>(0, 9);
+
     icvCompleteMomentState( mom );
 }
 
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
index 7d4b0a7653..070ced4731 100644
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -127,7 +127,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 3)
+#define dst_align ((dst_offset / 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -165,7 +165,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 3)
+#define dst_align ((dst_offset / 2) & 3)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
@@ -335,7 +335,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -375,7 +375,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -507,7 +507,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
index fdf65923cd..3dbd376ecf 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -126,7 +126,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -164,7 +164,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
@@ -288,7 +288,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
         int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
         int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
 
diff --git a/modules/ocl/src/opencl/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl
index e1cc9f6ab4..40988f5fed 100644
--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ b/modules/ocl/src/opencl/arithm_mul.cl
@@ -277,9 +277,15 @@ __kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offs
 }
 #endif
 
+#ifdef DOUBLE_SUPPORT
+#define SCALAR_TYPE double
+#else
+#define SCALAR_TYPE float
+#endif
+
 __kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
                               __global float *dst,  int dst_step,  int dst_offset,
-                              int rows, int cols, int dst_step1, float scalar)
+                              int rows, int cols, int dst_step1, SCALAR_TYPE scalar)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/filtering_laplacian.cl b/modules/ocl/src/opencl/filtering_laplacian.cl
index 96a2f51ef4..8535eb1a54 100644
--- a/modules/ocl/src/opencl/filtering_laplacian.cl
+++ b/modules/ocl/src/opencl/filtering_laplacian.cl
@@ -82,9 +82,9 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////Macro for define elements number per thread/////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-#define ANCHOR                  3
-#define ANX                     1
-#define ANY                     1
+//#define ANCHOR                  3
+//#define ANX                     1
+//#define ANY                     1
 
 #define ROWS_PER_GROUP          4
 #define ROWS_PER_GROUP_BITS     2
@@ -185,7 +185,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
 
         for(int i = 0; i < ANCHOR; i++)
         {
-#pragma unroll 3
+#pragma unroll
             for(int j = 0; j < ANCHOR; j++)
             {
                 if(dst_rows_index < dst_rows_end)
@@ -295,7 +295,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
 
         for(int i = 0; i < ANCHOR; i++)
         {
-#pragma unroll 3
+#pragma unroll
             for(int j = 0; j < ANCHOR; j++)
             {
                 if(dst_rows_index < dst_rows_end)
@@ -410,7 +410,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
 
         for(int i = 0; i < ANCHOR; i++)
         {
-#pragma unroll 3
+#pragma unroll
             for(int j = 0; j < ANCHOR; j++)
             {
                 if(dst_rows_index < dst_rows_end)
diff --git a/modules/ocl/src/opencl/filtering_morph.cl b/modules/ocl/src/opencl/filtering_morph.cl
index 49640008f4..e659a59f51 100644
--- a/modules/ocl/src/opencl/filtering_morph.cl
+++ b/modules/ocl/src/opencl/filtering_morph.cl
@@ -120,7 +120,7 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
     int gidy = get_global_id(1);
     int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
 
-    if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3)==0)
+    if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
     {
         *(__global uchar4*)&dst[out_addr] = res;
     }
diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl
index e0ab8603b7..4873298af0 100644
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -10,6 +10,7 @@
 //    Wang Weiyan, wangweiyanster@gmail.com
 //    Jia Haipeng, jiahaipeng95@gmail.com
 //    Nathan, liujun@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -45,27 +46,16 @@
 typedef int   sumtype;
 typedef float sqsumtype;
 
-typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
-{
-    struct __attribute__((aligned (32)))
-{
-    int p0 __attribute__((aligned (4)));
-    int p1 __attribute__((aligned (4)));
-    int p2 __attribute__((aligned (4)));
-    int p3 __attribute__((aligned (4)));
-    float weight __attribute__((aligned (4)));
-}
-rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
-}
-GpuHidHaarFeature;
-
+#ifndef STUMP_BASED 
+#define STUMP_BASED 1
+#endif
 
 typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
 {
     int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
-    float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
-    float threshold /*__attribute__((aligned (4)))*/;
-    float alpha[2] __attribute__((aligned (8)));
+    float weight[CV_HAAR_FEATURE_MAX];
+    float threshold;
+    float alpha[3] __attribute__((aligned (16)));
     int left __attribute__((aligned (4)));
     int right __attribute__((aligned (4)));
 }
@@ -111,7 +101,6 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
     float inv_window_area __attribute__((aligned (4)));
 } GpuHidHaarClassifierCascade;
 
-
 __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
     global GpuHidHaarStageClassifier * stagecascadeptr,
     global int4 * info,
@@ -234,7 +223,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                 float stage_sum = 0.f;
                 int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
                 float stagethreshold = as_float(stageinfo.y);
-                for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
+                for(int nodeloop = 0; nodeloop < stageinfo.x; )
                 {
                     __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
 
@@ -242,7 +231,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                     int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
                     int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
                     float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-                    float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                    float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
+
                     float nodethreshold  = w.w * variance_norm_factor;
 
                     info1.x +=lcl_off;
@@ -261,8 +251,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                     classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
                                     lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
 
-                    stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                    bool passThres = classsum >= nodethreshold;
+#if STUMP_BASED
+                    stage_sum += passThres ? alpha3.y : alpha3.x;
                     nodecounter++;
+                    nodeloop++;
+#else
+                    bool isRootNode = (nodecounter & 1) == 0;
+                    if(isRootNode)
+                    {
+                        if( (passThres && currentnodeptr->right) ||
+                            (!passThres && currentnodeptr->left))
+                        {
+                            nodecounter ++;
+                        }
+                        else
+                        {
+                            stage_sum += alpha3.x;
+                            nodecounter += 2;
+                            nodeloop ++;
+                        }
+                    }
+                    else
+                    {
+                        stage_sum += passThres ? alpha3.z : alpha3.y;
+                        nodecounter ++;
+                        nodeloop ++;
+                    }
+#endif
                 }
 
                 result = (stage_sum >= stagethreshold);
@@ -301,18 +317,20 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
 
                     if(lcl_compute_win_id < queuecount)
                     {
-
                         int tempnodecounter = lcl_compute_id;
                         float part_sum = 0.f;
-                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
+                        const int stump_factor = STUMP_BASED ? 1 : 2;
+                        int root_offset = 0;
+                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
                         {
-                            __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
+                            __global GpuHidHaarTreeNode* currentnodeptr = 
+                                nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
 
                             int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
                             int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
                             int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
                             float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-                            float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                            float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
                             float nodethreshold  = w.w * variance_norm_factor;
 
                             info1.x +=queue_pixel;
@@ -332,8 +350,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                             classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
                                             lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
 
-                            part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-                            tempnodecounter +=lcl_compute_win;
+                            bool passThres = classsum >= nodethreshold;
+#if STUMP_BASED
+                            part_sum += passThres ? alpha3.y : alpha3.x;
+                            tempnodecounter += lcl_compute_win;
+                            lcl_loop++;
+#else
+                            if(root_offset == 0)
+                            {
+                                if( (passThres && currentnodeptr->right) ||
+                                    (!passThres && currentnodeptr->left))
+                                {
+                                    root_offset = 1;
+                                }
+                                else
+                                {
+                                    part_sum += alpha3.x;
+                                    tempnodecounter += lcl_compute_win;
+                                    lcl_loop++;
+                                }
+                            }
+                            else
+                            {
+                                part_sum += passThres ? alpha3.z : alpha3.y;
+                                tempnodecounter += lcl_compute_win;
+                                lcl_loop++;
+                                root_offset = 0;
+                            }
+#endif
                         }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
                         partialsum[lcl_id]=part_sum;
                     }
@@ -379,157 +423,3 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
 }
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-/*
-if(stagecascade->two_rects)
-{
-    #pragma unroll
-    for( n = 0; n < stagecascade->count; n++ )
-    {
-        t1 = *(node + counter);
-        t = t1.threshold * variance_norm_factor;
-        classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
-
-        classsum  += calc_sum1(t1, p_offset,1) * t1.weight[1];
-        stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
-
-        counter++;
-    }
-}
-else
-{
-    #pragma unroll
-    for( n = 0; n < stagecascade->count; n++ )
-    {
-        t = node[counter].threshold*variance_norm_factor;
-        classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
-        classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
-
-        if( node[counter].p0[2] )
-            classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
-
-        stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
-
-        counter++;
-    }
-}
-*/
-/*
-__kernel void gpuRunHaarClassifierCascade_ScaleWindow(
-                          constant GpuHidHaarClassifierCascade * _cascade,
-                          global GpuHidHaarStageClassifier * stagecascadeptr,
-                          //global GpuHidHaarClassifier * classifierptr,
-                          global GpuHidHaarTreeNode * nodeptr,
-                          global int * sum,
-                          global float * sqsum,
-                          global int * _candidate,
-                          int pixel_step,
-                          int cols,
-                          int rows,
-                          int start_stage,
-                          int end_stage,
-                          //int counts,
-                          int nodenum,
-                          int ystep,
-                          int detect_width,
-                          //int detect_height,
-                          int loopcount,
-                          int outputstep)
-                          //float scalefactor)
-{
-unsigned int x1 = get_global_id(0);
-unsigned int y1 = get_global_id(1);
-int p_offset;
-int m, n;
-int result;
-int counter;
-float mean, variance_norm_factor;
-for(int i=0;i<loopcount;i++)
-{
-constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
-global int * candidate = _candidate + i*outputstep;
-int window_width = cascade->p1 - cascade->p0;
-int window_height = window_width;
-result = 1;
-counter = 0;
-unsigned int x = mul24(x1,ystep);
-unsigned int y = mul24(y1,ystep);
-if((x < cols - window_width - 1) && (y < rows - window_height -1))
-{
-global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
-//global GpuHidHaarClassifier      *classifier   = classifierptr;
-global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;
-
-p_offset = mad24(y, pixel_step, x);// modify
-
-mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
-    *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
-    *cascade->inv_window_area;
-
-variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
-                    *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
-variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
-variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
-
-// if( cascade->is_stump_based )
-//{
-for( m = start_stage; m < end_stage; m++ )
-{
-float stage_sum = 0.f;
-float t,  classsum;
-GpuHidHaarTreeNode t1;
-
-//#pragma unroll
-for( n = 0; n < stagecascade->count; n++ )
-{
-     t1 = *(node + counter);
-     t  = t1.threshold * variance_norm_factor;
-     classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];
-
-     if((t1.p0[2]) && (!stagecascade->two_rects))
-         classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];
-
-     stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
-     counter++;
-}
-
-if (stage_sum < stagecascade->threshold)
-{
-    result = 0;
-    break;
-}
-
-stagecascade++;
-
-}
-if(result)
-{
-    candidate[4 * (y1 * detect_width + x1)]     = x;
-    candidate[4 * (y1 * detect_width + x1) + 1] = y;
-    candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
-    candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
-}
-//}
-}
-}
-}
-*/
-
-
-
-
diff --git a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
index 44877f3860..8507972ff2 100644
--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -17,7 +17,7 @@
 // @Authors
 //    Wu Xinglong, wxl370@126.com
 //    Sen Liu, swjtuls1987@126.com
-//
+//    Peng Xiao, pengxiao@outlook.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -49,25 +49,13 @@
 #define CV_HAAR_FEATURE_MAX           3
 typedef int   sumtype;
 typedef float sqsumtype;
-typedef struct  __attribute__((aligned(128)))  GpuHidHaarFeature
-{
-    struct __attribute__((aligned(32)))
-{
-    int p0 __attribute__((aligned(4)));
-    int p1 __attribute__((aligned(4)));
-    int p2 __attribute__((aligned(4)));
-    int p3 __attribute__((aligned(4)));
-    float weight __attribute__((aligned(4)));
-}
-rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
-}
-GpuHidHaarFeature;
+
 typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
 {
     int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
     float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
     float threshold /*__attribute__((aligned (4)))*/;
-    float alpha[2] __attribute__((aligned(8)));
+    float alpha[3] __attribute__((aligned(16)));
     int left __attribute__((aligned(4)));
     int right __attribute__((aligned(4)));
 }
@@ -174,45 +162,83 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                 const int p_offset = mad24(y, step, x);
                 cascadeinfo.x += p_offset;
                 cascadeinfo.z += p_offset;
-                mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
-                        sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
+                mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
+                - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                        sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+                + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
                        * correction_t;
-                variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
-                                       sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
+                variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
+                - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                                       sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+                + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
                 variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
                 variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
                 bool result = true;
                 nodecounter = startnode + nodecount * scalei;
-
                 for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
                 {
                     float stage_sum = 0.f;
                     int   stagecount = stagecascadeptr[stageloop].count;
-                    for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
+                    for (int nodeloop = 0; nodeloop < stagecount;)
                     {
                         __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
                         int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
                         int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
                         int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
                         float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
-                        float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
+                        float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
                         float nodethreshold  = w.w * variance_norm_factor;
+
                         info1.x += p_offset;
                         info1.z += p_offset;
                         info2.x += p_offset;
                         info2.z += p_offset;
-                        float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
-                                          sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
-                        classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
-                                     sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
                         info3.x += p_offset;
                         info3.z += p_offset;
-                        classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
-                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
-                        stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                        float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
+                        - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
+                                          sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
+                        + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
+                        classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
+                        - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
+                        + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
+                        classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
+                        - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
+                        + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
+                        
+                        bool passThres = classsum >= nodethreshold;
+
+#if STUMP_BASED
+                        stage_sum += passThres ? alpha3.y : alpha3.x;
                         nodecounter++;
+                        nodeloop++;
+#else
+                        bool isRootNode = (nodecounter & 1) == 0;
+                        if(isRootNode)
+                        {
+                            if( (passThres && currentnodeptr->right) ||
+                                (!passThres && currentnodeptr->left))
+                            {
+                                nodecounter ++;
+                            }
+                            else
+                            {
+                                stage_sum += alpha3.x;
+                                nodecounter += 2;
+                                nodeloop ++;
+                            }
+                        }
+                        else
+                        {
+                            stage_sum += (passThres ? alpha3.z : alpha3.y);
+                            nodecounter ++;
+                            nodeloop ++;
+                        }
+#endif
                     }
-                    result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
+                    result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
                 }
 
                 barrier(CLK_LOCAL_MEM_FENCE);
@@ -222,7 +248,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                     int queueindex = atomic_inc(lclcount);
                     lcloutindex[queueindex] = (y << 16) | x;
                 }
-
                 barrier(CLK_LOCAL_MEM_FENCE);
                 int queuecount = lclcount[0];
 
@@ -277,5 +302,6 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
     newnode[counter].threshold = t1.threshold;
     newnode[counter].alpha[0] = t1.alpha[0];
     newnode[counter].alpha[1] = t1.alpha[1];
+    newnode[counter].alpha[2] = t1.alpha[2];
 }
 
diff --git a/modules/ocl/src/opencl/imgproc_calcHarris.cl b/modules/ocl/src/opencl/imgproc_calcHarris.cl
index 15742d6c5e..1911a72016 100644
--- a/modules/ocl/src/opencl/imgproc_calcHarris.cl
+++ b/modules/ocl/src/opencl/imgproc_calcHarris.cl
@@ -130,28 +130,29 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #else
-   for(int i=0; i < ksY+1; i++)
-   {
+    int clamped_col = min(dst_cols, col);
+    for(int i=0; i < ksY+1; i++)
+    {
         int dx_selected_row;
         int dx_selected_col;
         dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
         dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
-        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
-        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
+        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
         dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
 
         int dy_selected_row;
         int dy_selected_col;
         dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
         dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
-        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
-        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
+        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
         dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
 
         data[0][i] = dx_data[i] * dx_data[i];
         data[1][i] = dx_data[i] * dy_data[i];
         data[2][i] = dy_data[i] * dy_data[i];
-   }
+    }
 #endif
     float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
     for(int i=1; i < ksY; i++)
diff --git a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
index 662fbb07b9..462ec77925 100644
--- a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
+++ b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
@@ -130,28 +130,30 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #else
-   for(int i=0; i < ksY+1; i++)
-   {
+    int clamped_col = min(dst_cols, col);
+
+    for(int i=0; i < ksY+1; i++)
+    {
         int dx_selected_row;
         int dx_selected_col;
         dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
         dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
-        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
-        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
+        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
         dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
 
         int dy_selected_row;
         int dy_selected_col;
         dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
         dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
-        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
-        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
+        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
         dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
 
         data[0][i] = dx_data[i] * dx_data[i];
         data[1][i] = dx_data[i] * dy_data[i];
         data[2][i] = dy_data[i] * dy_data[i];
-   }
+    }
 #endif
     float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
     for(int i=1; i < ksY; i++)
diff --git a/modules/ocl/src/opencl/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl
index ceaaed1eb6..5402759e3c 100644
--- a/modules/ocl/src/opencl/imgproc_canny.cl
+++ b/modules/ocl/src/opencl/imgproc_canny.cl
@@ -297,6 +297,9 @@ calcMap
     map_step   /= sizeof(*map);
     map_offset /= sizeof(*map);
 
+    mag += mag_offset;
+    map += map_offset;
+
     __local float smem[18][18];
 
     int gidx = get_global_id(0);
@@ -389,7 +392,7 @@ edgesHysteresisLocal
 (
     __global int * map,
     __global ushort2 * st,
-    volatile __global unsigned int * counter,
+    __global unsigned int * counter,
     int rows,
     int cols,
     int map_step,
@@ -399,6 +402,8 @@ edgesHysteresisLocal
     map_step   /= sizeof(*map);
     map_offset /= sizeof(*map);
 
+    map += map_offset;
+
     __local int smem[18][18];
 
     int gidx = get_global_id(0);
@@ -416,12 +421,12 @@ edgesHysteresisLocal
     if(ly < 14)
     {
         smem[ly][lx] =
-            map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step + map_offset];
+            map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
     }
     if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
     {
         smem[ly + 14][lx] =
-            map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step + map_offset];
+            map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -482,14 +487,17 @@ edgesHysteresisLocal
 __constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
 __constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 
+
 #define stack_size 512
 __kernel
-void edgesHysteresisGlobal
+void
+__attribute__((reqd_work_group_size(128,1,1)))
+edgesHysteresisGlobal
 (
     __global int * map,
     __global ushort2 * st1,
     __global ushort2 * st2,
-    volatile __global int * counter,
+    __global int * counter,
     int rows,
     int cols,
     int count,
@@ -501,6 +509,8 @@ void edgesHysteresisGlobal
     map_step   /= sizeof(*map);
     map_offset /= sizeof(*map);
 
+    map += map_offset;
+
     int gidx = get_global_id(0);
     int gidy = get_global_id(1);
 
@@ -510,7 +520,7 @@ void edgesHysteresisGlobal
     int grp_idx = get_group_id(0);
     int grp_idy = get_group_id(1);
 
-    volatile __local unsigned int s_counter;
+    __local unsigned int s_counter;
     __local unsigned int s_ind;
 
     __local ushort2 s_st[stack_size];
@@ -564,9 +574,9 @@ void edgesHysteresisGlobal
                     pos.x += c_dx[lidx & 7];
                     pos.y += c_dy[lidx & 7];
 
-                    if (map[pos.x + map_offset + pos.y * map_step] == 1)
+                    if (map[pos.x + pos.y * map_step] == 1)
                     {
-                        map[pos.x + map_offset + pos.y * map_step] = 2;
+                        map[pos.x + pos.y * map_step] = 2;
 
                         ind = atomic_inc(&s_counter);
 
@@ -621,6 +631,6 @@ void getEdges
 
     if(gidy < rows && gidx < cols)
     {
-        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] >> 1));
+        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
     }
 }
diff --git a/modules/ocl/src/opencl/imgproc_clahe.cl b/modules/ocl/src/opencl/imgproc_clahe.cl
new file mode 100644
index 0000000000..0d010f7a5b
--- /dev/null
+++ b/modules/ocl/src/opencl/imgproc_clahe.cl
@@ -0,0 +1,275 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+
+int calc_lut(__local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid == 0)
+    {
+        for (int i = 1; i < 256; ++i)
+        {
+            smem[i] += smem[i - 1];
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    return smem[tid];
+}
+
+#ifdef CPU
+void reduce(volatile __local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+    { 
+        smem[tid] = val += smem[tid + 128];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+    { 
+        smem[tid] = val += smem[tid + 64];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+    {
+        smem[tid] += smem[tid + 16];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+    {
+        smem[tid] += smem[tid + 8];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 4)
+    {
+        smem[tid] += smem[tid + 4];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 2)
+    {
+        smem[tid] += smem[tid + 2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 1)
+    {
+        smem[256] = smem[tid] + smem[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+#else
+void reduce(__local volatile int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+    { 
+        smem[tid] = val += smem[tid + 128];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+    { 
+        smem[tid] = val += smem[tid + 64];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    } barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8) {
+#endif
+        smem[tid] += smem[tid + 8];
+        smem[tid] += smem[tid + 4];
+        smem[tid] += smem[tid + 2];
+        smem[tid] += smem[tid + 1];
+    }
+}
+#endif
+
+__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
+                      const int srcStep, const int dstStep,
+                      const int2 tileSize, const int tilesX,
+                      const int clipLimit, const float lutScale)
+{
+    __local int smem[512];
+
+    const int tx = get_group_id(0);
+    const int ty = get_group_id(1);
+    const unsigned int tid = get_local_id(1) * get_local_size(0)
+                             + get_local_id(0);
+
+    smem[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
+    {
+        __global const uchar* srcPtr = src + mad24( ty * tileSize.y + i,
+                                                    srcStep, tx * tileSize.x );
+        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
+        {
+            const int data = srcPtr[j];
+            atomic_inc(&smem[data]);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tHistVal = smem[tid];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (clipLimit > 0)
+    {
+        // clip histogram bar
+
+        int clipped = 0;
+        if (tHistVal > clipLimit)
+        {
+            clipped = tHistVal - clipLimit;
+            tHistVal = clipLimit;
+        }
+
+        // find number of overall clipped samples
+
+        reduce(smem, clipped, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef CPU
+        clipped = smem[256];
+#else
+        clipped = smem[0];
+#endif
+
+        // broadcast evaluated value
+
+        __local int totalClipped;
+
+        if (tid == 0)
+            totalClipped = clipped;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // redistribute clipped samples evenly
+
+        int redistBatch = totalClipped / 256;
+        tHistVal += redistBatch;
+
+        int residual = totalClipped - redistBatch * 256;
+        if (tid < residual)
+            ++tHistVal;
+    }
+
+    const int lutVal = calc_lut(smem, tHistVal, tid);
+    uint ires = (uint)convert_int_rte(lutScale * lutVal);
+    lut[(ty * tilesX + tx) * dstStep + tid] =
+        convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
+
+__kernel void transform(__global __const uchar * src,
+                        __global uchar * dst,
+                        __global uchar * lut,
+                        const int srcStep, const int dstStep, const int lutStep,
+                        const int cols, const int rows,
+                        const int2 tileSize,
+                        const int tilesX, const int tilesY)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= cols || y >= rows)
+        return;
+
+    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
+    int ty1 = convert_int_rtn(tyf);
+    int ty2 = ty1 + 1;
+    const float ya = tyf - ty1;
+    ty1 = max(ty1, 0);
+    ty2 = min(ty2, tilesY - 1);
+
+    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
+    int tx1 = convert_int_rtn(txf);
+    int tx2 = tx1 + 1;
+    const float xa = txf - tx1;
+    tx1 = max(tx1, 0);
+    tx2 = min(tx2, tilesX - 1);
+
+    const int srcVal = src[mad24(y, srcStep, x)];
+
+    float res = 0;
+
+    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya));
+    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya));
+    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya));
+    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya));
+
+    uint ires = (uint)convert_int_rte(res);
+    dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
diff --git a/modules/ocl/src/opencl/imgproc_gfft.cl b/modules/ocl/src/opencl/imgproc_gfft.cl
new file mode 100644
index 0000000000..5fa27ffc1b
--- /dev/null
+++ b/modules/ocl/src/opencl/imgproc_gfft.cl
@@ -0,0 +1,276 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WITH_MASK
+#define WITH_MASK 0
+#endif
+
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+inline float ELEM_INT2(image2d_t _eig, int _x, int _y) 
+{
+    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
+}
+
+inline float ELEM_FLT2(image2d_t _eig, float2 pt) 
+{
+    return read_imagef(_eig, sampler, pt).x;
+}
+
+__kernel
+    void findCorners
+    (
+        image2d_t eig,
+        __global const char * mask,
+        __global float2 * corners,
+        const int mask_strip,// in pixels
+        const float threshold,
+        const int rows,
+        const int cols,
+        const int max_count,
+        __global int * g_counter
+    )
+{
+    const int j = get_global_id(0);
+    const int i = get_global_id(1);
+
+    if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
+#if WITH_MASK
+        && mask[i * mask_strip + j] != 0
+#endif
+        )
+    {
+        const float val = ELEM_INT2(eig, j, i);
+
+        if (val > threshold)
+        {
+            float maxVal = val;
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
+
+            if (val == maxVal)
+            {
+                const int ind = atomic_inc(g_counter);
+
+                if (ind < max_count)
+                    corners[ind] = (float2)(j, i);
+            }
+        }
+    }
+}
+
+//bitonic sort
+__kernel
+    void sortCorners_bitonicSort
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        const int stage,
+        const int passOfStage
+    )
+{
+    const int threadId = get_global_id(0);
+    if(threadId >= count / 2)
+    {
+        return;
+    }
+
+    const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
+
+    const int pairDistance = 1 << (stage - passOfStage);
+    const int blockWidth   = 2 * pairDistance;
+
+    const int leftId = min( (threadId % pairDistance) 
+                   + (threadId / pairDistance) * blockWidth, count );
+
+    const int rightId = min( leftId + pairDistance, count );
+
+    const float2 leftPt  = corners[leftId];
+    const float2 rightPt = corners[rightId];
+
+    const float leftVal  = ELEM_FLT2(eig, leftPt);
+    const float rightVal = ELEM_FLT2(eig, rightPt);
+
+    const bool compareResult = leftVal > rightVal;
+
+    float2 greater = compareResult ? leftPt:rightPt;
+    float2 lesser  = compareResult ? rightPt:leftPt;
+    
+    corners[leftId]  = sortOrder ? lesser : greater;
+    corners[rightId] = sortOrder ? greater : lesser;
+}
+
+//selection sort for gfft
+//kernel is ported from Bolt library:
+//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+//  Local sort will firstly sort elements of each workgroup using selection sort
+//  its performance is O(n)
+__kernel
+    void sortCorners_selectionSortLocal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        __local float2 * scratch
+    )
+{
+    int          i  = get_local_id(0); // index in workgroup
+    int numOfGroups = get_num_groups(0); // index in workgroup
+    int groupID     = get_group_id(0);
+    int         wg  = get_local_size(0); // workgroup size = block size
+    int n; // number of elements to be processed for this work group
+
+    int offset   = groupID * wg;
+    int same     = 0;
+    corners      += offset;
+    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
+    float2 pt1, pt2;
+
+    pt1 = corners[min(i, n)];
+    scratch[i] = pt1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(i >= n)
+    {
+        return;
+    }
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    int pos = 0;
+    for (int j=0;j<n;++j)
+    {
+        pt2  = scratch[j];
+        val2 = ELEM_FLT2(eig, pt2);
+        if(val2 > val1) 
+            pos++;//calculate the rank of this element in this work group
+        else 
+        {
+            if(val1 > val2)
+                continue;
+            else 
+            {
+                // val1 and val2 are same
+                same++;
+            }
+        }
+    }
+    for (int j=0; j< same; j++)      
+        corners[pos + j] = pt1;
+}
+__kernel
+    void sortCorners_selectionSortFinal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count
+    )
+{
+    const int          i  = get_local_id(0); // index in workgroup
+    const int numOfGroups = get_num_groups(0); // index in workgroup
+    const int groupID     = get_group_id(0);
+    const int         wg  = get_local_size(0); // workgroup size = block size
+    int pos = 0, same = 0;
+    const int offset = get_group_id(0) * wg;
+    const int remainder = count - wg*(numOfGroups-1);
+
+    if((offset + i ) >= count)
+        return;
+    float2 pt1, pt2;
+    pt1 = corners[groupID*wg + i];
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    for(int j=0; j<numOfGroups-1; j++ )
+    {
+        for(int k=0; k<wg; k++)
+        {
+            pt2  = corners[j*wg + k];
+            val2 = ELEM_FLT2(eig, pt2); 
+            if(val1 > val2)
+                break;
+            else
+            {
+                //Increment only if the value is not the same. 
+                if( val2 > val1 )
+                    pos++;
+                else 
+                    same++;
+            }
+        }
+    }
+
+    for(int k=0; k<remainder; k++)
+    {
+        pt2  = corners[(numOfGroups-1)*wg + k];
+        val2 = ELEM_FLT2(eig, pt2); 
+        if(val1 > val2)
+            break;
+        else
+        {
+            //Don't increment if the value is the same. 
+            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
+            if(val2 > val1)
+                pos++;
+            else 
+                same++;
+        }
+    }  
+    for (int j=0; j< same; j++)      
+        corners[pos + j] = pt1;
+}
+
diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl
index 8ad501f7c1..9162abb7ef 100644
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -143,7 +143,7 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
         int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
         float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
         int4 con = dpos >= 0 && dpos < dst_cols;
-        ddata = convert_float4(con) != 0 ? ddata : dVal;
+        ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
         if(dstart < dst_cols)
         {
             *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl
index 2378f4f849..71313017a9 100644
--- a/modules/ocl/src/opencl/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
@@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt
             sum[i] = dst_sum[i][0];
 }
 
-__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
                            __global F* dst_m,
                            int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, int TILE_SIZE)
 {
     uchar tmp_coi[16]; // get the coi data
     uchar16 tmp[16];
@@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
     int x = wgidx*TILE_SIZE;  // vector length of uchar
     int kcn = (cn==2)?2:4;
     int rstep = min(src_step, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols - x);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if( tileSize_width < TILE_SIZE )
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
+
+        if( coi > 0 )	//channel of interest
+            for(int i = 0; i < tileSize_width; i += VLEN_C)
+            {
+                for(int j=0; j<VLEN_C; j++)
+                    tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
+                                          tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_C)
+                tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
+    }
 
-    if( tileSize_width < TILE_SIZE )
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
-    if( coi > 0 )	//channel of interest
-        for(int i = 0; i < tileSize_width; i += VLEN_C)
-        {
-            for(int j=0; j<VLEN_C; j++)
-                tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
-                                      tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_C)
-            tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
     uchar16 zero = (uchar16)(0);
     uchar16 full = (uchar16)(255);
     if( binary )
         for(int i=0; i < tileSize_width; i+=VLEN_C)
             tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
+
     F mom[10];
     __local int m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
+    {
         for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
+
     int lm[10] = {0};
     int16 x0 = (int16)(0);
     int16 x1 = (int16)(0);
@@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
                 m[i][lidy-j/2] = lm[i];
         barrier(CLK_LOCAL_MEM_FENCE);
     }
+
     if(lidy == 0&&lidx == 0)
     {
         for( int mt = 0; mt < 10; mt++ )
@@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
     }
 }
 
-__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
                            __global F* dst_m,
                            int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
     ushort tmp_coi[8]; // get the coi data
     ushort8 tmp[32];
@@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
     int x = wgidx*TILE_SIZE;  // real X index of pixel
     int kcn = (cn==2)?2:4;
     int rstep = min(src_step/2, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
-    if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
-        for(int i=tileSize_width; i < rstep; i++ )
-            *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_US)
-        {
-            for(int j=0; j<VLEN_US; j++)
-                tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_US)
-            tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
+            for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+            {
+                for(int j=0; j<VLEN_US; j++)
+                    tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+                tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    }
+
     ushort8 zero = (ushort8)(0);
     ushort8 full = (ushort8)(255);
     if( binary )
@@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
             tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
     F mom[10];
     __local long m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
         for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
     barrier(CLK_LOCAL_MEM_FENCE);
+
     long lm[10] = {0};
     int8 x0 = (int8)(0);
     int8 x1 = (int8)(0);
@@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
         lm[0] = x0.s0;             // m00
     }
     barrier(CLK_LOCAL_MEM_FENCE);
+
     for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
     {
         if(lidy < j)
             for( int i = 0; i < 10; i++ )
                 lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
         if(lidy >= j/2&&lidy < j)
             for( int i = 0; i < 10; i++ )
                 m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
     }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     if(lidy == 0&&lidx == 0)
     {
         for(int mt = 0; mt < 10; mt++ )
@@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
     }
 }
 
-__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
                            __global F* dst_m,
                            int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
     short tmp_coi[8]; // get the coi data
     short8 tmp[32];
@@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
     int x = wgidx*TILE_SIZE;  // real X index of pixel
     int kcn = (cn==2)?2:4;
     int rstep = min(src_step/2, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_S)
-        {
-            for(int j=0; j<VLEN_S; j++)
-                tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_S)
-            tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+            {
+                for(int j=0; j<VLEN_S; j++)
+                    tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+                tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    }
+
     short8 zero = (short8)(0);
     short8 full = (short8)(255);
     if( binary )
@@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
 
     F mom[10];
     __local long m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
         for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
+            m[i][lidy]=0;
     barrier(CLK_LOCAL_MEM_FENCE);
     long lm[10] = {0};
     int8 x0 = (int8)(0);
@@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
     }
 }
 
-__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
                             __global F* dst_m,
                             int dst_cols, int dst_step, int blocky,
-                            int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+                            int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
     float tmp_coi[4]; // get the coi data
     float4 tmp[64] ;
@@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
     int y = wgidy*TILE_SIZE;  // real Y index of pixel
     int x = wgidx*TILE_SIZE;  // real X index of pixel
     int kcn = (cn==2)?2:4;
-    src_step /= sizeof(*src_data);
-    int rstep = min(src_step, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE, src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols -x);
+    int rstep = min(src_step/4, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
     int maxIdx = mul24(src_rows, src_cols);
     int yOff = (y+lidy)*src_step;
     int index;
-    if(tileSize_width < TILE_SIZE && yOff < src_rows)
-        for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
-            *(src_data+yOff+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_F)
-        {
-#pragma unroll
-            for(int j=0; j<4; j++)
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
             {
-                index = yOff+(x+i+j)*kcn+coi-1;
-                if (index < maxIdx)
-                    tmp_coi[j] = *(src_data+index);
-                else
-                    tmp_coi[j] = 0;
+                for(int j=0; j<4; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
             }
-            tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-        }
-    else
-        for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
-            tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
+                tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+    }
+
     float4 zero = (float4)(0);
     float4 full = (float4)(255);
     if( binary )
@@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
             tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
     F mom[10];
     __local F m[10][128];
-    if(lidy == 0)
+    if(lidy < 128)
         for(int i = 0; i < 10; i ++)
-            for(int j = 0; j < 128; j ++)
-                m[i][j] = 0;
+            m[i][lidy] = 0;
     barrier(CLK_LOCAL_MEM_FENCE);
     F lm[10] = {0};
     F4 x0 = (F4)(0);
@@ -729,185 +748,6 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
         m[0][lidy-bheight] = x0.s0;             // m00
     }
 
-    else if(lidy < bheight)
-    {
-        lm[9] = ((F)py) * sy;  // m03
-        lm[8] = ((F)x1.s0) * sy;  // m12
-        lm[7] = ((F)x2.s0) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
-    {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lidy == 0&&lidx == 0)
-    {
-        for( int mt = 0; mt < 10; mt++ )
-            mom[mt] = (F)lm[mt];
-        if(binary)
-        {
-            F s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
-        }
-
-        F xm = x * mom[0], ym = y * mom[0];
-
-        // accumulate moments computed in each tile
-        dst_step /= sizeof(F);
-
-        int dst_x_off = mad24(wgidy, dst_cols, wgidx);
-        int dst_off = 0;
-        int max_dst_index = 10 * blocky * get_global_size(1);
-
-        // + m00 ( = m00' )
-        dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
-
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
-
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
-        if (dst_off < max_dst_index)
-            *(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
-    }
-}
-
-__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
-                           __global F* dst_m,
-                           int dst_cols, int dst_step, int blocky,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
-    F tmp_coi[4]; // get the coi data
-    F4 tmp[64];
-    int VLEN_D = 4; // length of vetor
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE;  // real Y index of pixel
-    int x = wgidx*TILE_SIZE;  // real X index of pixel
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/8, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE,  src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols - x);
-
-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-        {
-            for(int j=0; j<4; j++)
-                tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
-    F4 zero = (F4)(0);
-    F4 full = (F4)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
-    F mom[10];
-    __local F m[10][128];
-    if(lidy == 0)
-        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    F lm[10] = {0};
-    F4 x0 = (F4)(0);
-    F4 x1 = (F4)(0);
-    F4 x2 = (F4)(0);
-    F4 x3 = (F4)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
-    {
-        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
-        F4 p = tmp[xt/VLEN_D];
-        F4 xp = v_xt * p, xxp = xp * v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += xxp *v_xt;
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3;
-
-    F py = lidy * x0.s0, sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((F)py) * sy;  // m03
-        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
-
     else if(lidy < bheight)
     {
         lm[9] = ((F)py) * sy;  // m03
@@ -922,6 +762,164 @@ __kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, in
         lm[0] = x0.s0;             // m00
     }
     barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        F xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+
+__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    F tmp_coi[4]; // get the coi data
+    F4 tmp[64];
+    int VLEN_D = 4; // length of vetor
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/8, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE,  src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_D)
+            {
+                for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+            }
+        else
+            for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
+                tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+    }
+
+    F4 zero = (F4)(0);
+    F4 full = (F4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
+    F mom[10];
+    __local F m[10][128];
+    if(lidy < 128)
+        for(int i=0; i<10; i++)
+            m[i][lidy]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    F lm[10] = {0};
+    F4 x0 = (F4)(0);
+    F4 x1 = (F4)(0);
+    F4 x2 = (F4)(0);
+    F4 x3 = (F4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
+    {
+        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+        F4 p = tmp[xt/VLEN_D];
+        F4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp *v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+
+    F py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((F)py) * sy;  // m03
+        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((F)py) * sy;  // m03
+        lm[8] = ((F)x1.s0) * sy;  // m12
+        lm[7] = ((F)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
     {
         if(lidy < j)
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
index 8852facae8..05d538330f 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -43,7 +43,6 @@
 //
 //M*/
 
-
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
 #define CELLS_PER_BLOCK_X 2
@@ -51,6 +50,100 @@
 #define NTHREADS 256
 #define CV_PI_F 3.1415926535897932384626433832795f
 
+//----------------------------------------------------------------------------
+// Histogram computation
+// 12 threads for a cell, 12x4 threads per block
+// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
+__kernel void compute_hists_lut_kernel(
+    const int cblock_stride_x, const int cblock_stride_y,
+    const int cnbins, const int cblock_hist_size, const int img_block_width, 
+    const int blocks_in_group, const int blocks_total,
+    const int grad_quadstep, const int qangle_step,
+    __global const float* grad, __global const uchar* qangle,
+    __global const float* gauss_w_lut,
+    __global float* block_hists, __local float* smem)
+{
+    const int lx = get_local_id(0);
+    const int lp = lx / 24; /* local group id */
+    const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
+    const int gidY = gid / img_block_width;
+    const int gidX = gid - gidY * img_block_width;
+
+    const int lidX = lx - lp * 24;
+    const int lidY = get_local_id(1);
+
+    const int cell_x = lidX / 12;
+    const int cell_y = lidY;
+    const int cell_thread_x = lidX - cell_x * 12;
+
+    __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * 
+        CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
+    __local float* final_hist = hists + cnbins * 
+        (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
+
+    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
+    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
+
+    __global const float* grad_ptr = (gid < blocks_total) ? 
+        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
+    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
+
+    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + 
+        cell_thread_x;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+        hist[bin_id * 48] = 0.f;
+
+    const int dist_x = -4 + cell_thread_x - 4 * cell_x;
+    const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+    const int dist_y_begin = -4 - 4 * lidY;
+    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+    {
+        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
+        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+
+        grad_ptr += grad_quadstep;
+        qangle_ptr += qangle_step;
+
+        int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+
+        int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
+        float gaussian = gauss_w_lut[idx];
+        idx = (dist_y + 8) * 16 + (dist_x + 8);
+        float interp_weight = gauss_w_lut[256+idx];
+
+        hist[bin.x * 48] += gaussian * interp_weight * vote.x;
+        hist[bin.y * 48] += gaussian * interp_weight * vote.y;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* hist_ = hist;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
+    {
+        if (cell_thread_x < 6)
+            hist_[0] += hist_[6];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (cell_thread_x < 3)
+            hist_[0] += hist_[3];
+#ifdef CPU
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+        if (cell_thread_x == 0)
+            final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
+                hist_[0] + hist_[1] + hist_[2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
+    if ((tid < cblock_hist_size) && (gid < blocks_total))
+    {
+        __global float* block_hist = block_hists + 
+            (gidY * img_block_width + gidX) * cblock_hist_size;
+        block_hist[tid] = final_hist[tid];
+    }
+}
+
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
@@ -125,16 +218,14 @@ __kernel void compute_hists_kernel(
         barrier(CLK_LOCAL_MEM_FENCE);
         if (cell_thread_x < 3)
             hist_[0] += hist_[3];
-#ifdef WAVE_SIZE_1
+#ifdef CPU
         barrier(CLK_LOCAL_MEM_FENCE);
 #endif
         if (cell_thread_x == 0)
             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
                 hist_[0] + hist_[1] + hist_[2];
     }
-#ifdef WAVE_SIZE_1
     barrier(CLK_LOCAL_MEM_FENCE);
-#endif
 
     int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
     if ((tid < cblock_hist_size) && (gid < blocks_total))
@@ -145,6 +236,57 @@ __kernel void compute_hists_kernel(
     }
 }
 
+//-------------------------------------------------------------
+//  Normalization of histograms via L2Hys_norm
+//  optimized for the case of 9 bins
+__kernel void normalize_hists_36_kernel(__global float* block_hists, 
+                                        const float threshold, __local float *squares)
+{
+    const int tid = get_local_id(0);
+    const int gid = get_global_id(0);
+    const int bid = tid / 36;      /* block-hist id, (0 - 6) */
+    const int boffset = bid * 36;  /* block-hist offset in the work-group */
+    const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
+
+    float elem = block_hists[gid];
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local float* smem = squares + boffset;
+    float sum = smem[hid];
+    if (hid < 18)
+        smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    elem = elem / (sqrt(sum) + 3.6f);
+    elem = min(elem, threshold);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sum = smem[hid];
+    if (hid < 18)
+      smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
+}
+
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
 //
@@ -153,76 +295,50 @@ float reduce_smem(volatile __local float* smem, int size)
     unsigned int tid = get_local_id(0);
     float sum = smem[tid];
 
-    if (size >= 512)
-    {
-        if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (size >= 256)
-    {
-        if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (size >= 128)
-    {
-        if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
+    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#ifdef CPU
+    if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }	
+    if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }		
+    if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#else
     if (tid < 32)
     {
         if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-#endif
         if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-#endif
         if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
-    {
-#endif
         if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
-    {
-#endif
         if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
-#endif
         if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
     }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    sum = smem[0];
+#endif
 
     return sum;
 }
 
-__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
-                                     __global float* block_hists, const float threshold, __local float *squares)
+__kernel void normalize_hists_kernel(
+    const int nthreads, const int block_hist_size, const int img_block_width,
+    __global float* block_hists, const float threshold, __local float *squares)
 {
     const int tid = get_local_id(0);
     const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
-    __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
+    __global float* hist = block_hists + (gidY * img_block_width + gidX) * 
+        block_hist_size + tid;
 
     float elem = 0.f;
     if (tid < block_hist_size)
@@ -249,25 +365,98 @@ __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_si
 
 //---------------------------------------------------------------------
 //  Linear SVM based classification
-//
-__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
-                                    const int img_win_width, const int img_block_width,
-                                    const int win_block_stride_x, const int win_block_stride_y,
-                                    __global const float * block_hists, __global const float* coefs,
-                                    float free_coef, float threshold, __global uchar* labels)
+//  48x96 window, 9 bins and default parameters
+//  180 threads, each thread corresponds to a bin in a row
+__kernel void classify_hists_180_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
 {
     const int tid = get_local_id(0);
     const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     float product = 0.f;
-    for (int i = tid; i < cdescr_size; i += NTHREADS)
+
+    for (int i = 0; i < cdescr_height; i++)
     {
-        int offset_y = i / cdescr_width;
-        int offset_x = i - offset_y * cdescr_width;
-        product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+        product += coefs[i * cdescr_width + tid] * 
+            hist[i * img_block_width * cblock_hist_size + tid];
+    }
+
+    __local float products[180];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 90) products[tid] = product = product + products[tid + 90];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 45) products[tid] = product = product + products[tid + 45];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* smem = products;
+#ifdef CPU
+    if (tid < 13) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 13)
+    {
+        smem[tid] = product = product + smem[tid + 32];
+    }
+    if (tid < 16)
+    {
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+
+    if (tid == 0){
+		product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  64x128 window, 9 bins and default parameters
+//  256 threads, 252 of them are used
+__kernel void classify_hists_252_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    if (tid < cdescr_width)
+    {
+        for (int i = 0; i < cdescr_height; i++)
+            product += coefs[i * cdescr_width + tid] * 
+                hist[i * img_block_width * cblock_hist_size + tid];
     }
 
     __local float products[NTHREADS];
@@ -282,67 +471,120 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
     if (tid < 64) products[tid] = product = product + products[tid + 64];
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    volatile __local float* smem = products;
+	volatile __local float* smem = products;
+#ifdef CPU
+	if(tid<32) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
     if (tid < 32)
-    {
+    {      
         smem[tid] = product = product + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 16];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 8];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 4];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 2];
-#ifdef WAVE_SIZE_1
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
 #endif
-        smem[tid] = product = product + smem[tid + 1];
+    if (tid == 0){
+		product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  256 threads
+__kernel void classify_hists_kernel(
+    const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        product += coefs[i] * 
+            hist[offset_y * img_block_width * cblock_hist_size + offset_x];
     }
 
-    if (tid == 0)
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+	volatile __local float* smem = products;
+#ifdef CPU
+	if(tid<32) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 32)
+    {       
+        smem[tid] = product = product + smem[tid + 32];
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+    if (tid == 0){
+		smem[tid] = product = product + smem[tid + 1];
         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
 }
 
 //----------------------------------------------------------------------------
 // Extract descriptors
 
-__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
-        const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-        __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_rows_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep, 
+    const int cdescr_size, const int cdescr_width, const int img_block_width, 
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float* block_hists, __global float* descriptors)
 {
     int tid = get_local_id(0);
     int gidX = get_group_id(0);
     int gidY = get_group_id(1);
 
     // Get left top corner of the window in src
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     // Get left top corner of the window in dst
-    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+    __global float* descriptor = descriptors + 
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 
     // Copy elements from src to dst
     for (int i = tid; i < cdescr_size; i += NTHREADS)
@@ -353,19 +595,23 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
     }
 }
 
-__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
-        const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
-        const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_cols_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
+    const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, 
+    const int win_block_stride_x, const int win_block_stride_y, 
+    __global const float* block_hists, __global float* descriptors)
 {
     int tid = get_local_id(0);
     int gidX = get_group_id(0);
     int gidY = get_group_id(1);
 
     // Get left top corner of the window in src
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists +  (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     // Get left top corner of the window in dst
-    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+    __global float* descriptor = descriptors + 
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 
     // Copy elements from src to dst
     for (int i = tid; i < cdescr_size; i += NTHREADS)
@@ -376,16 +622,19 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
         int y = block_idx / cnblocks_win_x;
         int x = block_idx - y * cnblocks_win_x;
 
-        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = 
+            hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
     }
 }
 
 //----------------------------------------------------------------------------
 // Gradients computation
 
-__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
-        const __global uchar4 * img, __global float * grad, __global uchar * qangle,
-        const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC4_kernel(
+    const int height, const int width, 
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
@@ -426,8 +675,10 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
     barrier(CLK_LOCAL_MEM_FENCE);
     if (x < width)
     {
-        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
-        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], 
+            sh_row[tid + 2 * (NTHREADS + 2)]);
+        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], 
+            sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
 
         float3 dx;
         if (correct_gamma == 1)
@@ -482,9 +733,11 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
     }
 }
 
-__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
-        __global const uchar * img, __global float * grad, __global uchar * qangle,
-        const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC1_kernel(
+    const int height, const int width, 
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
@@ -539,43 +792,4 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
         grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
         grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
     }
-}
-
-//----------------------------------------------------------------------------
-// Resize
-
-__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step,
-                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    int sx = (int)floor(dx*ifx+0.5f);
-    int sy = (int)floor(dy*ify+0.5f);
-    sx = min(sx, src_cols-1);
-    sy = min(sy, src_rows-1);
-    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
-    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
-
-    if(dx<dst_cols && dy<dst_rows)
-        dst[dpos] = src[spos];
-}
-
-__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step,
-                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    int sx = (int)floor(dx*ifx+0.5f);
-    int sy = (int)floor(dy*ify+0.5f);
-    sx = min(sx, src_cols-1);
-    sy = min(sy, src_rows-1);
-    int dpos = dst_offset + dy * dst_step + dx;
-    int spos = src_offset + sy * src_step + sx;
-
-    if(dx<dst_cols && dy<dst_rows)
-        dst[dpos] = src[spos];
 }
\ No newline at end of file
diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl
index 0b7f0c9025..4afa7b7104 100644
--- a/modules/ocl/src/opencl/pyr_up.cl
+++ b/modules/ocl/src/opencl/pyr_up.cl
@@ -18,6 +18,7 @@
 //    Zhang Chunpeng	chunpeng@multicorewareinc.com
 //    Dachuan Zhao, dachuan@multicorewareinc.com
 //    Yao Wang, yao@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -47,7 +48,7 @@
 
 //#pragma OPENCL EXTENSION cl_amd_printf : enable
 
-uchar get_valid_uchar(uchar data)
+uchar get_valid_uchar(float data)
 {
     return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
 }
@@ -142,7 +143,7 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
     sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
 
     if ((x < dstCols) && (y < dstRows))
-        dst[x + y * dstStep] = (float)(4.0f * sum);
+        dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum);
 
 }
 
@@ -244,7 +245,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
     sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
 
     if ((x < dstCols) && (y < dstRows))
-        dst[x + y * dstStep] = (float)(4.0f * sum);
+        dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum);
 
 }
 
@@ -351,31 +352,6 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_8UC4  //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
-float4 covert_uchar4_to_float4(uchar4 data)
-{
-    float4 f4Data = {0,0,0,0};
-
-    f4Data.x = (float)data.x;
-    f4Data.y = (float)data.y;
-    f4Data.z = (float)data.z;
-    f4Data.w = (float)data.w;
-
-    return f4Data;
-}
-
-
-uchar4 convert_float4_to_uchar4(float4 data)
-{
-    uchar4 u4Data;
-
-    u4Data.x = get_valid_uchar(data.x);
-    u4Data.y = get_valid_uchar(data.y);
-    u4Data.z = get_valid_uchar(data.z);
-    u4Data.w = get_valid_uchar(data.w);
-
-    return u4Data;
-}
-
 __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
                           int srcRows,int dstRows,int srcCols,int dstCols,
                           int srcOffset,int dstOffset,int srcStep,int dstStep)
@@ -406,15 +382,15 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
         srcy = abs(srcy);
         srcy = min(srcRows -1 ,srcy);
 
-        s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
+        s_srcPatch[tidy][tidx] = convert_float4(src[srcx + srcy * srcStep]);
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
     float4 sum = (float4)(0,0,0,0);
 
-    const int evenFlag = (int)((tidx & 1) == 0);
-    const int oddFlag = (int)((tidx & 1) != 0);
+    const float4 evenFlag = (float4)((tidx & 1) == 0);
+    const float4 oddFlag = (float4)((tidx & 1) != 0);
     const bool  eveny = ((tidy & 1) == 0);
 
     float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
@@ -476,38 +452,13 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
 
     if ((x < dstCols) && (y < dstRows))
     {
-        dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum);
+        dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum);
     }
 }
+
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_16UC4 //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
-float4 covert_ushort4_to_float4(ushort4 data)
-{
-    float4 f4Data = {0,0,0,0};
-
-    f4Data.x = (float)data.x;
-    f4Data.y = (float)data.y;
-    f4Data.z = (float)data.z;
-    f4Data.w = (float)data.w;
-
-    return f4Data;
-}
-
-
-ushort4 convert_float4_to_ushort4(float4 data)
-{
-    ushort4 u4Data;
-
-    u4Data.x = (float)data.x;
-    u4Data.y = (float)data.y;
-    u4Data.z = (float)data.z;
-    u4Data.w = (float)data.w;
-
-    return u4Data;
-}
-
-
 __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
                           int srcRows,int dstRows,int srcCols,int dstCols,
                           int srcOffset,int dstOffset,int srcStep,int dstStep)
@@ -535,15 +486,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
         srcy = abs(srcy);
         srcy = min(srcRows -1 ,srcy);
 
-        s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]);
+        s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]);
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
     float4 sum = (float4)(0,0,0,0);
 
-    const int evenFlag = (int)((get_local_id(0) & 1) == 0);
-    const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+    const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0);
+    const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0);
     const bool  eveny = ((get_local_id(1) & 1) == 0);
     const int tidx = get_local_id(0);
 
@@ -570,11 +521,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
 
         if (eveny)
         {
-            sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + (oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
             sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+            sum = sum + (oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
         }
 
         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
@@ -610,7 +561,7 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
 
     if ((x < dstCols) && (y < dstRows))
     {
-        dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum);
+        dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum);
     }
 }
 
@@ -654,8 +605,8 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
 
     float4 sum = (float4)(0,0,0,0);
 
-    const int evenFlag = (int)((tidx & 1) == 0);
-    const int oddFlag = (int)((tidx & 1) != 0);
+    const float4 evenFlag = (float4)((tidx & 1) == 0);
+    const float4 oddFlag = (float4)((tidx & 1) != 0);
     const bool  eveny = ((tidy & 1) == 0);
 
     float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
@@ -681,11 +632,11 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
 
         if (eveny)
         {
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * co2  ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
+            sum = sum + (oddFlag * co2  ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
             sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * co2  ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
+            sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
         }
 
         s_dstPatch[tidy][tidx] = sum;
@@ -719,4 +670,4 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
     {
         dst[x + y * dstStep] = 4.0f * sum;
     }
-}
\ No newline at end of file
+}
diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl
index 1043b8410b..40a1993952 100644
--- a/modules/ocl/src/opencl/pyrlk.cl
+++ b/modules/ocl/src/opencl/pyrlk.cl
@@ -46,145 +46,10 @@
 
 //#pragma OPENCL EXTENSION cl_amd_printf : enable
 
-__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows && x < cols * cn)
-    {
-        const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
-        const uchar src_val1 = (src + y * srcStep)[x];
-        const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
-
-        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
-        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
-    }
-}
-
-__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows && x < cols * cn)
-    {
-        const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
-        const uchar src_val1 = (src + y * srcStep)[x];
-        const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
-
-        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
-        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
-    }
-}
-
-__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int colsn = cols * cn;
-
-    if (y < rows && x < colsn)
-    {
-        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
-        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
-        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
-        const int xl = x - cn >= 0 ? x - cn : cn + x;
-
-        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
-        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
-    }
-}
-
-__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int colsn = cols * cn;
-
-    if (y < rows && x < colsn)
-    {
-        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
-        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
-        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
-        const int xl = x - cn >= 0 ? x - cn : cn + x;
-
-        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
-        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
-    }
-}
-
-#define W_BITS 14
-#define W_BITS1 14
-
-#define  CV_DESCALE(x, n)     (((x) + (1 << ((n)-1))) >> (n))
-
-int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
-    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
-    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
-    __global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
-    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5);
-}
-
-int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
-    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
-    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
-    __global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
-    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1);
-}
-
-float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    float iw01 = (a * (1.0f - b) * (1 << W_BITS));
-    float iw10 = ((1.0f - a) * b * (1 << W_BITS));
-    float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn;
-    __global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn;
-
-    return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5;
-}
-
 #define	BUFFER	64
-
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
 #ifdef CPU
 void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
 {
@@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local
     smem3[tid] = val3;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-        smem3[tid] = val3 += smem3[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-        smem3[tid] = val3 += smem3[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        smem1[tid] = val1 += smem1[tid + 32];
-        smem2[tid] = val2 += smem2[tid + 32];
-        smem3[tid] = val3 += smem3[tid + 32];
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
     {
-        smem1[tid] = val1 += smem1[tid + 16];
-        smem2[tid] = val2 += smem2[tid + 16];
-        smem3[tid] = val3 += smem3[tid + 16];
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 8)
     {
-        smem1[tid] = val1 += smem1[tid + 8];
-        smem2[tid] = val2 += smem2[tid + 8];
-        smem3[tid] = val3 += smem3[tid + 8];
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 4)
     {
-        smem1[tid] = val1 += smem1[tid + 4];
-        smem2[tid] = val2 += smem2[tid + 4];
-        smem3[tid] = val3 += smem3[tid + 4];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 2)
     {
-        smem1[tid] = val1 += smem1[tid + 2];
-        smem2[tid] = val2 += smem2[tid + 2];
-        smem3[tid] = val3 += smem3[tid + 2];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 1)
     {
-        smem1[BUFFER] = val1 += smem1[tid + 1];
-        smem2[BUFFER] = val2 += smem2[tid + 1];
-        smem3[BUFFER] = val3 += smem3[tid + 1];
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
+        smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
     smem2[tid] = val2;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = (val1 += smem1[tid + 128]);
-        smem2[tid] = (val2 += smem2[tid + 128]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = (val1 += smem1[tid + 64]);
-        smem2[tid] = (val2 += smem2[tid + 64]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        smem1[tid] = (val1 += smem1[tid + 32]);
-        smem2[tid] = (val2 += smem2[tid + 32]);
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
     {
-        smem1[tid] = (val1 += smem1[tid + 16]);
-        smem2[tid] = (val2 += smem2[tid + 16]);
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 8)
     {
-        smem1[tid] = (val1 += smem1[tid + 8]);
-        smem2[tid] = (val2 += smem2[tid + 8]);
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 4)
     {
-        smem1[tid] = (val1 += smem1[tid + 4]);
-        smem2[tid] = (val2 += smem2[tid + 4]);
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 2)
     {
-        smem1[tid] = (val1 += smem1[tid + 2]);
-        smem2[tid] = (val2 += smem2[tid + 2]);
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 1)
     {
-        smem1[BUFFER] = (val1 += smem1[tid + 1]);
-        smem2[BUFFER] = (val2 += smem2[tid + 1]);
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = (val1 += smem1[tid + 128]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = (val1 += smem1[tid + 64]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        smem1[tid] = (val1 += smem1[tid + 32]);
+        smem1[tid] += smem1[tid + 32];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
     {
-        smem1[tid] = (val1 += smem1[tid + 16]);
+        smem1[tid] += smem1[tid + 16];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 8)
     {
-        smem1[tid] = (val1 += smem1[tid + 8]);
+        smem1[tid] += smem1[tid + 8];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 4)
     {
-        smem1[tid] = (val1 += smem1[tid + 4]);
+        smem1[tid] += smem1[tid + 4];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 2)
     {
-        smem1[tid] = (val1 += smem1[tid + 2]);
+        smem1[tid] += smem1[tid + 2];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 1)
     {
-        smem1[BUFFER] = (val1 += smem1[tid + 1]);
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
 #else
-void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
+void reduce3(float val1, float val2, float val3, 
+__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
     smem3[tid] = val3;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-        smem3[tid] = val3 += smem3[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-        smem3[tid] = val3 += smem3[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        volatile __local float* vmem1 = smem1;
-        volatile __local float* vmem2 = smem2;
-        volatile __local float* vmem3 = smem3;
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];
 
-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem2[tid] = val2 += vmem2[tid + 32];
-        vmem3[tid] = val3 += vmem3[tid + 32];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];
 
-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem2[tid] = val2 += vmem2[tid + 16];
-        vmem3[tid] = val3 += vmem3[tid + 16];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];
 
-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem2[tid] = val2 += vmem2[tid + 8];
-        vmem3[tid] = val3 += vmem3[tid + 8];
-
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem2[tid] = val2 += vmem2[tid + 4];
-        vmem3[tid] = val3 += vmem3[tid + 4];
-
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem2[tid] = val2 += vmem2[tid + 2];
-        vmem3[tid] = val3 += vmem3[tid + 2];
-
-        vmem1[tid] = val1 += vmem1[tid + 1];
-        vmem2[tid] = val2 += vmem2[tid + 1];
-        vmem3[tid] = val3 += vmem3[tid + 1];
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
+        smem3[tid] += smem3[tid + 1];
     }
 }
 
-void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)
+void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        volatile __local float* vmem1 = smem1;
-        volatile __local float* vmem2 = smem2;
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
 
-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem2[tid] = val2 += vmem2[tid + 32];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
 
-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem2[tid] = val2 += vmem2[tid + 16];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
 
-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem2[tid] = val2 += vmem2[tid + 8];
-
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem2[tid] = val2 += vmem2[tid + 4];
-
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem2[tid] = val2 += vmem2[tid + 2];
-
-        vmem1[tid] = val1 += vmem1[tid + 1];
-        vmem2[tid] = val2 += vmem2[tid + 1];
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
     }
 }
 
-void reduce1(float val1, __local float* smem1, int tid)
+void reduce1(float val1, __local volatile float* smem1, int tid)
 {
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
     if (tid < 32)
     {
-        volatile __local float* vmem1 = smem1;
-
-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem1[tid] = val1 += vmem1[tid + 1];
+        smem1[tid] += smem1[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem1[tid] += smem1[tid + 4];
+        smem1[tid] += smem1[tid + 2];
+        smem1[tid] += smem1[tid + 1];
     }
 }
 #endif
 
 #define SCALE (1.0f / (1 << 20))
 #define	THRESHOLD	0.01f
-#define	DIMENSION	21
 
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl
index bd86a7f3fb..552874d427 100644
--- a/modules/ocl/src/opencl/stereobm.cl
+++ b/modules/ocl/src/opencl/stereobm.cl
@@ -162,8 +162,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
     int y_tex;
     int x_tex = X - radius;
 
-    if (x_tex >= cwidth)
-        return;
+    //if (x_tex >= cwidth)
+    //    return;
 
     for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
     {
diff --git a/modules/ocl/src/opencl/tvl1flow.cl b/modules/ocl/src/opencl/tvl1flow.cl
new file mode 100644
index 0000000000..e0ff7307b1
--- /dev/null
+++ b/modules/ocl/src/opencl/tvl1flow.cl
@@ -0,0 +1,407 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void centeredGradientKernel(__global const float* src, int src_col, int src_row, int src_step, 
+__global float* dx, __global float* dy, int dx_step)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if((x < src_col)&&(y < src_row))
+    {
+		int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);
+     	int src_x2 = (x - 1) > 0 ? (x -1) : 0;
+        
+        //if(src[y * src_step + src_x1] == src[y * src_step+ src_x2])
+        //{
+        //    printf("y = %d\n", y);
+        //    printf("src_x1 = %d\n", src_x1);
+        //    printf("src_x2 = %d\n", src_x2);
+        //}
+        dx[y * dx_step+ x] = 0.5f * (src[y * src_step + src_x1] - src[y * src_step+ src_x2]);
+        
+		int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);
+        int src_y2 = (y - 1) > 0 ? (y - 1) : 0;
+        dy[y * dx_step+ x] = 0.5f * (src[src_y1 * src_step + x] - src[src_y2 * src_step+ x]);
+    }
+
+}
+
+float bicubicCoeff(float x_)
+{
+
+    float x = fabs(x_);
+    if (x <= 1.0f)
+    {
+        return x * x * (1.5f * x - 2.5f) + 1.0f;
+    }
+    else if (x < 2.0f)
+    {
+        return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+    }
+    else
+    {
+        return 0.0f;
+    }
+
+}
+
+__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,
+	image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,  
+    __global const float* u1, int u1_step, 
+    __global const float* u2,
+    __global float* I1w,
+	__global float* I1wx, /*int I1wx_step,*/
+	__global float* I1wy, /*int I1wy_step,*/
+	__global float* grad, /*int grad_step,*/
+	__global float* rho,
+	int I1w_step,
+	int u2_step,
+	int u1_offset_x,
+	int u1_offset_y,
+	int u2_offset_x,
+	int u2_offset_y)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if(x < I0_col&&y < I0_row)
+    {
+        //const float u1Val = u1(y, x);
+        const float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        //const float u2Val = u2(y, x);
+        const float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+        const float wx = x + u1Val;
+        const float wy = y + u2Val;
+
+        const int xmin = ceil(wx - 2.0f);
+        const int xmax = floor(wx + 2.0f);
+
+        const int ymin = ceil(wy - 2.0f);
+        const int ymax = floor(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+        sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+
+                //sum  += w * tex2D(tex_I1 , cx, cy);
+				int2 cood = (int2)(cx, cy);
+                sum += w * read_imagef(tex_I1, sampleri, cood).x;
+                //sumx += w * tex2D(tex_I1x, cx, cy);
+                sumx += w * read_imagef(tex_I1x, sampleri, cood).x;
+                //sumy += w * tex2D(tex_I1y, cx, cy);
+                sumy += w * read_imagef(tex_I1y, sampleri, cood).x;
+
+                wsum += w;
+            }
+        }
+
+        const float coeff = 1.0f / wsum;
+
+        const float I1wVal  = sum  * coeff;
+        const float I1wxVal = sumx * coeff;
+        const float I1wyVal = sumy * coeff;
+
+        I1w[y * I1w_step + x]  = I1wVal;
+        I1wx[y * I1w_step + x] = I1wxVal;
+        I1wy[y * I1w_step + x] = I1wyVal;
+
+        const float Ix2 = I1wxVal * I1wxVal;
+        const float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad[y * I1w_step + x] = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        const float I0Val = I0[y * I0_step + x];
+        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+
+}
+
+float readImage(__global const float *image,  const int x,  const int y,  const int rows,  const int cols, const int elemCntPerRow)
+{
+    int i0 = clamp(x, 0, cols - 1);
+    int j0 = clamp(y, 0, rows - 1);
+    int i1 = clamp(x + 1, 0, cols - 1);
+    int j1 = clamp(y + 1, 0, rows - 1);
+
+    return image[j0 * elemCntPerRow + i0];
+}
+
+__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,
+	__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,  
+    __global const float* u1, int u1_step, 
+    __global const float* u2,
+    __global float* I1w,
+	__global float* I1wx, /*int I1wx_step,*/
+	__global float* I1wy, /*int I1wy_step,*/
+	__global float* grad, /*int grad_step,*/
+	__global float* rho,
+	int I1w_step,
+	int u2_step,
+	int I1_step,
+	int I1x_step)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if(x < I0_col&&y < I0_row)
+    {
+        //const float u1Val = u1(y, x);
+        const float u1Val = u1[y * u1_step + x];
+        //const float u2Val = u2(y, x);
+        const float u2Val = u2[y * u2_step + x];
+
+        const float wx = x + u1Val;
+        const float wy = y + u2Val;
+
+        const int xmin = ceil(wx - 2.0f);
+        const int xmax = floor(wx + 2.0f);
+
+        const int ymin = ceil(wy - 2.0f);
+        const int ymax = floor(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+
+				int2 cood = (int2)(cx, cy);
+                sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);
+                sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);
+                sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);
+                wsum += w;
+            }
+        }
+
+        const float coeff = 1.0f / wsum;
+
+        const float I1wVal  = sum  * coeff;
+        const float I1wxVal = sumx * coeff;
+        const float I1wyVal = sumy * coeff;
+
+        I1w[y * I1w_step + x]  = I1wVal;
+        I1wx[y * I1w_step + x] = I1wxVal;
+        I1wy[y * I1w_step + x] = I1wyVal;
+
+        const float Ix2 = I1wxVal * I1wxVal;
+        const float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad[y * I1w_step + x] = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        const float I0Val = I0[y * I0_step + x];
+        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+
+}
+
+
+__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step, 
+    __global const float* u2, 
+    __global float* p11, int p11_step, 
+    __global float* p12,
+    __global float* p21,
+    __global float* p22, 
+    const float taut,
+	int u2_step,
+	int u1_offset_x,
+	int u1_offset_y,
+	int u2_offset_x,
+	int u2_offset_y)
+{
+
+    //const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    //const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if(x < u1_col && y < u1_row)
+    {
+		int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
+        const float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        
+		int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
+        const float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+
+		int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
+        const float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+		int src_y2 = (y + 1) <  (u1_row - 1) ? (y + 1) : (u1_row - 1);
+        const float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+        const float g1 = hypot(u1x, u1y);
+        const float g2 = hypot(u2x, u2y);
+
+        const float ng1 = 1.0f + taut * g1;
+        const float ng2 = 1.0f + taut * g2;
+
+        p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;
+        p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;
+        p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;
+        p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;
+    }
+
+}
+
+float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)
+{
+
+    if (x > 0 && y > 0)
+    {
+        const float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];
+        const float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];
+        return v1x + v2y;
+    }
+    else
+    {
+        if (y > 0)
+            return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];
+        else
+        {
+            if (x > 0)
+                return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];
+            else
+                return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];
+        }
+    }
+
+}
+
+__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,
+    __global const float* I1wy, /*int I1wy_step,*/
+    __global const float* grad, /*int grad_step,*/ 
+    __global const float* rho_c, /*int rho_c_step,*/
+    __global const float* p11, /*int p11_step,*/
+    __global const float* p12, /*int p12_step,*/
+    __global const float* p21, /*int p21_step,*/
+    __global const float* p22, /*int p22_step,*/
+    __global float* u1, int u1_step, 
+    __global float* u2, 
+    __global float* error, const float l_t, const float theta, int u2_step,
+	int u1_offset_x,
+	int u1_offset_y,
+	int u2_offset_x,
+	int u2_offset_y)
+{
+
+    //const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    //const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+
+    if(x < I1wx_col && y < I1wx_row)
+    {
+        const float I1wxVal = I1wx[y * I1wx_step + x];
+        const float I1wyVal = I1wy[y * I1wx_step + x];
+        const float gradVal = grad[y * I1wx_step + x];
+        const float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
+        const float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
+
+        const float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
+
+        // estimate the values of the variable (v1, v2) (thresholding operator TH)
+
+        float d1 = 0.0f;
+        float d2 = 0.0f;
+
+        if (rho < -l_t * gradVal)
+        {
+            d1 = l_t * I1wxVal;
+            d2 = l_t * I1wyVal;
+        }
+        else if (rho > l_t * gradVal)
+        {
+            d1 = -l_t * I1wxVal;
+            d2 = -l_t * I1wyVal;
+        }
+        else if (gradVal > 1.192092896e-07f)
+        {
+            const float fi = -rho / gradVal;
+            d1 = fi * I1wxVal;
+            d2 = fi * I1wyVal;
+        }
+
+        const float v1 = u1OldVal + d1;
+        const float v2 = u2OldVal + d2;
+
+        // compute the divergence of the dual variable (p1, p2)
+
+        const float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);
+        const float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);
+
+        // estimate the values of the optical flow (u1, u2)
+
+        const float u1NewVal = v1 + theta * div_p1;
+        const float u2NewVal = v2 + theta * div_p2;
+
+        u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;
+        u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;
+
+        const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
+        const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
+        error[y * I1wx_step + x] = n1 + n2;
+    }
+
+}
diff --git a/modules/ocl/src/precomp.hpp b/modules/ocl/src/precomp.hpp
index b2a3e41c6f..4f93eac420 100644
--- a/modules/ocl/src/precomp.hpp
+++ b/modules/ocl/src/precomp.hpp
@@ -78,6 +78,7 @@
 
 #if defined (HAVE_OPENCL)
 
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #include "opencv2/ocl/private/util.hpp"
 #include "safe_call.hpp"
 
diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp
index 4a6ce1c790..8e9420480c 100644
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Dachuan Zhao, dachuan@multicorewareinc.com
-//		Yao Wang, bitwangyaoyao@gmail.com
+//      Dachuan Zhao, dachuan@multicorewareinc.com
+//      Yao Wang, bitwangyaoyao@gmail.com
 //      Nathan, liujun@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -56,31 +56,16 @@ namespace cv
 {
 namespace ocl
 {
-///////////////////////////OpenCL kernel strings///////////////////////////
 extern const char *pyrlk;
 extern const char *pyrlk_no_image;
-extern const char *arithm_mul;
 }
 }
-
 struct dim3
 {
     unsigned int x, y, z;
 };
 
-struct float2
-{
-    float x, y;
-};
-
-struct int2
-{
-    int x, y;
-};
-
-namespace
-{
-void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
+static void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
 {
     winSize.width *= cn;
 
@@ -100,45 +85,6 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe
 
     block.z = patch.z = 1;
 }
-}
-
-static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
-{
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
-
-    CV_Assert(src1.cols == dst.cols &&
-              src1.rows == dst.rows);
-
-    CV_Assert(src1.type() == dst.type());
-    CV_Assert(src1.depth() != CV_8S);
-
-    Context  *clCxt = src1.clCxt;
-
-    size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { src1.cols,
-                                src1.rows,
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    args.push_back( make_pair( sizeof(float), (float *)&scalar ));
-
-    openCLExecuteKernel(clCxt, &arithm_mul, "arithm_muls", globalThreads, localThreads, args, -1, src1.depth());
-}
 
 static void lkSparse_run(oclMat &I, oclMat &J,
                          const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
@@ -151,15 +97,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     size_t localThreads[3]  = { 8, isImageSupported ? 8 : 32, 1 };
     size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
     int cn = I.oclchannels();
-    char calcErr;
-    if (level == 0)
-    {
-        calcErr = 1;
-    }
-    else
-    {
-        calcErr = 0;
-    }
+    char calcErr = level==0?1:0;
 
     vector<pair<size_t , const void *> > args;
 
@@ -187,8 +125,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     args.push_back( make_pair( sizeof(cl_int), (void *)&iters ));
     args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr ));
 
-    bool is_cpu;
-    queryDeviceInfo(IS_CPU_DEVICE, &is_cpu);
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
     if (is_cpu)
     {
         openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU");
@@ -199,7 +136,17 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     {
         if(isImageSupported)
         {
-            openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth());
+            stringstream idxStr;
+            idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth();
+            cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str());
+            int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+            openCLSafeCall(clReleaseKernel(kernel));
+
+            static char opt[32] = {0};
+            sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+
+            openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, 
+                                args, I.oclchannels(), I.depth(), opt);
             releaseTexture(ITex);
             releaseTexture(JTex);
         }
@@ -242,8 +189,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
 
     oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
     oclMat temp2 = nextPts.reshape(1);
-    multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
-    //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
+    multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);
 
     ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
     status.setTo(Scalar::all(1));
@@ -258,7 +204,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
         ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
 
     // build the image pyramids.
-
     prevPyr_.resize(maxLevel + 1);
     nextPyr_.resize(maxLevel + 1);
 
@@ -275,7 +220,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
     }
 
     // dI/dx ~ Ix, dI/dy ~ Iy
-
     for (int level = maxLevel; level >= 0; level--)
     {
         lkSparse_run(prevPyr_[level], nextPyr_[level],
diff --git a/modules/ocl/src/safe_call.hpp b/modules/ocl/src/safe_call.hpp
index 441495f860..ba36cabd32 100644
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/ocl/src/safe_call.hpp
@@ -47,7 +47,7 @@
 #define __OPENCV_OPENCL_SAFE_CALL_HPP__
 
 #if defined __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp
new file mode 100644
index 0000000000..a322f62a4e
--- /dev/null
+++ b/modules/ocl/src/tvl1flow.cpp
@@ -0,0 +1,479 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//		Jin Ma, jin@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "precomp.hpp"
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char* tvl1flow;
+    }
+}
+
+cv::ocl::OpticalFlowDual_TVL1_OCL::OpticalFlowDual_TVL1_OCL()
+{
+    tau            = 0.25;
+    lambda         = 0.15;
+    theta          = 0.3;
+    nscales        = 5;
+    warps          = 5;
+    epsilon        = 0.01;
+    iterations     = 300;
+    useInitialFlow = false;
+}
+
+void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy)
+{
+    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+    CV_Assert( I0.size() == I1.size() );
+    CV_Assert( I0.type() == I1.type() );
+    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
+    CV_Assert( nscales > 0 );
+
+    // allocate memory for the pyramid structure
+    I0s.resize(nscales);
+    I1s.resize(nscales);
+    u1s.resize(nscales);
+    u2s.resize(nscales);
+    //I0s_step == I1s_step
+    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
+    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
+
+
+    if (!useInitialFlow)
+    {
+        flowx.create(I0.size(), CV_32FC1);
+        flowy.create(I0.size(), CV_32FC1);
+    }
+    //u1s_step != u2s_step
+    u1s[0] = flowx;
+    u2s[0] = flowy;
+
+    I1x_buf.create(I0.size(), CV_32FC1);
+    I1y_buf.create(I0.size(), CV_32FC1);
+
+    I1w_buf.create(I0.size(), CV_32FC1);
+    I1wx_buf.create(I0.size(), CV_32FC1);
+    I1wy_buf.create(I0.size(), CV_32FC1);
+
+    grad_buf.create(I0.size(), CV_32FC1);
+    rho_c_buf.create(I0.size(), CV_32FC1);
+
+    p11_buf.create(I0.size(), CV_32FC1);
+    p12_buf.create(I0.size(), CV_32FC1);
+    p21_buf.create(I0.size(), CV_32FC1);
+    p22_buf.create(I0.size(), CV_32FC1);
+
+    diff_buf.create(I0.size(), CV_32FC1);
+
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        ocl::pyrDown(I0s[s - 1], I0s[s]);
+        ocl::pyrDown(I1s[s - 1], I1s[s]);
+
+        if (I0s[s].cols < 16 || I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+
+        if (useInitialFlow)
+        {
+            ocl::pyrDown(u1s[s - 1], u1s[s]);
+            ocl::pyrDown(u2s[s - 1], u2s[s]);
+
+            //ocl::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
+            multiply(0.5, u1s[s], u1s[s]);
+            //ocl::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
+            multiply(0.5, u1s[s], u2s[s]);
+        }
+    }
+
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
+
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+
+        // otherwise, upsample the optical flow
+
+        // zoom the optical flow for the next finer scale
+        ocl::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
+        ocl::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
+
+        // scale the optical flow with the appropriate zoom factor
+        multiply(2, u1s[s - 1], u1s[s - 1]);
+        multiply(2, u2s[s - 1], u2s[s - 1]);
+
+    }
+
+}
+
+namespace ocl_tvl1flow
+{
+    void centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy);
+
+    void warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, 
+        oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, 
+        oclMat &grad, oclMat &rho);
+
+    void estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad, 
+        oclMat &rho_c, oclMat &p11, oclMat &p12, 
+        oclMat &p21, oclMat &p22, oclMat &u1, 
+        oclMat &u2, oclMat &error, float l_t, float theta);
+
+    void estimateDualVariables(oclMat &u1, oclMat &u2, 
+        oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut);
+}
+
+void cv::ocl::OpticalFlowDual_TVL1_OCL::procOneScale(const oclMat &I0, const oclMat &I1, oclMat &u1, oclMat &u2)
+{
+    using namespace ocl_tvl1flow;
+
+    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
+
+    CV_DbgAssert( I1.size() == I0.size() );
+    CV_DbgAssert( I1.type() == I0.type() );
+    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == u1.size() );
+
+    if (u1.empty())
+    {
+        u1.create(I0.size(), CV_32FC1);
+        u1.setTo(Scalar::all(0));
+
+        u2.create(I0.size(), CV_32FC1);
+        u2.setTo(Scalar::all(0));
+    }
+
+    oclMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    oclMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    centeredGradient(I1, I1x, I1y);
+
+    oclMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    oclMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    oclMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    oclMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    oclMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    oclMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    oclMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    oclMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    oclMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+
+    oclMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
+
+        double error = numeric_limits<double>::max();
+        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
+        {
+            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, 
+                u1, u2, diff, l_t, static_cast<float>(theta));
+
+            error = ocl::sum(diff)[0];
+
+            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
+
+        }
+    }
+
+}
+
+void cv::ocl::OpticalFlowDual_TVL1_OCL::collectGarbage()
+{
+    I0s.clear();
+    I1s.clear();
+    u1s.clear();
+    u2s.clear();
+
+    I1x_buf.release();
+    I1y_buf.release();
+
+    I1w_buf.release();
+    I1wx_buf.release();
+    I1wy_buf.release();
+
+    grad_buf.release();
+    rho_c_buf.release();
+
+    p11_buf.release();
+    p12_buf.release();
+    p21_buf.release();
+    p22_buf.release();
+
+    diff_buf.release();
+    norm_buf.release();
+}
+
+void ocl_tvl1flow::centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy)
+{
+    Context  *clCxt = src.clCxt;
+    size_t localThreads[3] = {32, 8, 1};
+    size_t globalThreads[3] = {src.cols, src.rows, 1};
+
+    int srcElementSize = src.elemSize();
+    int src_step = src.step/srcElementSize;
+
+    int dElememntSize = dx.elemSize();
+    int dx_step = dx.step/dElememntSize;
+
+    string kernelName = "centeredGradientKernel";
+    vector< pair<size_t, const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&src.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&src.cols));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&src.rows));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&src_step));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&dx.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&dy.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&dx_step));
+    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThreads, localThreads, args, -1, -1);
+
+}
+
+void ocl_tvl1flow::estimateDualVariables(oclMat &u1, oclMat &u2, oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut)
+{
+    Context *clCxt = u1.clCxt;
+
+    size_t localThread[] = {32, 8, 1};
+    size_t globalThread[] = 
+    {
+        u1.cols, 
+        u1.rows,
+        1
+    };
+
+    int u1_element_size = u1.elemSize();
+    int u1_step = u1.step/u1_element_size;
+
+    int u2_element_size = u2.elemSize();
+    int u2_step = u2.step/u2_element_size;
+
+    int p11_element_size = p11.elemSize();
+    int p11_step = p11.step/p11_element_size;
+
+    int u1_offset_y = u1.offset/u1.step;
+    int u1_offset_x = u1.offset%u1.step;
+    u1_offset_x = u1_offset_x/u1.elemSize();
+
+    int u2_offset_y = u2.offset/u2.step;
+    int u2_offset_x = u2.offset%u2.step;
+    u2_offset_x = u2_offset_x/u2.elemSize();
+
+    string kernelName = "estimateDualVariablesKernel";
+    vector< pair<size_t, const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1.cols));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1.rows));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&p11_step));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
+    args.push_back( make_pair( sizeof(cl_float), (void*)&taut));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
+
+    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
+}
+
+void ocl_tvl1flow::estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad, 
+    oclMat &rho_c, oclMat &p11, oclMat &p12, 
+    oclMat &p21, oclMat &p22, oclMat &u1, 
+    oclMat &u2, oclMat &error, float l_t, float theta)
+{
+    Context* clCxt = I1wx.clCxt;
+
+    size_t localThread[] = {32, 8, 1};
+    size_t globalThread[] = 
+    {
+        I1wx.cols, 
+        I1wx.rows,
+        1
+    };
+
+    int I1wx_element_size = I1wx.elemSize();
+    int I1wx_step = I1wx.step/I1wx_element_size;
+
+    int u1_element_size = u1.elemSize();
+    int u1_step = u1.step/u1_element_size;
+
+    int u2_element_size = u2.elemSize();
+    int u2_step = u2.step/u2_element_size;
+
+    int u1_offset_y = u1.offset/u1.step;
+    int u1_offset_x = u1.offset%u1.step;
+    u1_offset_x = u1_offset_x/u1.elemSize();
+
+    int u2_offset_y = u2.offset/u2.step;
+    int u2_offset_x = u2.offset%u2.step;
+    u2_offset_x = u2_offset_x/u2.elemSize();
+
+    string kernelName = "estimateUKernel";
+    vector< pair<size_t, const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.cols));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.rows));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx_step));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&rho_c.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&error.data));
+    args.push_back( make_pair( sizeof(cl_float), (void*)&l_t));
+    args.push_back( make_pair( sizeof(cl_float), (void*)&theta));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
+
+    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
+}
+
+void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, oclMat &grad, oclMat &rho)
+{
+    Context* clCxt = I0.clCxt;
+    const bool isImgSupported = support_image2d(clCxt);
+    
+    CV_Assert(isImgSupported);
+
+    int u1ElementSize = u1.elemSize();
+    int u1Step = u1.step/u1ElementSize;
+
+    int u2ElementSize = u2.elemSize();
+    int u2Step = u2.step/u2ElementSize;
+
+    int I0ElementSize = I0.elemSize();
+    int I0Step = I0.step/I0ElementSize;
+
+    int I1w_element_size = I1w.elemSize();
+    int I1w_step = I1w.step/I1w_element_size;
+
+    int u1_offset_y = u1.offset/u1.step;
+    int u1_offset_x = u1.offset%u1.step;
+    u1_offset_x = u1_offset_x/u1.elemSize();
+
+    int u2_offset_y = u2.offset/u2.step;
+    int u2_offset_x = u2.offset%u2.step;
+    u2_offset_x = u2_offset_x/u2.elemSize();
+
+    size_t localThread[] = {32, 8, 1};
+    size_t globalThread[] = 
+    {
+        I0.cols, 
+        I0.rows,
+        1
+    };
+
+    cl_mem I1_tex;
+    cl_mem I1x_tex;
+    cl_mem I1y_tex;
+    I1_tex = bindTexture(I1);
+    I1x_tex = bindTexture(I1x);
+    I1y_tex = bindTexture(I1y);
+
+    string kernelName = "warpBackwardKernel";
+    vector< pair<size_t, const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I0.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&I0Step));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&I0.cols));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&I0.rows));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1_tex));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1x_tex));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1y_tex));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1Step));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1w.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void*)&rho.data));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&I1w_step));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2Step));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
+
+    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
+
+    releaseTexture(I1_tex);
+    releaseTexture(I1x_tex);
+    releaseTexture(I1y_tex);
+}
\ No newline at end of file
diff --git a/modules/ocl/test/test_canny.cpp b/modules/ocl/test/test_canny.cpp
index cac6b66f51..10032e897c 100644
--- a/modules/ocl/test/test_canny.cpp
+++ b/modules/ocl/test/test_canny.cpp
@@ -45,7 +45,6 @@
 
 #include "precomp.hpp"
 #ifdef HAVE_OPENCL
-#define SHOW_RESULT 0
 
 ////////////////////////////////////////////////////////
 // Canny
@@ -59,13 +58,10 @@ PARAM_TEST_CASE(Canny, AppertureSize, L2gradient)
     bool useL2gradient;
 
     cv::Mat edges_gold;
-    //std::vector<cv::ocl::Info> oclinfo;
     virtual void SetUp()
     {
         apperture_size = GET_PARAM(0);
         useL2gradient = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
     }
 };
 
@@ -77,32 +73,18 @@ TEST_P(Canny, Accuracy)
     double low_thresh = 50.0;
     double high_thresh = 100.0;
 
-    cv::resize(img, img, cv::Size(512, 384));
     cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);
 
     cv::ocl::oclMat edges;
     cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
 
-    char filename [100];
-    sprintf(filename, "G:/Valve_edges_a%d_L2Grad%d.jpg", apperture_size, (int)useL2gradient);
-
     cv::Mat edges_gold;
     cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
 
-#if SHOW_RESULT
-    cv::Mat edges_x2, ocl_edges(edges);
-    edges_x2.create(edges.rows, edges.cols * 2, edges.type());
-    edges_x2.setTo(0);
-    cv::add(edges_gold, cv::Mat(edges_x2, cv::Rect(0, 0, edges_gold.cols, edges_gold.rows)), cv::Mat(edges_x2, cv::Rect(0, 0, edges_gold.cols, edges_gold.rows)));
-    cv::add(ocl_edges, cv::Mat(edges_x2, cv::Rect(edges_gold.cols, 0, edges_gold.cols, edges_gold.rows)), cv::Mat(edges_x2, cv::Rect(edges_gold.cols, 0, edges_gold.cols, edges_gold.rows)));
-    cv::namedWindow("Canny result (left: cpu, right: ocl)");
-    cv::imshow("Canny result (left: cpu, right: ocl)", edges_x2);
-    cv::waitKey();
-#endif //OUTPUT_RESULT
     EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2);
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Canny, testing::Combine(
                             testing::Values(AppertureSize(3), AppertureSize(5)),
                             testing::Values(L2gradient(false), L2gradient(true))));
 #endif
\ No newline at end of file
diff --git a/modules/ocl/test/test_columnsum.cpp b/modules/ocl/test/test_columnsum.cpp
deleted file mode 100644
index 231f0657b0..0000000000
--- a/modules/ocl/test/test_columnsum.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//	   Chunpeng Zhang chunpeng@multicorewareinc.com
-//
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-PARAM_TEST_CASE(ColumnSum, cv::Size)
-{
-    cv::Size size;
-    cv::Mat src;
-
-    virtual void SetUp()
-    {
-        size = GET_PARAM(0);
-    }
-};
-
-TEST_P(ColumnSum, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_32FC1);
-    cv::ocl::oclMat d_dst;
-    cv::ocl::oclMat d_src(src);
-
-    cv::ocl::columnSum(d_src, d_dst);
-
-    cv::Mat dst(d_dst);
-
-    for (int j = 0; j < src.cols; ++j)
-    {
-        float gold = src.at<float>(0, j);
-        float res = dst.at<float>(0, j);
-        ASSERT_NEAR(res, gold, 1e-5);
-    }
-
-    for (int i = 1; i < src.rows; ++i)
-    {
-        for (int j = 0; j < src.cols; ++j)
-        {
-            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
-            float res = dst.at<float>(i, j);
-            ASSERT_NEAR(res, gold, 1e-5);
-        }
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);
-
-
-#endif
diff --git a/modules/ocl/test/test_gemm.cpp b/modules/ocl/test/test_gemm.cpp
index a5d90ff01c..5548456568 100644
--- a/modules/ocl/test/test_gemm.cpp
+++ b/modules/ocl/test/test_gemm.cpp
@@ -74,7 +74,7 @@ TEST_P(Gemm, Accuracy)
     cv::gemm(a, b, 1.0, c, 1.0, dst, flags);
     cv::ocl::gemm(cv::ocl::oclMat(a), cv::ocl::oclMat(b), 1.0, cv::ocl::oclMat(c), 1.0, ocl_dst, flags);
 
-    EXPECT_MAT_NEAR(dst, ocl_dst, mat_size.area() * 1e-4, "");
+    EXPECT_MAT_NEAR(dst, ocl_dst, mat_size.area() * 1e-4);
 }
 
 INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
diff --git a/modules/ocl/test/test_haar.cpp b/modules/ocl/test/test_haar.cpp
deleted file mode 100644
index 96f721146b..0000000000
--- a/modules/ocl/test/test_haar.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Sen Liu, swjutls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/objdetect/objdetect.hpp"
-#include "precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-extern string workdir;
-struct getRect
-{
-    Rect operator ()(const CvAvgComp &e) const
-    {
-        return e.rect;
-    }
-};
-
-PARAM_TEST_CASE(Haar, double, int)
-{
-    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::ocl::OclCascadeClassifierBuf cascadebuf;
-    cv::CascadeClassifier cpucascade, cpunestedCascade;
-
-    double scale;
-    int flags;
-
-    virtual void SetUp()
-    {
-        scale = GET_PARAM(0);
-        flags = GET_PARAM(1);
-        string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
-
-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName )))
-        {
-            cout << "ERROR: Could not load classifier cascade" << endl;
-            return;
-        }
-    }
-};
-
-////////////////////////////////faceDetect/////////////////////////////////////////////////
-TEST_P(Haar, FaceDetect)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << "Couldn't read " << imgName << std::endl;
-        return ;
-    }
-
-    vector<Rect> faces, oclfaces;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    cv::ocl::oclMat image;
-    CvSeq *_objects;
-    image.upload(smallImg);
-    _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
-                   3, flags, Size(30, 30), Size(0, 0) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    oclfaces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-
-    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-    EXPECT_EQ(faces.size(), oclfaces.size());
-}
-
-TEST_P(Haar, FaceDetectUseBuf)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << "Couldn't read " << imgName << std::endl;
-        return ;
-    }
-
-    vector<Rect> faces, oclfaces;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    cv::ocl::oclMat image;
-    image.upload(smallImg);
-
-    cascadebuf.detectMultiScale( image, oclfaces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-    cascadebuf.release();
-
-    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-    EXPECT_EQ(faces.size(), oclfaces.size());
-}
-
-INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
-    Combine(Values(1.0),
-            Values(CV_HAAR_SCALE_IMAGE, 0)));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp
index 664f8a3919..3228b6c0cf 100644
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
@@ -23,6 +23,7 @@
 //    Rock Li, Rock.Li@amd.com
 //    Wu Zailong, bullet@yeah.net
 //    Xu Pang, pangxu010@163.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -1393,6 +1394,46 @@ TEST_P(calcHist, Mat)
         EXPECT_MAT_NEAR(dst_hist, cpu_hist, 0.0);
     }
 }
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CLAHE
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(ClipLimit, double)
+}
+
+PARAM_TEST_CASE(CLAHE, cv::Size, ClipLimit)
+{
+    cv::Size size;
+    double clipLimit;
+
+    cv::Mat src;
+    cv::Mat dst_gold;
+
+    cv::ocl::oclMat g_src;
+    cv::ocl::oclMat g_dst;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        clipLimit = GET_PARAM(1);
+
+        cv::RNG &rng = TS::ptr()->get_rng();
+        src = randomMat(rng, size, CV_8UC1, 0, 256, false);
+        g_src.upload(src);
+    }
+};
+
+TEST_P(CLAHE, Accuracy)
+{
+    cv::Ptr<cv::CLAHE> clahe = cv::ocl::createCLAHE(clipLimit);
+    clahe->apply(g_src, g_dst);
+    cv::Mat dst(g_dst);
+
+    cv::Ptr<cv::CLAHE> clahe_gold = cv::createCLAHE(clipLimit);
+    clahe_gold->apply(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+}
 
 ///////////////////////////Convolve//////////////////////////////////
 PARAM_TEST_CASE(ConvolveTestBase, MatType, bool)
@@ -1532,6 +1573,47 @@ TEST_P(Convolve, Mat)
     }
 }
 
+//////////////////////////////// ColumnSum //////////////////////////////////////
+PARAM_TEST_CASE(ColumnSum, cv::Size)
+{
+    cv::Size size;
+    cv::Mat src;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+    }
+};
+
+TEST_P(ColumnSum, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1);
+    cv::ocl::oclMat d_dst;
+    cv::ocl::oclMat d_src(src);
+
+    cv::ocl::columnSum(d_src, d_dst);
+
+    cv::Mat dst(d_dst);
+
+    for (int j = 0; j < src.cols; ++j)
+    {
+        float gold = src.at<float>(0, j);
+        float res = dst.at<float>(0, j);
+        ASSERT_NEAR(res, gold, 1e-5);
+    }
+
+    for (int i = 1; i < src.rows; ++i)
+    {
+        for (int j = 0; j < src.cols; ++j)
+        {
+            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            float res = dst.at<float>(i, j);
+            ASSERT_NEAR(res, gold, 1e-5);
+        }
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////
+
 INSTANTIATE_TEST_CASE_P(ImgprocTestBase, equalizeHist, Combine(
                             ONE_TYPE(CV_8UC1),
                             NULL_TYPE,
@@ -1643,7 +1725,10 @@ INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine(
                             ONE_TYPE(CV_32SC1) //no use
                         ));
 
-//INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine(
-//                            Values(CV_32FC1, CV_32FC1),
-//                            Values(false))); // Values(false) is the reserved parameter
+INSTANTIATE_TEST_CASE_P(ImgProc, CLAHE, Combine(
+                        Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(1300, 1300)),
+                        Values(0.0, 40.0)));
+
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);
+
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_moments.cpp b/modules/ocl/test/test_moments.cpp
index 98c66def31..86f4779d68 100644
--- a/modules/ocl/test/test_moments.cpp
+++ b/modules/ocl/test/test_moments.cpp
@@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat)
     {
         if(test_contours)
         {
-            Mat src = imread( workdir + "../cpp/pic3.png", 1 );
-            Mat src_gray, canny_output;
-            cvtColor( src, src_gray, CV_BGR2GRAY );
+            Mat src = imread( workdir + "../cpp/pic3.png", IMREAD_GRAYSCALE );
+            ASSERT_FALSE(src.empty());
+            Mat canny_output;
             vector<vector<Point> > contours;
             vector<Vec4i> hierarchy;
-            Canny( src_gray, canny_output, 100, 200, 3 );
+            Canny( src, canny_output, 100, 200, 3 );
             findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
             for( size_t i = 0; i < contours.size(); i++ )
             {
diff --git a/modules/ocl/test/test_hog.cpp b/modules/ocl/test/test_objdetect.cpp
similarity index 51%
rename from modules/ocl/test/test_hog.cpp
rename to modules/ocl/test/test_objdetect.cpp
index cfc4e3963f..86590f7981 100644
--- a/modules/ocl/test/test_hog.cpp
+++ b/modules/ocl/test/test_objdetect.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Wenju He, wenju@multicorewareinc.com
+//		Yao Wang, bitwangyaoyao@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,51 +45,58 @@
 
 #include "precomp.hpp"
 #include "opencv2/core/core.hpp"
-using namespace std;
+#include "opencv2/objdetect/objdetect.hpp"
+
+using namespace cv;
+using namespace testing;
 #ifdef HAVE_OPENCL
 
 extern string workdir;
-PARAM_TEST_CASE(HOG, cv::Size, int)
+
+///////////////////// HOG /////////////////////////////
+PARAM_TEST_CASE(HOG, Size, int)
 {
-    cv::Size winSize;
+    Size winSize;
     int type;
+    Mat img_rgb;
     virtual void SetUp()
     {
         winSize = GET_PARAM(0);
         type = GET_PARAM(1);
+        img_rgb = readImage(workdir + "../gpu/road.png");
+        if(img_rgb.empty())
+        {
+            std::cout << "Couldn't read road.png" << std::endl;
+        }
     }
 };
 
 TEST_P(HOG, GetDescriptors)
 {
-    // Load image
-    cv::Mat img_rgb = readImage(workdir + "lena.jpg");
-    ASSERT_FALSE(img_rgb.empty());
-
     // Convert image
-    cv::Mat img;
+    Mat img;
     switch (type)
     {
     case CV_8UC1:
-        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        cvtColor(img_rgb, img, CV_BGR2GRAY);
         break;
     case CV_8UC4:
     default:
-        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        cvtColor(img_rgb, img, CV_BGR2BGRA);
         break;
     }
-    cv::ocl::oclMat d_img(img);
+    ocl::oclMat d_img(img);
 
     // HOGs
-    cv::ocl::HOGDescriptor ocl_hog;
+    ocl::HOGDescriptor ocl_hog;
     ocl_hog.gamma_correction = true;
-    cv::HOGDescriptor hog;
+    HOGDescriptor hog;
     hog.gammaCorrection = true;
 
     // Compute descriptor
-    cv::ocl::oclMat d_descriptors;
+    ocl::oclMat d_descriptors;
     ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
-    cv::Mat down_descriptors;
+    Mat down_descriptors;
     d_descriptors.download(down_descriptors);
     down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
 
@@ -105,45 +112,34 @@ TEST_P(HOG, GetDescriptors)
         hog.compute(img_rgb, descriptors, ocl_hog.win_size);
         break;
     }
-    cv::Mat cpu_descriptors(descriptors);
+    Mat cpu_descriptors(descriptors);
 
     EXPECT_MAT_SIMILAR(down_descriptors, cpu_descriptors, 1e-2);
 }
 
-
-bool match_rect(cv::Rect r1, cv::Rect r2, int threshold)
-{
-    return ((abs(r1.x - r2.x) < threshold) && (abs(r1.y - r2.y) < threshold) &&
-            (abs(r1.width - r2.width) < threshold) && (abs(r1.height - r2.height) < threshold));
-}
-
 TEST_P(HOG, Detect)
 {
-    // Load image
-    cv::Mat img_rgb = readImage(workdir + "lena.jpg");
-    ASSERT_FALSE(img_rgb.empty());
-
     // Convert image
-    cv::Mat img;
+    Mat img;
     switch (type)
     {
     case CV_8UC1:
-        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        cvtColor(img_rgb, img, CV_BGR2GRAY);
         break;
     case CV_8UC4:
     default:
-        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        cvtColor(img_rgb, img, CV_BGR2BGRA);
         break;
     }
-    cv::ocl::oclMat d_img(img);
+    ocl::oclMat d_img(img);
 
     // HOGs
-    if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
-        winSize = cv::Size(64, 128);
-    cv::ocl::HOGDescriptor ocl_hog(winSize);
+    if ((winSize != Size(48, 96)) && (winSize != Size(64, 128)))
+        winSize = Size(64, 128);
+    ocl::HOGDescriptor ocl_hog(winSize);
     ocl_hog.gamma_correction = true;
 
-    cv::HOGDescriptor hog;
+    HOGDescriptor hog;
     hog.winSize = winSize;
     hog.gammaCorrection = true;
 
@@ -165,88 +161,117 @@ TEST_P(HOG, Detect)
     }
 
     // OpenCL detection
-    std::vector<cv::Rect> d_found;
-    ocl_hog.detectMultiScale(d_img, d_found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+    std::vector<Rect> d_found;
+    ocl_hog.detectMultiScale(d_img, d_found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
 
     // CPU detection
-    std::vector<cv::Rect> found;
+    std::vector<Rect> found;
     switch (type)
     {
     case CV_8UC1:
-        hog.detectMultiScale(img, found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+        hog.detectMultiScale(img, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
         break;
     case CV_8UC4:
     default:
-        hog.detectMultiScale(img_rgb, found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+        hog.detectMultiScale(img_rgb, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
         break;
     }
 
-    // Ground-truth rectangular people window
-    cv::Rect win1_64x128(231, 190, 72, 144);
-    cv::Rect win2_64x128(621, 156, 97, 194);
-    cv::Rect win1_48x96(238, 198, 63, 126);
-    cv::Rect win2_48x96(619, 161, 92, 185);
-    cv::Rect win3_48x96(488, 136, 56, 112);
-
-    // Compare whether ground-truth windows are detected and compare the number of windows detected.
-    std::vector<int> d_comp(4);
-    std::vector<int> comp(4);
-    for(int i = 0; i < (int)d_comp.size(); i++)
-    {
-        d_comp[i] = 0;
-        comp[i] = 0;
-    }
-
-    int threshold = 10;
-    int val = 32;
-    d_comp[0] = (int)d_found.size();
-    comp[0] = (int)found.size();
-    if (winSize == cv::Size(48, 96))
-    {
-        for(int i = 0; i < (int)d_found.size(); i++)
-        {
-            if (match_rect(d_found[i], win1_48x96, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found[i], win2_48x96, threshold))
-                d_comp[2] = val;
-            if (match_rect(d_found[i], win3_48x96, threshold))
-                d_comp[3] = val;
-        }
-        for(int i = 0; i < (int)found.size(); i++)
-        {
-            if (match_rect(found[i], win1_48x96, threshold))
-                comp[1] = val;
-            if (match_rect(found[i], win2_48x96, threshold))
-                comp[2] = val;
-            if (match_rect(found[i], win3_48x96, threshold))
-                comp[3] = val;
-        }
-    }
-    else if (winSize == cv::Size(64, 128))
-    {
-        for(int i = 0; i < (int)d_found.size(); i++)
-        {
-            if (match_rect(d_found[i], win1_64x128, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found[i], win2_64x128, threshold))
-                d_comp[2] = val;
-        }
-        for(int i = 0; i < (int)found.size(); i++)
-        {
-            if (match_rect(found[i], win1_64x128, threshold))
-                comp[1] = val;
-            if (match_rect(found[i], win2_64x128, threshold))
-                comp[2] = val;
-        }
-    }
-
-    EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3);
+    EXPECT_LT(checkRectSimilarity(img.size(), found, d_found), 1.0);
 }
 
 
 INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
-                            testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
+                            testing::Values(Size(64, 128), Size(48, 96)),
                             testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
 
+///////////////////////////// Haar //////////////////////////////
+IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
+CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
+CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
+struct getRect
+{
+    Rect operator ()(const CvAvgComp &e) const
+    {
+        return e.rect;
+    }
+};
 
-#endif //HAVE_OPENCL
+PARAM_TEST_CASE(Haar, int, CascadeName)
+{
+    ocl::OclCascadeClassifier cascade, nestedCascade;
+    CascadeClassifier cpucascade, cpunestedCascade;
+
+    int flags;
+    std::string cascadeName;
+    vector<Rect> faces, oclfaces;
+    Mat img;
+    ocl::oclMat d_img;
+
+    virtual void SetUp()
+    {
+        flags = GET_PARAM(0);
+        cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(1));
+        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
+        {
+            std::cout << "ERROR: Could not load classifier cascade" << std::endl;
+            return;
+        }
+        img = readImage(workdir + "lena.jpg", IMREAD_GRAYSCALE);
+        if(img.empty())
+        {
+            std::cout << "Couldn't read lena.jpg" << std::endl;
+            return ;
+        }
+        equalizeHist(img, img);
+        d_img.upload(img);
+    }
+};
+
+TEST_P(Haar, FaceDetect)
+{
+    MemStorage storage(cvCreateMemStorage(0));
+    CvSeq *_objects;
+    _objects = cascade.oclHaarDetectObjects(d_img, storage, 1.1, 3, 
+                                            flags, Size(30, 30), Size(0, 0));
+    vector<CvAvgComp> vecAvgComp;
+    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
+    oclfaces.resize(vecAvgComp.size());
+    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
+    
+    cpucascade.detectMultiScale(img, faces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+
+    EXPECT_LT(checkRectSimilarity(img.size(), faces, oclfaces), 1.0);
+}
+
+TEST_P(Haar, FaceDetectUseBuf)
+{
+    ocl::OclCascadeClassifierBuf cascadebuf;
+    if(!cascadebuf.load(cascadeName))
+    {
+        std::cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << std::endl;
+        return;
+    }
+    cascadebuf.detectMultiScale(d_img, oclfaces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+    cpucascade.detectMultiScale(img, faces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+
+    // intentionally run ocl facedetect again and check if it still works after the first run
+    cascadebuf.detectMultiScale(d_img, oclfaces,  1.1, 3,
+                                flags,
+                                Size(30, 30));
+    cascadebuf.release();
+
+    EXPECT_LT(checkRectSimilarity(img.size(), faces, oclfaces), 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, Haar,
+    Combine(Values(CV_HAAR_SCALE_IMAGE, 0), 
+            Values(cascade_frontalface_alt/*, cascade_frontalface_alt2*/)));
+
+#endif //HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/ocl/test/test_pyrlk.cpp b/modules/ocl/test/test_optflow.cpp
similarity index 54%
rename from modules/ocl/test/test_pyrlk.cpp
rename to modules/ocl/test/test_optflow.cpp
index 064cb30bd8..0121be8f9e 100644
--- a/modules/ocl/test/test_pyrlk.cpp
+++ b/modules/ocl/test/test_optflow.cpp
@@ -1,4 +1,4 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -7,12 +7,16 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
+// @Authors
+//
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,9 +25,9 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
@@ -52,6 +56,124 @@ using namespace std;
 
 extern string workdir;
 
+
+//////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(MinDistance, double)
+}
+PARAM_TEST_CASE(GoodFeaturesToTrack, MinDistance)
+{
+    double minDistance;
+
+    virtual void SetUp()
+    {
+        minDistance = GET_PARAM(0);
+    }
+};
+
+TEST_P(GoodFeaturesToTrack, Accuracy)
+{
+    cv::Mat frame = readImage(workdir + "../gpu/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame.empty());
+
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
+
+    cv::ocl::oclMat d_pts;
+    detector(oclMat(frame), d_pts);
+
+    ASSERT_FALSE(d_pts.empty());
+
+    std::vector<cv::Point2f> pts(d_pts.cols);
+    
+    detector.downloadPoints(d_pts, pts);
+
+    std::vector<cv::Point2f> pts_gold;
+    cv::goodFeaturesToTrack(frame, pts_gold, maxCorners, qualityLevel, minDistance);
+
+    ASSERT_EQ(pts_gold.size(), pts.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < pts.size(); ++i)
+    {
+        cv::Point2i a = pts_gold[i];
+        cv::Point2i b = pts[i];
+
+        bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+
+        if (!eq)
+            ++mistmatch;
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / pts.size();
+
+    ASSERT_LE(bad_ratio, 0.01);
+}
+
+TEST_P(GoodFeaturesToTrack, EmptyCorners)
+{
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
+
+    cv::ocl::oclMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
+    cv::ocl::oclMat corners(1, maxCorners, CV_32FC2);
+
+    detector(src, corners);
+
+    ASSERT_TRUE(corners.empty());
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_Video, GoodFeaturesToTrack, 
+    testing::Values(MinDistance(0.0), MinDistance(3.0)));
+
+//////////////////////////////////////////////////////////////////////////
+PARAM_TEST_CASE(TVL1, bool)
+{
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        useRoi = GET_PARAM(0);
+    }
+
+};
+
+TEST_P(TVL1, Accuracy)
+{
+    cv::Mat frame0 = readImage(workdir + "../gpu/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(workdir + "../gpu/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
+    cv::RNG &rng = TS::ptr()->get_rng();
+    cv::Mat flowx = randomMat(rng, frame0.size(), CV_32FC1, 0, 0, useRoi);
+    cv::Mat flowy = randomMat(rng, frame0.size(), CV_32FC1, 0, 0, useRoi);
+    cv::ocl::oclMat d_flowx(flowx), d_flowy(flowy);
+    d_alg(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
+
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cv::Mat flow;
+    alg->calc(frame0, frame1, flow);
+    cv::Mat gold[2];
+    cv::split(flow, gold);
+
+    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 3e-3);
+    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 3e-3);
+}
+INSTANTIATE_TEST_CASE_P(OCL_Video, TVL1, Values(true, false));
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// PyrLKOpticalFlow
+
 PARAM_TEST_CASE(Sparse, bool, bool)
 {
     bool useGray;
@@ -60,7 +182,7 @@ PARAM_TEST_CASE(Sparse, bool, bool)
     virtual void SetUp()
     {
         UseSmart = GET_PARAM(0);
-        useGray = GET_PARAM(0);
+        useGray = GET_PARAM(1);
     }
 };
 
@@ -147,9 +269,9 @@ TEST_P(Sparse, Mat)
 
 }
 
-INSTANTIATE_TEST_CASE_P(Video, Sparse, Combine(
-                            Values(false, true),
-                            Values(false)));
+INSTANTIATE_TEST_CASE_P(OCL_Video, Sparse, Combine(
+    Values(false, true),
+    Values(false, true)));
 
 #endif // HAVE_OPENCL
 
diff --git a/modules/ocl/test/test_pyrdown.cpp b/modules/ocl/test/test_pyramids.cpp
similarity index 75%
rename from modules/ocl/test/test_pyrdown.cpp
rename to modules/ocl/test/test_pyramids.cpp
index 6d00fb5e45..1bd188dea6 100644
--- a/modules/ocl/test/test_pyrdown.cpp
+++ b/modules/ocl/test/test_pyramids.cpp
@@ -15,7 +15,6 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Dachuan Zhao, dachuan@multicorewareinc.com
 //    Yao Wang yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -56,11 +55,12 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
 
-PARAM_TEST_CASE(PyrDown, MatType, int)
+PARAM_TEST_CASE(PyrBase, MatType, int)
 {
     int type;
     int channels;
-
+    Mat dst_cpu;
+    oclMat gdst;
     virtual void SetUp()
     {
         type = GET_PARAM(0);
@@ -69,19 +69,19 @@ PARAM_TEST_CASE(PyrDown, MatType, int)
 
 };
 
+/////////////////////// PyrDown //////////////////////////
+struct PyrDown : PyrBase {};
 
 TEST_P(PyrDown, Mat)
 {
     for(int j = 0; j < LOOP_TIMES; j++)
     {
-        cv::Size size(MWIDTH, MHEIGHT);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Mat src = randomMat(rng, size, CV_MAKETYPE(type, channels), 0, 100, false);
-
-        cv::ocl::oclMat gsrc(src), gdst;
-        cv::Mat dst_cpu;
-        cv::pyrDown(src, dst_cpu);
-        cv::ocl::pyrDown(gsrc, gdst);
+        Size size(MWIDTH, MHEIGHT);
+        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        oclMat gsrc(src);
+        
+        pyrDown(src, dst_cpu);
+        pyrDown(gsrc, gdst);
 
         EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), type == CV_32F ? 1e-4f : 1.0f);
     }
@@ -90,5 +90,27 @@ TEST_P(PyrDown, Mat)
 INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine(
                             Values(CV_8U, CV_32F), Values(1, 3, 4)));
 
+/////////////////////// PyrUp //////////////////////////
 
+struct PyrUp : PyrBase {};
+
+TEST_P(PyrUp, Accuracy)
+{
+    for(int j = 0; j < LOOP_TIMES; j++)
+    {
+        Size size(MWIDTH, MHEIGHT);
+        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        oclMat gsrc(src);
+
+        pyrUp(src, dst_cpu);
+        pyrUp(gsrc, gdst);
+
+        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (type == CV_32F ? 1e-4f : 1.0));
+    }
+
+}
+
+
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
+                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_pyrup.cpp b/modules/ocl/test/test_pyrup.cpp
deleted file mode 100644
index afd3e8b1b8..0000000000
--- a/modules/ocl/test/test_pyrup.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Chunpeng chunpeng@multicorewareinc.com
-//    Yao Wang yao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencv2/core/core.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(PyrUp, MatType, int)
-{
-    int type;
-    int channels;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-    }
-};
-
-TEST_P(PyrUp, Accuracy)
-{
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        Size size(MWIDTH, MHEIGHT);
-        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
-        Mat dst_gold;
-        pyrUp(src, dst_gold);
-        ocl::oclMat dst;
-        ocl::oclMat srcMat(src);
-        ocl::pyrUp(srcMat, dst);
-
-        EXPECT_MAT_NEAR(dst_gold, Mat(dst), (type == CV_32F ? 1e-4f : 1.0));
-    }
-
-}
-
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
-                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
-
-
-#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/ocl/test/utility.cpp b/modules/ocl/test/utility.cpp
index 4b21081a8b..27f9cec079 100644
--- a/modules/ocl/test/utility.cpp
+++ b/modules/ocl/test/utility.cpp
@@ -100,12 +100,6 @@ Mat randomMat(Size size, int type, double minVal, double maxVal)
     return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
 }
 
-
-
-
-
-
-
 /*
 void showDiff(InputArray gold_, InputArray actual_, double eps)
 {
@@ -137,58 +131,7 @@ void showDiff(InputArray gold_, InputArray actual_, double eps)
 }
 */
 
-/*
-bool supportFeature(const DeviceInfo& info, FeatureSet feature)
-{
-    return TargetArchs::builtWith(feature) && info.supports(feature);
-}
 
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
-
-vector<DeviceInfo> devices(FeatureSet feature)
-{
-    const vector<DeviceInfo>& d = devices();
-
-    vector<DeviceInfo> devs_filtered;
-
-    if (TargetArchs::builtWith(feature))
-    {
-        devs_filtered.reserve(d.size());
-
-        for (size_t i = 0, size = d.size(); i < size; ++i)
-        {
-            const DeviceInfo& info = d[i];
-
-            if (info.supports(feature))
-                devs_filtered.push_back(info);
-        }
-    }
-
-    return devs_filtered;
-}
-*/
 
 vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
 {
@@ -264,3 +207,48 @@ void PrintTo(const Inverse &inverse, std::ostream *os)
         (*os) << "direct";
 }
 
+double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
+{
+    double final_test_result = 0.0;
+    size_t sz1 = ob1.size();
+    size_t sz2 = ob2.size();
+
+    if(sz1 != sz2)
+    {
+        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
+    else
+    {
+        if(sz1==0 && sz2==0)
+            return 0;
+        cv::Mat cpu_result(sz, CV_8UC1);
+        cpu_result.setTo(0);
+
+        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
+        {      
+            cv::Mat cpu_result_roi(cpu_result, *r);
+            cpu_result_roi.setTo(1);
+            cpu_result.copyTo(cpu_result);
+        }
+        int cpu_area = cv::countNonZero(cpu_result > 0);
+
+        cv::Mat gpu_result(sz, CV_8UC1);
+        gpu_result.setTo(0);
+        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
+        {
+            cv::Mat gpu_result_roi(gpu_result, *r2);
+            gpu_result_roi.setTo(1);
+            gpu_result.copyTo(gpu_result);
+        }
+
+        cv::Mat result_;
+        multiply(cpu_result, gpu_result, result_);
+        int result = cv::countNonZero(result_ > 0);
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
+    }
+    return final_test_result;
+}
+
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp
index 42fa69384d..0b101ec50b 100644
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -55,13 +55,12 @@ cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal =
 
 void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
 
-//! return true if device supports specified feature and gpu module was built with support the feature.
-//bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+// This function test if gpu_rst matches cpu_rst.
+// If the two vectors are not equal, it will return the difference in vector size
+// Else it will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
+// The smaller, the better matched
+double checkRectSimilarity(cv::Size sz, std::vector<cv::Rect>& ob1, std::vector<cv::Rect>& ob2);
 
-//! return all devices compatible with current gpu module build.
-//const std::vector<cv::ocl::DeviceInfo>& devices();
-//! return all devices compatible with current gpu module build which support specified feature.
-//std::vector<cv::ocl::DeviceInfo> devices(cv::gpu::FeatureSet feature);
 
 //! read image from testdata folder.
 cv::Mat readImage(const std::string &fileName, int flags = cv::IMREAD_COLOR);
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 02d7a6f620..191926ccb7 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -59,17 +59,17 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
 
     switch (src.type()) {
         case CV_8U:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                 FastNlMeansDenoisingInvoker<uchar>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                 FastNlMeansDenoisingInvoker<cv::Vec2b>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                 FastNlMeansDenoisingInvoker<cv::Vec3b>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
@@ -159,19 +159,19 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
 
     switch (srcImgs[0].type()) {
         case CV_8U:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<uchar>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index c4f13826d2..8824f17c0d 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -55,12 +55,12 @@ using namespace std;
 using namespace cv;
 
 template <typename T>
-struct FastNlMeansDenoisingInvoker {
+struct FastNlMeansDenoisingInvoker : ParallelLoopBody {
     public:
         FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
             int template_window_size, int search_window_size, const float h);
 
-        void operator() (const BlockedRange& range) const;
+        void operator() (const Range& range) const;
 
     private:
         void operator= (const FastNlMeansDenoisingInvoker&);
@@ -156,9 +156,9 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
 }
 
 template <class T>
-void FastNlMeansDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
-    int row_from = range.begin();
-    int row_to = range.end() - 1;
+void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const {
+    int row_from = range.start;
+    int row_to = range.end - 1;
 
     Array2d<int> dist_sums(search_window_size_, search_window_size_);
 
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 2ae5054e00..8b32eded18 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -55,13 +55,13 @@ using namespace std;
 using namespace cv;
 
 template <typename T>
-struct FastNlMeansMultiDenoisingInvoker {
+struct FastNlMeansMultiDenoisingInvoker : ParallelLoopBody {
     public:
         FastNlMeansMultiDenoisingInvoker(
             const std::vector<Mat>& srcImgs, int imgToDenoiseIndex, int temporalWindowSize,
             Mat& dst, int template_window_size, int search_window_size, const float h);
 
-        void operator() (const BlockedRange& range) const;
+        void operator() (const Range& range) const;
 
     private:
         void operator= (const FastNlMeansMultiDenoisingInvoker&);
@@ -175,9 +175,9 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
 }
 
 template <class T>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
-    int row_from = range.begin();
-    int row_to = range.end() - 1;
+void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const {
+    int row_from = range.start;
+    int row_to = range.end - 1;
 
     Array3d<int> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
 
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 9bab58c52f..d918cfff29 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -66,21 +66,17 @@ struct DistIdxPair
 };
 
 
-struct MatchPairsBody
+struct MatchPairsBody : ParallelLoopBody
 {
-    MatchPairsBody(const MatchPairsBody& other)
-            : matcher(other.matcher), features(other.features),
-              pairwise_matches(other.pairwise_matches), near_pairs(other.near_pairs) {}
-
     MatchPairsBody(FeaturesMatcher &_matcher, const vector<ImageFeatures> &_features,
                    vector<MatchesInfo> &_pairwise_matches, vector<pair<int,int> > &_near_pairs)
             : matcher(_matcher), features(_features),
               pairwise_matches(_pairwise_matches), near_pairs(_near_pairs) {}
 
-    void operator ()(const BlockedRange &r) const
+    void operator ()(const Range &r) const
     {
         const int num_images = static_cast<int>(features.size());
-        for (int i = r.begin(); i < r.end(); ++i)
+        for (int i = r.start; i < r.end; ++i)
         {
             int from = near_pairs[i].first;
             int to = near_pairs[i].second;
@@ -526,9 +522,9 @@ void FeaturesMatcher::operator ()(const vector<ImageFeatures> &features, vector<
     MatchPairsBody body(*this, features, pairwise_matches, near_pairs);
 
     if (is_thread_safe_)
-        parallel_for(BlockedRange(0, static_cast<int>(near_pairs.size())), body);
+        parallel_for_(Range(0, static_cast<int>(near_pairs.size())), body);
     else
-        body(BlockedRange(0, static_cast<int>(near_pairs.size())));
+        body(Range(0, static_cast<int>(near_pairs.size())));
     LOGLN_CHAT("");
 }
 
diff --git a/modules/stitching/src/motion_estimators.cpp b/modules/stitching/src/motion_estimators.cpp
index ab27a46a2a..c873bc721a 100644
--- a/modules/stitching/src/motion_estimators.cpp
+++ b/modules/stitching/src/motion_estimators.cpp
@@ -69,13 +69,13 @@ struct CalcRotation
         K_from(0,0) = cameras[edge.from].focal;
         K_from(1,1) = cameras[edge.from].focal * cameras[edge.from].aspect;
         K_from(0,2) = cameras[edge.from].ppx;
-        K_from(0,2) = cameras[edge.from].ppy;
+        K_from(1,2) = cameras[edge.from].ppy;
 
         Mat_<double> K_to = Mat::eye(3, 3, CV_64F);
         K_to(0,0) = cameras[edge.to].focal;
         K_to(1,1) = cameras[edge.to].focal * cameras[edge.to].aspect;
         K_to(0,2) = cameras[edge.to].ppx;
-        K_to(0,2) = cameras[edge.to].ppy;
+        K_to(1,2) = cameras[edge.to].ppy;
 
         Mat R = K_from.inv() * pairwise_matches[pair_idx].H.inv() * K_to;
         cameras[edge.to].R = cameras[edge.from].R * R;
diff --git a/modules/superres/perf/perf_main.cpp b/modules/superres/perf/perf_main.cpp
index adc69e6e8b..0a8ab5deaa 100644
--- a/modules/superres/perf/perf_main.cpp
+++ b/modules/superres/perf/perf_main.cpp
@@ -44,4 +44,11 @@
 
 using namespace perf;
 
-CV_PERF_TEST_MAIN(superres, printCudaInfo())
+static const char * impls[] = {
+#ifdef HAVE_CUDA
+    "cuda",
+#endif
+    "plain"
+};
+
+CV_PERF_TEST_MAIN_WITH_IMPLS(superres, impls, printCudaInfo())
diff --git a/modules/ts/include/opencv2/ts/ts_gtest.h b/modules/ts/include/opencv2/ts/ts_gtest.h
index 2d1227ecdc..80b410bb3c 100644
--- a/modules/ts/include/opencv2/ts/ts_gtest.h
+++ b/modules/ts/include/opencv2/ts/ts_gtest.h
@@ -17566,6 +17566,9 @@ GTEST_DECLARE_string_(color);
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
 
+// OpenCV extension: same as filter, but for the parameters string.
+GTEST_DECLARE_string_(param_filter);
+
 // This flag causes the Google Test to list tests. None of the tests listed
 // are actually run if the flag is provided.
 GTEST_DECLARE_bool_(list_tests);
diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp
index fe57655157..1e68cd49b0 100644
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -210,18 +210,13 @@ private:
 #define SANITY_CHECK_KEYPOINTS(array, ...) ::perf::Regression::addKeypoints(this, #array, array , ## __VA_ARGS__)
 #define SANITY_CHECK_MATCHES(array, ...) ::perf::Regression::addMatches(this, #array, array , ## __VA_ARGS__)
 
-#ifdef HAVE_CUDA
 class CV_EXPORTS GpuPerf
 {
 public:
   static bool targetDevice();
 };
 
-# define PERF_RUN_GPU()  ::perf::GpuPerf::targetDevice()
-#else
-# define PERF_RUN_GPU()  false
-#endif
-
+#define PERF_RUN_GPU()  ::perf::GpuPerf::targetDevice()
 
 /*****************************************************************************************\
 *                            Container for performance metrics                            *
@@ -263,7 +258,11 @@ public:
     TestBase();
 
     static void Init(int argc, const char* const argv[]);
+    static void Init(const std::vector<std::string> & availableImpls,
+                     int argc, const char* const argv[]);
+    static void RecordRunParameters();
     static std::string getDataPath(const std::string& relativePath);
+    static std::string getSelectedImpl();
 
 protected:
     virtual void PerfTestBody() = 0;
@@ -477,15 +476,29 @@ CV_EXPORTS void PrintTo(const Size& sz, ::std::ostream* os);
     void fixture##_##name::PerfTestBody()
 
 
-#define CV_PERF_TEST_MAIN(testsuitname, ...) \
-int main(int argc, char **argv)\
-{\
+#define CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, ...) \
     while (++argc >= (--argc,-1)) {__VA_ARGS__; break;} /*this ugly construction is needed for VS 2005*/\
-    ::perf::Regression::Init(#testsuitname);\
-    ::perf::TestBase::Init(argc, argv);\
+    ::perf::Regression::Init(#modulename);\
+    ::perf::TestBase::Init(std::vector<std::string>(impls, impls + sizeof impls / sizeof *impls),\
+                           argc, argv);\
     ::testing::InitGoogleTest(&argc, argv);\
     cvtest::printVersionInfo();\
-    return RUN_ALL_TESTS();\
+    ::testing::Test::RecordProperty("cv_module_name", #modulename);\
+    ::perf::TestBase::RecordRunParameters();\
+    return RUN_ALL_TESTS();
+
+// impls must be an array, not a pointer; "plain" should always be one of the implementations
+#define CV_PERF_TEST_MAIN_WITH_IMPLS(modulename, impls, ...) \
+int main(int argc, char **argv)\
+{\
+    CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, __VA_ARGS__)\
+}
+
+#define CV_PERF_TEST_MAIN(modulename, ...) \
+int main(int argc, char **argv)\
+{\
+    const char * plain_only[] = { "plain" };\
+    CV_PERF_TEST_MAIN_INTERNALS(modulename, plain_only, __VA_ARGS__)\
 }
 
 #define TEST_CYCLE_N(n) for(declare.iterations(n); startTimer(), next(); stopTimer())
diff --git a/modules/ts/misc/run.py b/modules/ts/misc/run.py
index 4351713715..a64127f0d4 100755
--- a/modules/ts/misc/run.py
+++ b/modules/ts/misc/run.py
@@ -288,6 +288,16 @@ class TestSuite(object):
             if self.adb:
                 # construct name for aapt tool
                 self.aapt = [os.path.join(os.path.dirname(self.adb[0]), ("aapt","aapt.exe")[hostos == 'nt'])]
+                if not os.path.isfile(self.aapt[0]):
+                    # it's moved in SDK r22
+                    sdk_dir = os.path.dirname( os.path.dirname(self.adb[0]) )
+                    aapt_fn = ("aapt", "aapt.exe")[hostos == 'nt']
+                    for r, ds, fs in os.walk( os.path.join(sdk_dir, 'build-tools') ):
+                        if aapt_fn in fs:
+                            self.aapt = [ os.path.join(r, aapt_fn) ]
+                            break
+                    else:
+                        self.error = "Can't find '%s' tool!" % aapt_fn
 
         # fix has_perf_tests param
         self.has_perf_tests = self.has_perf_tests == "ON"
diff --git a/modules/ts/misc/testlog_parser.py b/modules/ts/misc/testlog_parser.py
index 7ae6aa5980..5d478645b2 100755
--- a/modules/ts/misc/testlog_parser.py
+++ b/modules/ts/misc/testlog_parser.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 
-import sys, re, os.path
+import collections
+import re
+import os.path
+import sys
 from xml.dom.minidom import parse
 
 class TestInfo(object):
@@ -100,34 +103,39 @@ class TestInfo(object):
     def dump(self, units="ms"):
         print "%s ->\t\033[1;31m%s\033[0m = \t%.2f%s" % (str(self), self.status, self.get("gmean", units), units)
 
-    def shortName(self):
+
+    def getName(self):
         pos = self.name.find("/")
         if pos > 0:
-            name = self.name[:pos]
-        else:
-            name = self.name
-        if self.fixture.endswith(name):
-            fixture = self.fixture[:-len(name)]
+            return self.name[:pos]
+        return self.name
+
+
+    def getFixture(self):
+        if self.fixture.endswith(self.getName()):
+            fixture = self.fixture[:-len(self.getName())]
         else:
             fixture = self.fixture
         if fixture.endswith("_"):
             fixture = fixture[:-1]
+        return fixture
+
+
+    def param(self):
+        return '::'.join(filter(None, [self.type_param, self.value_param]))
+
+    def shortName(self):
+        name = self.getName()
+        fixture = self.getFixture()
         return '::'.join(filter(None, [name, fixture]))
 
+
     def __str__(self):
-        pos = self.name.find("/")
-        if pos > 0:
-            name = self.name[:pos]
-        else:
-            name = self.name
-        if self.fixture.endswith(name):
-            fixture = self.fixture[:-len(name)]
-        else:
-            fixture = self.fixture
-        if fixture.endswith("_"):
-            fixture = fixture[:-1]
+        name = self.getName()
+        fixture = self.getFixture()
         return '::'.join(filter(None, [name, fixture, self.type_param, self.value_param]))
 
+
     def __cmp__(self, other):
         r = cmp(self.fixture, other.fixture);
         if r != 0:
@@ -154,12 +162,31 @@ class TestInfo(object):
                 return 1
         return 0
 
+# This is a Sequence for compatibility with old scripts,
+# which treat parseLogFile's return value as a list.
+class TestRunInfo(collections.Sequence):
+    def __init__(self, properties, tests):
+        self.properties = properties
+        self.tests = tests
+
+    def __len__(self):
+        return len(self.tests)
+
+    def __getitem__(self, key):
+        return self.tests[key]
+
 def parseLogFile(filename):
-    tests = []
     log = parse(filename)
-    for case in log.getElementsByTagName("testcase"):
-        tests.append(TestInfo(case))
-    return tests
+
+    properties = {
+        attr_name[3:]: attr_value
+        for (attr_name, attr_value) in log.documentElement.attributes.items()
+        if attr_name.startswith('cv_')
+    }
+
+    tests = map(TestInfo, log.getElementsByTagName("testcase"))
+
+    return TestRunInfo(properties, tests)
 
 
 if __name__ == "__main__":
@@ -168,8 +195,18 @@ if __name__ == "__main__":
         exit(0)
 
     for arg in sys.argv[1:]:
-        print "Tests found in", arg
-        tests = parseLogFile(arg)
-        for t in sorted(tests):
+        print "Processing {}...".format(arg)
+
+        run = parseLogFile(arg)
+
+        print "Properties:"
+
+        for (prop_name, prop_value) in run.properties.items():
+          print "\t{} = {}".format(prop_name, prop_value)
+
+        print "Tests:"
+
+        for t in sorted(run.tests):
             t.dump()
+
         print
diff --git a/modules/ts/misc/xls-report.py b/modules/ts/misc/xls-report.py
new file mode 100755
index 0000000000..e911314e92
--- /dev/null
+++ b/modules/ts/misc/xls-report.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python
+
+"""
+    This script can generate XLS reports from OpenCV tests' XML output files.
+
+    To use it, first, create a directory for each machine you ran tests on.
+    Each such directory will become a sheet in the report. Put each XML file
+    into the corresponding directory.
+
+    Then, create your configuration file(s). You can have a global configuration
+    file (specified with the -c option), and per-sheet configuration files, which
+    must be called sheet.conf and placed in the directory corresponding to the sheet.
+    The settings in the per-sheet configuration file will override those in the
+    global configuration file, if both are present.
+
+    A configuration file must consist of a Python dictionary. The following keys
+    will be recognized:
+
+    * 'comparisons': [{'from': string, 'to': string}]
+        List of configurations to compare performance between. For each item,
+        the sheet will have a column showing speedup from configuration named
+        'from' to configuration named "to".
+
+    * 'configuration_matchers': [{'properties': {string: object}, 'name': string}]
+        Instructions for matching test run property sets to configuration names.
+
+        For each found XML file:
+
+        1) All attributes of the root element starting with the prefix 'cv_' are
+           placed in a dictionary, with the cv_ prefix stripped and the cv_module_name
+           element deleted.
+
+        2) The first matcher for which the XML's file property set contains the same
+           keys with equal values as its 'properties' dictionary is searched for.
+           A missing property can be matched by using None as the value.
+
+           Corollary 1: you should place more specific matchers before less specific
+           ones.
+
+           Corollary 2: an empty 'properties' dictionary matches every property set.
+
+        3) If a matching matcher is found, its 'name' string is presumed to be the name
+           of the configuration the XML file corresponds to. Otherwise, a warning is
+           printed. A warning is also printed if two different property sets match to the
+           same configuration name.
+
+    * 'configurations': [string]
+        List of names for compile-time and runtime configurations of OpenCV.
+        Each item will correspond to a column of the sheet.
+
+    * 'module_colors': {string: string}
+        Mapping from module name to color name. In the sheet, cells containing module
+        names from this mapping will be colored with the corresponding color. You can
+        find the list of available colors here:
+        <http://www.simplistix.co.uk/presentations/python-excel.pdf>.
+
+    * 'sheet_name': string
+        Name for the sheet. If this parameter is missing, the name of sheet's directory
+        will be used.
+
+    Note that all keys are optional, although to get useful results, you'll want to
+    specify at least 'configurations' and 'configuration_matchers'.
+
+    Finally, run the script. Use the --help option for usage information.
+"""
+
+from __future__ import division
+
+import ast
+import fnmatch
+import logging
+import numbers
+import os, os.path
+import re
+
+from argparse import ArgumentParser
+from collections import OrderedDict
+from glob import glob
+from itertools import ifilter
+
+import xlwt
+
+from testlog_parser import parseLogFile
+
+re_image_size = re.compile(r'^ \d+ x \d+$', re.VERBOSE)
+re_data_type = re.compile(r'^ (?: 8 | 16 | 32 | 64 ) [USF] C [1234] $', re.VERBOSE)
+
+time_style = xlwt.easyxf(num_format_str='#0.00')
+no_time_style = xlwt.easyxf('pattern: pattern solid, fore_color gray25')
+
+speedup_style = time_style
+good_speedup_style = xlwt.easyxf('font: color green', num_format_str='#0.00')
+bad_speedup_style = xlwt.easyxf('font: color red', num_format_str='#0.00')
+no_speedup_style = no_time_style
+error_speedup_style = xlwt.easyxf('pattern: pattern solid, fore_color orange')
+header_style = xlwt.easyxf('font: bold true; alignment: horizontal centre, vertical top, wrap True')
+
+class Collector(object):
+    def __init__(self, config_match_func):
+        self.__config_cache = {}
+        self.config_match_func = config_match_func
+        self.tests = {}
+
+    # Format a sorted sequence of pairs as if it was a dictionary.
+    # We can't just use a dictionary instead, since we want to preserve the sorted order of the keys.
+    @staticmethod
+    def __format_config_cache_key(pairs):
+        return '{' + ', '.join(repr(k) + ': ' + repr(v) for (k, v) in pairs) + '}'
+
+    def collect_from(self, xml_path):
+        run = parseLogFile(xml_path)
+
+        module = run.properties['module_name']
+
+        properties = run.properties.copy()
+        del properties['module_name']
+
+        props_key = tuple(sorted(properties.iteritems())) # dicts can't be keys
+
+        if props_key in self.__config_cache:
+            configuration = self.__config_cache[props_key]
+        else:
+            configuration = self.config_match_func(properties)
+
+            if configuration is None:
+                logging.warning('failed to match properties to a configuration: %s',
+                    Collector.__format_config_cache_key(props_key))
+            else:
+                same_config_props = [it[0] for it in self.__config_cache.iteritems() if it[1] == configuration]
+                if len(same_config_props) > 0:
+                    logging.warning('property set %s matches the same configuration %r as property set %s',
+                        Collector.__format_config_cache_key(props_key),
+                        configuration,
+                        Collector.__format_config_cache_key(same_config_props[0]))
+
+            self.__config_cache[props_key] = configuration
+
+        if configuration is None: return
+
+        module_tests = self.tests.setdefault(module, OrderedDict())
+
+        for test in run.tests:
+            test_results = module_tests.setdefault((test.shortName(), test.param()), {})
+            test_results[configuration] = test.get("gmean") if test.status == 'run' else test.status
+
+def make_match_func(matchers):
+    def match_func(properties):
+        for matcher in matchers:
+            if all(properties.get(name) == value
+                   for (name, value) in matcher['properties'].iteritems()):
+                return matcher['name']
+
+        return None
+
+    return match_func
+
+def main():
+    arg_parser = ArgumentParser(description='Build an XLS performance report.')
+    arg_parser.add_argument('sheet_dirs', nargs='+', metavar='DIR', help='directory containing perf test logs')
+    arg_parser.add_argument('-o', '--output', metavar='XLS', default='report.xls', help='name of output file')
+    arg_parser.add_argument('-c', '--config', metavar='CONF', help='global configuration file')
+
+    args = arg_parser.parse_args()
+
+    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)
+
+    if args.config is not None:
+        with open(args.config) as global_conf_file:
+            global_conf = ast.literal_eval(global_conf_file.read())
+    else:
+        global_conf = {}
+
+    wb = xlwt.Workbook()
+
+    for sheet_path in args.sheet_dirs:
+        try:
+            with open(os.path.join(sheet_path, 'sheet.conf')) as sheet_conf_file:
+                sheet_conf = ast.literal_eval(sheet_conf_file.read())
+        except Exception:
+            sheet_conf = {}
+            logging.debug('no sheet.conf for %s', sheet_path)
+
+        sheet_conf = dict(global_conf.items() + sheet_conf.items())
+
+        config_names = sheet_conf.get('configurations', [])
+        config_matchers = sheet_conf.get('configuration_matchers', [])
+
+        collector = Collector(make_match_func(config_matchers))
+
+        for root, _, filenames in os.walk(sheet_path):
+            logging.info('looking in %s', root)
+            for filename in fnmatch.filter(filenames, '*.xml'):
+                collector.collect_from(os.path.join(root, filename))
+
+        sheet = wb.add_sheet(sheet_conf.get('sheet_name', os.path.basename(os.path.abspath(sheet_path))))
+
+        sheet.row(0).height = 800
+        sheet.panes_frozen = True
+        sheet.remove_splits = True
+        sheet.horz_split_pos = 1
+        sheet.horz_split_first_visible = 1
+
+        sheet_comparisons = sheet_conf.get('comparisons', [])
+
+        for i, w in enumerate([2000, 15000, 2500, 2000, 15000]
+                + (len(config_names) + 1 + len(sheet_comparisons)) * [3000]):
+            sheet.col(i).width = w
+
+        for i, caption in enumerate(['Module', 'Test', 'Image\nsize', 'Data\ntype', 'Parameters']
+                + config_names + [None]
+                + [comp['to'] + '\nvs\n' + comp['from'] for comp in sheet_comparisons]):
+            sheet.row(0).write(i, caption, header_style)
+
+        row = 1
+
+        module_colors = sheet_conf.get('module_colors', {})
+        module_styles = {module: xlwt.easyxf('pattern: pattern solid, fore_color {}'.format(color))
+                         for module, color in module_colors.iteritems()}
+
+        for module, tests in sorted(collector.tests.iteritems()):
+            for ((test, param), configs) in tests.iteritems():
+                sheet.write(row, 0, module, module_styles.get(module, xlwt.Style.default_style))
+                sheet.write(row, 1, test)
+
+                param_list = param[1:-1].split(", ")
+                sheet.write(row, 2, next(ifilter(re_image_size.match, param_list), None))
+                sheet.write(row, 3, next(ifilter(re_data_type.match, param_list), None))
+
+                sheet.row(row).write(4, param)
+                for i, c in enumerate(config_names):
+                    if c in configs:
+                        sheet.write(row, 5 + i, configs[c], time_style)
+                    else:
+                        sheet.write(row, 5 + i, None, no_time_style)
+
+                for i, comp in enumerate(sheet_comparisons):
+                    cmp_from = configs.get(comp["from"])
+                    cmp_to = configs.get(comp["to"])
+                    col = 5 + len(config_names) + 1 + i
+
+                    if isinstance(cmp_from, numbers.Number) and isinstance(cmp_to, numbers.Number):
+                        try:
+                            speedup = cmp_from / cmp_to
+                            sheet.write(row, col, speedup, good_speedup_style if speedup > 1.1 else
+                                                           bad_speedup_style  if speedup < 0.9 else
+                                                           speedup_style)
+                        except ArithmeticError as e:
+                            sheet.write(row, col, None, error_speedup_style)
+                    else:
+                        sheet.write(row, col, None, no_speedup_style)
+
+                row += 1
+                if row % 1000 == 0: sheet.flush_row_data()
+
+    wb.save(args.output)
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/ts/src/precomp.hpp b/modules/ts/src/precomp.hpp
index 10acd7ad8f..a74417da47 100644
--- a/modules/ts/src/precomp.hpp
+++ b/modules/ts/src/precomp.hpp
@@ -1,4 +1,5 @@
 #include "opencv2/core/core_c.h"
+#include "opencv2/core/internal.hpp"
 #include "opencv2/ts/ts.hpp"
 
 #ifdef GTEST_LINKED_AS_SHARED_LIBRARY
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 1d636e6746..38a23706dd 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -2,6 +2,10 @@
 #include <float.h>
 #include <limits.h>
 
+#ifdef HAVE_TEGRA_OPTIMIZATION
+#include "tegra.hpp"
+#endif
+
 using namespace cv;
 
 namespace cvtest
@@ -2936,28 +2940,75 @@ MatComparator::operator()(const char* expr1, const char* expr2,
 
 void printVersionInfo(bool useStdOut)
 {
-    ::testing::Test::RecordProperty("CV_VERSION", CV_VERSION);
+    ::testing::Test::RecordProperty("cv_version", CV_VERSION);
     if(useStdOut) std::cout << "OpenCV version: " << CV_VERSION << std::endl;
 
     std::string buildInfo( cv::getBuildInformation() );
 
     size_t pos1 = buildInfo.find("Version control");
-    size_t pos2 = buildInfo.find("\n", pos1);\
+    size_t pos2 = buildInfo.find('\n', pos1);
     if(pos1 != std::string::npos && pos2 != std::string::npos)
     {
-        std::string ver( buildInfo.substr(pos1, pos2-pos1) );
-        ::testing::Test::RecordProperty("Version_control", ver);
-        if(useStdOut) std::cout << ver << std::endl;
+        size_t value_start = buildInfo.rfind(' ', pos2) + 1;
+        std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
+        ::testing::Test::RecordProperty("cv_vcs_version", ver);
+        if (useStdOut) std::cout << "OpenCV VCS version: " << ver << std::endl;
     }
 
     pos1 = buildInfo.find("inner version");
-    pos2 = buildInfo.find("\n", pos1);\
+    pos2 = buildInfo.find('\n', pos1);
     if(pos1 != std::string::npos && pos2 != std::string::npos)
     {
-        std::string ver( buildInfo.substr(pos1, pos2-pos1) );
-        ::testing::Test::RecordProperty("inner_version", ver);
-        if(useStdOut) std::cout << ver << std::endl;
+        size_t value_start = buildInfo.rfind(' ', pos2) + 1;
+        std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
+        ::testing::Test::RecordProperty("cv_inner_vcs_version", ver);
+        if(useStdOut) std::cout << "Inner VCS version: " << ver << std::endl;
     }
+
+    const char* parallel_framework = currentParallelFramework();
+
+    if (parallel_framework) {
+        ::testing::Test::RecordProperty("cv_parallel_framework", parallel_framework);
+        if (useStdOut) std::cout << "Parallel framework: " << parallel_framework << std::endl;
+    }
+
+    std::string cpu_features;
+
+#if CV_SSE
+    if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
+#endif
+#if CV_SSE2
+    if (checkHardwareSupport(CV_CPU_SSE2)) cpu_features += " sse2";
+#endif
+#if CV_SSE3
+    if (checkHardwareSupport(CV_CPU_SSE3)) cpu_features += " sse3";
+#endif
+#if CV_SSSE3
+    if (checkHardwareSupport(CV_CPU_SSSE3)) cpu_features += " ssse3";
+#endif
+#if CV_SSE4_1
+    if (checkHardwareSupport(CV_CPU_SSE4_1)) cpu_features += " sse4.1";
+#endif
+#if CV_SSE4_2
+    if (checkHardwareSupport(CV_CPU_SSE4_2)) cpu_features += " sse4.2";
+#endif
+#if CV_AVX
+    if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
+#endif
+#if CV_NEON
+    cpu_features += " neon"; // NEON is currently not checked at runtime
+#endif
+
+    cpu_features.erase(0, 1); // erase initial space
+
+    ::testing::Test::RecordProperty("cv_cpu_features", cpu_features);
+    if (useStdOut) std::cout << "CPU features: " << cpu_features << std::endl;
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    const char * tegra_optimization = tegra::isDeviceSupported() ? "enabled" : "disabled";
+    ::testing::Test::RecordProperty("cv_tegra_optimization", tegra_optimization);
+    if (useStdOut) std::cout << "Tegra optimization: " << tegra_optimization << std::endl;
+#endif
 }
 
 } //namespace cvtest
diff --git a/modules/ts/src/ts_gtest.cpp b/modules/ts/src/ts_gtest.cpp
index 7c388cbd4a..48870913c3 100644
--- a/modules/ts/src/ts_gtest.cpp
+++ b/modules/ts/src/ts_gtest.cpp
@@ -497,6 +497,7 @@ const char kBreakOnFailureFlag[] = "break_on_failure";
 const char kCatchExceptionsFlag[] = "catch_exceptions";
 const char kColorFlag[] = "color";
 const char kFilterFlag[] = "filter";
+const char kParamFilterFlag[] = "param_filter";
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
 const char kPrintTimeFlag[] = "print_time";
@@ -575,6 +576,7 @@ class GTestFlagSaver {
     death_test_style_ = GTEST_FLAG(death_test_style);
     death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
     filter_ = GTEST_FLAG(filter);
+    param_filter_ = GTEST_FLAG(param_filter);
     internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
@@ -596,6 +598,7 @@ class GTestFlagSaver {
     GTEST_FLAG(death_test_style) = death_test_style_;
     GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
     GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(param_filter) = param_filter_;
     GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
@@ -617,6 +620,7 @@ class GTestFlagSaver {
   std::string death_test_style_;
   bool death_test_use_fork_;
   std::string filter_;
+  std::string param_filter_;
   std::string internal_run_death_test_;
   bool list_tests_;
   std::string output_;
@@ -1699,6 +1703,12 @@ GTEST_DEFINE_string_(
     "exclude).  A test is run if it matches one of the positive "
     "patterns and does not match any of the negative patterns.");
 
+GTEST_DEFINE_string_(
+    param_filter,
+    internal::StringFromGTestEnv("param_filter", kUniversalFilter),
+    "Same syntax and semantics as for param, but these patterns "
+    "have to match the test's parameters.");
+
 GTEST_DEFINE_bool_(list_tests, false,
                    "List all tests without running them.");
 
@@ -4188,6 +4198,14 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
                   "Note: %s filter = %s\n", GTEST_NAME_, filter);
   }
 
+  const char* const param_filter = GTEST_FLAG(param_filter).c_str();
+
+  // Ditto.
+  if (!String::CStringEquals(param_filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s parameter filter = %s\n", GTEST_NAME_, param_filter);
+  }
+
   if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
     const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
     ColoredPrintf(COLOR_YELLOW,
@@ -5873,9 +5891,15 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
                                                    kDisableTestFilter);
       test_info->is_disabled_ = is_disabled;
 
+      const std::string value_param(test_info->value_param() == NULL ?
+                                    "" : test_info->value_param());
+
       const bool matches_filter =
           internal::UnitTestOptions::FilterMatchesTest(test_case_name,
-                                                       test_name);
+                                                       test_name) &&
+          internal::UnitTestOptions::MatchesFilter(value_param,
+                                                   GTEST_FLAG(param_filter).c_str());
+
       test_info->matches_filter_ = matches_filter;
 
       const bool is_runnable =
@@ -6223,6 +6247,12 @@ static const char kColorEncodedHelpMessage[] =
 "      Run only the tests whose name matches one of the positive patterns but\n"
 "      none of the negative patterns. '?' matches any single character; '*'\n"
 "      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "param_filter=@YPOSITIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Like @G--" GTEST_FLAG_PREFIX_
+                      "filter@D, but applies to the test's parameter. If a\n"
+"      test is not parameterized, its parameter is considered to be the\n"
+"      empty string.\n"
 "  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
 "      Run all disabled tests too.\n"
 "\n"
@@ -6300,6 +6330,7 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
         ParseBoolFlag(arg, kDeathTestUseFork,
                       &GTEST_FLAG(death_test_use_fork)) ||
         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+        ParseStringFlag(arg, kParamFilterFlag, &GTEST_FLAG(param_filter)) ||
         ParseStringFlag(arg, kInternalRunDeathTestFlag,
                         &GTEST_FLAG(internal_run_death_test)) ||
         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index c375e7c388..c2c1ee6bd2 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -14,30 +14,10 @@ int64 TestBase::timeLimitDefault = 0;
 unsigned int TestBase::iterationsLimitDefault = (unsigned int)(-1);
 int64 TestBase::_timeadjustment = 0;
 
-const std::string command_line_keys =
-    "{   |perf_max_outliers   |8        |percent of allowed outliers}"
-    "{   |perf_min_samples    |10       |minimal required numer of samples}"
-    "{   |perf_force_samples  |100      |force set maximum number of samples for all tests}"
-    "{   |perf_seed           |809564   |seed for random numbers generator}"
-    "{   |perf_threads        |-1       |the number of worker threads, if parallel execution is enabled}"
-    "{   |perf_write_sanity   |false    |create new records for sanity checks}"
-    "{   |perf_verify_sanity  |false    |fail tests having no regression data for sanity checks}"
-#ifdef ANDROID
-    "{   |perf_time_limit     |6.0      |default time limit for a single test (in seconds)}"
-    "{   |perf_affinity_mask  |0        |set affinity mask for the main thread}"
-    "{   |perf_log_power_checkpoints  | |additional xml logging for power measurement}"
-#else
-    "{   |perf_time_limit     |3.0      |default time limit for a single test (in seconds)}"
-#endif
-    "{   |perf_max_deviation  |1.0      |}"
-    "{h  |help                |false    |print help info}"
-#ifdef HAVE_CUDA
-    "{   |perf_run_cpu        |false    |run GPU performance tests for analogical CPU functions}"
-    "{   |perf_cuda_device    |0        |run GPU test suite onto specific CUDA capable device}"
-    "{   |perf_cuda_info_only |false    |print an information about system and an available CUDA devices and then exit.}"
-#endif
-;
+// Item [0] will be considered the default implementation.
+static std::vector<std::string> available_impls;
 
+static std::string  param_impl;
 static double       param_max_outliers;
 static double       param_max_deviation;
 static unsigned int param_min_samples;
@@ -48,7 +28,6 @@ static int          param_threads;
 static bool         param_write_sanity;
 static bool         param_verify_sanity;
 #ifdef HAVE_CUDA
-static bool         param_run_cpu;
 static int          param_cuda_device;
 #endif
 
@@ -577,11 +556,12 @@ Regression& Regression::operator() (const std::string& name, cv::InputArray arra
 
     std::string nodename = getCurrentTestNodeName();
 
-#ifdef HAVE_CUDA
-    static const std::string prefix = (param_run_cpu)? "CPU_" : "GPU_";
+    // This is a hack for compatibility and it should eventually get removed.
+    // gpu's tests don't even have CPU sanity data anymore.
     if(suiteName == "gpu")
-        nodename = prefix + nodename;
-#endif
+    {
+        nodename = (PERF_RUN_GPU() ? "GPU_" : "CPU_") + nodename;
+    }
 
     cv::FileNode n = rootIn[nodename];
     if(n.isNone())
@@ -646,6 +626,43 @@ performance_metrics::performance_metrics()
 
 void TestBase::Init(int argc, const char* const argv[])
 {
+    std::vector<std::string> plain_only;
+    plain_only.push_back("plain");
+    TestBase::Init(plain_only, argc, argv);
+}
+
+void TestBase::Init(const std::vector<std::string> & availableImpls,
+                 int argc, const char* const argv[])
+{
+    available_impls = availableImpls;
+
+    const std::string command_line_keys =
+        "{   |perf_max_outliers           |8        |percent of allowed outliers}"
+        "{   |perf_min_samples            |10       |minimal required numer of samples}"
+        "{   |perf_force_samples          |100      |force set maximum number of samples for all tests}"
+        "{   |perf_seed                   |809564   |seed for random numbers generator}"
+        "{   |perf_threads                |-1       |the number of worker threads, if parallel execution is enabled}"
+        "{   |perf_write_sanity           |false    |create new records for sanity checks}"
+        "{   |perf_verify_sanity          |false    |fail tests having no regression data for sanity checks}"
+        "{   |perf_impl                   |" + available_impls[0] +
+                                                   "|the implementation variant of functions under test}"
+        "{   |perf_list_impls             |false    |list available implementation variants and exit}"
+        "{   |perf_run_cpu                |false    |deprecated, equivalent to --perf_impl=plain}"
+#ifdef ANDROID
+        "{   |perf_time_limit             |6.0      |default time limit for a single test (in seconds)}"
+        "{   |perf_affinity_mask          |0        |set affinity mask for the main thread}"
+        "{   |perf_log_power_checkpoints  |         |additional xml logging for power measurement}"
+#else
+        "{   |perf_time_limit             |3.0      |default time limit for a single test (in seconds)}"
+#endif
+        "{   |perf_max_deviation          |1.0      |}"
+        "{h  |help                        |false    |print help info}"
+#ifdef HAVE_CUDA
+        "{   |perf_cuda_device            |0        |run GPU test suite onto specific CUDA capable device}"
+        "{   |perf_cuda_info_only         |false    |print an information about system and an available CUDA devices and then exit.}"
+#endif
+    ;
+
     cv::CommandLineParser args(argc, argv, command_line_keys.c_str());
     if (args.get<bool>("help"))
     {
@@ -656,6 +673,7 @@ void TestBase::Init(int argc, const char* const argv[])
 
     ::testing::AddGlobalTestEnvironment(new PerfEnvironment);
 
+    param_impl          = args.get<bool>("perf_run_cpu") ? "plain" : args.get<std::string>("perf_impl");
     param_max_outliers  = std::min(100., std::max(0., args.get<double>("perf_max_outliers")));
     param_min_samples   = std::max(1u, args.get<unsigned int>("perf_min_samples"));
     param_max_deviation = std::max(0., args.get<double>("perf_max_deviation"));
@@ -670,19 +688,41 @@ void TestBase::Init(int argc, const char* const argv[])
     log_power_checkpoints = args.get<bool>("perf_log_power_checkpoints");
 #endif
 
+    bool param_list_impls = args.get<bool>("perf_list_impls");
+
+    if (param_list_impls)
+    {
+        fputs("Available implementation variants:", stdout);
+        for (size_t i = 0; i < available_impls.size(); ++i) {
+            putchar(' ');
+            fputs(available_impls[i].c_str(), stdout);
+        }
+        putchar('\n');
+        exit(0);
+    }
+
+    if (std::find(available_impls.begin(), available_impls.end(), param_impl) == available_impls.end())
+    {
+        printf("No such implementation: %s\n", param_impl.c_str());
+        exit(1);
+    }
+
 #ifdef HAVE_CUDA
 
     bool printOnly        = args.get<bool>("perf_cuda_info_only");
 
     if (printOnly)
         exit(0);
+#endif
+
+    if (available_impls.size() > 1)
+        printf("[----------]\n[   INFO   ] \tImplementation variant: %s.\n[----------]\n", param_impl.c_str()), fflush(stdout);
+
+#ifdef HAVE_CUDA
 
-    param_run_cpu         = args.get<bool>("perf_run_cpu");
     param_cuda_device      = std::max(0, std::min(cv::gpu::getCudaEnabledDeviceCount(), args.get<int>("perf_cuda_device")));
 
-    if (param_run_cpu)
-        printf("[----------]\n[ GPU INFO ] \tRun test suite on CPU.\n[----------]\n"), fflush(stdout);
-    else
+    if (param_impl == "cuda")
     {
         cv::gpu::DeviceInfo info(param_cuda_device);
         if (!info.isCompatible())
@@ -708,6 +748,18 @@ void TestBase::Init(int argc, const char* const argv[])
     _timeadjustment = _calibrate();
 }
 
+void TestBase::RecordRunParameters()
+{
+    ::testing::Test::RecordProperty("cv_implementation", param_impl);
+    ::testing::Test::RecordProperty("cv_num_threads", param_threads);
+}
+
+std::string TestBase::getSelectedImpl()
+{
+    return param_impl;
+}
+
+
 int64 TestBase::_calibrate()
 {
     class _helper : public ::perf::TestBase
@@ -1325,12 +1377,10 @@ void perf::sort(std::vector<cv::KeyPoint>& pts, cv::InputOutputArray descriptors
 /*****************************************************************************************\
 *                                  ::perf::GpuPerf
 \*****************************************************************************************/
-#ifdef HAVE_CUDA
 bool perf::GpuPerf::targetDevice()
 {
-    return !param_run_cpu;
+    return param_impl == "cuda";
 }
-#endif
 
 /*****************************************************************************************\
 *                                  ::perf::PrintTo
diff --git a/modules/video/perf/perf_optflowpyrlk.cpp b/modules/video/perf/perf_optflowpyrlk.cpp
index 12005f8ffa..8c53db03ae 100644
--- a/modules/video/perf/perf_optflowpyrlk.cpp
+++ b/modules/video/perf/perf_optflowpyrlk.cpp
@@ -165,7 +165,8 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize_Deriv, OpticalFlowPyrLK_self, testing::Com
     declare.in(pyramid1, pyramid2, inPoints).out(outPoints);
     declare.time(400);
 
-    TEST_CYCLE()
+    int runs = 3;
+    TEST_CYCLE_MULTIRUN(runs)
     {
         calcOpticalFlowPyrLK(pyramid1, pyramid2, inPoints, outPoints, status, err,
                              Size(winSize, winSize), maxLevel, criteria,
@@ -217,4 +218,4 @@ PERF_TEST_P(Path_Win_Deriv_Border_Reuse, OpticalFlowPyrLK_pyr, testing::Combine(
     }
 
     SANITY_CHECK(pyramid);
-}
\ No newline at end of file
+}
diff --git a/modules/video/src/bgfg_gaussmix2.cpp b/modules/video/src/bgfg_gaussmix2.cpp
index e532af2ae6..6bbb960482 100644
--- a/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@@ -248,7 +248,7 @@ detectShadowGMM(const float* data, int nchannels, int nmodes,
 //IEEE Trans. on Pattern Analysis and Machine Intelligence, vol.26, no.5, pages 651-656, 2004
 //http://www.zoranz.net/Publications/zivkovic2004PAMI.pdf
 
-struct MOG2Invoker
+struct MOG2Invoker : ParallelLoopBody
 {
     MOG2Invoker(const Mat& _src, Mat& _dst,
                 GMM* _gmm, float* _mean,
@@ -280,9 +280,9 @@ struct MOG2Invoker
         cvtfunc = src->depth() != CV_32F ? getConvertFunc(src->depth(), CV_32F) : 0;
     }
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int y0 = range.begin(), y1 = range.end();
+        int y0 = range.start, y1 = range.end;
         int ncols = src->cols, nchannels = src->channels();
         AutoBuffer<float> buf(src->cols*nchannels);
         float alpha1 = 1.f - alphaT;
@@ -562,15 +562,15 @@ void BackgroundSubtractorMOG2::operator()(InputArray _image, OutputArray _fgmask
     learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./min( 2*nframes, history );
     CV_Assert(learningRate >= 0);
 
-    parallel_for(BlockedRange(0, image.rows),
-                 MOG2Invoker(image, fgmask,
-                             (GMM*)bgmodel.data,
-                             (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
-                             bgmodelUsedModes.data, nmixtures, (float)learningRate,
-                             (float)varThreshold,
-                             backgroundRatio, varThresholdGen,
-                             fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
-                             bShadowDetection, nShadowDetection));
+    parallel_for_(Range(0, image.rows),
+                  MOG2Invoker(image, fgmask,
+                              (GMM*)bgmodel.data,
+                              (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
+                              bgmodelUsedModes.data, nmixtures, (float)learningRate,
+                              (float)varThreshold,
+                              backgroundRatio, varThresholdGen,
+                              fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
+                              bShadowDetection, nShadowDetection));
 }
 
 void BackgroundSubtractorMOG2::getBackgroundImage(OutputArray backgroundImage) const
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 9e47eb8029..291cb86a26 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -156,7 +156,7 @@ cv::detail::LKTrackerInvoker::LKTrackerInvoker(
     minEigThreshold = _minEigThreshold;
 }
 
-void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
+void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 {
     Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
     const Mat& I = *prevImg;
@@ -170,7 +170,7 @@ void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
     Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
     Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
 
-    for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ )
+    for( int ptidx = range.start; ptidx < range.end; ptidx++ )
     {
         Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
         Point2f nextPt;
@@ -733,11 +733,11 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
         typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
 #endif
 
-        parallel_for(BlockedRange(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
-                                                                nextPyr[level * lvlStep2], prevPts, nextPts,
-                                                                status, err,
-                                                                winSize, criteria, level, maxLevel,
-                                                                flags, (float)minEigThreshold));
+        parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
+                                                          nextPyr[level * lvlStep2], prevPts, nextPts,
+                                                          status, err,
+                                                          winSize, criteria, level, maxLevel,
+                                                          flags, (float)minEigThreshold));
     }
 }
 
diff --git a/modules/video/src/lkpyramid.hpp b/modules/video/src/lkpyramid.hpp
index 390e46bf99..4aff37ef84 100644
--- a/modules/video/src/lkpyramid.hpp
+++ b/modules/video/src/lkpyramid.hpp
@@ -7,7 +7,7 @@ namespace detail
 
     typedef short deriv_type;
 
-    struct LKTrackerInvoker
+    struct LKTrackerInvoker : ParallelLoopBody
     {
         LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
                           const Point2f* _prevPts, Point2f* _nextPts,
@@ -15,7 +15,7 @@ namespace detail
                           Size _winSize, TermCriteria _criteria,
                           int _level, int _maxLevel, int _flags, float _minEigThreshold );
 
-        void operator()(const BlockedRange& range) const;
+        void operator()(const Range& range) const;
 
         const Mat* prevImg;
         const Mat* nextImg;
diff --git a/modules/video/src/video_init.cpp b/modules/video/src/video_init.cpp
index 0f3cec144c..7ec860fbd3 100644
--- a/modules/video/src/video_init.cpp
+++ b/modules/video/src/video_init.cpp
@@ -60,7 +60,15 @@ CV_INIT_ALGORITHM(BackgroundSubtractorMOG2, "BackgroundSubtractor.MOG2",
     obj.info()->addParam(obj, "history", obj.history);
     obj.info()->addParam(obj, "nmixtures", obj.nmixtures);
     obj.info()->addParam(obj, "varThreshold", obj.varThreshold);
-    obj.info()->addParam(obj, "detectShadows", obj.bShadowDetection));
+    obj.info()->addParam(obj, "detectShadows", obj.bShadowDetection);
+    obj.info()->addParam(obj, "backgroundRatio", obj.backgroundRatio);
+    obj.info()->addParam(obj, "varThresholdGen", obj.varThresholdGen);
+    obj.info()->addParam(obj, "fVarInit", obj.fVarInit);
+    obj.info()->addParam(obj, "fVarMin", obj.fVarMin);
+    obj.info()->addParam(obj, "fVarMax", obj.fVarMax);
+    obj.info()->addParam(obj, "fCT", obj.fCT);
+    obj.info()->addParam(obj, "nShadowDetection", obj.nShadowDetection);
+    obj.info()->addParam(obj, "fTau", obj.fTau));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/modules/videostab/src/global_motion.cpp b/modules/videostab/src/global_motion.cpp
index 484b598dc1..de93d5c5ac 100644
--- a/modules/videostab/src/global_motion.cpp
+++ b/modules/videostab/src/global_motion.cpp
@@ -205,6 +205,9 @@ Mat estimateGlobalMotionRobust(
                             estimateGlobMotionLeastSquaresAffine };
 
     const int npoints = static_cast<int>(points0.size());
+    if (npoints < params.size)
+        return Mat::eye(3, 3, CV_32F);
+
     const int niters = static_cast<int>(ceil(log(1 - params.prob) /
                                              log(1 - pow(1 - params.eps, params.size))));
 
@@ -300,6 +303,8 @@ PyrLkRobustMotionEstimator::PyrLkRobustMotionEstimator()
 Mat PyrLkRobustMotionEstimator::estimate(const Mat &frame0, const Mat &frame1)
 {
     detector_->detect(frame0, keypointsPrev_);
+    if (keypointsPrev_.empty())
+        return Mat::eye(3, 3, CV_32F);
 
     pointsPrev_.resize(keypointsPrev_.size());
     for (size_t i = 0; i < keypointsPrev_.size(); ++i)
diff --git a/android/README.android b/platforms/android/README.android
similarity index 100%
rename from android/README.android
rename to platforms/android/README.android
diff --git a/android/android.toolchain.cmake b/platforms/android/android.toolchain.cmake
similarity index 88%
rename from android/android.toolchain.cmake
rename to platforms/android/android.toolchain.cmake
index 0f7e340678..d7f09c7888 100644
--- a/android/android.toolchain.cmake
+++ b/platforms/android/android.toolchain.cmake
@@ -289,6 +289,9 @@
 #   - March 2013
 #     [+] updated for NDK r8e (x86 version)
 #     [+] support x86_64 version of NDK
+#   - April 2013
+#     [+] support non-release NDK layouts (from Linaro git and Android git)
+#     [~] automatically detect if explicit link to crtbegin_*.o is needed
 # ------------------------------------------------------------------------------
 
 cmake_minimum_required( VERSION 2.6.3 )
@@ -516,24 +519,19 @@ if( NOT ANDROID_NDK )
   endif( ANDROID_NDK )
  endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
 endif( NOT ANDROID_NDK )
+
 # remember found paths
 if( ANDROID_NDK )
  get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- # try to detect change
- if( CMAKE_AR )
-  string( LENGTH "${ANDROID_NDK}" __length )
-  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
-  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK )
-   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
-   " )
-  endif()
-  unset( __androidNdkPreviousPath )
-  unset( __length )
- endif()
  set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
  set( BUILD_WITH_ANDROID_NDK True )
- file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
- string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ if( EXISTS "${ANDROID_NDK}/RELEASE.TXT" )
+  file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
+  string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ else()
+  set( ANDROID_NDK_RELEASE "r1x" )
+  set( ANDROID_NDK_RELEASE_FULL "unreleased" )
+ endif()
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
  get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
  # try to detect change
@@ -560,6 +558,51 @@ else()
       sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
 endif()
 
+# android NDK layout
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT DEFINED ANDROID_NDK_LAYOUT )
+  # try to automatically detect the layout
+  if( EXISTS "${ANDROID_NDK}/RELEASE.TXT")
+   set( ANDROID_NDK_LAYOUT "RELEASE" )
+  elseif( EXISTS "${ANDROID_NDK}/../../linux-x86/toolchain/" )
+   set( ANDROID_NDK_LAYOUT "LINARO" )
+  elseif( EXISTS "${ANDROID_NDK}/../../gcc/" )
+   set( ANDROID_NDK_LAYOUT "ANDROID" )
+  endif()
+ endif()
+ set( ANDROID_NDK_LAYOUT "${ANDROID_NDK_LAYOUT}" CACHE STRING "The inner layout of NDK" )
+ mark_as_advanced( ANDROID_NDK_LAYOUT )
+ if( ANDROID_NDK_LAYOUT STREQUAL "LINARO" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../${ANDROID_NDK_HOST_SYSTEM_NAME}/toolchain" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ elseif( ANDROID_NDK_LAYOUT STREQUAL "ANDROID" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../gcc/${ANDROID_NDK_HOST_SYSTEM_NAME}/arm" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ else() # ANDROID_NDK_LAYOUT STREQUAL "RELEASE"
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/toolchains" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME2}" )
+ endif()
+ get_filename_component( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK_TOOLCHAINS_PATH}" ABSOLUTE )
+
+ # try to detect change of NDK
+ if( CMAKE_AR )
+  string( LENGTH "${ANDROID_NDK_TOOLCHAINS_PATH}" __length )
+  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
+  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK_TOOLCHAINS_PATH )
+   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
+   " )
+  endif()
+  unset( __androidNdkPreviousPath )
+  unset( __length )
+ endif()
+endif()
+
+
 # get all the details about standalone toolchain
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
@@ -587,17 +630,23 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  endif()
 endif()
 
-macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __host_system_name )
+macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
  foreach( __toolchain ${${__availableToolchainsLst}} )
-  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/" )
+  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
    string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
   else()
    set( __gcc_toolchain "${__toolchain}" )
   endif()
-  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__gcc_toolchain}/prebuilt/${__host_system_name}" )
+  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
   if( __machine )
-   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?$" __version "${__gcc_toolchain}" )
-   string( REGEX MATCH "^[^-]+" __arch "${__gcc_toolchain}" )
+   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
+   if( __machine MATCHES i686 )
+    set( __arch "x86" )
+   elseif( __machine MATCHES arm )
+    set( __arch "arm" )
+   elseif( __machine MATCHES mipsel )
+    set( __arch "mipsel" )
+   endif()
    list( APPEND __availableToolchainMachines "${__machine}" )
    list( APPEND __availableToolchainArchs "${__arch}" )
    list( APPEND __availableToolchainCompilerVersions "${__version}" )
@@ -615,29 +664,29 @@ if( BUILD_WITH_ANDROID_NDK )
  set( __availableToolchainMachines "" )
  set( __availableToolchainArchs "" )
  set( __availableToolchainCompilerVersions "" )
- if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_NAME}/" )
+ if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_TOOLCHAIN_NAME}/" )
   # do not go through all toolchains if we know the name
   set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
    if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
    endif()
   endif()
  endif()
  if( NOT __availableToolchains )
-  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK}/toolchains" "${ANDROID_NDK}/toolchains/*" )
+  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK_TOOLCHAINS_PATH}" "${ANDROID_NDK_TOOLCHAINS_PATH}/*" )
   if( __availableToolchains )
    list(SORT __availableToolchainsLst) # we need clang to go after gcc
   endif()
   __LIST_FILTER( __availableToolchainsLst "^[.]" )
   __LIST_FILTER( __availableToolchainsLst "llvm" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
    if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
    endif()
   endif()
  endif()
@@ -768,6 +817,7 @@ else()
   list( GET __availableToolchainArchs ${__idx} __toolchainArch )
   if( __toolchainArch STREQUAL ANDROID_ARCH_FULLNAME )
    list( GET __availableToolchainCompilerVersions ${__idx} __toolchainVersion )
+   string( REPLACE "x" "99" __toolchainVersion "${__toolchainVersion}")
    if( __toolchainVersion VERSION_GREATER __toolchainMaxVersion )
     set( __toolchainMaxVersion "${__toolchainVersion}" )
     set( __toolchainIdx ${__idx} )
@@ -971,11 +1021,11 @@ if( "${ANDROID_TOOLCHAIN_NAME}" STREQUAL "standalone-clang" )
 elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
  string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
  string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-4.6" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
- if( NOT EXISTS "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}/bin/clang${TOOL_OS_SUFFIX}" )
+ if( NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}/bin/clang${TOOL_OS_SUFFIX}" )
   message( FATAL_ERROR "Could not find the Clang compiler driver" )
  endif()
  set( ANDROID_COMPILER_IS_CLANG 1 )
- set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
 else()
  set( ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
  unset( ANDROID_COMPILER_IS_CLANG CACHE )
@@ -989,7 +1039,7 @@ endif()
 
 # setup paths and STL for NDK
 if( BUILD_WITH_ANDROID_NDK )
- set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
  set( ANDROID_SYSROOT "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}" )
 
  if( ANDROID_STL STREQUAL "none" )
@@ -1048,11 +1098,11 @@ if( BUILD_WITH_ANDROID_NDK )
  endif()
  # find libsupc++.a - rtti & exceptions
  if( ANDROID_STL STREQUAL "system_re" OR ANDROID_STL MATCHES "gnustl" )
-  if( ANDROID_NDK_RELEASE STRGREATER "r8" ) # r8b
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  elseif( NOT ANDROID_NDK_RELEASE STRLESS "r7" AND ANDROID_NDK_RELEASE STRLESS "r8b")
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  else( ANDROID_NDK_RELEASE STRLESS "r7" )
+  set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r8b or newer
+  if( NOT EXISTS "${__libsupcxx}" )
+   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r7-r8
+  endif()
+  if( NOT EXISTS "${__libsupcxx}" ) # before r7
    if( ARMEABI_V7A )
     if( ANDROID_FORCE_ARM_BUILD )
      set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
@@ -1102,7 +1152,7 @@ unset( _ndk_ccache )
 
 # setup the cross-compiler
 if( NOT CMAKE_C_COMPILER )
- if( NDK_CCACHE )
+ if( NDK_CCACHE AND NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
   set( CMAKE_C_COMPILER   "${NDK_CCACHE}" CACHE PATH "ccache as C compiler" )
   set( CMAKE_CXX_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C++ compiler" )
   if( ANDROID_COMPILER_IS_CLANG )
@@ -1174,11 +1224,25 @@ set( CMAKE_ASM_SOURCE_FILE_EXTENSIONS s S asm )
 remove_definitions( -DANDROID )
 add_definitions( -DANDROID )
 
-if(ANDROID_SYSROOT MATCHES "[ ;\"]")
- set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+if( ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ if( CMAKE_HOST_WIN32 )
+  # try to convert path to 8.3 form
+  file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "@echo %~s1" )
+  execute_process( COMMAND "$ENV{ComSpec}" /c "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "${ANDROID_SYSROOT}"
+                   OUTPUT_VARIABLE __path OUTPUT_STRIP_TRAILING_WHITESPACE
+                   RESULT_VARIABLE __result ERROR_QUIET )
+  if( __result EQUAL 0 )
+   file( TO_CMAKE_PATH "${__path}" ANDROID_SYSROOT )
+   set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+  else()
+   set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+  endif()
+ else()
+  set( ANDROID_CXX_FLAGS "'--sysroot=${ANDROID_SYSROOT}'" )
+ endif()
  if( NOT _CMAKE_IN_TRY_COMPILE )
-  # quotes will break try_compile and compiler identification
-  message(WARNING "Your Android system root has non-alphanumeric symbols. It can break compiler features detection and the whole build.")
+  # quotes can break try_compile and compiler identification
+  message(WARNING "Path to your Android NDK (or toolchain) has non-alphanumeric symbols.\nThe build might be broken.\n")
  endif()
 else()
  set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
@@ -1249,22 +1313,18 @@ elseif( ARMEABI )
  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv5te -mtune=xscale -msoft-float" )
 endif()
 
+if( ANDROID_STL MATCHES "gnustl" AND (EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}") )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+else()
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+endif()
+
 # STL
 if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
- if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- else()
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- endif()
- if ( X86 AND ANDROID_STL MATCHES "gnustl" AND ANDROID_NDK_RELEASE STREQUAL "r6" )
-  # workaround "undefined reference to `__dso_handle'" problem
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
- endif()
  if( EXISTS "${__libstl}" )
   set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libstl}\"" )
   set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libstl}\"" )
@@ -1283,9 +1343,12 @@ if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
   set( CMAKE_C_LINK_EXECUTABLE       "${CMAKE_C_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
  endif()
  if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} -lm" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} -lm" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} -lm" )
+  if( NOT EXISTS "${ANDROID_LIBM_PATH}" )
+   set( ANDROID_LIBM_PATH -lm )
+  endif()
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} ${ANDROID_LIBM_PATH}" )
  endif()
 endif()
 
@@ -1321,7 +1384,14 @@ if( ARMEABI_V7A )
 endif()
 
 if( ANDROID_NO_UNDEFINED )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ if( MIPS )
+  # there is some sysroot-related problem in mips linker...
+  if( NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined -Wl,-rpath-link,${ANDROID_SYSROOT}/usr/lib" )
+  endif()
+ else()
+  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ endif()
 endif()
 
 if( ANDROID_SO_UNDEFINED )
@@ -1401,9 +1471,9 @@ set( CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FL
 set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
 
 if( MIPS AND BUILD_WITH_ANDROID_NDK AND ANDROID_NDK_RELEASE STREQUAL "r8" )
- set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
- set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
- set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
+ set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
+ set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
+ set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
 endif()
 
 # configure rtti
@@ -1430,6 +1500,43 @@ endif()
 include_directories( SYSTEM "${ANDROID_SYSROOT}/usr/include" ${ANDROID_STL_INCLUDE_DIRS} )
 link_directories( "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
 
+# detect if need link crtbegin_so.o explicitly
+if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
+ set( __cmd "${CMAKE_CXX_CREATE_SHARED_LIBRARY}" )
+ string( REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_C_COMPILER>"   "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}"   __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CXX_FLAGS>" "${CMAKE_CXX_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<LANGUAGE_COMPILE_FLAGS>" "" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_FLAGS>" "${CMAKE_SHARED_LINKER_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS>" "-shared" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET_SONAME>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET>" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain_crtlink_test.so" __cmd "${__cmd}" )
+ string( REPLACE "<OBJECTS>" "\"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_LIBRARIES>" "" __cmd "${__cmd}" )
+ separate_arguments( __cmd )
+ foreach( __var ANDROID_NDK ANDROID_NDK_TOOLCHAINS_PATH ANDROID_STANDALONE_TOOLCHAIN )
+  if( ${__var} )
+   set( __tmp "${${__var}}" )
+   separate_arguments( __tmp )
+   string( REPLACE "${__tmp}" "${${__var}}" __cmd "${__cmd}")
+  endif()
+ endforeach()
+ string( REPLACE "'" "" __cmd "${__cmd}" )
+ string( REPLACE "\"" "" __cmd "${__cmd}" )
+ execute_process( COMMAND ${__cmd} RESULT_VARIABLE __cmd_result OUTPUT_QUIET ERROR_QUIET )
+ if( __cmd_result EQUAL 0 )
+  set( ANDROID_EXPLICIT_CRT_LINK ON )
+ else()
+  set( ANDROID_EXPLICIT_CRT_LINK OFF )
+ endif()
+endif()
+
+if( ANDROID_EXPLICIT_CRT_LINK )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+endif()
+
 # setup output directories
 set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "root for library output, set this to change where android libs are installed to" )
 set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )
@@ -1521,6 +1628,7 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
  foreach( __var NDK_CCACHE  LIBRARY_OUTPUT_PATH_ROOT  ANDROID_FORBID_SYGWIN  ANDROID_SET_OBSOLETE_VARIABLES
                 ANDROID_NDK_HOST_X64
                 ANDROID_NDK
+                ANDROID_NDK_LAYOUT
                 ANDROID_STANDALONE_TOOLCHAIN
                 ANDROID_TOOLCHAIN_NAME
                 ANDROID_ABI
@@ -1534,6 +1642,8 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
                 ANDROID_GOLD_LINKER
                 ANDROID_NOEXECSTACK
                 ANDROID_RELRO
+                ANDROID_LIBM_PATH
+                ANDROID_EXPLICIT_CRT_LINK
                 )
   if( DEFINED ${__var} )
    if( "${__var}" MATCHES " ")
@@ -1577,6 +1687,7 @@ endif()
 #   ANDROID_STANDALONE_TOOLCHAIN
 #   ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
 #   ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
+#   ANDROID_NDK_LAYOUT : the inner NDK structure (RELEASE, LINARO, ANDROID)
 #   LIBRARY_OUTPUT_PATH_ROOT : <any valid path>
 #   NDK_CCACHE : <path to your ccache executable>
 # Obsolete:
@@ -1622,6 +1733,7 @@ endif()
 #   ANDROID_EXCEPTIONS : if exceptions are enabled by the runtime
 #   ANDROID_GCC_TOOLCHAIN_NAME : read-only, differs from ANDROID_TOOLCHAIN_NAME only if clang is used
 #   ANDROID_CLANG_VERSION : version of clang compiler if clang is used
+#   ANDROID_LIBM_PATH : path to libm.so (set to something like $(TOP)/out/target/product/<product_name>/obj/lib/libm.so) to workaround unresolved `sincos`
 #
 # Defaults:
 #   ANDROID_DEFAULT_NDK_API_LEVEL
diff --git a/android/java.rst b/platforms/android/java.rst
similarity index 100%
rename from android/java.rst
rename to platforms/android/java.rst
diff --git a/android/libinfo/CMakeLists.txt b/platforms/android/libinfo/CMakeLists.txt
similarity index 100%
rename from android/libinfo/CMakeLists.txt
rename to platforms/android/libinfo/CMakeLists.txt
diff --git a/android/libinfo/info.c b/platforms/android/libinfo/info.c
similarity index 100%
rename from android/libinfo/info.c
rename to platforms/android/libinfo/info.c
diff --git a/android/package/AndroidManifest.xml b/platforms/android/package/AndroidManifest.xml
similarity index 100%
rename from android/package/AndroidManifest.xml
rename to platforms/android/package/AndroidManifest.xml
diff --git a/android/package/CMakeLists.txt b/platforms/android/package/CMakeLists.txt
similarity index 100%
rename from android/package/CMakeLists.txt
rename to platforms/android/package/CMakeLists.txt
diff --git a/android/package/res/drawable/icon.png b/platforms/android/package/res/drawable/icon.png
similarity index 100%
rename from android/package/res/drawable/icon.png
rename to platforms/android/package/res/drawable/icon.png
diff --git a/android/package/res/values/strings.xml b/platforms/android/package/res/values/strings.xml
similarity index 100%
rename from android/package/res/values/strings.xml
rename to platforms/android/package/res/values/strings.xml
diff --git a/android/refman.rst b/platforms/android/refman.rst
similarity index 100%
rename from android/refman.rst
rename to platforms/android/refman.rst
diff --git a/android/service/CMakeLists.txt b/platforms/android/service/CMakeLists.txt
similarity index 100%
rename from android/service/CMakeLists.txt
rename to platforms/android/service/CMakeLists.txt
diff --git a/android/service/all.py b/platforms/android/service/all.py
similarity index 100%
rename from android/service/all.py
rename to platforms/android/service/all.py
diff --git a/android/service/device.conf b/platforms/android/service/device.conf
similarity index 100%
rename from android/service/device.conf
rename to platforms/android/service/device.conf
diff --git a/android/service/doc/AndroidAppUsageModel.dia b/platforms/android/service/doc/AndroidAppUsageModel.dia
similarity index 100%
rename from android/service/doc/AndroidAppUsageModel.dia
rename to platforms/android/service/doc/AndroidAppUsageModel.dia
diff --git a/android/service/doc/BaseLoaderCallback.rst b/platforms/android/service/doc/BaseLoaderCallback.rst
similarity index 100%
rename from android/service/doc/BaseLoaderCallback.rst
rename to platforms/android/service/doc/BaseLoaderCallback.rst
diff --git a/android/service/doc/InstallCallbackInterface.rst b/platforms/android/service/doc/InstallCallbackInterface.rst
similarity index 100%
rename from android/service/doc/InstallCallbackInterface.rst
rename to platforms/android/service/doc/InstallCallbackInterface.rst
diff --git a/android/service/doc/Intro.rst b/platforms/android/service/doc/Intro.rst
similarity index 100%
rename from android/service/doc/Intro.rst
rename to platforms/android/service/doc/Intro.rst
diff --git a/android/service/doc/JavaHelper.rst b/platforms/android/service/doc/JavaHelper.rst
similarity index 100%
rename from android/service/doc/JavaHelper.rst
rename to platforms/android/service/doc/JavaHelper.rst
diff --git a/android/service/doc/LibInstallAproved.dia b/platforms/android/service/doc/LibInstallAproved.dia
similarity index 100%
rename from android/service/doc/LibInstallAproved.dia
rename to platforms/android/service/doc/LibInstallAproved.dia
diff --git a/android/service/doc/LibInstallCanceled.dia b/platforms/android/service/doc/LibInstallCanceled.dia
similarity index 100%
rename from android/service/doc/LibInstallCanceled.dia
rename to platforms/android/service/doc/LibInstallCanceled.dia
diff --git a/android/service/doc/LibInstalled.dia b/platforms/android/service/doc/LibInstalled.dia
similarity index 100%
rename from android/service/doc/LibInstalled.dia
rename to platforms/android/service/doc/LibInstalled.dia
diff --git a/android/service/doc/LoaderCallbackInterface.rst b/platforms/android/service/doc/LoaderCallbackInterface.rst
similarity index 100%
rename from android/service/doc/LoaderCallbackInterface.rst
rename to platforms/android/service/doc/LoaderCallbackInterface.rst
diff --git a/android/service/doc/NoService.dia b/platforms/android/service/doc/NoService.dia
similarity index 100%
rename from android/service/doc/NoService.dia
rename to platforms/android/service/doc/NoService.dia
diff --git a/android/service/doc/Structure.dia b/platforms/android/service/doc/Structure.dia
similarity index 100%
rename from android/service/doc/Structure.dia
rename to platforms/android/service/doc/Structure.dia
diff --git a/android/service/doc/UseCases.rst b/platforms/android/service/doc/UseCases.rst
similarity index 100%
rename from android/service/doc/UseCases.rst
rename to platforms/android/service/doc/UseCases.rst
diff --git a/android/service/doc/build_uml.py b/platforms/android/service/doc/build_uml.py
similarity index 100%
rename from android/service/doc/build_uml.py
rename to platforms/android/service/doc/build_uml.py
diff --git a/android/service/doc/img/AndroidAppUsageModel.png b/platforms/android/service/doc/img/AndroidAppUsageModel.png
similarity index 100%
rename from android/service/doc/img/AndroidAppUsageModel.png
rename to platforms/android/service/doc/img/AndroidAppUsageModel.png
diff --git a/android/service/doc/img/LibInstallAproved.png b/platforms/android/service/doc/img/LibInstallAproved.png
similarity index 100%
rename from android/service/doc/img/LibInstallAproved.png
rename to platforms/android/service/doc/img/LibInstallAproved.png
diff --git a/android/service/doc/img/LibInstallCanceled.png b/platforms/android/service/doc/img/LibInstallCanceled.png
similarity index 100%
rename from android/service/doc/img/LibInstallCanceled.png
rename to platforms/android/service/doc/img/LibInstallCanceled.png
diff --git a/android/service/doc/img/LibInstalled.png b/platforms/android/service/doc/img/LibInstalled.png
similarity index 100%
rename from android/service/doc/img/LibInstalled.png
rename to platforms/android/service/doc/img/LibInstalled.png
diff --git a/android/service/doc/img/NoService.png b/platforms/android/service/doc/img/NoService.png
similarity index 100%
rename from android/service/doc/img/NoService.png
rename to platforms/android/service/doc/img/NoService.png
diff --git a/android/service/doc/img/Structure.png b/platforms/android/service/doc/img/Structure.png
similarity index 100%
rename from android/service/doc/img/Structure.png
rename to platforms/android/service/doc/img/Structure.png
diff --git a/android/service/doc/index.rst b/platforms/android/service/doc/index.rst
similarity index 100%
rename from android/service/doc/index.rst
rename to platforms/android/service/doc/index.rst
diff --git a/android/service/engine/.classpath b/platforms/android/service/engine/.classpath
similarity index 100%
rename from android/service/engine/.classpath
rename to platforms/android/service/engine/.classpath
diff --git a/android/service/engine/.project b/platforms/android/service/engine/.project
similarity index 100%
rename from android/service/engine/.project
rename to platforms/android/service/engine/.project
diff --git a/android/service/engine/AndroidManifest.xml b/platforms/android/service/engine/AndroidManifest.xml
similarity index 100%
rename from android/service/engine/AndroidManifest.xml
rename to platforms/android/service/engine/AndroidManifest.xml
diff --git a/android/service/engine/CMakeLists.txt b/platforms/android/service/engine/CMakeLists.txt
similarity index 97%
rename from android/service/engine/CMakeLists.txt
rename to platforms/android/service/engine/CMakeLists.txt
index 8b88393942..852a028cab 100644
--- a/android/service/engine/CMakeLists.txt
+++ b/platforms/android/service/engine/CMakeLists.txt
@@ -24,7 +24,7 @@ else()
   message(WARNING "Can not automatically determine the value for ANDROID_PLATFORM_VERSION_CODE")
 endif()
 
-configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${ANDROID_MANIFEST_FILE}" "${OpenCV_BINARY_DIR}/android/service/engine/.build/${ANDROID_MANIFEST_FILE}"  @ONLY)
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${ANDROID_MANIFEST_FILE}" "${OpenCV_BINARY_DIR}/platforms/android/service/engine/.build/${ANDROID_MANIFEST_FILE}"  @ONLY)
 
 link_directories("${ANDROID_SOURCE_TREE}/out/target/product/generic/system/lib" "${ANDROID_SOURCE_TREE}/out/target/product/${ANDROID_PRODUCT}/system/lib" "${ANDROID_SOURCE_TREE}/bin/${ANDROID_ARCH_NAME}")
 
@@ -72,4 +72,3 @@ file(GLOB engine_test_files "jni/Tests/*.cpp")
 
 add_executable(opencv_test_engine ${engine_test_files} jni/Tests/gtest/gtest-all.cpp)
 target_link_libraries(opencv_test_engine z binder log utils android_runtime ${engine} ${engine}_jni)
-
diff --git a/android/service/engine/build.xml b/platforms/android/service/engine/build.xml
similarity index 100%
rename from android/service/engine/build.xml
rename to platforms/android/service/engine/build.xml
diff --git a/android/service/engine/jni/Android.mk b/platforms/android/service/engine/jni/Android.mk
similarity index 100%
rename from android/service/engine/jni/Android.mk
rename to platforms/android/service/engine/jni/Android.mk
diff --git a/android/service/engine/jni/Application.mk b/platforms/android/service/engine/jni/Application.mk
similarity index 100%
rename from android/service/engine/jni/Application.mk
rename to platforms/android/service/engine/jni/Application.mk
diff --git a/android/service/engine/jni/BinderComponent/BnOpenCVEngine.cpp b/platforms/android/service/engine/jni/BinderComponent/BnOpenCVEngine.cpp
similarity index 100%
rename from android/service/engine/jni/BinderComponent/BnOpenCVEngine.cpp
rename to platforms/android/service/engine/jni/BinderComponent/BnOpenCVEngine.cpp
diff --git a/android/service/engine/jni/BinderComponent/BnOpenCVEngine.h b/platforms/android/service/engine/jni/BinderComponent/BnOpenCVEngine.h
similarity index 100%
rename from android/service/engine/jni/BinderComponent/BnOpenCVEngine.h
rename to platforms/android/service/engine/jni/BinderComponent/BnOpenCVEngine.h
diff --git a/android/service/engine/jni/BinderComponent/BpOpenCVEngine.cpp b/platforms/android/service/engine/jni/BinderComponent/BpOpenCVEngine.cpp
similarity index 100%
rename from android/service/engine/jni/BinderComponent/BpOpenCVEngine.cpp
rename to platforms/android/service/engine/jni/BinderComponent/BpOpenCVEngine.cpp
diff --git a/android/service/engine/jni/BinderComponent/BpOpenCVEngine.h b/platforms/android/service/engine/jni/BinderComponent/BpOpenCVEngine.h
similarity index 100%
rename from android/service/engine/jni/BinderComponent/BpOpenCVEngine.h
rename to platforms/android/service/engine/jni/BinderComponent/BpOpenCVEngine.h
diff --git a/android/service/engine/jni/BinderComponent/HardwareDetector.cpp b/platforms/android/service/engine/jni/BinderComponent/HardwareDetector.cpp
similarity index 100%
rename from android/service/engine/jni/BinderComponent/HardwareDetector.cpp
rename to platforms/android/service/engine/jni/BinderComponent/HardwareDetector.cpp
diff --git a/android/service/engine/jni/BinderComponent/HardwareDetector.h b/platforms/android/service/engine/jni/BinderComponent/HardwareDetector.h
similarity index 100%
rename from android/service/engine/jni/BinderComponent/HardwareDetector.h
rename to platforms/android/service/engine/jni/BinderComponent/HardwareDetector.h
diff --git a/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp b/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
similarity index 100%
rename from android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
rename to platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
diff --git a/android/service/engine/jni/BinderComponent/OpenCVEngine.h b/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.h
similarity index 100%
rename from android/service/engine/jni/BinderComponent/OpenCVEngine.h
rename to platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.h
diff --git a/android/service/engine/jni/BinderComponent/ProcReader.cpp b/platforms/android/service/engine/jni/BinderComponent/ProcReader.cpp
similarity index 100%
rename from android/service/engine/jni/BinderComponent/ProcReader.cpp
rename to platforms/android/service/engine/jni/BinderComponent/ProcReader.cpp
diff --git a/android/service/engine/jni/BinderComponent/ProcReader.h b/platforms/android/service/engine/jni/BinderComponent/ProcReader.h
similarity index 100%
rename from android/service/engine/jni/BinderComponent/ProcReader.h
rename to platforms/android/service/engine/jni/BinderComponent/ProcReader.h
diff --git a/android/service/engine/jni/BinderComponent/StringUtils.cpp b/platforms/android/service/engine/jni/BinderComponent/StringUtils.cpp
similarity index 100%
rename from android/service/engine/jni/BinderComponent/StringUtils.cpp
rename to platforms/android/service/engine/jni/BinderComponent/StringUtils.cpp
diff --git a/android/service/engine/jni/BinderComponent/StringUtils.h b/platforms/android/service/engine/jni/BinderComponent/StringUtils.h
similarity index 100%
rename from android/service/engine/jni/BinderComponent/StringUtils.h
rename to platforms/android/service/engine/jni/BinderComponent/StringUtils.h
diff --git a/android/service/engine/jni/BinderComponent/TegraDetector.cpp b/platforms/android/service/engine/jni/BinderComponent/TegraDetector.cpp
similarity index 100%
rename from android/service/engine/jni/BinderComponent/TegraDetector.cpp
rename to platforms/android/service/engine/jni/BinderComponent/TegraDetector.cpp
diff --git a/android/service/engine/jni/BinderComponent/TegraDetector.h b/platforms/android/service/engine/jni/BinderComponent/TegraDetector.h
similarity index 100%
rename from android/service/engine/jni/BinderComponent/TegraDetector.h
rename to platforms/android/service/engine/jni/BinderComponent/TegraDetector.h
diff --git a/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
rename to platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
diff --git a/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
rename to platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
diff --git a/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp b/platforms/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp
rename to platforms/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp
diff --git a/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.h b/platforms/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.h
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.h
rename to platforms/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.h
diff --git a/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
rename to platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
diff --git a/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
rename to platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
diff --git a/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
rename to platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
diff --git a/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
similarity index 100%
rename from android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
rename to platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
diff --git a/android/service/engine/jni/NativeClient/ClientMain.cpp b/platforms/android/service/engine/jni/NativeClient/ClientMain.cpp
similarity index 100%
rename from android/service/engine/jni/NativeClient/ClientMain.cpp
rename to platforms/android/service/engine/jni/NativeClient/ClientMain.cpp
diff --git a/android/service/engine/jni/NativeService/CommonPackageManager.cpp b/platforms/android/service/engine/jni/NativeService/CommonPackageManager.cpp
similarity index 100%
rename from android/service/engine/jni/NativeService/CommonPackageManager.cpp
rename to platforms/android/service/engine/jni/NativeService/CommonPackageManager.cpp
diff --git a/android/service/engine/jni/NativeService/CommonPackageManager.h b/platforms/android/service/engine/jni/NativeService/CommonPackageManager.h
similarity index 100%
rename from android/service/engine/jni/NativeService/CommonPackageManager.h
rename to platforms/android/service/engine/jni/NativeService/CommonPackageManager.h
diff --git a/android/service/engine/jni/NativeService/NativePackageManager.cpp b/platforms/android/service/engine/jni/NativeService/NativePackageManager.cpp
similarity index 100%
rename from android/service/engine/jni/NativeService/NativePackageManager.cpp
rename to platforms/android/service/engine/jni/NativeService/NativePackageManager.cpp
diff --git a/android/service/engine/jni/NativeService/NativePackageManager.h b/platforms/android/service/engine/jni/NativeService/NativePackageManager.h
similarity index 100%
rename from android/service/engine/jni/NativeService/NativePackageManager.h
rename to platforms/android/service/engine/jni/NativeService/NativePackageManager.h
diff --git a/android/service/engine/jni/NativeService/PackageInfo.cpp b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
similarity index 100%
rename from android/service/engine/jni/NativeService/PackageInfo.cpp
rename to platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
diff --git a/android/service/engine/jni/NativeService/PackageInfo.h b/platforms/android/service/engine/jni/NativeService/PackageInfo.h
similarity index 100%
rename from android/service/engine/jni/NativeService/PackageInfo.h
rename to platforms/android/service/engine/jni/NativeService/PackageInfo.h
diff --git a/android/service/engine/jni/NativeService/ServiceMain.cpp b/platforms/android/service/engine/jni/NativeService/ServiceMain.cpp
similarity index 100%
rename from android/service/engine/jni/NativeService/ServiceMain.cpp
rename to platforms/android/service/engine/jni/NativeService/ServiceMain.cpp
diff --git a/android/service/engine/jni/Tests/HardwareDetectionTest.cpp b/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
similarity index 100%
rename from android/service/engine/jni/Tests/HardwareDetectionTest.cpp
rename to platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
diff --git a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp b/platforms/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
similarity index 100%
rename from android/service/engine/jni/Tests/OpenCVEngineTest.cpp
rename to platforms/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
diff --git a/android/service/engine/jni/Tests/PackageInfoTest.cpp b/platforms/android/service/engine/jni/Tests/PackageInfoTest.cpp
similarity index 99%
rename from android/service/engine/jni/Tests/PackageInfoTest.cpp
rename to platforms/android/service/engine/jni/Tests/PackageInfoTest.cpp
index 6cbb069431..36fdae764f 100644
--- a/android/service/engine/jni/Tests/PackageInfoTest.cpp
+++ b/platforms/android/service/engine/jni/Tests/PackageInfoTest.cpp
@@ -222,4 +222,3 @@ TEST(PackageInfo, Comparator3)
     EXPECT_EQ(info1, info2);
 }
 #endif
-
diff --git a/android/service/engine/jni/Tests/PackageManagerStub.cpp b/platforms/android/service/engine/jni/Tests/PackageManagerStub.cpp
similarity index 100%
rename from android/service/engine/jni/Tests/PackageManagerStub.cpp
rename to platforms/android/service/engine/jni/Tests/PackageManagerStub.cpp
diff --git a/android/service/engine/jni/Tests/PackageManagerStub.h b/platforms/android/service/engine/jni/Tests/PackageManagerStub.h
similarity index 100%
rename from android/service/engine/jni/Tests/PackageManagerStub.h
rename to platforms/android/service/engine/jni/Tests/PackageManagerStub.h
diff --git a/android/service/engine/jni/Tests/PackageManagmentTest.cpp b/platforms/android/service/engine/jni/Tests/PackageManagmentTest.cpp
similarity index 99%
rename from android/service/engine/jni/Tests/PackageManagmentTest.cpp
rename to platforms/android/service/engine/jni/Tests/PackageManagmentTest.cpp
index e21dcf7604..61d6e01c24 100644
--- a/android/service/engine/jni/Tests/PackageManagmentTest.cpp
+++ b/platforms/android/service/engine/jni/Tests/PackageManagmentTest.cpp
@@ -137,5 +137,3 @@ TEST(PackageManager, GetPackagePathForMips)
 //     string path = pm.GetPackagePathByVersion("240", PLATFORM_TEGRA2, 0);
 //     EXPECT_STREQ("/data/data/org.opencv.lib_v24_tegra2/lib", path.c_str());
 // }
-
-
diff --git a/android/service/engine/jni/Tests/TestMain.cpp b/platforms/android/service/engine/jni/Tests/TestMain.cpp
similarity index 100%
rename from android/service/engine/jni/Tests/TestMain.cpp
rename to platforms/android/service/engine/jni/Tests/TestMain.cpp
diff --git a/android/service/engine/jni/Tests/Tests.mk b/platforms/android/service/engine/jni/Tests/Tests.mk
similarity index 100%
rename from android/service/engine/jni/Tests/Tests.mk
rename to platforms/android/service/engine/jni/Tests/Tests.mk
diff --git a/android/service/engine/jni/Tests/gtest/gtest-all.cpp b/platforms/android/service/engine/jni/Tests/gtest/gtest-all.cpp
similarity index 100%
rename from android/service/engine/jni/Tests/gtest/gtest-all.cpp
rename to platforms/android/service/engine/jni/Tests/gtest/gtest-all.cpp
diff --git a/android/service/engine/jni/Tests/gtest/gtest.h b/platforms/android/service/engine/jni/Tests/gtest/gtest.h
similarity index 100%
rename from android/service/engine/jni/Tests/gtest/gtest.h
rename to platforms/android/service/engine/jni/Tests/gtest/gtest.h
diff --git a/android/service/engine/jni/include/EngineCommon.h b/platforms/android/service/engine/jni/include/EngineCommon.h
similarity index 100%
rename from android/service/engine/jni/include/EngineCommon.h
rename to platforms/android/service/engine/jni/include/EngineCommon.h
diff --git a/android/service/engine/jni/include/IOpenCVEngine.h b/platforms/android/service/engine/jni/include/IOpenCVEngine.h
similarity index 100%
rename from android/service/engine/jni/include/IOpenCVEngine.h
rename to platforms/android/service/engine/jni/include/IOpenCVEngine.h
diff --git a/android/service/engine/jni/include/IPackageManager.h b/platforms/android/service/engine/jni/include/IPackageManager.h
similarity index 100%
rename from android/service/engine/jni/include/IPackageManager.h
rename to platforms/android/service/engine/jni/include/IPackageManager.h
diff --git a/android/service/engine/jni/include/OpenCVEngineHelper.h b/platforms/android/service/engine/jni/include/OpenCVEngineHelper.h
similarity index 100%
rename from android/service/engine/jni/include/OpenCVEngineHelper.h
rename to platforms/android/service/engine/jni/include/OpenCVEngineHelper.h
diff --git a/android/service/engine/project.properties b/platforms/android/service/engine/project.properties
similarity index 100%
rename from android/service/engine/project.properties
rename to platforms/android/service/engine/project.properties
diff --git a/android/service/engine/res/drawable/icon.png b/platforms/android/service/engine/res/drawable/icon.png
similarity index 100%
rename from android/service/engine/res/drawable/icon.png
rename to platforms/android/service/engine/res/drawable/icon.png
diff --git a/android/service/engine/res/layout-small/info.xml b/platforms/android/service/engine/res/layout-small/info.xml
similarity index 100%
rename from android/service/engine/res/layout-small/info.xml
rename to platforms/android/service/engine/res/layout-small/info.xml
diff --git a/android/service/engine/res/layout-small/main.xml b/platforms/android/service/engine/res/layout-small/main.xml
similarity index 100%
rename from android/service/engine/res/layout-small/main.xml
rename to platforms/android/service/engine/res/layout-small/main.xml
diff --git a/android/service/engine/res/layout/info.xml b/platforms/android/service/engine/res/layout/info.xml
similarity index 100%
rename from android/service/engine/res/layout/info.xml
rename to platforms/android/service/engine/res/layout/info.xml
diff --git a/android/service/engine/res/layout/main.xml b/platforms/android/service/engine/res/layout/main.xml
similarity index 100%
rename from android/service/engine/res/layout/main.xml
rename to platforms/android/service/engine/res/layout/main.xml
diff --git a/android/service/engine/res/values/strings.xml b/platforms/android/service/engine/res/values/strings.xml
similarity index 100%
rename from android/service/engine/res/values/strings.xml
rename to platforms/android/service/engine/res/values/strings.xml
diff --git a/android/service/engine/src/org/opencv/engine/BinderConnector.java b/platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/BinderConnector.java
rename to platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java
diff --git a/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/HardwareDetector.java
rename to platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
diff --git a/android/service/engine/src/org/opencv/engine/MarketConnector.java b/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/MarketConnector.java
rename to platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
diff --git a/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
rename to platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
diff --git a/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
rename to platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
diff --git a/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java b/platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java
rename to platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java
diff --git a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
rename to platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
diff --git a/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java b/platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java
similarity index 100%
rename from android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java
rename to platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java
diff --git a/android/service/engine_test/.classpath b/platforms/android/service/engine_test/.classpath
similarity index 100%
rename from android/service/engine_test/.classpath
rename to platforms/android/service/engine_test/.classpath
diff --git a/android/service/engine_test/.project b/platforms/android/service/engine_test/.project
similarity index 100%
rename from android/service/engine_test/.project
rename to platforms/android/service/engine_test/.project
diff --git a/android/service/engine_test/AndroidManifest.xml b/platforms/android/service/engine_test/AndroidManifest.xml
similarity index 100%
rename from android/service/engine_test/AndroidManifest.xml
rename to platforms/android/service/engine_test/AndroidManifest.xml
diff --git a/android/service/engine_test/build.xml b/platforms/android/service/engine_test/build.xml
similarity index 100%
rename from android/service/engine_test/build.xml
rename to platforms/android/service/engine_test/build.xml
diff --git a/android/service/engine_test/project.properties b/platforms/android/service/engine_test/project.properties
similarity index 100%
rename from android/service/engine_test/project.properties
rename to platforms/android/service/engine_test/project.properties
diff --git a/android/service/engine_test/res/drawable-hdpi/ic_launcher.png b/platforms/android/service/engine_test/res/drawable-hdpi/ic_launcher.png
similarity index 100%
rename from android/service/engine_test/res/drawable-hdpi/ic_launcher.png
rename to platforms/android/service/engine_test/res/drawable-hdpi/ic_launcher.png
diff --git a/android/service/engine_test/res/drawable-ldpi/ic_launcher.png b/platforms/android/service/engine_test/res/drawable-ldpi/ic_launcher.png
similarity index 100%
rename from android/service/engine_test/res/drawable-ldpi/ic_launcher.png
rename to platforms/android/service/engine_test/res/drawable-ldpi/ic_launcher.png
diff --git a/android/service/engine_test/res/drawable-mdpi/ic_launcher.png b/platforms/android/service/engine_test/res/drawable-mdpi/ic_launcher.png
similarity index 100%
rename from android/service/engine_test/res/drawable-mdpi/ic_launcher.png
rename to platforms/android/service/engine_test/res/drawable-mdpi/ic_launcher.png
diff --git a/android/service/engine_test/res/layout/main.xml b/platforms/android/service/engine_test/res/layout/main.xml
similarity index 100%
rename from android/service/engine_test/res/layout/main.xml
rename to platforms/android/service/engine_test/res/layout/main.xml
diff --git a/android/service/engine_test/res/values/strings.xml b/platforms/android/service/engine_test/res/values/strings.xml
similarity index 100%
rename from android/service/engine_test/res/values/strings.xml
rename to platforms/android/service/engine_test/res/values/strings.xml
diff --git a/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java b/platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java
similarity index 100%
rename from android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java
rename to platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java
diff --git a/android/service/push_native.py b/platforms/android/service/push_native.py
similarity index 100%
rename from android/service/push_native.py
rename to platforms/android/service/push_native.py
diff --git a/android/service/readme.txt b/platforms/android/service/readme.txt
similarity index 100%
rename from android/service/readme.txt
rename to platforms/android/service/readme.txt
diff --git a/android/service/test_native.py b/platforms/android/service/test_native.py
similarity index 99%
rename from android/service/test_native.py
rename to platforms/android/service/test_native.py
index 9a39032b18..328b9a8a51 100755
--- a/android/service/test_native.py
+++ b/platforms/android/service/test_native.py
@@ -34,4 +34,3 @@ if (__name__ ==  "__main__"):
     os.system("adb %s shell mkdir -p \"%s\"" % (DEVICE_STR, DEVICE_LOG_PATH))
 
     RunTestApp("OpenCVEngineTestApp")
-
diff --git a/ios/Info.plist.in b/platforms/ios/Info.plist.in
similarity index 93%
rename from ios/Info.plist.in
rename to platforms/ios/Info.plist.in
index 89ef38625d..6bcfe862d0 100644
--- a/ios/Info.plist.in
+++ b/platforms/ios/Info.plist.in
@@ -5,7 +5,7 @@
     <key>CFBundleName</key>
     <string>OpenCV</string>
     <key>CFBundleIdentifier</key>
-    <string>com.itseez.opencv</string>
+    <string>org.opencv</string>
     <key>CFBundleVersion</key>
     <string>${VERSION}</string>
     <key>CFBundleShortVersionString</key>
diff --git a/ios/build_framework.py b/platforms/ios/build_framework.py
similarity index 95%
rename from ios/build_framework.py
rename to platforms/ios/build_framework.py
index ceef4b71d7..bc385bb1bb 100755
--- a/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -38,7 +38,7 @@ def build_opencv(srcroot, buildroot, target, arch):
     # for some reason, if you do not specify CMAKE_BUILD_TYPE, it puts libs to "RELEASE" rather than "Release"
     cmakeargs = ("-GXcode " +
                 "-DCMAKE_BUILD_TYPE=Release " +
-                "-DCMAKE_TOOLCHAIN_FILE=%s/ios/cmake/Toolchains/Toolchain-%s_Xcode.cmake " +
+                "-DCMAKE_TOOLCHAIN_FILE=%s/platforms/ios/cmake/Toolchains/Toolchain-%s_Xcode.cmake " +
                 "-DBUILD_opencv_world=ON " +
                 "-DCMAKE_INSTALL_PREFIX=install") % (srcroot, target)
     # if cmake cache exists, just rerun cmake to update OpenCV.xproj if necessary
@@ -92,16 +92,13 @@ def put_framework_together(srcroot, dstroot):
     os.system("lipo -create " + wlist + " -o " + dstdir + "/opencv2")
 
     # form Info.plist
-    srcfile = open(srcroot + "/ios/Info.plist.in", "rt")
+    srcfile = open(srcroot + "/platforms/ios/Info.plist.in", "rt")
     dstfile = open(dstdir + "/Resources/Info.plist", "wt")
     for l in srcfile.readlines():
         dstfile.write(l.replace("${VERSION}", opencv_version))
     srcfile.close()
     dstfile.close()
 
-    # copy cascades
-    # TODO ...
-
     # make symbolic links
     os.symlink("A", "Versions/Current")
     os.symlink("Versions/Current/Headers", "Headers")
@@ -125,4 +122,4 @@ if __name__ == "__main__":
         print "Usage:\n\t./build_framework.py <outputdir>\n\n"
         sys.exit(0)
 
-    build_framework(os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "..")), os.path.abspath(sys.argv[1]))
\ No newline at end of file
+    build_framework(os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../..")), os.path.abspath(sys.argv[1]))
\ No newline at end of file
diff --git a/ios/cmake/Modules/Platform/iOS.cmake b/platforms/ios/cmake/Modules/Platform/iOS.cmake
similarity index 100%
rename from ios/cmake/Modules/Platform/iOS.cmake
rename to platforms/ios/cmake/Modules/Platform/iOS.cmake
diff --git a/ios/cmake/Toolchains/Toolchain-iPhoneOS_Xcode.cmake b/platforms/ios/cmake/Toolchains/Toolchain-iPhoneOS_Xcode.cmake
similarity index 84%
rename from ios/cmake/Toolchains/Toolchain-iPhoneOS_Xcode.cmake
rename to platforms/ios/cmake/Toolchains/Toolchain-iPhoneOS_Xcode.cmake
index 67343253bd..6493deb459 100644
--- a/ios/cmake/Toolchains/Toolchain-iPhoneOS_Xcode.cmake
+++ b/platforms/ios/cmake/Toolchains/Toolchain-iPhoneOS_Xcode.cmake
@@ -4,12 +4,12 @@ set (IPHONEOS TRUE)
 # Standard settings
 set (CMAKE_SYSTEM_NAME iOS)
 # Include extra modules for the iOS platform files
-set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/ios/cmake/Modules")
+set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/platforms/ios/cmake/Modules")
 
-# Force the compilers to gcc for iOS
+# Force the compilers to clang for iOS
 include (CMakeForceCompiler)
-#CMAKE_FORCE_C_COMPILER (gcc gcc)
-#CMAKE_FORCE_CXX_COMPILER (g++ g++)
+#CMAKE_FORCE_C_COMPILER (clang GNU)
+#CMAKE_FORCE_CXX_COMPILER (clang++ GNU)
 
 set (CMAKE_C_SIZEOF_DATA_PTR 4)
 set (CMAKE_C_HAS_ISYSROOT 1)
diff --git a/ios/cmake/Toolchains/Toolchain-iPhoneSimulator_Xcode.cmake b/platforms/ios/cmake/Toolchains/Toolchain-iPhoneSimulator_Xcode.cmake
similarity index 85%
rename from ios/cmake/Toolchains/Toolchain-iPhoneSimulator_Xcode.cmake
rename to platforms/ios/cmake/Toolchains/Toolchain-iPhoneSimulator_Xcode.cmake
index 7ef8113edb..0056c8dbd4 100644
--- a/ios/cmake/Toolchains/Toolchain-iPhoneSimulator_Xcode.cmake
+++ b/platforms/ios/cmake/Toolchains/Toolchain-iPhoneSimulator_Xcode.cmake
@@ -4,12 +4,12 @@ set (IPHONESIMULATOR TRUE)
 # Standard settings
 set (CMAKE_SYSTEM_NAME iOS)
 # Include extra modules for the iOS platform files
-set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/ios/cmake/Modules")
+set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/platforms/ios/cmake/Modules")
 
-# Force the compilers to gcc for iOS
+# Force the compilers to clang for iOS
 include (CMakeForceCompiler)
-#CMAKE_FORCE_C_COMPILER (gcc gcc)
-#CMAKE_FORCE_CXX_COMPILER (g++ g++)
+#CMAKE_FORCE_C_COMPILER (clang GNU)
+#CMAKE_FORCE_CXX_COMPILER (clang++ GNU)
 
 set (CMAKE_C_SIZEOF_DATA_PTR 4)
 set (CMAKE_C_HAS_ISYSROOT 1)
diff --git a/platforms/ios/readme.txt b/platforms/ios/readme.txt
new file mode 100644
index 0000000000..8f1f206b03
--- /dev/null
+++ b/platforms/ios/readme.txt
@@ -0,0 +1,7 @@
+Building OpenCV from Source, using CMake and Command Line
+=========================================================
+
+cd ~/<my_working_directory>
+python opencv/platforms/ios/build_framework.py ios
+
+If everything's fine, a few minutes later you will get ~/<my_working_directory>/ios/opencv2.framework. You can add this framework to your Xcode projects.
\ No newline at end of file
diff --git a/platforms/linux/scripts/cmake_arm_gnueabi_hardfp.sh b/platforms/linux/scripts/cmake_arm_gnueabi_hardfp.sh
deleted file mode 100755
index f8df7859c3..0000000000
--- a/platforms/linux/scripts/cmake_arm_gnueabi_hardfp.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-cd `dirname $0`/..
-
-mkdir -p build_hardfp
-cd build_hardfp
-
-cmake -DCMAKE_TOOLCHAIN_FILE=../arm-gnueabi.toolchain.cmake $@ ../../..
-
diff --git a/platforms/linux/scripts/cmake_arm_gnueabi_softfp.sh b/platforms/linux/scripts/cmake_arm_gnueabi_softfp.sh
deleted file mode 100755
index f4210fa829..0000000000
--- a/platforms/linux/scripts/cmake_arm_gnueabi_softfp.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-cd `dirname $0`/..
-
-mkdir -p build_softfp
-cd build_softfp
-
-cmake -DSOFTFP=ON -DCMAKE_TOOLCHAIN_FILE=../arm-gnueabi.toolchain.cmake $@ ../../..
-
diff --git a/platforms/readme.txt b/platforms/readme.txt
index 7e1c4555c5..dfe0461422 100644
--- a/platforms/readme.txt
+++ b/platforms/readme.txt
@@ -1 +1,3 @@
-This folder contains toolchains and additional files that are needed for cross compitation.
\ No newline at end of file
+This folder contains toolchains and additional files that are needed for cross compilation.
+For more information see introduction tutorials for target platform in documentation:
+http://docs.opencv.org/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.html#table-of-content-introduction
\ No newline at end of file
diff --git a/android/scripts/ABI_compat_generator.py b/platforms/scripts/ABI_compat_generator.py
similarity index 98%
rename from android/scripts/ABI_compat_generator.py
rename to platforms/scripts/ABI_compat_generator.py
index b492a70fe4..fdabf00611 100755
--- a/android/scripts/ABI_compat_generator.py
+++ b/platforms/scripts/ABI_compat_generator.py
@@ -6,9 +6,7 @@ import os
 
 
 architecture = 'armeabi'
-excludedHeaders = set(['hdf5.h', 'cap_ios.h', 
-    'eigen.hpp', 'cxeigen.hpp' #TOREMOVE
-    ])
+excludedHeaders = set(['hdf5.h', 'cap_ios.h', 'eigen.hpp', 'cxeigen.hpp']) #TOREMOVE
 systemIncludes = ['sources/cxx-stl/gnu-libstdc++/4.6/include', \
     '/opt/android-ndk-r8c/platforms/android-8/arch-arm', # TODO: check if this one could be passed as command line arg
     'sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include']
@@ -113,7 +111,7 @@ def FindHeaders():
             if f == m:
                 moduleHeaders += GetHeaderFiles(os.path.join(cppHeadersFolder, f))
                 if m == 'flann':
-                    flann = os.path.join(cppHeadersFolder, f, 'flann.hpp') 
+                    flann = os.path.join(cppHeadersFolder, f, 'flann.hpp')
                     moduleHeaders.remove(flann)
                     moduleHeaders.insert(0, flann)
                 cppHeaders += moduleHeaders
diff --git a/android/scripts/camera_build.conf b/platforms/scripts/camera_build.conf
similarity index 100%
rename from android/scripts/camera_build.conf
rename to platforms/scripts/camera_build.conf
diff --git a/android/scripts/cmake_android_all_cameras.py b/platforms/scripts/cmake_android_all_cameras.py
similarity index 90%
rename from android/scripts/cmake_android_all_cameras.py
rename to platforms/scripts/cmake_android_all_cameras.py
index afcab63a75..c160df0fa0 100755
--- a/android/scripts/cmake_android_all_cameras.py
+++ b/platforms/scripts/cmake_android_all_cameras.py
@@ -49,7 +49,7 @@ for s in ConfFile.readlines():
 
     os.chdir(BuildDir)
     BuildLog = os.path.join(BuildDir, "build.log")
-    CmakeCmdLine = "cmake -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake -DANDROID_SOURCE_TREE=\"%s\" -DANDROID_NATIVE_API_LEVEL=\"%s\" -DANDROID_ABI=\"%s\" -DANDROID_STL=stlport_static ../../ > \"%s\" 2>&1" % (AndroidTreeRoot, NativeApiLevel, Arch, BuildLog)
+    CmakeCmdLine = "cmake -DCMAKE_TOOLCHAIN_FILE=../android/android.toolchain.cmake -DANDROID_SOURCE_TREE=\"%s\" -DANDROID_NATIVE_API_LEVEL=\"%s\" -DANDROID_ABI=\"%s\" -DANDROID_STL=stlport_static ../.. > \"%s\" 2>&1" % (AndroidTreeRoot, NativeApiLevel, Arch, BuildLog)
     MakeCmdLine = "make %s >> \"%s\" 2>&1" % (MakeTarget, BuildLog);
     #print(CmakeCmdLine)
     os.system(CmakeCmdLine)
diff --git a/android/scripts/cmake_android.sh b/platforms/scripts/cmake_android_arm.sh
similarity index 50%
rename from android/scripts/cmake_android.sh
rename to platforms/scripts/cmake_android_arm.sh
index 101ba3cee8..84c88a8159 100755
--- a/android/scripts/cmake_android.sh
+++ b/platforms/scripts/cmake_android_arm.sh
@@ -1,8 +1,7 @@
 #!/bin/sh
 cd `dirname $0`/..
 
-mkdir -p build
-cd build
-
-cmake -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_TOOLCHAIN_FILE=../android.toolchain.cmake $@ ../..
+mkdir -p build_android_arm
+cd build_android_arm
 
+cmake -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_TOOLCHAIN_FILE=../android/android.toolchain.cmake $@ ../..
diff --git a/platforms/scripts/cmake_android_mips.sh b/platforms/scripts/cmake_android_mips.sh
new file mode 100755
index 0000000000..6bc7944b6d
--- /dev/null
+++ b/platforms/scripts/cmake_android_mips.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+cd `dirname $0`/..
+
+mkdir -p build_android_mips
+cd build_android_mips
+
+cmake -DANDROID_ABI=mips -DCMAKE_TOOLCHAIN_FILE=../android/android.toolchain.cmake $@ ../..
diff --git a/platforms/scripts/cmake_android_service.sh b/platforms/scripts/cmake_android_service.sh
new file mode 100755
index 0000000000..7ba8865b2a
--- /dev/null
+++ b/platforms/scripts/cmake_android_service.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+cd `dirname $0`/..
+
+mkdir -p build_android_service
+cd build_android_service
+
+cmake -DCMAKE_TOOLCHAIN_FILE=../android/android.toolchain.cmake -DANDROID_TOOLCHAIN_NAME="arm-linux-androideabi-4.4.3" -DANDROID_STL=stlport_static -DANDROID_STL_FORCE_FEATURES=OFF -DBUILD_ANDROID_SERVICE=ON -DANDROID_SOURCE_TREE=~/Projects/AndroidSource/ServiceStub/ $@ ../..
diff --git a/platforms/scripts/cmake_android_x86.sh b/platforms/scripts/cmake_android_x86.sh
new file mode 100755
index 0000000000..8fb8abda7e
--- /dev/null
+++ b/platforms/scripts/cmake_android_x86.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+cd `dirname $0`/..
+
+mkdir -p build_android_x86
+cd build_android_x86
+
+cmake -DANDROID_ABI=x86 -DCMAKE_TOOLCHAIN_FILE=../android/android.toolchain.cmake $@ ../..
diff --git a/platforms/scripts/cmake_arm_gnueabi_hardfp.sh b/platforms/scripts/cmake_arm_gnueabi_hardfp.sh
new file mode 100755
index 0000000000..1fce4f9dc1
--- /dev/null
+++ b/platforms/scripts/cmake_arm_gnueabi_hardfp.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+cd `dirname $0`/..
+
+mkdir -p build_linux_arm_hardfp
+cd build_linux_arm_hardfp
+
+cmake -DCMAKE_TOOLCHAIN_FILE=../linux/arm-gnueabi.toolchain.cmake $@ ../..
diff --git a/platforms/scripts/cmake_arm_gnueabi_softfp.sh b/platforms/scripts/cmake_arm_gnueabi_softfp.sh
new file mode 100755
index 0000000000..734348907c
--- /dev/null
+++ b/platforms/scripts/cmake_arm_gnueabi_softfp.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+cd `dirname $0`/..
+
+mkdir -p build_linux_arm_softfp
+cd build_linux_arm_softfp
+
+cmake -DSOFTFP=ON -DCMAKE_TOOLCHAIN_FILE=../linux/arm-gnueabi.toolchain.cmake $@ ../..
diff --git a/platforms/linux/scripts/cmake_carma.sh b/platforms/scripts/cmake_carma.sh
similarity index 100%
rename from platforms/linux/scripts/cmake_carma.sh
rename to platforms/scripts/cmake_carma.sh
diff --git a/platforms/scripts/cmake_winrt.cmd b/platforms/scripts/cmake_winrt.cmd
new file mode 100644
index 0000000000..df70e856c5
--- /dev/null
+++ b/platforms/scripts/cmake_winrt.cmd
@@ -0,0 +1,6 @@
+mkdir build_winrt_arm
+cd build_winrt_arm
+
+rem call "C:\Program Files\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+
+cmake.exe -GNinja -DWITH_TBB=ON -DBUILD_TBB=ON -DCMAKE_BUILD_TYPE=Release -DWITH_FFMPEG=OFF -DBUILD_opencv_gpu=OFF -DBUILD_opencv_python=OFF -DCMAKE_TOOLCHAIN_FILE=..\winrt\arm.winrt.toolchain.cmake ..\..
diff --git a/platforms/winrt/scripts/cmake_winrt.cmd b/platforms/winrt/scripts/cmake_winrt.cmd
deleted file mode 100644
index aafed7d09d..0000000000
--- a/platforms/winrt/scripts/cmake_winrt.cmd
+++ /dev/null
@@ -1,6 +0,0 @@
-mkdir build
-cd build
-
-rem call "C:\Program Files\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
-
-cmake.exe -GNinja -DCMAKE_BUILD_TYPE=Release -DWITH_FFMPEG=OFF -DBUILD_opencv_gpu=OFF -DBUILD_opencv_python=OFF -DCMAKE_TOOLCHAIN_FILE=..\..\winrt\arm.winrt.toolchain.cmake ..\..\..
diff --git a/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java b/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
index 23727739e4..b06b2cc1c5 100644
--- a/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
+++ b/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
@@ -215,9 +215,9 @@ public class FdActivity extends Activity implements CvCameraViewListener2 {
         else if (item == mItemFace20)
             setMinFaceSize(0.2f);
         else if (item == mItemType) {
-            mDetectorType = (mDetectorType + 1) % mDetectorName.length;
-            item.setTitle(mDetectorName[mDetectorType]);
-            setDetectorType(mDetectorType);
+            int tmpDetectorType = (mDetectorType + 1) % mDetectorName.length;
+            item.setTitle(mDetectorName[tmpDetectorType]);
+            setDetectorType(tmpDetectorType);
         }
         return true;
     }
diff --git a/samples/android/native-activity/.cproject b/samples/android/native-activity/.cproject
index 09687f3ac0..44aadfe9af 100644
--- a/samples/android/native-activity/.cproject
+++ b/samples/android/native-activity/.cproject
@@ -1,75 +1,61 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
-	<storageModule moduleId="org.eclipse.cdt.core.settings">
-		<cconfiguration id="0.129633445">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="0.129633445" moduleId="org.eclipse.cdt.core.settings" name="Default">
-				<externalSettings/>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildProperties="" description="" id="0.129633445" name="Default" parent="org.eclipse.cdt.build.core.prefbase.cfg">
-					<folderInfo id="0.129633445." name="/" resourcePath="">
-						<toolChain id="org.eclipse.cdt.build.core.prefbase.toolchain.2006441180" name="No ToolChain" resourceTypeBasedDiscovery="false" superClass="org.eclipse.cdt.build.core.prefbase.toolchain">
-							<targetPlatform id="org.eclipse.cdt.build.core.prefbase.toolchain.2006441180.527973180" name=""/>
-							<builder autoBuildTarget="" command="${NDKROOT}/ndk-build.cmd" enableAutoBuild="true" enableCleanBuild="false" id="org.eclipse.cdt.build.core.settings.default.builder.180541221" incrementalBuildTarget="" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.libs.791069665" name="holder for library settings" superClass="org.eclipse.cdt.build.core.settings.holder.libs"/>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.1894181736" name="Assembly" superClass="org.eclipse.cdt.build.core.settings.holder">
-								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.588929884" languageId="org.eclipse.cdt.core.assembly" languageName="Assembly" sourceContentType="org.eclipse.cdt.core.asmSource" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
-							</tool>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.303359177" name="GNU C++" superClass="org.eclipse.cdt.build.core.settings.holder">
-								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.373249505" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
-								</option>
-								<option id="org.eclipse.cdt.build.core.settings.holder.symbols.1424359063" name="Symbols" superClass="org.eclipse.cdt.build.core.settings.holder.symbols" valueType="definedSymbols">
-									<listOptionValue builtIn="false" value="ANDROID=1"/>
-								</option>
-								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.360067880" languageId="org.eclipse.cdt.core.g++" languageName="GNU C++" sourceContentType="org.eclipse.cdt.core.cxxSource,org.eclipse.cdt.core.cxxHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
-							</tool>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.1156172258" name="GNU C" superClass="org.eclipse.cdt.build.core.settings.holder">
-								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.149918263" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
-								</option>
-								<option id="org.eclipse.cdt.build.core.settings.holder.symbols.719752707" name="Symbols" superClass="org.eclipse.cdt.build.core.settings.holder.symbols" valueType="definedSymbols">
-									<listOptionValue builtIn="false" value="ANDROID=1"/>
-								</option>
-								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.232493949" languageId="org.eclipse.cdt.core.gcc" languageName="GNU C" sourceContentType="org.eclipse.cdt.core.cSource,org.eclipse.cdt.core.cHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-					<sourceEntries>
-						<entry flags="VALUE_WORKSPACE_PATH" kind="sourcePath" name="jni"/>
-					</sourceEntries>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
-	</storageModule>
-	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-		<project id="OpenCV Sample - face-detection.null.1639518055" name="OpenCV Sample - face-detection"/>
-	</storageModule>
-	<storageModule moduleId="scannerConfiguration">
-		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		<scannerConfigBuildInfo instanceId="0.129633445">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/OpenCV Sample - face-detection"/>
-	</storageModule>
-	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
-</cproject>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?>
+
+<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="0.882924228">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="0.882924228" moduleId="org.eclipse.cdt.core.settings" name="Default">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildProperties="" description="" id="0.882924228" name="Default" parent="org.eclipse.cdt.build.core.prefbase.cfg">
+					<folderInfo id="0.882924228." name="/" resourcePath="">
+						<toolChain id="org.eclipse.cdt.build.core.prefbase.toolchain.1667980868" name="No ToolChain" resourceTypeBasedDiscovery="false" superClass="org.eclipse.cdt.build.core.prefbase.toolchain">
+							<targetPlatform id="org.eclipse.cdt.build.core.prefbase.toolchain.1667980868.2108168132" name=""/>
+							<builder autoBuildTarget="" command="&quot;${NDKROOT}/ndk-build.cmd&quot;" enableAutoBuild="true" enableCleanBuild="false" id="org.eclipse.cdt.build.core.settings.default.builder.328915772" incrementalBuildTarget="" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.libs.630148311" name="holder for library settings" superClass="org.eclipse.cdt.build.core.settings.holder.libs"/>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.525090327" name="Assembly" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.1491216279" languageId="org.eclipse.cdt.core.assembly" languageName="Assembly" sourceContentType="org.eclipse.cdt.core.asmSource" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.1242729366" name="GNU C++" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.881377735" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/android/native_app_glue&quot;"/>
+								</option>
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.273216997" languageId="org.eclipse.cdt.core.g++" languageName="GNU C++" sourceContentType="org.eclipse.cdt.core.cxxSource,org.eclipse.cdt.core.cxxHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.1779128177" name="GNU C" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.1778510041" languageId="org.eclipse.cdt.core.gcc" languageName="GNU C" sourceContentType="org.eclipse.cdt.core.cSource,org.eclipse.cdt.core.cHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="CvNativeActivity.null.708321898" name="CvNativeActivity"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="0.882924228">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="refreshScope" versionNumber="1">
+		<resource resourceType="PROJECT" workspacePath="/CvNativeActivity"/>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+</cproject>
diff --git a/samples/android/native-activity/.project b/samples/android/native-activity/.project
index cf0823c0b3..c20be83f60 100644
--- a/samples/android/native-activity/.project
+++ b/samples/android/native-activity/.project
@@ -5,6 +5,64 @@
 	<projects>
 	</projects>
 	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>auto,full,incremental,</triggers>
+			<arguments>
+				<dictionary>
+					<key>?name?</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.append_environment</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.autoBuildTarget</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.buildArguments</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.buildCommand</key>
+					<value>&quot;${NDKROOT}/ndk-build.cmd&quot;</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
+					<value>clean</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.contents</key>
+					<value>org.eclipse.cdt.make.core.activeConfigSettings</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.enableAutoBuild</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.enableCleanBuild</key>
+					<value>false</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.enableFullBuild</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.fullBuildTarget</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.stopOnError</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
+					<value>false</value>
+				</dictionary>
+			</arguments>
+		</buildCommand>
 		<buildCommand>
 			<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
 			<arguments>
@@ -25,9 +83,19 @@
 			<arguments>
 			</arguments>
 		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
 	</buildSpec>
 	<natures>
 		<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
 		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
 	</natures>
 </projectDescription>
diff --git a/samples/android/native-activity/jni/native.cpp b/samples/android/native-activity/jni/native.cpp
index 66bc006db1..5cfb3a9611 100644
--- a/samples/android/native-activity/jni/native.cpp
+++ b/samples/android/native-activity/jni/native.cpp
@@ -9,7 +9,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
-#include <float.h>
 #include <queue>
 
 #include <opencv2/core/core.hpp>
@@ -60,7 +59,7 @@ static cv::Size calc_optimal_camera_resolution(const char* supported, int width,
             }
         }
 
-        idx++; // to skip coma symbol
+        idx++; // to skip comma symbol
 
     } while(supported[idx-1] != '\0');
 
@@ -86,9 +85,9 @@ static void engine_draw_frame(Engine* engine, const cv::Mat& frame)
 
     for (int yy = top_indent; yy < std::min(frame.rows+top_indent, buffer.height); yy++)
     {
-        unsigned char* line = (unsigned char*)pixels;
-        memcpy(line+left_indent*4*sizeof(unsigned char), frame.ptr<unsigned char>(yy),
-               std::min(frame.cols, buffer.width)*4*sizeof(unsigned char));
+        unsigned char* line = (unsigned char*)pixels + left_indent*4*sizeof(unsigned char);
+        size_t line_size = std::min(frame.cols, buffer.width)*4*sizeof(unsigned char);
+        memcpy(line, frame.ptr<unsigned char>(yy), line_size);
         // go to next line
         pixels = (int32_t*)pixels + buffer.stride;
     }
@@ -139,7 +138,7 @@ static void engine_handle_cmd(android_app* app, int32_t cmd)
                     return;
                 }
 
-                LOGI("Camera initialized at resoution %dx%d", camera_resolution.width, camera_resolution.height);
+                LOGI("Camera initialized at resolution %dx%d", camera_resolution.width, camera_resolution.height);
             }
             break;
         case APP_CMD_TERM_WINDOW:
@@ -157,7 +156,8 @@ void android_main(android_app* app)
     // Make sure glue isn't stripped.
     app_dummy();
 
-    memset(&engine, 0, sizeof(engine));
+    size_t engine_size = sizeof(engine); // for Eclipse CDT parser
+    memset((void*)&engine, 0, engine_size);
     app->userData = &engine;
     app->onAppCmd = engine_handle_cmd;
     engine.app = app;
diff --git a/samples/cpp/latentsvm_multidetect.cpp b/samples/cpp/latentsvm_multidetect.cpp
index d2105122ab..619c54b849 100644
--- a/samples/cpp/latentsvm_multidetect.cpp
+++ b/samples/cpp/latentsvm_multidetect.cpp
@@ -3,7 +3,7 @@
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/contrib/contrib.hpp"
 
-#ifdef WIN32
+#if defined(WIN32) || defined(_WIN32)
 #include <io.h>
 #else
 #include <dirent.h>
@@ -67,7 +67,7 @@ static void readDirectory( const string& directoryName, vector<string>& filename
 {
     filenames.clear();
 
-#ifdef WIN32
+#if defined(WIN32) | defined(_WIN32)
     struct _finddata_t s_file;
     string str = directoryName + "\\*.*";
 
diff --git a/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp b/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp
index f4cde9b2ee..ead7fd7182 100644
--- a/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp
+++ b/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp
@@ -70,7 +70,7 @@ int main( int argc, char** argv )
   std::vector< DMatch > good_matches;
 
   for( int i = 0; i < descriptors_1.rows; i++ )
-  { if( matches[i].distance < 2*min_dist )
+  { if( matches[i].distance <= 2*min_dist )
     { good_matches.push_back( matches[i]); }
   }
 
diff --git a/samples/gpu/bgfg_segm.cpp b/samples/gpu/bgfg_segm.cpp
index a77d336a9e..6963e75ff8 100644
--- a/samples/gpu/bgfg_segm.cpp
+++ b/samples/gpu/bgfg_segm.cpp
@@ -1,15 +1,10 @@
 #include <iostream>
 #include <string>
 
-#include "opencv2/opencv_modules.hpp"
 #include "opencv2/core/core.hpp"
 #include "opencv2/gpu/gpu.hpp"
 #include "opencv2/highgui/highgui.hpp"
 
-#ifdef HAVE_OPENCV_NONFREE
-#include "opencv2/nonfree/gpu.hpp"
-#endif
-
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
@@ -19,9 +14,6 @@ enum Method
     FGD_STAT,
     MOG,
     MOG2,
-#ifdef HAVE_OPENCV_NONFREE
-    VIBE,
-#endif
     GMG
 };
 
@@ -30,7 +22,7 @@ int main(int argc, const char** argv)
     cv::CommandLineParser cmd(argc, argv,
         "{ c | camera | false       | use camera }"
         "{ f | file   | 768x576.avi | input video file }"
-        "{ m | method | mog         | method (fgd, mog, mog2, vibe, gmg) }"
+        "{ m | method | mog         | method (fgd, mog, mog2, gmg) }"
         "{ h | help   | false       | print help message }");
 
     if (cmd.get<bool>("help"))
@@ -48,9 +40,6 @@ int main(int argc, const char** argv)
     if (method != "fgd"
         && method != "mog"
         && method != "mog2"
-    #ifdef HAVE_OPENCV_NONFREE
-        && method != "vibe"
-    #endif
         && method != "gmg")
     {
         cerr << "Incorrect method" << endl;
@@ -60,9 +49,6 @@ int main(int argc, const char** argv)
     Method m = method == "fgd" ? FGD_STAT :
                method == "mog" ? MOG :
                method == "mog2" ? MOG2 :
-            #ifdef HAVE_OPENCV_NONFREE
-               method == "vibe" ? VIBE :
-            #endif
                                   GMG;
 
     VideoCapture cap;
@@ -86,9 +72,6 @@ int main(int argc, const char** argv)
     FGDStatModel fgd_stat;
     MOG_GPU mog;
     MOG2_GPU mog2;
-#ifdef HAVE_OPENCV_NONFREE
-    VIBE_GPU vibe;
-#endif
     GMG_GPU gmg;
     gmg.numInitializationFrames = 40;
 
@@ -114,12 +97,6 @@ int main(int argc, const char** argv)
         mog2(d_frame, d_fgmask);
         break;
 
-#ifdef HAVE_OPENCV_NONFREE
-    case VIBE:
-        vibe.initialize(d_frame);
-        break;
-#endif
-
     case GMG:
         gmg.initialize(d_frame.size());
         break;
@@ -128,11 +105,7 @@ int main(int argc, const char** argv)
     namedWindow("image", WINDOW_NORMAL);
     namedWindow("foreground mask", WINDOW_NORMAL);
     namedWindow("foreground image", WINDOW_NORMAL);
-    if (m != GMG
-    #ifdef HAVE_OPENCV_NONFREE
-        && m != VIBE
-    #endif
-        )
+    if (m != GMG)
     {
         namedWindow("mean background image", WINDOW_NORMAL);
     }
@@ -165,12 +138,6 @@ int main(int argc, const char** argv)
             mog2.getBackgroundImage(d_bgimg);
             break;
 
-#ifdef HAVE_OPENCV_NONFREE
-        case VIBE:
-            vibe(d_frame, d_fgmask);
-            break;
-#endif
-
         case GMG:
             gmg(d_frame, d_fgmask);
             break;
diff --git a/samples/gpu/cascadeclassifier_nvidia_api.cpp b/samples/gpu/cascadeclassifier_nvidia_api.cpp
index 99c95ab977..98195b35c2 100644
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -17,12 +17,21 @@ using namespace std;
 using namespace cv;
 
 
-#if !defined(HAVE_CUDA)
+#if !defined(HAVE_CUDA) || defined(__arm__)
+
 int main( int, const char** )
 {
-    cout << "Please compile the library with CUDA support" << endl;
-    return -1;
+#if !defined(HAVE_CUDA)
+    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true)." << std::endl;
+#endif
+
+#if defined(__arm__)
+    std::cout << "Unsupported for ARM CUDA library." << std::endl;
+#endif
+
+    return 0;
 }
+
 #else
 
 
diff --git a/samples/gpu/driver_api_multi.cpp b/samples/gpu/driver_api_multi.cpp
index 2d743f0e9c..c829830e72 100644
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -11,7 +11,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/gpu/gpu.hpp"
 
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) || defined(__arm__)
 
 int main()
 {
@@ -23,6 +23,10 @@ int main()
     std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
 #endif
 
+#if defined(__arm__)
+    std::cout << "Unsupported for ARM CUDA library." << std::endl;
+#endif
+
     return 0;
 }
 
diff --git a/samples/gpu/driver_api_stereo_multi.cpp b/samples/gpu/driver_api_stereo_multi.cpp
index 10c3974771..d4d0af451c 100644
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -13,7 +13,7 @@
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
 
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) || defined(__arm__)
 
 int main()
 {
@@ -25,6 +25,10 @@ int main()
     std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
 #endif
 
+#if defined(__arm__)
+    std::cout << "Unsupported for ARM CUDA library." << std::endl;
+#endif
+
     return 0;
 }
 
diff --git a/samples/ocl/aloe-L.png b/samples/ocl/aloe-L.png
deleted file mode 100644
index 47587668e2..0000000000
Binary files a/samples/ocl/aloe-L.png and /dev/null differ
diff --git a/samples/ocl/aloe-R.png b/samples/ocl/aloe-R.png
deleted file mode 100644
index 5d11c57a9e..0000000000
Binary files a/samples/ocl/aloe-R.png and /dev/null differ
diff --git a/samples/ocl/aloe-disp.png b/samples/ocl/aloe-disp.png
deleted file mode 100644
index dd4a499bed..0000000000
Binary files a/samples/ocl/aloe-disp.png and /dev/null differ
diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp
index ec79339518..a49610aeb7 100644
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@@ -1,5 +1,3 @@
-//This sample is inherited from facedetect.cpp in smaple/c
-
 #include "opencv2/objdetect/objdetect.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
@@ -9,78 +7,97 @@
 
 using namespace std;
 using namespace cv;
+#define LOOP_NUM 10
 
-static void help()
+const static Scalar colors[] =  { CV_RGB(0,0,255),
+                                  CV_RGB(0,128,255),
+                                  CV_RGB(0,255,255),
+                                  CV_RGB(0,255,0),
+                                  CV_RGB(255,128,0),
+                                  CV_RGB(255,255,0),
+                                  CV_RGB(255,0,0),
+                                  CV_RGB(255,0,255)
+                                } ;
+
+
+int64 work_begin = 0;
+int64 work_end = 0;
+string outputName;
+
+static void workBegin()
 {
-    cout << "\nThis program demonstrates the cascade recognizer.\n"
-        "This classifier can recognize many ~rigid objects, it's most known use is for faces.\n"
-        "Usage:\n"
-        "./facedetect [--cascade=<cascade_path> this is the primary trained classifier such as frontal face]\n"
-        "   [--scale=<image scale greater or equal to 1, try 1.3 for example>\n"
-        "   [filename|camera_index]\n\n"
-        "see facedetect.cmd for one call:\n"
-        "./facedetect --cascade=\"../../data/haarcascades/haarcascade_frontalface_alt.xml\" --scale=1.3 \n"
-        "Hit any key to quit.\n"
-        "Using OpenCV version " << CV_VERSION << "\n" << endl;
+    work_begin = getTickCount();
+}
+static void workEnd()
+{
+    work_end += (getTickCount() - work_begin);
+}
+static double getTime()
+{
+    return work_end /((double)cvGetTickFrequency() * 1000.);
 }
-struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } };
-void detectAndDraw( Mat& img,
-    cv::ocl::OclCascadeClassifier& cascade, CascadeClassifier& nestedCascade,
-    double scale);
 
-String cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml";
+
+void detect( Mat& img, vector<Rect>& faces,
+             ocl::OclCascadeClassifierBuf& cascade,
+             double scale, bool calTime);
+
+
+void detectCPU( Mat& img, vector<Rect>& faces,
+                CascadeClassifier& cascade,
+                double scale, bool calTime);
+
+
+void Draw(Mat& img, vector<Rect>& faces, double scale);
+
+
+// This function test if gpu_rst matches cpu_rst.
+// If the two vectors are not equal, it will return the difference in vector size
+// Else if will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
+double checkRectSimilarity(Size sz, vector<Rect>& cpu_rst, vector<Rect>& gpu_rst);
+
 
 int main( int argc, const char** argv )
 {
+    const char* keys =
+        "{ h | help       | false       | print help message }"
+        "{ i | input      |             | specify input image }"
+        "{ t | template   | haarcascade_frontalface_alt.xml |"
+        " specify template file path }"
+        "{ c | scale      |   1.0       | scale image }"
+        "{ s | use_cpu    | false       | use cpu or gpu to process the image }"
+        "{ o | output     | facedetect_output.jpg  |"
+        " specify output image save path(only works when input is images) }";
+
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }
     CvCapture* capture = 0;
     Mat frame, frameCopy, image;
-    const String scaleOpt = "--scale=";
-    size_t scaleOptLen = scaleOpt.length();
-    const String cascadeOpt = "--cascade=";
-    size_t cascadeOptLen = cascadeOpt.length();
-    String inputName;
 
-    help();
-    cv::ocl::OclCascadeClassifier cascade;
-    CascadeClassifier  nestedCascade;
-    double scale = 1;
+    bool useCPU = cmd.get<bool>("s");
+    string inputName = cmd.get<string>("i");
+    outputName = cmd.get<string>("o");
+    string cascadeName = cmd.get<string>("t");
+    double scale = cmd.get<double>("c");
+    ocl::OclCascadeClassifierBuf cascade;
+    CascadeClassifier  cpu_cascade;
 
-    for( int i = 1; i < argc; i++ )
-    {
-        cout << "Processing " << i << " " <<  argv[i] << endl;
-        if( cascadeOpt.compare( 0, cascadeOptLen, argv[i], cascadeOptLen ) == 0 )
-        {
-            cascadeName.assign( argv[i] + cascadeOptLen );
-            cout << "  from which we have cascadeName= " << cascadeName << endl;
-        }
-        else if( scaleOpt.compare( 0, scaleOptLen, argv[i], scaleOptLen ) == 0 )
-        {
-            if( !sscanf( argv[i] + scaleOpt.length(), "%lf", &scale ) || scale < 1 )
-                scale = 1;
-            cout << " from which we read scale = " << scale << endl;
-        }
-        else if( argv[i][0] == '-' )
-        {
-            cerr << "WARNING: Unknown option %s" << argv[i] << endl;
-        }
-        else
-            inputName.assign( argv[i] );
-    }
-
-    if( !cascade.load( cascadeName ) )
+    if( !cascade.load( cascadeName ) || !cpu_cascade.load(cascadeName) )
     {
         cerr << "ERROR: Could not load classifier cascade" << endl;
-        cerr << "Usage: facedetect [--cascade=<cascade_path>]\n"
-            "   [--scale[=<image scale>\n"
-            "   [filename|camera_index]\n" << endl ;
         return -1;
     }
 
-    if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') )
+    if( inputName.empty() )
     {
-        capture = cvCaptureFromCAM( inputName.empty() ? 0 : inputName.c_str()[0] - '0' );
-        int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0' ;
-        if(!capture) cout << "Capture from CAM " <<  c << " didn't work" << endl;
+        capture = cvCaptureFromCAM(0);
+        if(!capture)
+            cout << "Capture from CAM 0 didn't work" << endl;
     }
     else if( inputName.size() )
     {
@@ -88,26 +105,31 @@ int main( int argc, const char** argv )
         if( image.empty() )
         {
             capture = cvCaptureFromAVI( inputName.c_str() );
-            if(!capture) cout << "Capture from AVI didn't work" << endl;
+            if(!capture)
+                cout << "Capture from AVI didn't work" << endl;
+            return -1;
         }
     }
     else
     {
         image = imread( "lena.jpg", 1 );
-        if(image.empty()) cout << "Couldn't read lena.jpg" << endl;
+        if(image.empty())
+            cout << "Couldn't read lena.jpg" << endl;
+        return -1;
     }
 
+
     cvNamedWindow( "result", 1 );
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = cv::ocl::getDevice(oclinfo);
-    if(devnums<1)
+    vector<ocl::Info> oclinfo;
+    int devnums = ocl::getDevice(oclinfo);
+    if( devnums < 1 )
     {
         std::cout << "no device found\n";
         return -1;
     }
     //if you want to use undefault device, set it here
     //setDevice(oclinfo[0]);
-    //setBinpath(CLBINPATH);
+    ocl::setBinpath("./");
     if( capture )
     {
         cout << "In capture ..." << endl;
@@ -115,108 +137,113 @@ int main( int argc, const char** argv )
         {
             IplImage* iplImg = cvQueryFrame( capture );
             frame = iplImg;
+            vector<Rect> faces;
             if( frame.empty() )
                 break;
             if( iplImg->origin == IPL_ORIGIN_TL )
                 frame.copyTo( frameCopy );
             else
                 flip( frame, frameCopy, 0 );
-
-            detectAndDraw( frameCopy, cascade, nestedCascade, scale );
-
+            if(useCPU)
+            {
+                detectCPU(frameCopy, faces, cpu_cascade, scale, false);
+            }
+            else
+            {
+                detect(frameCopy, faces, cascade, scale, false);
+            }
+            Draw(frameCopy, faces, scale);
             if( waitKey( 10 ) >= 0 )
                 goto _cleanup_;
         }
 
+
         waitKey(0);
 
+
 _cleanup_:
         cvReleaseCapture( &capture );
     }
     else
     {
         cout << "In image read" << endl;
-        if( !image.empty() )
+        vector<Rect> faces;
+        vector<Rect> ref_rst;
+        double accuracy = 0.;
+        for(int i = 0; i <= LOOP_NUM; i ++)
         {
-            detectAndDraw( image, cascade, nestedCascade, scale );
-            waitKey(0);
-        }
-        else if( !inputName.empty() )
-        {
-            /* assume it is a text file containing the
-            list of the image filenames to be processed - one per line */
-            FILE* f = fopen( inputName.c_str(), "rt" );
-            if( f )
+            cout << "loop" << i << endl;
+            if(useCPU)
             {
-                char buf[1000+1];
-                while( fgets( buf, 1000, f ) )
+                detectCPU(image, faces, cpu_cascade, scale, i==0?false:true);
+            }
+            else
+            {
+                detect(image, faces, cascade, scale, i==0?false:true);
+                if(i == 0)
                 {
-                    int len = (int)strlen(buf), c;
-                    while( len > 0 && isspace(buf[len-1]) )
-                        len--;
-                    buf[len] = '\0';
-                    cout << "file " << buf << endl;
-                    image = imread( buf, 1 );
-                    if( !image.empty() )
-                    {
-                        detectAndDraw( image, cascade, nestedCascade, scale );
-                        c = waitKey(0);
-                        if( c == 27 || c == 'q' || c == 'Q' )
-                            break;
-                    }
-                    else
-                    {
-                        cerr << "Aw snap, couldn't read image " << buf << endl;
-                    }
+                    detectCPU(image, ref_rst, cpu_cascade, scale, false);
+                    accuracy = checkRectSimilarity(image.size(), ref_rst, faces);
                 }
-                fclose(f);
+            }
+            if (i == LOOP_NUM)
+            {
+                if (useCPU)
+                    cout << "average CPU time (noCamera) : ";
+                else
+                    cout << "average GPU time (noCamera) : ";
+                cout << getTime() / LOOP_NUM << " ms" << endl;
+                cout << "accuracy value: " << accuracy <<endl;
             }
         }
+        Draw(image, faces, scale);
+        waitKey(0);
     }
 
     cvDestroyWindow("result");
-
     return 0;
 }
 
-void detectAndDraw( Mat& img,
-    cv::ocl::OclCascadeClassifier& cascade, CascadeClassifier&,
-    double scale)
+void detect( Mat& img, vector<Rect>& faces,
+             ocl::OclCascadeClassifierBuf& cascade,
+             double scale, bool calTime)
+{
+    ocl::oclMat image(img);
+    ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
+    if(calTime) workBegin();
+    ocl::cvtColor( image, gray, CV_BGR2GRAY );
+    ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
+    ocl::equalizeHist( smallImg, smallImg );
+
+    cascade.detectMultiScale( smallImg, faces, 1.1,
+                              3, 0
+                              |CV_HAAR_SCALE_IMAGE
+                              , Size(30,30), Size(0, 0) );
+    if(calTime) workEnd();
+}
+
+
+void detectCPU( Mat& img, vector<Rect>& faces,
+                CascadeClassifier& cascade,
+                double scale, bool calTime)
+{
+    if(calTime) workBegin();
+    Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
+    cvtColor(img, cpu_gray, CV_BGR2GRAY);
+    resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR);
+    equalizeHist(cpu_smallImg, cpu_smallImg);
+    cascade.detectMultiScale(cpu_smallImg, faces, 1.1,
+                             3, 0 | CV_HAAR_SCALE_IMAGE,
+                             Size(30, 30), Size(0, 0));
+    if(calTime) workEnd();
+}
+
+
+void Draw(Mat& img, vector<Rect>& faces, double scale)
 {
     int i = 0;
-    double t = 0;
-    vector<Rect> faces;
-    const static Scalar colors[] =  { CV_RGB(0,0,255),
-        CV_RGB(0,128,255),
-        CV_RGB(0,255,255),
-        CV_RGB(0,255,0),
-        CV_RGB(255,128,0),
-        CV_RGB(255,255,0),
-        CV_RGB(255,0,0),
-        CV_RGB(255,0,255)} ;
-    cv::ocl::oclMat image(img);
-    cv::ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
-
-    cv::ocl::cvtColor( image, gray, CV_BGR2GRAY );
-    cv::ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    cv::ocl::equalizeHist( smallImg, smallImg );
-
-    CvSeq* _objects;
-    MemStorage storage(cvCreateMemStorage(0));
-    t = (double)cvGetTickCount();
-    _objects = cascade.oclHaarDetectObjects( smallImg, storage, 1.1,
-        3, 0
-        |CV_HAAR_SCALE_IMAGE
-        , Size(30,30), Size(0, 0) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    faces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
-    t = (double)cvGetTickCount() - t;
-    printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
     for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
     {
-        Mat smallImgROI;
         Point center;
         Scalar color = colors[i%8];
         int radius;
@@ -225,5 +252,53 @@ void detectAndDraw( Mat& img,
         radius = cvRound((r->width + r->height)*0.25*scale);
         circle( img, center, radius, color, 3, 8, 0 );
     }
-    cv::imshow( "result", img );
+    imshow( "result", img );
+    imwrite( outputName, img );
+}
+
+
+double checkRectSimilarity(Size sz, vector<Rect>& ob1, vector<Rect>& ob2)
+{
+    double final_test_result = 0.0;
+    size_t sz1 = ob1.size();
+    size_t sz2 = ob2.size();
+
+    if(sz1 != sz2)
+    {
+        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
+    else
+    {
+        if(sz1==0 && sz2==0)
+            return 0;
+        Mat cpu_result(sz, CV_8UC1);
+        cpu_result.setTo(0);
+
+        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
+        {
+            Mat cpu_result_roi(cpu_result, *r);
+            cpu_result_roi.setTo(1);
+            cpu_result.copyTo(cpu_result);
+        }
+        int cpu_area = countNonZero(cpu_result > 0);
+
+
+        Mat gpu_result(sz, CV_8UC1);
+        gpu_result.setTo(0);
+        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
+        {
+            cv::Mat gpu_result_roi(gpu_result, *r2);
+            gpu_result_roi.setTo(1);
+            gpu_result.copyTo(gpu_result);
+        }
+
+        Mat result_;
+        multiply(cpu_result, gpu_result, result_);
+        int result = countNonZero(result_ > 0);
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
+    }
+    return final_test_result;
 }
diff --git a/samples/ocl/hog.cpp b/samples/ocl/hog.cpp
index 76b6d2830e..ff53e010cf 100644
--- a/samples/ocl/hog.cpp
+++ b/samples/ocl/hog.cpp
@@ -10,69 +10,39 @@
 using namespace std;
 using namespace cv;
 
-bool help_showed = false;
-
-class Args
-{
-public:
-    Args();
-    static Args read(int argc, char** argv);
-
-    string src;
-    bool src_is_video;
-    bool src_is_camera;
-    int camera_id;
-
-    bool write_video;
-    string dst_video;
-    double dst_video_fps;
-
-    bool make_gray;
-
-    bool resize_src;
-    int width, height;
-
-    double scale;
-    int nlevels;
-    int gr_threshold;
-
-    double hit_threshold;
-    bool hit_threshold_auto;
-
-    int win_width;
-    int win_stride_width, win_stride_height;
-
-    bool gamma_corr;
-};
-
-
 class App
 {
 public:
-    App(const Args& s);
+    App(CommandLineParser& cmd);
     void run();
-
     void handleKey(char key);
-
     void hogWorkBegin();
     void hogWorkEnd();
     string hogWorkFps() const;
-
     void workBegin();
     void workEnd();
     string workFps() const;
-
     string message() const;
 
+
+// This function test if gpu_rst matches cpu_rst.
+// If the two vectors are not equal, it will return the difference in vector size
+// Else if will return
+// (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
+    double checkRectSimilarity(Size sz,
+                               std::vector<Rect>& cpu_rst,
+                               std::vector<Rect>& gpu_rst);
 private:
     App operator=(App&);
 
-    Args args;
+    //Args args;
     bool running;
-
     bool use_gpu;
     bool make_gray;
     double scale;
+    double resize_scale;
+    int win_width;
+    int win_stride_width, win_stride_height;
     int gr_threshold;
     int nlevels;
     double hit_threshold;
@@ -80,119 +50,49 @@ private:
 
     int64 hog_work_begin;
     double hog_work_fps;
-
     int64 work_begin;
     double work_fps;
-};
 
-static void printHelp()
-{
-    cout << "Histogram of Oriented Gradients descriptor and detector sample.\n"
-         << "\nUsage: hog_gpu\n"
-         << "  (<image>|--video <vide>|--camera <camera_id>) # frames source\n"
-         << "  [--make_gray <true/false>] # convert image to gray one or not\n"
-         << "  [--resize_src <true/false>] # do resize of the source image or not\n"
-         << "  [--width <int>] # resized image width\n"
-         << "  [--height <int>] # resized image height\n"
-         << "  [--hit_threshold <double>] # classifying plane distance threshold (0.0 usually)\n"
-         << "  [--scale <double>] # HOG window scale factor\n"
-         << "  [--nlevels <int>] # max number of HOG window scales\n"
-         << "  [--win_width <int>] # width of the window (48 or 64)\n"
-         << "  [--win_stride_width <int>] # distance by OX axis between neighbour wins\n"
-         << "  [--win_stride_height <int>] # distance by OY axis between neighbour wins\n"
-         << "  [--gr_threshold <int>] # merging similar rects constant\n"
-         << "  [--gamma_correct <int>] # do gamma correction or not\n"
-         << "  [--write_video <bool>] # write video or not\n"
-         << "  [--dst_video <path>] # output video path\n"
-         << "  [--dst_video_fps <double>] # output video fps\n";
-    help_showed = true;
-}
+    string img_source;
+    string vdo_source;
+    string output;
+    int camera_id;
+};
 
 int main(int argc, char** argv)
 {
+    const char* keys =
+        "{ h |  help    | false          | print help message }"
+        "{ i |  input   |                | specify input image}"
+        "{ c | camera   | -1             | enable camera capturing }"
+        "{ v | video    |                | use video as input }"
+        "{ g |  gray    | false          | convert image to gray one or not}"
+        "{ s |  scale   | 1.0            | resize the image before detect}"
+        "{ l |larger_win| false          | use 64x128 window}"
+        "{ o |  output  |                | specify output path when input is images}";
+    CommandLineParser cmd(argc, argv, keys);
+    App app(cmd);
     try
     {
-        if (argc < 2)
-            printHelp();
-        Args args = Args::read(argc, argv);
-        if (help_showed)
-            return -1;
-        App app(args);
         app.run();
     }
-    catch (const Exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch (const exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch(...) { return cout << "unknown exception" << endl, 1; }
+    catch (const Exception& e)
+    {
+        return cout << "error: "  << e.what() << endl, 1;
+    }
+    catch (const exception& e)
+    {
+        return cout << "error: "  << e.what() << endl, 1;
+    }
+    catch(...)
+    {
+        return cout << "unknown exception" << endl, 1;
+    }
     return 0;
 }
 
-
-Args::Args()
+App::App(CommandLineParser& cmd)
 {
-    src_is_video = false;
-    src_is_camera = false;
-    camera_id = 0;
-
-    write_video = false;
-    dst_video_fps = 24.;
-
-    make_gray = false;
-
-    resize_src = false;
-    width = 640;
-    height = 480;
-
-    scale = 1.05;
-    nlevels = 13;
-    gr_threshold = 8;
-    hit_threshold = 1.4;
-    hit_threshold_auto = true;
-
-    win_width = 48;
-    win_stride_width = 8;
-    win_stride_height = 8;
-
-    gamma_corr = true;
-}
-
-
-Args Args::read(int argc, char** argv)
-{
-    Args args;
-    for (int i = 1; i < argc; i++)
-    {
-        if (string(argv[i]) == "--make_gray") args.make_gray = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--resize_src") args.resize_src = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--width") args.width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--height") args.height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--hit_threshold")
-        {
-            args.hit_threshold = atof(argv[++i]);
-            args.hit_threshold_auto = false;
-        }
-        else if (string(argv[i]) == "--scale") args.scale = atof(argv[++i]);
-        else if (string(argv[i]) == "--nlevels") args.nlevels = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--dst_video") args.dst_video = argv[++i];
-        else if (string(argv[i]) == "--dst_video_fps") args.dst_video_fps = atof(argv[++i]);
-        else if (string(argv[i]) == "--help") printHelp();
-        else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; }
-        else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; }
-        else if (args.src.empty()) args.src = argv[i];
-        else throw runtime_error((string("unknown key: ") + argv[i]));
-    }
-    return args;
-}
-
-
-App::App(const Args& s)
-{
-    args = s;
     cout << "\nControls:\n"
          << "\tESC - exit\n"
          << "\tm - change mode GPU <-> CPU\n"
@@ -203,56 +103,56 @@ App::App(const Args& s)
          << "\t4/r - increase/decrease hit threshold\n"
          << endl;
 
+
     use_gpu = true;
-    make_gray = args.make_gray;
-    scale = args.scale;
-    gr_threshold = args.gr_threshold;
-    nlevels = args.nlevels;
+    make_gray = cmd.get<bool>("g");
+    resize_scale = cmd.get<double>("s");
+    win_width = cmd.get<bool>("l") == true ? 64 : 48;
+    vdo_source = cmd.get<string>("v");
+    img_source = cmd.get<string>("i");
+    output = cmd.get<string>("o");
+    camera_id = cmd.get<int>("c");
 
-    if (args.hit_threshold_auto)
-        args.hit_threshold = args.win_width == 48 ? 1.4 : 0.;
-    hit_threshold = args.hit_threshold;
+    win_stride_width = 8;
+    win_stride_height = 8;
+    gr_threshold = 8;
+    nlevels = 13;
+    hit_threshold = win_width == 48 ? 1.4 : 0.;
+    scale = 1.05;
+    gamma_corr = true;
 
-    gamma_corr = args.gamma_corr;
-
-    if (args.win_width != 64 && args.win_width != 48)
-        args.win_width = 64;
-
-    cout << "Scale: " << scale << endl;
-    if (args.resize_src)
-        cout << "Resized source: (" << args.width << ", " << args.height << ")\n";
     cout << "Group threshold: " << gr_threshold << endl;
     cout << "Levels number: " << nlevels << endl;
-    cout << "Win width: " << args.win_width << endl;
-    cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n";
+    cout << "Win width: " << win_width << endl;
+    cout << "Win stride: (" << win_stride_width << ", " << win_stride_height << ")\n";
     cout << "Hit threshold: " << hit_threshold << endl;
     cout << "Gamma correction: " << gamma_corr << endl;
     cout << endl;
 }
 
-
 void App::run()
 {
-    std::vector<ocl::Info> oclinfo;
+    vector<ocl::Info> oclinfo;
     ocl::getDevice(oclinfo);
     running = true;
-    cv::VideoWriter video_writer;
+    VideoWriter video_writer;
 
-    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(args.win_stride_width, args.win_stride_height);
+    Size win_size(win_width, win_width * 2);
+    Size win_stride(win_stride_width, win_stride_height);
 
     // Create HOG descriptors and detectors here
     vector<float> detector;
     if (win_size == Size(64, 128))
-        detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
+        detector = ocl::HOGDescriptor::getPeopleDetector64x128();
     else
-        detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
+        detector = ocl::HOGDescriptor::getPeopleDetector48x96();
 
-    cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
-                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
+
+    ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
+                               ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
+                               ocl::HOGDescriptor::DEFAULT_NLEVELS);
+    HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
+                          HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
     gpu_hog.setSVMDetector(detector);
     cpu_hog.setSVMDetector(detector);
 
@@ -261,35 +161,36 @@ void App::run()
         VideoCapture vc;
         Mat frame;
 
-        if (args.src_is_video)
+        if (vdo_source!="")
         {
-            vc.open(args.src.c_str());
+            vc.open(vdo_source.c_str());
             if (!vc.isOpened())
-                throw runtime_error(string("can't open video file: " + args.src));
+                throw runtime_error(string("can't open video file: " + vdo_source));
             vc >> frame;
         }
-        else if (args.src_is_camera)
+        else if (camera_id != -1)
         {
-            vc.open(args.camera_id);
+            vc.open(camera_id);
             if (!vc.isOpened())
             {
                 stringstream msg;
-                msg << "can't open camera: " << args.camera_id;
+                msg << "can't open camera: " << camera_id;
                 throw runtime_error(msg.str());
             }
             vc >> frame;
         }
         else
         {
-            frame = imread(args.src);
+            frame = imread(img_source);
             if (frame.empty())
-                throw runtime_error(string("can't open image file: " + args.src));
+                throw runtime_error(string("can't open image file: " + img_source));
         }
 
         Mat img_aux, img, img_to_show;
         ocl::oclMat gpu_img;
 
         // Iterate over all frames
+        bool verify = false;
         while (running && !frame.empty())
         {
             workBegin();
@@ -300,13 +201,15 @@ void App::run()
             else frame.copyTo(img_aux);
 
             // Resize image
-            if (args.resize_src) resize(img_aux, img, Size(args.width, args.height));
+            if (abs(scale-1.0)>0.001)
+            {
+                Size sz((int)((double)img_aux.cols/resize_scale), (int)((double)img_aux.rows/resize_scale));
+                resize(img_aux, img, sz);
+            }
             else img = img_aux;
             img_to_show = img;
-
             gpu_hog.nlevels = nlevels;
             cpu_hog.nlevels = nlevels;
-
             vector<Rect> found;
 
             // Perform HOG classification
@@ -316,11 +219,23 @@ void App::run()
                 gpu_img.upload(img);
                 gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
                                          Size(0, 0), scale, gr_threshold);
+                if (!verify)
+                {
+                    // verify if GPU output same objects with CPU at 1st run
+                    verify = true;
+                    vector<Rect> ref_rst;
+                    cvtColor(img, img, CV_BGRA2BGR);
+                    cpu_hog.detectMultiScale(img, ref_rst, hit_threshold, win_stride,
+                                             Size(0, 0), scale, gr_threshold-2);
+                    double accuracy = checkRectSimilarity(img.size(), ref_rst, found);
+                    cout << "\naccuracy value: " << accuracy << endl;
+                }
             }
             else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
-                                          Size(0, 0), scale, gr_threshold);
+                                              Size(0, 0), scale, gr_threshold);
             hogWorkEnd();
 
+
             // Draw positive classified windows
             for (size_t i = 0; i < found.size(); i++)
             {
@@ -335,25 +250,31 @@ void App::run()
             putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
             putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
             imshow("opencv_gpu_hog", img_to_show);
-
-            if (args.src_is_video || args.src_is_camera) vc >> frame;
+            if (vdo_source!="" || camera_id!=-1) vc >> frame;
 
             workEnd();
 
-            if (args.write_video)
+            if (output!="")
             {
-                if (!video_writer.isOpened())
+                if (img_source!="")     // wirte image
                 {
-                    video_writer.open(args.dst_video, CV_FOURCC('x','v','i','d'), args.dst_video_fps,
-                                      img_to_show.size(), true);
-                    if (!video_writer.isOpened())
-                        throw std::runtime_error("can't create video writer");
+                    imwrite(output, img_to_show);
                 }
+                else                    //write video
+                {
+                    if (!video_writer.isOpened())
+                    {
+                        video_writer.open(output, CV_FOURCC('x','v','i','d'), 24,
+                                          img_to_show.size(), true);
+                        if (!video_writer.isOpened())
+                            throw std::runtime_error("can't create video writer");
+                    }
 
-                if (make_gray) cvtColor(img_to_show, img, CV_GRAY2BGR);
-                else cvtColor(img_to_show, img, CV_BGRA2BGR);
+                    if (make_gray) cvtColor(img_to_show, img, CV_GRAY2BGR);
+                    else cvtColor(img_to_show, img, CV_BGRA2BGR);
 
-                video_writer << img;
+                    video_writer << img;
+                }
             }
 
             handleKey((char)waitKey(3));
@@ -361,7 +282,6 @@ void App::run()
     }
 }
 
-
 void App::handleKey(char key)
 {
     switch (key)
@@ -424,7 +344,10 @@ void App::handleKey(char key)
 }
 
 
-inline void App::hogWorkBegin() { hog_work_begin = getTickCount(); }
+inline void App::hogWorkBegin()
+{
+    hog_work_begin = getTickCount();
+}
 
 inline void App::hogWorkEnd()
 {
@@ -440,8 +363,10 @@ inline string App::hogWorkFps() const
     return ss.str();
 }
 
-
-inline void App::workBegin() { work_begin = getTickCount(); }
+inline void App::workBegin()
+{
+    work_begin = getTickCount();
+}
 
 inline void App::workEnd()
 {
@@ -457,3 +382,53 @@ inline string App::workFps() const
     return ss.str();
 }
 
+
+double App::checkRectSimilarity(Size sz,
+                                std::vector<Rect>& ob1,
+                                std::vector<Rect>& ob2)
+{
+    double final_test_result = 0.0;
+    size_t sz1 = ob1.size();
+    size_t sz2 = ob2.size();
+
+    if(sz1 != sz2)
+    {
+        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
+    else
+    {
+        if(sz1==0 && sz2==0)
+            return 0;
+        cv::Mat cpu_result(sz, CV_8UC1);
+        cpu_result.setTo(0);
+
+
+        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
+        {
+            cv::Mat cpu_result_roi(cpu_result, *r);
+            cpu_result_roi.setTo(1);
+            cpu_result.copyTo(cpu_result);
+        }
+        int cpu_area = cv::countNonZero(cpu_result > 0);
+
+
+        cv::Mat gpu_result(sz, CV_8UC1);
+        gpu_result.setTo(0);
+        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
+        {
+            cv::Mat gpu_result_roi(gpu_result, *r2);
+            gpu_result_roi.setTo(1);
+            gpu_result.copyTo(gpu_result);
+        }
+
+        cv::Mat result_;
+        multiply(cpu_result, gpu_result, result_);
+        int result = cv::countNonZero(result_ > 0);
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
+    }
+    return final_test_result;
+}
+
diff --git a/samples/ocl/pyrlk_optical_flow.cpp b/samples/ocl/pyrlk_optical_flow.cpp
new file mode 100644
index 0000000000..cefa928670
--- /dev/null
+++ b/samples/ocl/pyrlk_optical_flow.cpp
@@ -0,0 +1,275 @@
+#include <iostream>
+#include <vector>
+#include <iomanip>
+
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/video/video.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+typedef unsigned char uchar;
+#define LOOP_NUM 10
+int64 work_begin = 0;
+int64 work_end = 0;
+
+static void workBegin()
+{
+    work_begin = getTickCount();
+}
+static void workEnd()
+{
+    work_end += (getTickCount() - work_begin);
+}
+static double getTime()
+{
+    return work_end * 1000. / getTickFrequency();
+}
+
+static void download(const oclMat& d_mat, vector<Point2f>& vec)
+{
+    vec.clear();
+    vec.resize(d_mat.cols);
+    Mat mat(1, d_mat.cols, CV_32FC2, (void*)&vec[0]);
+    d_mat.download(mat);
+}
+
+static void download(const oclMat& d_mat, vector<uchar>& vec)
+{
+    vec.clear();
+    vec.resize(d_mat.cols);
+    Mat mat(1, d_mat.cols, CV_8UC1, (void*)&vec[0]);
+    d_mat.download(mat);
+}
+
+static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<Point2f>& nextPts, const vector<uchar>& status, Scalar line_color = Scalar(0, 0, 255))
+{
+    for (size_t i = 0; i < prevPts.size(); ++i)
+    {
+        if (status[i])
+        {
+            int line_thickness = 1;
+
+            Point p = prevPts[i];
+            Point q = nextPts[i];
+
+            double angle = atan2((double) p.y - q.y, (double) p.x - q.x);
+
+            double hypotenuse = sqrt( (double)(p.y - q.y)*(p.y - q.y) + (double)(p.x - q.x)*(p.x - q.x) );
+
+            if (hypotenuse < 1.0)
+                continue;
+
+            // Here we lengthen the arrow by a factor of three.
+            q.x = (int) (p.x - 3 * hypotenuse * cos(angle));
+            q.y = (int) (p.y - 3 * hypotenuse * sin(angle));
+
+            // Now we draw the main line of the arrow.
+            line(frame, p, q, line_color, line_thickness);
+
+            // Now draw the tips of the arrow. I do some scaling so that the
+            // tips look proportional to the main line of the arrow.
+
+            p.x = (int) (q.x + 9 * cos(angle + CV_PI / 4));
+            p.y = (int) (q.y + 9 * sin(angle + CV_PI / 4));
+            line(frame, p, q, line_color, line_thickness);
+
+            p.x = (int) (q.x + 9 * cos(angle - CV_PI / 4));
+            p.y = (int) (q.y + 9 * sin(angle - CV_PI / 4));
+            line(frame, p, q, line_color, line_thickness);
+        }
+    }
+}
+
+
+int main(int argc, const char* argv[])
+{
+    static std::vector<Info> ocl_info;
+    ocl::getDevice(ocl_info);
+    //if you want to use undefault device, set it here
+    setDevice(ocl_info[0]);
+
+    //set this to save kernel compile time from second time you run
+    ocl::setBinpath("./");
+    const char* keys =
+        "{ h   | help     | false           | print help message }"
+        "{ l   | left     |                 | specify left image }"
+        "{ r   | right    |                 | specify right image }"
+        "{ c   | camera   | 0               | specify camera id }"
+        "{ s   | use_cpu  | false           | use cpu or gpu to process the image }"
+        "{ v   | video    |                 | use video as input }"
+        "{ o   | output   | pyrlk_output.jpg| specify output save path when input is images }"
+        "{ p   | points   | 1000            | specify points count [GoodFeatureToTrack] }"
+        "{ m   | min_dist | 0               | specify minimal distance between points [GoodFeatureToTrack] }";
+
+    CommandLineParser cmd(argc, argv, keys);
+
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Usage: pyrlk_optical_flow [options]" << endl;
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }
+
+    bool defaultPicturesFail = false;
+    string fname0 = cmd.get<string>("l");
+    string fname1 = cmd.get<string>("r");
+    string vdofile = cmd.get<string>("v");
+    string outfile = cmd.get<string>("o");
+    int points = cmd.get<int>("p");
+    double minDist = cmd.get<double>("m");
+    bool useCPU = cmd.get<bool>("s");
+    int inputName = cmd.get<int>("c");
+
+    oclMat d_nextPts, d_status;
+    GoodFeaturesToTrackDetector_OCL d_features(points);
+    Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
+    Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
+    PyrLKOpticalFlow d_pyrLK;
+    vector<cv::Point2f> pts(points);
+    vector<cv::Point2f> nextPts(points);
+    vector<unsigned char> status(points);
+    vector<float> err;
+
+    cout << "Points count : " << points << endl << endl;
+
+    if (frame0.empty() || frame1.empty())
+    {
+        CvCapture* capture = 0;
+        Mat frame, frameCopy;
+        Mat frame0Gray, frame1Gray;
+        Mat ptr0, ptr1;
+
+        if(vdofile == "")
+            capture = cvCaptureFromCAM( inputName );
+        else
+            capture = cvCreateFileCapture(vdofile.c_str());
+
+        int c = inputName ;
+        if(!capture)
+        {
+            if(vdofile == "")
+                cout << "Capture from CAM " << c << " didn't work" << endl;
+            else
+                cout << "Capture from file " << vdofile << " failed" <<endl;
+            if (defaultPicturesFail)
+            {
+                return -1;
+            }
+            goto nocamera;
+        }
+
+        cout << "In capture ..." << endl;
+        for(int i = 0;; i++)
+        {
+            frame = cvQueryFrame( capture );
+            if( frame.empty() )
+                break;
+
+            if (i == 0)
+            {
+                frame.copyTo( frame0 );
+                cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+            }
+            else
+            {
+                if (i%2 == 1)
+                {
+                    frame.copyTo(frame1);
+                    cvtColor(frame1, frame1Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame0Gray;
+                    ptr1 = frame1Gray;
+                }
+                else
+                {
+                    frame.copyTo(frame0);
+                    cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame1Gray;
+                    ptr1 = frame0Gray;
+                }
+
+                if (useCPU)
+                {
+                    pts.clear();
+                    goodFeaturesToTrack(ptr0, pts, points, 0.01, 0.0);
+                    if(pts.size() == 0)
+                        continue;
+                    calcOpticalFlowPyrLK(ptr0, ptr1, pts, nextPts, status, err);
+                }
+                else
+                {
+                    oclMat d_img(ptr0), d_prevPts;
+                    d_features(d_img, d_prevPts);
+                    if(!d_prevPts.rows || !d_prevPts.cols)
+                        continue;
+                    d_pyrLK.sparse(d_img, oclMat(ptr1), d_prevPts, d_nextPts, d_status);
+                    d_features.downloadPoints(d_prevPts,pts);
+                    download(d_nextPts, nextPts);
+                    download(d_status, status);
+                }
+                if (i%2 == 1)
+                    frame1.copyTo(frameCopy);
+                else
+                    frame0.copyTo(frameCopy);
+                drawArrows(frameCopy, pts, nextPts, status, Scalar(255, 0, 0));
+                imshow("PyrLK [Sparse]", frameCopy);
+            }
+
+            if( waitKey( 10 ) >= 0 )
+                goto _cleanup_;
+        }
+
+        waitKey(0);
+
+_cleanup_:
+        cvReleaseCapture( &capture );
+    }
+    else
+    {
+nocamera:
+        for(int i = 0; i <= LOOP_NUM; i ++)
+        {
+            cout << "loop" << i << endl;
+            if (i > 0) workBegin();
+
+            if (useCPU)
+            {
+                goodFeaturesToTrack(frame0, pts, points, 0.01, minDist);
+                calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+            }
+            else
+            {
+                oclMat d_img(frame0), d_prevPts;
+                d_features(d_img, d_prevPts);
+                d_pyrLK.sparse(d_img, oclMat(frame1), d_prevPts, d_nextPts, d_status);
+                d_features.downloadPoints(d_prevPts, pts);
+                download(d_nextPts, nextPts);
+                download(d_status, status);
+            }
+
+            if (i > 0 && i <= LOOP_NUM)
+                workEnd();
+
+            if (i == LOOP_NUM)
+            {
+                if (useCPU)
+                    cout << "average CPU time (noCamera) : ";
+                else
+                    cout << "average GPU time (noCamera) : ";
+
+                cout << getTime() / LOOP_NUM << " ms" << endl;
+
+                drawArrows(frame0, pts, nextPts, status, Scalar(255, 0, 0));
+                imshow("PyrLK [Sparse]", frame0);
+                imwrite(outfile, frame0);
+            }
+        }
+    }
+
+    waitKey();
+
+    return 0;
+}
diff --git a/samples/ocl/squares.cpp b/samples/ocl/squares.cpp
index 6b184161f7..48964ffb2e 100644
--- a/samples/ocl/squares.cpp
+++ b/samples/ocl/squares.cpp
@@ -6,7 +6,6 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/ocl/ocl.hpp"
-
 #include <iostream>
 #include <math.h>
 #include <string.h>
@@ -14,23 +13,50 @@
 using namespace cv;
 using namespace std;
 
-static void help()
-{
-    cout <<
-        "\nA program using OCL module pyramid scaling, Canny, dilate functions, threshold, split; cpu contours, contour simpification and\n"
-        "memory storage (it's got it all folks) to find\n"
-        "squares in a list of images pic1-6.png\n"
-        "Returns sequence of squares detected on the image.\n"
-        "the sequence is stored in the specified memory storage\n"
-        "Call:\n"
-        "./squares\n"
-        "Using OpenCV version %s\n" << CV_VERSION << "\n" << endl;
-}
+#define ACCURACY_CHECK 1
 
+#if ACCURACY_CHECK
+// check if two vectors of vector of points are near or not
+// prior assumption is that they are in correct order
+static bool checkPoints(
+    vector< vector<Point> > set1,
+    vector< vector<Point> > set2,
+    int maxDiff = 5)
+{
+    if(set1.size() != set2.size())
+    {
+        return false;
+    }
+
+    for(vector< vector<Point> >::iterator it1 = set1.begin(), it2 = set2.begin();
+            it1 < set1.end() && it2 < set2.end(); it1 ++, it2 ++)
+    {
+        vector<Point> pts1 = *it1;
+        vector<Point> pts2 = *it2;
+
+
+        if(pts1.size() != pts2.size())
+        {
+            return false;
+        }
+        for(size_t i = 0; i < pts1.size(); i ++)
+        {
+            Point pt1 = pts1[i], pt2 = pts2[i];
+            if(std::abs(pt1.x - pt2.x) > maxDiff ||
+                    std::abs(pt1.y - pt2.y) > maxDiff)
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+#endif
 
 int thresh = 50, N = 11;
 const char* wndname = "OpenCL Square Detection Demo";
 
+
 // helper function:
 // finds a cosine of angle between vectors
 // from pt0->pt1 and from pt0->pt2
@@ -43,9 +69,92 @@ static double angle( Point pt1, Point pt2, Point pt0 )
     return (dx1*dx2 + dy1*dy2)/sqrt((dx1*dx1 + dy1*dy1)*(dx2*dx2 + dy2*dy2) + 1e-10);
 }
 
+
 // returns sequence of squares detected on the image.
 // the sequence is stored in the specified memory storage
 static void findSquares( const Mat& image, vector<vector<Point> >& squares )
+{
+    squares.clear();
+    Mat pyr, timg, gray0(image.size(), CV_8U), gray;
+
+    // down-scale and upscale the image to filter out the noise
+    pyrDown(image, pyr, Size(image.cols/2, image.rows/2));
+    pyrUp(pyr, timg, image.size());
+    vector<vector<Point> > contours;
+
+    // find squares in every color plane of the image
+    for( int c = 0; c < 3; c++ )
+    {
+        int ch[] = {c, 0};
+        mixChannels(&timg, 1, &gray0, 1, ch, 1);
+
+        // try several threshold levels
+        for( int l = 0; l < N; l++ )
+        {
+            // hack: use Canny instead of zero threshold level.
+            // Canny helps to catch squares with gradient shading
+            if( l == 0 )
+            {
+                // apply Canny. Take the upper threshold from slider
+                // and set the lower to 0 (which forces edges merging)
+                Canny(gray0, gray, 0, thresh, 5);
+                // dilate canny output to remove potential
+                // holes between edge segments
+                dilate(gray, gray, Mat(), Point(-1,-1));
+            }
+            else
+            {
+                // apply threshold if l!=0:
+                //     tgray(x,y) = gray(x,y) < (l+1)*255/N ? 255 : 0
+                cv::threshold(gray0, gray, (l+1)*255/N, 255, THRESH_BINARY);
+            }
+
+            // find contours and store them all as a list
+            findContours(gray, contours, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);
+
+            vector<Point> approx;
+
+            // test each contour
+            for( size_t i = 0; i < contours.size(); i++ )
+            {
+                // approximate contour with accuracy proportional
+                // to the contour perimeter
+                approxPolyDP(Mat(contours[i]), approx, arcLength(Mat(contours[i]), true)*0.02, true);
+
+                // square contours should have 4 vertices after approximation
+                // relatively large area (to filter out noisy contours)
+                // and be convex.
+                // Note: absolute value of an area is used because
+                // area may be positive or negative - in accordance with the
+                // contour orientation
+                if( approx.size() == 4 &&
+                        fabs(contourArea(Mat(approx))) > 1000 &&
+                        isContourConvex(Mat(approx)) )
+                {
+                    double maxCosine = 0;
+
+                    for( int j = 2; j < 5; j++ )
+                    {
+                        // find the maximum cosine of the angle between joint edges
+                        double cosine = fabs(angle(approx[j%4], approx[j-2], approx[j-1]));
+                        maxCosine = MAX(maxCosine, cosine);
+                    }
+
+                    // if cosines of all angles are small
+                    // (all angles are ~90 degree) then write quandrange
+                    // vertices to resultant sequence
+                    if( maxCosine < 0.3 )
+                        squares.push_back(approx);
+                }
+            }
+        }
+    }
+}
+
+
+// returns sequence of squares detected on the image.
+// the sequence is stored in the specified memory storage
+static void findSquares_ocl( const Mat& image, vector<vector<Point> >& squares )
 {
     squares.clear();
 
@@ -91,7 +200,6 @@ static void findSquares( const Mat& image, vector<vector<Point> >& squares )
             findContours(gray, contours, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);
 
             vector<Point> approx;
-
             // test each contour
             for( size_t i = 0; i < contours.size(); i++ )
             {
@@ -106,11 +214,10 @@ static void findSquares( const Mat& image, vector<vector<Point> >& squares )
                 // area may be positive or negative - in accordance with the
                 // contour orientation
                 if( approx.size() == 4 &&
-                    fabs(contourArea(Mat(approx))) > 1000 &&
-                    isContourConvex(Mat(approx)) )
+                        fabs(contourArea(Mat(approx))) > 1000 &&
+                        isContourConvex(Mat(approx)) )
                 {
                     double maxCosine = 0;
-
                     for( int j = 2; j < 5; j++ )
                     {
                         // find the maximum cosine of the angle between joint edges
@@ -139,40 +246,93 @@ static void drawSquares( Mat& image, const vector<vector<Point> >& squares )
         int n = (int)squares[i].size();
         polylines(image, &p, &n, 1, true, Scalar(0,255,0), 3, CV_AA);
     }
-
-    imshow(wndname, image);
 }
 
 
-int main(int /*argc*/, char** /*argv*/)
+// draw both pure-C++ and ocl square results onto a single image
+static Mat drawSquaresBoth( const Mat& image,
+                            const vector<vector<Point> >& sqsCPP,
+                            const vector<vector<Point> >& sqsOCL
+)
 {
+    Mat imgToShow(Size(image.cols * 2, image.rows), image.type());
+    Mat lImg = imgToShow(Rect(Point(0, 0), image.size()));
+    Mat rImg = imgToShow(Rect(Point(image.cols, 0), image.size()));
+    image.copyTo(lImg);
+    image.copyTo(rImg);
+    drawSquares(lImg, sqsCPP);
+    drawSquares(rImg, sqsOCL);
+    float fontScale = 0.8f;
+    Scalar white = Scalar::all(255), black = Scalar::all(0);
+
+    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
+    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
+    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
+    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
+
+    return imgToShow;
+}
+
+
+int main(int argc, char** argv)
+{
+    const char* keys =
+        "{ i | input   |                    | specify input image }"
+        "{ o | output  | squares_output.jpg | specify output save path}";
+    CommandLineParser cmd(argc, argv, keys);
+    string inputName = cmd.get<string>("i");
+    string outfile = cmd.get<string>("o");
+    if(inputName.empty())
+    {
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }
 
-    //ocl::setBinpath("F:/kernel_bin");
     vector<ocl::Info> info;
     CV_Assert(ocl::getDevice(info));
-
-    static const char* names[] = { "pic1.png", "pic2.png", "pic3.png",
-        "pic4.png", "pic5.png", "pic6.png", 0 };
-    help();
+    int iterations = 10;
     namedWindow( wndname, 1 );
-    vector<vector<Point> > squares;
+    vector<vector<Point> > squares_cpu, squares_ocl;
 
-    for( int i = 0; names[i] != 0; i++ )
+    Mat image = imread(inputName, 1);
+    if( image.empty() )
     {
-        Mat image = imread(names[i], 1);
-        if( image.empty() )
-        {
-            cout << "Couldn't load " << names[i] << endl;
-            continue;
-        }
-
-        findSquares(image, squares);
-        drawSquares(image, squares);
-
-        int c = waitKey();
-        if( (char)c == 27 )
-            break;
+        cout << "Couldn't load " << inputName << endl;
+        return -1;
     }
+    int j = iterations;
+    int64 t_ocl = 0, t_cpp = 0;
+    //warm-ups
+    cout << "warming up ..." << endl;
+    findSquares(image, squares_cpu);
+    findSquares_ocl(image, squares_ocl);
+
+
+#if ACCURACY_CHECK
+    cout << "Checking ocl accuracy ... " << endl;
+    cout << (checkPoints(squares_cpu, squares_ocl) ? "Pass" : "Failed") << endl;
+#endif
+    do
+    {
+        int64 t_start = cv::getTickCount();
+        findSquares(image, squares_cpu);
+        t_cpp += cv::getTickCount() - t_start;
+
+
+        t_start  = cv::getTickCount();
+        findSquares_ocl(image, squares_ocl);
+        t_ocl += cv::getTickCount() - t_start;
+        cout << "run loop: " << j << endl;
+    }
+    while(--j);
+    cout << "cpp average time: " << 1000.0f * (double)t_cpp / getTickFrequency() / iterations << "ms" << endl;
+    cout << "ocl average time: " << 1000.0f * (double)t_ocl / getTickFrequency() / iterations << "ms" << endl;
+
+    Mat result = drawSquaresBoth(image, squares_cpu, squares_ocl);
+    imshow(wndname, result);
+    imwrite(outfile, result);
+    cvWaitKey(0);
 
     return 0;
 }
diff --git a/samples/ocl/stereo_match.cpp b/samples/ocl/stereo_match.cpp
new file mode 100644
index 0000000000..abe75c70e1
--- /dev/null
+++ b/samples/ocl/stereo_match.cpp
@@ -0,0 +1,384 @@
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+using namespace cv;
+using namespace std;
+using namespace ocl;
+
+
+struct App
+{
+    App(CommandLineParser& cmd);
+    void run();
+    void handleKey(char key);
+    void printParams() const;
+
+    void workBegin()
+    {
+        work_begin = getTickCount();
+    }
+    void workEnd()
+    {
+        int64 d = getTickCount() - work_begin;
+        double f = getTickFrequency();
+        work_fps = f / d;
+    }
+    string method_str() const
+    {
+        switch (method)
+        {
+        case BM:
+            return "BM";
+        case BP:
+            return "BP";
+        case CSBP:
+            return "CSBP";
+        }
+        return "";
+    }
+    string text() const
+    {
+        stringstream ss;
+        ss << "(" << method_str() << ") FPS: " << setiosflags(ios::left)
+           << setprecision(4) << work_fps;
+        return ss.str();
+    }
+private:
+    bool running;
+
+    Mat left_src, right_src;
+    Mat left, right;
+    oclMat d_left, d_right;
+
+    StereoBM_OCL bm;
+    StereoBeliefPropagation bp;
+    StereoConstantSpaceBP csbp;
+
+    int64 work_begin;
+    double work_fps;
+
+    string l_img, r_img;
+    string out_img;
+    enum {BM, BP, CSBP} method;
+    int ndisp; // Max disparity + 1
+    enum {GPU, CPU} type;
+};
+
+int main(int argc, char** argv)
+{
+    const char* keys =
+        "{ h | help     | false                     | print help message }"
+        "{ l | left     |                           | specify left image }"
+        "{ r | right    |                           | specify right image }"
+        "{ m | method   | BM                        | specify match method(BM/BP/CSBP) }"
+        "{ n | ndisp    | 64                        |  specify number of disparity levels }"
+        "{ s | cpu_ocl  | false                     | use cpu or gpu as ocl device to process the image }"
+        "{ o | output   | stereo_match_output.jpg   | specify output path when input is images}";
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }
+    try
+    {
+        App app(cmd);
+        int flag = CVCL_DEVICE_TYPE_GPU;
+        if(cmd.get<bool>("s") == true)
+            flag = CVCL_DEVICE_TYPE_CPU;
+
+        vector<Info> info;
+        if(getDevice(info, flag) == 0)
+        {
+            throw runtime_error("Error: Did not find a valid OpenCL device!");
+        }
+        cout << "Device name:" << info[0].DeviceName[0] << endl;
+
+        app.run();
+    }
+    catch (const exception& e)
+    {
+        cout << "error: " << e.what() << endl;
+    }
+    return 0;
+}
+
+App::App(CommandLineParser& cmd)
+    : running(false),method(BM)
+{
+    cout << "stereo_match_ocl sample\n";
+    cout << "\nControls:\n"
+         << "\tesc - exit\n"
+         << "\tp - print current parameters\n"
+         << "\tg - convert source images into gray\n"
+         << "\tm - change stereo match method\n"
+         << "\ts - change Sobel prefiltering flag (for BM only)\n"
+         << "\t1/q - increase/decrease maximum disparity\n"
+         << "\t2/w - increase/decrease window size (for BM only)\n"
+         << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
+         << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
+    l_img = cmd.get<string>("l");
+    r_img = cmd.get<string>("r");
+    string mstr = cmd.get<string>("m");
+    if(mstr == "BM") method = BM;
+    else if(mstr == "BP") method = BP;
+    else if(mstr == "CSBP") method = CSBP;
+    else cout << "unknown method!\n";
+    ndisp = cmd.get<int>("n");
+    out_img = cmd.get<string>("o");
+}
+
+
+void App::run()
+{
+    // Load images
+    left_src = imread(l_img);
+    right_src = imread(r_img);
+    if (left_src.empty()) throw runtime_error("can't open file \"" + l_img + "\"");
+    if (right_src.empty()) throw runtime_error("can't open file \"" + r_img + "\"");
+
+    cvtColor(left_src, left, CV_BGR2GRAY);
+    cvtColor(right_src, right, CV_BGR2GRAY);
+
+    d_left.upload(left);
+    d_right.upload(right);
+
+    imshow("left", left);
+    imshow("right", right);
+
+    // Set common parameters
+    bm.ndisp = ndisp;
+    bp.ndisp = ndisp;
+    csbp.ndisp = ndisp;
+
+    cout << endl;
+    printParams();
+
+    running = true;
+    bool written = false;
+    while (running)
+    {
+
+        // Prepare disparity map of specified type
+        Mat disp;
+        oclMat d_disp;
+        workBegin();
+        switch (method)
+        {
+        case BM:
+            if (d_left.channels() > 1 || d_right.channels() > 1)
+            {
+                cout << "BM doesn't support color images\n";
+                cvtColor(left_src, left, CV_BGR2GRAY);
+                cvtColor(right_src, right, CV_BGR2GRAY);
+                cout << "image_channels: " << left.channels() << endl;
+                d_left.upload(left);
+                d_right.upload(right);
+                imshow("left", left);
+                imshow("right", right);
+            }
+            bm(d_left, d_right, d_disp);
+            break;
+        case BP:
+            bp(d_left, d_right, d_disp);
+            break;
+        case CSBP:
+            csbp(d_left, d_right, d_disp);
+            break;
+        }
+        // Show results
+        d_disp.download(disp);
+        workEnd();
+        if (method != BM)
+        {
+            disp.convertTo(disp, 0);
+        }
+        putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
+        imshow("disparity", disp);
+        if(!written)
+        {
+            imwrite(out_img, disp);
+            written = true;
+        }
+        handleKey((char)waitKey(3));
+    }
+}
+
+
+void App::printParams() const
+{
+    cout << "--- Parameters ---\n";
+    cout << "image_size: (" << left.cols << ", " << left.rows << ")\n";
+    cout << "image_channels: " << left.channels() << endl;
+    cout << "method: " << method_str() << endl
+         << "ndisp: " << ndisp << endl;
+    switch (method)
+    {
+    case BM:
+        cout << "win_size: " << bm.winSize << endl;
+        cout << "prefilter_sobel: " << bm.preset << endl;
+        break;
+    case BP:
+        cout << "iter_count: " << bp.iters << endl;
+        cout << "level_count: " << bp.levels << endl;
+        break;
+    case CSBP:
+        cout << "iter_count: " << csbp.iters << endl;
+        cout << "level_count: " << csbp.levels << endl;
+        break;
+    }
+    cout << endl;
+}
+
+
+void App::handleKey(char key)
+{
+    switch (key)
+    {
+    case 27:
+        running = false;
+        break;
+    case 'p':
+    case 'P':
+        printParams();
+        break;
+    case 'g':
+    case 'G':
+        if (left.channels() == 1 && method != BM)
+        {
+            left = left_src;
+            right = right_src;
+        }
+        else
+        {
+            cvtColor(left_src, left, CV_BGR2GRAY);
+            cvtColor(right_src, right, CV_BGR2GRAY);
+        }
+        d_left.upload(left);
+        d_right.upload(right);
+        cout << "image_channels: " << left.channels() << endl;
+        imshow("left", left);
+        imshow("right", right);
+        break;
+    case 'm':
+    case 'M':
+        switch (method)
+        {
+        case BM:
+            method = BP;
+            break;
+        case BP:
+            method = CSBP;
+            break;
+        case CSBP:
+            method = BM;
+            break;
+        }
+        cout << "method: " << method_str() << endl;
+        break;
+    case 's':
+    case 'S':
+        if (method == BM)
+        {
+            switch (bm.preset)
+            {
+            case StereoBM_OCL::BASIC_PRESET:
+                bm.preset = StereoBM_OCL::PREFILTER_XSOBEL;
+                break;
+            case StereoBM_OCL::PREFILTER_XSOBEL:
+                bm.preset = StereoBM_OCL::BASIC_PRESET;
+                break;
+            }
+            cout << "prefilter_sobel: " << bm.preset << endl;
+        }
+        break;
+    case '1':
+        ndisp == 1 ? ndisp = 8 : ndisp += 8;
+        cout << "ndisp: " << ndisp << endl;
+        bm.ndisp = ndisp;
+        bp.ndisp = ndisp;
+        csbp.ndisp = ndisp;
+        break;
+    case 'q':
+    case 'Q':
+        ndisp = max(ndisp - 8, 1);
+        cout << "ndisp: " << ndisp << endl;
+        bm.ndisp = ndisp;
+        bp.ndisp = ndisp;
+        csbp.ndisp = ndisp;
+        break;
+    case '2':
+        if (method == BM)
+        {
+            bm.winSize = min(bm.winSize + 1, 51);
+            cout << "win_size: " << bm.winSize << endl;
+        }
+        break;
+    case 'w':
+    case 'W':
+        if (method == BM)
+        {
+            bm.winSize = max(bm.winSize - 1, 2);
+            cout << "win_size: " << bm.winSize << endl;
+        }
+        break;
+    case '3':
+        if (method == BP)
+        {
+            bp.iters += 1;
+            cout << "iter_count: " << bp.iters << endl;
+        }
+        else if (method == CSBP)
+        {
+            csbp.iters += 1;
+            cout << "iter_count: " << csbp.iters << endl;
+        }
+        break;
+    case 'e':
+    case 'E':
+        if (method == BP)
+        {
+            bp.iters = max(bp.iters - 1, 1);
+            cout << "iter_count: " << bp.iters << endl;
+        }
+        else if (method == CSBP)
+        {
+            csbp.iters = max(csbp.iters - 1, 1);
+            cout << "iter_count: " << csbp.iters << endl;
+        }
+        break;
+    case '4':
+        if (method == BP)
+        {
+            bp.levels += 1;
+            cout << "level_count: " << bp.levels << endl;
+        }
+        else if (method == CSBP)
+        {
+            csbp.levels += 1;
+            cout << "level_count: " << csbp.levels << endl;
+        }
+        break;
+    case 'r':
+    case 'R':
+        if (method == BP)
+        {
+            bp.levels = max(bp.levels - 1, 1);
+            cout << "level_count: " << bp.levels << endl;
+        }
+        else if (method == CSBP)
+        {
+            csbp.levels = max(csbp.levels - 1, 1);
+            cout << "level_count: " << csbp.levels << endl;
+        }
+        break;
+    }
+}
+
+
diff --git a/samples/ocl/surf_matcher.cpp b/samples/ocl/surf_matcher.cpp
index ea6ee97cb2..bee517fbca 100644
--- a/samples/ocl/surf_matcher.cpp
+++ b/samples/ocl/surf_matcher.cpp
@@ -1,201 +1,94 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
 #include <iostream>
 #include <stdio.h>
 #include "opencv2/core/core.hpp"
-#include "opencv2/features2d/features2d.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/ocl/ocl.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
 #include "opencv2/nonfree/ocl.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
 
-using namespace std;
 using namespace cv;
 using namespace cv::ocl;
 
-//#define USE_CPU_DESCRIPTOR // use cpu descriptor extractor until ocl descriptor extractor is fixed
-//#define USE_CPU_BFMATCHER
-void help();
+const int LOOP_NUM = 10;
+const int GOOD_PTS_MAX = 50;
+const float GOOD_PORTION = 0.15f;
 
-void help()
+namespace
 {
-    cout << "\nThis program demonstrates using SURF_OCL features detector and descriptor extractor" << endl;
-    cout << "\nUsage:\n\tsurf_matcher --left <image1> --right <image2>" << endl;
+
+int64 work_begin = 0;
+int64 work_end = 0;
+
+void workBegin()
+{
+    work_begin = getTickCount();
+}
+void workEnd()
+{
+    work_end = getTickCount() - work_begin;
+}
+double getTime()
+{
+    return work_end /((double)cvGetTickFrequency() * 1000.);
 }
 
-
-////////////////////////////////////////////////////
-// This program demonstrates the usage of SURF_OCL.
-// use cpu findHomography interface to calculate the transformation matrix
-int main(int argc, char* argv[])
+template<class KPDetector>
+struct SURFDetector
 {
-    if (argc != 5 && argc != 1)
+    KPDetector surf;
+    SURFDetector(double hessian = 800.0)
+        :surf(hessian)
     {
-        help();
-        return -1;
     }
-    vector<cv::ocl::Info> info;
-    if(!cv::ocl::getDevice(info))
+    template<class T>
+    void operator()(const T& in, const T& mask, vector<cv::KeyPoint>& pts, T& descriptors, bool useProvided = false)
     {
-        cout << "Error: Did not find a valid OpenCL device!" << endl;
-        return -1;
+        surf(in, mask, pts, descriptors, useProvided);
     }
-    Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
-    oclMat img1, img2;
-    if(argc != 5)
+};
+
+template<class KPMatcher>
+struct SURFMatcher
+{
+    KPMatcher matcher;
+    template<class T>
+    void match(const T& in1, const T& in2, vector<cv::DMatch>& matches)
     {
-        cpu_img1 = imread("o.png");
-        cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
-        img1     = cpu_img1_grey;
-        CV_Assert(!img1.empty());
-
-        cpu_img2 = imread("r2.png");
-        cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
-        img2     = cpu_img2_grey;
-    }
-    else
-    {
-        for (int i = 1; i < argc; ++i)
-        {
-            if (string(argv[i]) == "--left")
-            {
-                cpu_img1 = imread(argv[++i]);
-                cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
-                img1     = cpu_img1_grey;
-                CV_Assert(!img1.empty());
-            }
-            else if (string(argv[i]) == "--right")
-            {
-                cpu_img2 = imread(argv[++i]);
-                cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
-                img2     = cpu_img2_grey;
-            }
-            else if (string(argv[i]) == "--help")
-            {
-                help();
-                return -1;
-            }
-        }
+        matcher.match(in1, in2, matches);
     }
+};
 
-    SURF_OCL surf;
-    //surf.hessianThreshold = 400.f;
-    //surf.extended = false;
-
-    // detecting keypoints & computing descriptors
-    oclMat keypoints1GPU, keypoints2GPU;
-    oclMat descriptors1GPU, descriptors2GPU;
-
-    // downloading results
-    vector<KeyPoint> keypoints1, keypoints2;
-    vector<DMatch> matches;
-
-
-#ifndef USE_CPU_DESCRIPTOR
-    surf(img1, oclMat(), keypoints1GPU, descriptors1GPU);
-    surf(img2, oclMat(), keypoints2GPU, descriptors2GPU);
-
-    surf.downloadKeypoints(keypoints1GPU, keypoints1);
-    surf.downloadKeypoints(keypoints2GPU, keypoints2);
-
-
-#ifdef USE_CPU_BFMATCHER
-    //BFMatcher
-    BFMatcher matcher(cv::NORM_L2);
-    matcher.match(Mat(descriptors1GPU), Mat(descriptors2GPU), matches);
-#else
-    BruteForceMatcher_OCL_base matcher(BruteForceMatcher_OCL_base::L2Dist);
-    matcher.match(descriptors1GPU, descriptors2GPU, matches);
-#endif
-
-#else
-    surf(img1, oclMat(), keypoints1GPU);
-    surf(img2, oclMat(), keypoints2GPU);
-    surf.downloadKeypoints(keypoints1GPU, keypoints1);
-    surf.downloadKeypoints(keypoints2GPU, keypoints2);
-
-    // use SURF_OCL to detect keypoints and use SURF to extract descriptors
-    SURF surf_cpu;
-    Mat descriptors1, descriptors2;
-    surf_cpu(cpu_img1, Mat(), keypoints1, descriptors1, true);
-    surf_cpu(cpu_img2, Mat(), keypoints2, descriptors2, true);
-    matcher.match(descriptors1, descriptors2, matches);
-#endif
-    cout << "OCL: FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
-    cout << "OCL: FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
-
-    double max_dist = 0; double min_dist = 100;
-    //-- Quick calculation of max and min distances between keypoints
-    for( size_t i = 0; i < keypoints1.size(); i++ )
-    {
-        double dist = matches[i].distance;
-        if( dist < min_dist ) min_dist = dist;
-        if( dist > max_dist ) max_dist = dist;
-    }
-
-    printf("-- Max dist : %f \n", max_dist );
-    printf("-- Min dist : %f \n", min_dist );
-
-    //-- Draw only "good" matches (i.e. whose distance is less than 2.5*min_dist )
+Mat drawGoodMatches(
+    const Mat& cpu_img1,
+    const Mat& cpu_img2,
+    const vector<KeyPoint>& keypoints1,
+    const vector<KeyPoint>& keypoints2,
+    vector<DMatch>& matches,
+    vector<Point2f>& scene_corners_
+)
+{
+    //-- Sort matches and preserve top 10% matches
+    std::sort(matches.begin(), matches.end());
     std::vector< DMatch > good_matches;
+    double minDist = matches.front().distance,
+           maxDist = matches.back().distance;
 
-    for( size_t i = 0; i < keypoints1.size(); i++ )
+    const int ptsPairs = std::min(GOOD_PTS_MAX, (int)(matches.size() * GOOD_PORTION));
+    for( int i = 0; i < ptsPairs; i++ )
     {
-        if( matches[i].distance < 3*min_dist )
-        {
-            good_matches.push_back( matches[i]);
-        }
+        good_matches.push_back( matches[i] );
     }
+    std::cout << "\nMax distance: " << maxDist << std::endl;
+    std::cout << "Min distance: " << minDist << std::endl;
+
+    std::cout << "Calculating homography using " << ptsPairs << " point pairs." << std::endl;
 
     // drawing the results
     Mat img_matches;
     drawMatches( cpu_img1, keypoints1, cpu_img2, keypoints2,
-        good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
-        vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
+                 good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
+                 vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );
 
     //-- Localize the object
     std::vector<Point2f> obj;
@@ -207,26 +100,243 @@ int main(int argc, char* argv[])
         obj.push_back( keypoints1[ good_matches[i].queryIdx ].pt );
         scene.push_back( keypoints2[ good_matches[i].trainIdx ].pt );
     }
-    Mat H = findHomography( obj, scene, CV_RANSAC );
-
     //-- Get the corners from the image_1 ( the object to be "detected" )
     std::vector<Point2f> obj_corners(4);
-    obj_corners[0] = cvPoint(0,0); obj_corners[1] = cvPoint( cpu_img1.cols, 0 );
-    obj_corners[2] = cvPoint( cpu_img1.cols, cpu_img1.rows ); obj_corners[3] = cvPoint( 0, cpu_img1.rows );
+    obj_corners[0] = cvPoint(0,0);
+    obj_corners[1] = cvPoint( cpu_img1.cols, 0 );
+    obj_corners[2] = cvPoint( cpu_img1.cols, cpu_img1.rows );
+    obj_corners[3] = cvPoint( 0, cpu_img1.rows );
     std::vector<Point2f> scene_corners(4);
 
+    Mat H = findHomography( obj, scene, CV_RANSAC );
     perspectiveTransform( obj_corners, scene_corners, H);
 
+    scene_corners_ = scene_corners;
+
     //-- Draw lines between the corners (the mapped object in the scene - image_2 )
-    line( img_matches, scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
-    line( img_matches, scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
-    line( img_matches, scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
-    line( img_matches, scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
+    line( img_matches,
+          scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches,
+          scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches,
+          scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches,
+          scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, CV_AA );
+    return img_matches;
+}
+
+}
+////////////////////////////////////////////////////
+// This program demonstrates the usage of SURF_OCL.
+// use cpu findHomography interface to calculate the transformation matrix
+int main(int argc, char* argv[])
+{
+    const char* keys =
+        "{ h | help     | false           | print help message  }"
+        "{ l | left     |                 | specify left image  }"
+        "{ r | right    |                 | specify right image }"
+        "{ o | output   | SURF_output.jpg | specify output save path (only works in CPU or GPU only mode) }"
+        "{ c | use_cpu  | false           | use CPU algorithms  }"
+        "{ a | use_all  | false           | use both CPU and GPU algorithms}";
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.get<bool>("help"))
+    {
+        std::cout << "Avaible options:" << std::endl;
+        cmd.printParams();
+        return 0;
+    }
+
+    vector<cv::ocl::Info> info;
+    if(cv::ocl::getDevice(info) == 0)
+    {
+        std::cout << "Error: Did not find a valid OpenCL device!" << std::endl;
+        return -1;
+    }
+    ocl::setDevice(info[0]);
+
+    Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
+    oclMat img1, img2;
+    bool useCPU = cmd.get<bool>("c");
+    bool useGPU = false;
+    bool useALL = cmd.get<bool>("a");
+
+    string outpath = cmd.get<std::string>("o");
+
+    cpu_img1 = imread(cmd.get<std::string>("l"));
+    CV_Assert(!cpu_img1.empty());
+    cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
+    img1 = cpu_img1_grey;
+
+    cpu_img2 = imread(cmd.get<std::string>("r"));
+    CV_Assert(!cpu_img2.empty());
+    cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
+    img2 = cpu_img2_grey;
+
+    if(useALL)
+    {
+        useCPU = false;
+        useGPU = false;
+    }
+    else if(useCPU==false && useALL==false)
+    {
+        useGPU = true;
+    }
+
+    if(!useCPU)
+    {
+        std::cout
+                << "Device name:"
+                << info[0].DeviceName[0]
+                << std::endl;
+    }
+    double surf_time = 0.;
+
+    //declare input/output
+    vector<KeyPoint> keypoints1, keypoints2;
+    vector<DMatch> matches;
+
+    vector<KeyPoint> gpu_keypoints1;
+    vector<KeyPoint> gpu_keypoints2;
+    vector<DMatch> gpu_matches;
+
+    Mat descriptors1CPU, descriptors2CPU;
+
+    oclMat keypoints1GPU, keypoints2GPU;
+    oclMat descriptors1GPU, descriptors2GPU;
+
+    //instantiate detectors/matchers
+    SURFDetector<SURF>     cpp_surf;
+    SURFDetector<SURF_OCL> ocl_surf;
+
+    SURFMatcher<BFMatcher>      cpp_matcher;
+    SURFMatcher<BFMatcher_OCL>  ocl_matcher;
+
+    //-- start of timing section
+    if (useCPU)
+    {
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            cpp_surf(cpu_img1_grey, Mat(), keypoints1, descriptors1CPU);
+            cpp_surf(cpu_img2_grey, Mat(), keypoints2, descriptors2CPU);
+            cpp_matcher.match(descriptors1CPU, descriptors2CPU, matches);
+        }
+        workEnd();
+        std::cout << "CPP: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "CPP: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
+    }
+    else if(useGPU)
+    {
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            ocl_surf(img1, oclMat(), keypoints1, descriptors1GPU);
+            ocl_surf(img2, oclMat(), keypoints2, descriptors2GPU);
+            ocl_matcher.match(descriptors1GPU, descriptors2GPU, matches);
+        }
+        workEnd();
+        std::cout << "OCL: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "OCL: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
+    }
+    else
+    {
+        //cpu runs
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            cpp_surf(cpu_img1_grey, Mat(), keypoints1, descriptors1CPU);
+            cpp_surf(cpu_img2_grey, Mat(), keypoints2, descriptors2CPU);
+            cpp_matcher.match(descriptors1CPU, descriptors2CPU, matches);
+        }
+        workEnd();
+        std::cout << "\nCPP: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "CPP: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "(CPP)SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl;
+
+        //gpu runs
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            ocl_surf(img1, oclMat(), gpu_keypoints1, descriptors1GPU);
+            ocl_surf(img2, oclMat(), gpu_keypoints2, descriptors2GPU);
+            ocl_matcher.match(descriptors1GPU, descriptors2GPU, gpu_matches);
+        }
+        workEnd();
+        std::cout << "\nOCL: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "OCL: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "(OCL)SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
+
+    }
+
+    //--------------------------------------------------------------------------
+    std::vector<Point2f> cpu_corner;
+    Mat img_matches = drawGoodMatches(cpu_img1, cpu_img2, keypoints1, keypoints2, matches, cpu_corner);
+
+    std::vector<Point2f> gpu_corner;
+    Mat ocl_img_matches;
+    if(useALL || (!useCPU&&!useGPU))
+    {
+        ocl_img_matches = drawGoodMatches(cpu_img1, cpu_img2, gpu_keypoints1, gpu_keypoints2, gpu_matches, gpu_corner);
+
+        //check accuracy
+        std::cout<<"\nCheck accuracy:\n";
+
+        if(cpu_corner.size()!=gpu_corner.size())
+            std::cout<<"Failed\n";
+        else
+        {
+            bool result = false;
+            for(size_t i = 0; i < cpu_corner.size(); i++)
+            {
+                if((std::abs(cpu_corner[i].x - gpu_corner[i].x) > 10)
+                        ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))
+                {
+                    std::cout<<"Failed\n";
+                    result = false;
+                    break;
+                }
+                result = true;
+            }
+            if(result)
+                std::cout<<"Passed\n";
+        }
+    }
 
     //-- Show detected matches
-    namedWindow("ocl surf matches", 0);
-    imshow("ocl surf matches", img_matches);
-    waitKey(0);
+    if (useCPU)
+    {
+        namedWindow("cpu surf matches", 0);
+        imshow("cpu surf matches", img_matches);
+        imwrite(outpath, img_matches);
+    }
+    else if(useGPU)
+    {
+        namedWindow("ocl surf matches", 0);
+        imshow("ocl surf matches", img_matches);
+        imwrite(outpath, img_matches);
+    }
+    else
+    {
+        namedWindow("cpu surf matches", 0);
+        imshow("cpu surf matches", img_matches);
 
+        namedWindow("ocl surf matches", 0);
+        imshow("ocl surf matches", ocl_img_matches);
+    }
+    waitKey(0);
     return 0;
 }
diff --git a/samples/ocl/tvl1_optical_flow.cpp b/samples/ocl/tvl1_optical_flow.cpp
new file mode 100644
index 0000000000..cff9692ed6
--- /dev/null
+++ b/samples/ocl/tvl1_optical_flow.cpp
@@ -0,0 +1,265 @@
+#include <iostream>
+#include <vector>
+#include <iomanip>
+
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/video/video.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+typedef unsigned char uchar;
+#define LOOP_NUM 10
+int64 work_begin = 0;
+int64 work_end = 0;
+
+static void workBegin()
+{
+    work_begin = getTickCount();
+}
+static void workEnd()
+{
+    work_end += (getTickCount() - work_begin);
+}
+static double getTime()
+{
+    return work_end * 1000. / getTickFrequency();
+}
+
+template <typename T> inline T clamp (T x, T a, T b)
+{
+    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
+}
+
+template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
+{
+    x = clamp(x, a, b);
+    return c + (d - c) * (x - a) / (b - a);
+}
+
+static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
+{
+    float maxDisplacement = 1.0f;
+
+    for (int i = 0; i < u.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+        for (int j = 0; j < u.cols; ++j)
+        {
+            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
+
+            if (d > maxDisplacement)
+                maxDisplacement = d;
+        }
+    }
+
+    flowField.create(u.size(), CV_8UC4);
+
+    for (int i = 0; i < flowField.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+
+        Vec4b* row = flowField.ptr<Vec4b>(i);
+
+        for (int j = 0; j < flowField.cols; ++j)
+        {
+            row[j][0] = 0;
+            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][3] = 255;
+        }
+    }
+}
+
+
+int main(int argc, const char* argv[])
+{
+    static std::vector<Info> ocl_info;
+    ocl::getDevice(ocl_info);
+    //if you want to use undefault device, set it here
+    setDevice(ocl_info[0]);
+
+    //set this to save kernel compile time from second time you run
+    ocl::setBinpath("./");
+    const char* keys =
+        "{ h   | help       | false           | print help message }"
+        "{ l   | left       |                 | specify left image }"
+        "{ r   | right      |                 | specify right image }"
+        "{ o   | output     | tvl1_output.jpg | specify output save path }"
+        "{ c   | camera     | 0               | enable camera capturing }"
+        "{ s   | use_cpu    | false           | use cpu or gpu to process the image }"
+        "{ v   | video      |                 | use video as input }";
+
+    CommandLineParser cmd(argc, argv, keys);
+
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Usage: pyrlk_optical_flow [options]" << endl;
+        cout << "Avaible options:" << endl;
+        cmd.printParams();
+        return 0;
+    }
+
+    bool defaultPicturesFail = false;
+    string fname0 = cmd.get<string>("l");
+    string fname1 = cmd.get<string>("r");
+    string vdofile = cmd.get<string>("v");
+    string outpath = cmd.get<string>("o");
+    bool useCPU = cmd.get<bool>("s");
+    bool useCamera = cmd.get<bool>("c");
+    int inputName = cmd.get<int>("c");
+
+    Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
+    Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
+
+
+    Mat flow, show_flow;
+    Mat flow_vec[2];
+    if (frame0.empty() || frame1.empty())
+    {
+        useCamera = true;
+        defaultPicturesFail = true;
+        CvCapture* capture = 0;
+        capture = cvCaptureFromCAM( inputName );
+        if (!capture)
+        {
+            cout << "Can't load input images" << endl;
+            return -1;
+        }
+    }
+
+
+    if (useCamera)
+    {
+        CvCapture* capture = 0;
+        Mat frame, frameCopy;
+        Mat frame0Gray, frame1Gray;
+        Mat ptr0, ptr1;
+
+        if(vdofile == "")
+            capture = cvCaptureFromCAM( inputName );
+        else
+            capture = cvCreateFileCapture(vdofile.c_str());
+
+        int c = inputName ;
+        if(!capture)
+        {
+            if(vdofile == "")
+                cout << "Capture from CAM " << c << " didn't work" << endl;
+            else
+                cout << "Capture from file " << vdofile << " failed" <<endl;
+            if (defaultPicturesFail)
+            {
+                return -1;
+            }
+            goto nocamera;
+        }
+
+        cout << "In capture ..." << endl;
+        for(int i = 0;; i++)
+        {
+            frame = cvQueryFrame( capture );
+            if( frame.empty() )
+                break;
+
+            if (i == 0)
+            {
+                frame.copyTo( frame0 );
+                cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+            }
+            else
+            {
+                if (i%2 == 1)
+                {
+                    frame.copyTo(frame1);
+                    cvtColor(frame1, frame1Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame0Gray;
+                    ptr1 = frame1Gray;
+                }
+                else
+                {
+                    frame.copyTo(frame0);
+                    cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame1Gray;
+                    ptr1 = frame0Gray;
+                }
+
+                if (useCPU)
+                {
+                    alg->calc(ptr0, ptr1, flow);
+                    split(flow, flow_vec);
+                }
+                else
+                {
+                    oclMat d_flowx, d_flowy;
+                    d_alg(oclMat(ptr0), oclMat(ptr1), d_flowx, d_flowy);
+                    d_flowx.download(flow_vec[0]);
+                    d_flowy.download(flow_vec[1]);
+                }
+                if (i%2 == 1)
+                    frame1.copyTo(frameCopy);
+                else
+                    frame0.copyTo(frameCopy);
+                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                imshow("PyrLK [Sparse]", show_flow);
+            }
+
+            if( waitKey( 10 ) >= 0 )
+                goto _cleanup_;
+        }
+
+        waitKey(0);
+
+_cleanup_:
+        cvReleaseCapture( &capture );
+    }
+    else
+    {
+nocamera:
+        oclMat d_flowx, d_flowy;
+        for(int i = 0; i <= LOOP_NUM; i ++)
+        {
+            cout << "loop" << i << endl;
+
+            if (i > 0) workBegin();
+            if (useCPU)
+            {
+                alg->calc(frame0, frame1, flow);
+                split(flow, flow_vec);
+            }
+            else
+            {
+                d_alg(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
+                d_flowx.download(flow_vec[0]);
+                d_flowy.download(flow_vec[1]);
+            }
+            if (i > 0 && i <= LOOP_NUM)
+                workEnd();
+
+            if (i == LOOP_NUM)
+            {
+                if (useCPU)
+                    cout << "average CPU time (noCamera) : ";
+                else
+                    cout << "average GPU time (noCamera) : ";
+                cout << getTime() / LOOP_NUM << " ms" << endl;
+
+                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                imshow("PyrLK [Sparse]", show_flow);
+                imwrite(outpath, show_flow);
+            }
+        }
+    }
+
+    waitKey();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/samples/python2/grabcut.py b/samples/python2/grabcut.py
new file mode 100644
index 0000000000..9fc1280acf
--- /dev/null
+++ b/samples/python2/grabcut.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+'''
+===============================================================================
+Interactive Image Segmentation using GrabCut algorithm.
+
+This sample shows interactive image segmentation using grabcut algorithm.
+
+USAGE :
+    python grabcut.py <filename>
+
+README FIRST:    
+    Two windows will show up, one for input and one for output.
+    
+    At first, in input window, draw a rectangle around the object using 
+mouse right button. Then press 'n' to segment the object (once or a few times)
+For any finer touch-ups, you can press any of the keys below and draw lines on 
+the areas you want. Then again press 'n' for updating the output.
+
+Key '0' - To select areas of sure background
+Key '1' - To select areas of sure foreground
+Key '2' - To select areas of probable background
+Key '3' - To select areas of probable foreground
+
+Key 'n' - To update the segmentation
+Key 'r' - To reset the setup
+Key 's' - To save the results
+===============================================================================
+'''
+
+import numpy as np
+import cv2
+import sys
+
+BLUE = [255,0,0]        # rectangle color
+RED = [0,0,255]         # PR BG
+GREEN = [0,255,0]       # PR FG
+BLACK = [0,0,0]         # sure BG
+WHITE = [255,255,255]   # sure FG
+
+DRAW_BG = {'color' : BLACK, 'val' : 0}
+DRAW_FG = {'color' : WHITE, 'val' : 1}
+DRAW_PR_FG = {'color' : GREEN, 'val' : 3}
+DRAW_PR_BG = {'color' : RED, 'val' : 2}
+
+# setting up flags
+rect = (0,0,1,1)
+drawing = False         # flag for drawing curves
+rectangle = False       # flag for drawing rect
+rect_over = False       # flag to check if rect drawn
+rect_or_mask = 100      # flag for selecting rect or mask mode
+value = DRAW_FG         # drawing initialized to FG
+thickness = 3           # brush thickness
+
+def onmouse(event,x,y,flags,param):
+    global img,img2,drawing,value,mask,rectangle,rect,rect_or_mask,ix,iy,rect_over
+    
+    # Draw Rectangle
+    if event == cv2.EVENT_RBUTTONDOWN:
+        rectangle = True
+        ix,iy = x,y
+
+    elif event == cv2.EVENT_MOUSEMOVE:
+        if rectangle == True:
+            img = img2.copy()
+            cv2.rectangle(img,(ix,iy),(x,y),BLUE,2)
+            rect = (ix,iy,abs(ix-x),abs(iy-y))
+            rect_or_mask = 0
+
+    elif event == cv2.EVENT_RBUTTONUP:
+        rectangle = False
+        rect_over = True
+        cv2.rectangle(img,(ix,iy),(x,y),BLUE,2)
+        rect = (ix,iy,abs(ix-x),abs(iy-y))
+        rect_or_mask = 0
+        print " Now press the key 'n' a few times until no further change \n"
+        
+    # draw touchup curves
+    
+    if event == cv2.EVENT_LBUTTONDOWN:
+        if rect_over == False:
+            print "first draw rectangle \n"
+        else:
+            drawing = True
+            cv2.circle(img,(x,y),thickness,value['color'],-1)
+            cv2.circle(mask,(x,y),thickness,value['val'],-1)
+
+    elif event == cv2.EVENT_MOUSEMOVE:
+        if drawing == True:
+            cv2.circle(img,(x,y),thickness,value['color'],-1)
+            cv2.circle(mask,(x,y),thickness,value['val'],-1)
+
+    elif event == cv2.EVENT_LBUTTONUP:
+        if drawing == True:
+            drawing = False
+            cv2.circle(img,(x,y),thickness,value['color'],-1)
+            cv2.circle(mask,(x,y),thickness,value['val'],-1)
+        
+# print documentation
+print __doc__
+
+# Loading images
+if len(sys.argv) == 2:
+    filename = sys.argv[1] # for drawing purposes
+else:
+    print "No input image given, so loading default image, lena.jpg \n"
+    print "Correct Usage : python grabcut.py <filename> \n"
+    filename = '../cpp/lena.jpg'
+
+img = cv2.imread(filename)
+img2 = img.copy()                               # a copy of original image
+mask = np.zeros(img.shape[:2],dtype = np.uint8) # mask initialized to PR_BG
+output = np.zeros(img.shape,np.uint8)           # output image to be shown
+
+# input and output windows
+cv2.namedWindow('output')
+cv2.namedWindow('input')
+cv2.setMouseCallback('input',onmouse)
+cv2.moveWindow('input',img.shape[1]+10,90)
+
+print " Instructions : \n"
+print " Draw a rectangle around the object using right mouse button \n"
+
+while(1):
+
+    cv2.imshow('output',output)
+    cv2.imshow('input',img)
+    k = 0xFF & cv2.waitKey(1)
+    
+    # key bindings
+    if k == 27:         # esc to exit
+        break
+    elif k == ord('0'): # BG drawing
+        print " mark background regions with left mouse button \n"
+        value = DRAW_BG
+    elif k == ord('1'): # FG drawing
+        print " mark foreground regions with left mouse button \n"
+        value = DRAW_FG
+    elif k == ord('2'): # PR_BG drawing
+        value = DRAW_PR_BG
+    elif k == ord('3'): # PR_FG drawing
+        value = DRAW_PR_FG
+    elif k == ord('s'): # save image
+        bar = np.zeros((img.shape[0],5,3),np.uint8)
+        res = np.hstack((img2,bar,img,bar,output))
+        cv2.imwrite('grabcut_output.png',res)
+        print " Result saved as image \n"
+    elif k == ord('r'): # reset everything
+        print "resetting \n"
+        rect = (0,0,1,1)
+        drawing = False         
+        rectangle = False       
+        rect_or_mask = 100 
+        rect_over = False     
+        value = DRAW_FG         
+        img = img2.copy()
+        mask = np.zeros(img.shape[:2],dtype = np.uint8) # mask initialized to PR_BG
+        output = np.zeros(img.shape,np.uint8)           # output image to be shown
+    elif k == ord('n'): # segment the image
+        print """ For finer touchups, mark foreground and background after pressing keys 0-3
+        and again press 'n' \n"""
+        if (rect_or_mask == 0):         # grabcut with rect
+            bgdmodel = np.zeros((1,65),np.float64)
+            fgdmodel = np.zeros((1,65),np.float64)    
+            cv2.grabCut(img2,mask,rect,bgdmodel,fgdmodel,1,cv2.GC_INIT_WITH_RECT)
+            rect_or_mask = 1
+        elif rect_or_mask == 1:         # grabcut with mask
+            bgdmodel = np.zeros((1,65),np.float64)
+            fgdmodel = np.zeros((1,65),np.float64) 
+            cv2.grabCut(img2,mask,rect,bgdmodel,fgdmodel,1,cv2.GC_INIT_WITH_MASK)
+
+    mask2 = np.where((mask==1) + (mask==3),255,0).astype('uint8')
+    output = cv2.bitwise_and(img2,img2,mask=mask2)   
+
+cv2.destroyAllWindows()
diff --git a/samples/winrt/ImageManipulations/AdvancedCapture.xaml b/samples/winrt/ImageManipulations/AdvancedCapture.xaml
new file mode 100644
index 0000000000..07db96f275
--- /dev/null
+++ b/samples/winrt/ImageManipulations/AdvancedCapture.xaml
@@ -0,0 +1,75 @@
+﻿<!--
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+-->
+
+<common:LayoutAwarePage
+    x:Class="SDKSample.MediaCapture.AdvancedCapture"
+    xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+    xmlns:local="using:$rootsnamespace$"
+    xmlns:common="using:SDKSample.Common"
+    xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
+    xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
+    mc:Ignorable="d">
+
+    <Grid x:Name="LayoutRoot" Background="White" HorizontalAlignment="Left" VerticalAlignment="Top">
+        <Grid.RowDefinitions>
+            <RowDefinition Height="Auto"/>
+            <RowDefinition Height="*"/>
+        </Grid.RowDefinitions>
+        <Grid x:Name="Input" Grid.Row="0">
+            <Grid.RowDefinitions>
+                <RowDefinition Height="Auto"/>
+                <RowDefinition Height="Auto"/>
+                <RowDefinition Height="*"/>
+            </Grid.RowDefinitions>
+            <TextBlock TextWrapping="Wrap" Grid.Row="0"  Text="This scenario shows how to enumerate cameras in the system. Choose a camera from the list to preview, record or take a photo from the chosen camera.  You can add the gray scale effect using the checkbox provided." Style="{StaticResource BasicTextStyle}" HorizontalAlignment="Left"/>
+            <StackPanel Orientation="Horizontal" Grid.Row="1" Margin="0,10,0,0">
+                <ListBox x:Name="EnumedDeviceList2" SelectionChanged="lstEnumedDevices_SelectionChanged" />
+                <Button x:Name="btnStartDevice2" Click="btnStartDevice_Click" IsEnabled="true"  Margin="0,0,10,0" Content="StartDevice"/>
+                <Button x:Name="btnStartPreview2" Click="btnStartPreview_Click" IsEnabled="true"  Margin="0,0,10,0" Content="StartPreview"/>
+                <ComboBox x:Name="EffectTypeCombo" Width="120" SelectedIndex="0">
+                    <ComboBoxItem Content="Preview"/>
+                    <ComboBoxItem Content="Grayscale"/>
+                    <ComboBoxItem Content="Canny"/>
+                    <ComboBoxItem Content="Sobel"/>
+                    <ComboBoxItem Content="Histogram"/>
+                </ComboBox>
+                <Button Content="Apply" HorizontalAlignment="Stretch" VerticalAlignment="Top" Click="Button_Click"/>
+            </StackPanel>
+            <StackPanel x:Name="EffectTypeCombo1" Orientation="Horizontal" Grid.Row="1" Margin="324,5,-324,7"/>
+        </Grid>
+
+        <Grid x:Name="Output" HorizontalAlignment="Left" VerticalAlignment="Top" Grid.Row="1">
+            <StackPanel Orientation="Horizontal" Margin="0,10,0,0">
+                <StackPanel>
+                    <TextBlock Style="{StaticResource BasicTextStyle}"  HorizontalAlignment='Center'  VerticalAlignment='Center'  TextAlignment='Center'	Text='Preview' />
+                    <Canvas x:Name="previewCanvas2" Background="Gray">
+                        <CaptureElement x:Name="previewElement2" />
+                    </Canvas>
+                </StackPanel>
+                <StackPanel/>
+                <StackPanel/>
+            </StackPanel>
+        </Grid>
+
+        <!-- Add Storyboards to the visual states below as necessary for supporting the various layouts -->
+        <VisualStateManager.VisualStateGroups>
+            <VisualStateGroup>
+                <VisualState x:Name="FullScreenLandscape"/>
+                <VisualState x:Name="Filled"/>
+                <VisualState x:Name="FullScreenPortrait"/>
+                <VisualState x:Name="Snapped"/>
+            </VisualStateGroup>
+        </VisualStateManager.VisualStateGroups>
+    </Grid>
+
+</common:LayoutAwarePage>
diff --git a/samples/winrt/ImageManipulations/AdvancedCapture.xaml.cpp b/samples/winrt/ImageManipulations/AdvancedCapture.xaml.cpp
new file mode 100644
index 0000000000..cff0a5a799
--- /dev/null
+++ b/samples/winrt/ImageManipulations/AdvancedCapture.xaml.cpp
@@ -0,0 +1,613 @@
+﻿//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// AdvancedCapture.xaml.cpp
+// Implementation of the AdvancedCapture class
+//
+
+#include "pch.h"
+#include "AdvancedCapture.xaml.h"
+
+using namespace SDKSample::MediaCapture;
+
+using namespace Windows::UI::Xaml;
+using namespace Windows::UI::Xaml::Navigation;
+using namespace Windows::UI::Xaml::Data;
+using namespace Windows::System;
+using namespace Windows::Foundation;
+using namespace Windows::Foundation::Collections;
+using namespace Platform;
+using namespace Windows::UI;
+using namespace Windows::UI::Core;
+using namespace Windows::UI::Xaml;
+using namespace Windows::UI::Xaml::Controls;
+using namespace Windows::UI::Xaml::Data;
+using namespace Windows::UI::Xaml::Media;
+using namespace Windows::Storage;
+using namespace Windows::Media::MediaProperties;
+using namespace Windows::Storage::Streams;
+using namespace Windows::System;
+using namespace Windows::UI::Xaml::Media::Imaging;
+using namespace Windows::Devices::Enumeration;
+
+ref class ReencodeState sealed
+{
+public:
+    ReencodeState()
+    {
+    }
+
+    virtual ~ReencodeState()
+    {
+        if (InputStream != nullptr)
+        {
+            delete InputStream;
+        }
+        if (OutputStream != nullptr)
+        {
+            delete OutputStream;
+        }
+    }
+
+internal:
+    Windows::Storage::Streams::IRandomAccessStream ^InputStream;
+    Windows::Storage::Streams::IRandomAccessStream ^OutputStream;
+    Windows::Storage::StorageFile ^PhotoStorage;
+    Windows::Graphics::Imaging::BitmapDecoder ^Decoder;
+    Windows::Graphics::Imaging::BitmapEncoder ^Encoder;
+};
+
+AdvancedCapture::AdvancedCapture()
+{
+    InitializeComponent();
+    ScenarioInit();
+}
+
+/// <summary>
+/// Invoked when this page is about to be displayed in a Frame.
+/// </summary>
+/// <param name="e">Event data that describes how this page was reached.  The Parameter
+/// property is typically used to configure the page.</param>
+void AdvancedCapture::OnNavigatedTo(NavigationEventArgs^ e)
+{
+    // A pointer back to the main page.  This is needed if you want to call methods in MainPage such
+    // as NotifyUser()
+    rootPage = MainPage::Current;
+
+    m_orientationChangedEventToken = Windows::Graphics::Display::DisplayProperties::OrientationChanged += ref new Windows::Graphics::Display::DisplayPropertiesEventHandler(this, &AdvancedCapture::DisplayProperties_OrientationChanged);
+}
+
+void AdvancedCapture::OnNavigatedFrom(NavigationEventArgs^ e)
+{
+    Windows::Media::MediaControl::SoundLevelChanged -= m_eventRegistrationToken;
+    Windows::Graphics::Display::DisplayProperties::OrientationChanged  -= m_orientationChangedEventToken;
+}
+
+void  AdvancedCapture::ScenarioInit()
+{
+    rootPage = MainPage::Current;
+    btnStartDevice2->IsEnabled = true;
+    btnStartPreview2->IsEnabled = false;
+    m_bRecording = false;
+    m_bPreviewing = false;
+    m_bEffectAdded = false;
+    previewElement2->Source = nullptr;
+    ShowStatusMessage("");
+    EffectTypeCombo->IsEnabled = false;
+    previewCanvas2->Visibility = Windows::UI::Xaml::Visibility::Collapsed;
+    EnumerateWebcamsAsync();
+    m_bSuspended = false;
+}
+
+void AdvancedCapture::ScenarioReset()
+{
+    previewCanvas2->Visibility = Windows::UI::Xaml::Visibility::Collapsed;
+    ScenarioInit();
+}
+
+void AdvancedCapture::Failed(Windows::Media::Capture::MediaCapture ^currentCaptureObject, Windows::Media::Capture::MediaCaptureFailedEventArgs^ currentFailure)
+{
+    String ^message = "Fatal error" + currentFailure->Message;
+    create_task(Dispatcher->RunAsync(Windows::UI::Core::CoreDispatcherPriority::High,
+        ref new Windows::UI::Core::DispatchedHandler([this, message]()
+    {
+        ShowStatusMessage(message);
+    })));
+}
+
+void AdvancedCapture::btnStartDevice_Click(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e)
+{
+    try
+    {
+        EnableButton(false, "StartDevice");
+        ShowStatusMessage("Starting device");
+        auto mediaCapture = ref new Windows::Media::Capture::MediaCapture();
+        m_mediaCaptureMgr = mediaCapture;
+        auto settings = ref new Windows::Media::Capture::MediaCaptureInitializationSettings();
+        auto chosenDevInfo = m_devInfoCollection->GetAt(EnumedDeviceList2->SelectedIndex);
+        settings->VideoDeviceId = chosenDevInfo->Id;
+        if (chosenDevInfo->EnclosureLocation != nullptr && chosenDevInfo->EnclosureLocation->Panel == Windows::Devices::Enumeration::Panel::Back)
+        {
+            m_bRotateVideoOnOrientationChange = true;
+            m_bReversePreviewRotation = false;
+        }
+        else if (chosenDevInfo->EnclosureLocation != nullptr && chosenDevInfo->EnclosureLocation->Panel == Windows::Devices::Enumeration::Panel::Front)
+        {
+            m_bRotateVideoOnOrientationChange = true;
+            m_bReversePreviewRotation = true;
+        }
+        else
+        {
+            m_bRotateVideoOnOrientationChange = false;
+        }
+
+        create_task(mediaCapture->InitializeAsync(settings)).then([this](task<void> initTask)
+        {
+            try
+            {
+                initTask.get();
+
+                auto mediaCapture =  m_mediaCaptureMgr.Get();
+
+                DisplayProperties_OrientationChanged(nullptr);
+
+                EnableButton(true, "StartPreview");
+                EnableButton(true, "StartStopRecord");
+                EnableButton(true, "TakePhoto");
+                ShowStatusMessage("Device initialized successful");
+                EffectTypeCombo->IsEnabled = true;
+                mediaCapture->Failed += ref new Windows::Media::Capture::MediaCaptureFailedEventHandler(this, &AdvancedCapture::Failed);
+            }
+            catch (Exception ^ e)
+            {
+                ShowExceptionMessage(e);
+            }
+        });
+    }
+    catch (Platform::Exception^ e)
+    {
+        ShowExceptionMessage(e);
+    }
+}
+
+void AdvancedCapture::btnStartPreview_Click(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e)
+{
+    m_bPreviewing = false;
+    try
+    {
+        ShowStatusMessage("Starting preview");
+        EnableButton(false, "StartPreview");
+
+        auto mediaCapture = m_mediaCaptureMgr.Get();
+        previewCanvas2->Visibility = Windows::UI::Xaml::Visibility::Visible;
+        previewElement2->Source = mediaCapture;
+        create_task(mediaCapture->StartPreviewAsync()).then([this](task<void> previewTask)
+        {
+            try
+            {
+                previewTask.get();
+                m_bPreviewing = true;
+                ShowStatusMessage("Start preview successful");
+            }
+            catch (Exception ^e)
+            {
+                ShowExceptionMessage(e);
+            }
+        });
+    }
+    catch (Platform::Exception^ e)
+    {
+        m_bPreviewing = false;
+        previewElement2->Source = nullptr;
+        EnableButton(true, "StartPreview");
+        ShowExceptionMessage(e);
+    }
+}
+
+void AdvancedCapture::lstEnumedDevices_SelectionChanged(Platform::Object^ sender, Windows::UI::Xaml::Controls::SelectionChangedEventArgs^ e)
+{
+     if ( m_bPreviewing )
+     {
+         create_task(m_mediaCaptureMgr->StopPreviewAsync()).then([this](task<void> previewTask)
+         {
+             try
+             {
+                 previewTask.get();
+                 m_bPreviewing = false;
+             }
+             catch (Exception ^e)
+             {
+                ShowExceptionMessage(e);
+             }
+         });
+    }
+
+    btnStartDevice2->IsEnabled = true;
+    btnStartPreview2->IsEnabled = false;
+    m_bRecording = false;
+    previewElement2->Source = nullptr;
+    EffectTypeCombo->IsEnabled = false;
+    m_bEffectAdded = false;
+    m_bEffectAddedToRecord = false;
+    m_bEffectAddedToPhoto = false;
+    ShowStatusMessage("");
+}
+
+void AdvancedCapture::EnumerateWebcamsAsync()
+{
+    try
+    {
+        ShowStatusMessage("Enumerating Webcams...");
+        m_devInfoCollection = nullptr;
+
+        EnumedDeviceList2->Items->Clear();
+
+        task<DeviceInformationCollection^>(DeviceInformation::FindAllAsync(DeviceClass::VideoCapture)).then([this](task<DeviceInformationCollection^> findTask)
+        {
+            try
+            {
+                m_devInfoCollection = findTask.get();
+                if (m_devInfoCollection == nullptr || m_devInfoCollection->Size == 0)
+                {
+                    ShowStatusMessage("No WebCams found.");
+                }
+                else
+                {
+                    for(unsigned int i = 0; i < m_devInfoCollection->Size; i++)
+                    {
+                        auto devInfo = m_devInfoCollection->GetAt(i);
+                        EnumedDeviceList2->Items->Append(devInfo->Name);
+                    }
+                    EnumedDeviceList2->SelectedIndex = 0;
+                    ShowStatusMessage("Enumerating Webcams completed successfully.");
+                    btnStartDevice2->IsEnabled = true;
+                }
+            }
+            catch (Exception ^e)
+            {
+                ShowExceptionMessage(e);
+            }
+        });
+    }
+    catch (Platform::Exception^ e)
+    {
+        ShowExceptionMessage(e);
+    }
+}
+
+void AdvancedCapture::AddEffectToImageStream()
+{
+    auto mediaCapture = m_mediaCaptureMgr.Get();
+    Windows::Media::Capture::VideoDeviceCharacteristic charecteristic = mediaCapture->MediaCaptureSettings->VideoDeviceCharacteristic;
+
+    if((charecteristic != Windows::Media::Capture::VideoDeviceCharacteristic::AllStreamsIdentical) &&
+        (charecteristic != Windows::Media::Capture::VideoDeviceCharacteristic::PreviewPhotoStreamsIdentical) &&
+        (charecteristic != Windows::Media::Capture::VideoDeviceCharacteristic::RecordPhotoStreamsIdentical))
+    {
+        Windows::Media::MediaProperties::IMediaEncodingProperties ^props = mediaCapture->VideoDeviceController->GetMediaStreamProperties(Windows::Media::Capture::MediaStreamType::Photo);
+        if(props->Type->Equals("Image"))
+        {
+            //Switch to a video media type instead since we cant add an effect to a image media type
+            Windows::Foundation::Collections::IVectorView<Windows::Media::MediaProperties::IMediaEncodingProperties^>^ supportedPropsList = mediaCapture->VideoDeviceController->GetAvailableMediaStreamProperties(Windows::Media::Capture::MediaStreamType::Photo);
+            {
+                unsigned int i = 0;
+                while (i < supportedPropsList->Size)
+                {
+                    Windows::Media::MediaProperties::IMediaEncodingProperties^ props = supportedPropsList->GetAt(i);
+
+                    String^ s = props->Type;
+                    if(props->Type->Equals("Video"))
+                    {
+                        task<void>(mediaCapture->VideoDeviceController->SetMediaStreamPropertiesAsync(Windows::Media::Capture::MediaStreamType::Photo,props)).then([this](task<void> changeTypeTask)
+                        {
+                            try
+                            {
+                                changeTypeTask.get();
+                                ShowStatusMessage("Change type on photo stream successful");
+                                //Now add the effect on the image pin
+                                task<void>(m_mediaCaptureMgr->AddEffectAsync(Windows::Media::Capture::MediaStreamType::Photo,"OcvTransform.OcvImageManipulations", nullptr)).then([this](task<void> effectTask3)
+                                {
+                                    try
+                                    {
+                                        effectTask3.get();
+                                        m_bEffectAddedToPhoto = true;
+                                        ShowStatusMessage("Adding effect to photo stream successful");
+                                        EffectTypeCombo->IsEnabled = true;
+
+                                    }
+                                    catch(Exception ^e)
+                                    {
+                                        ShowExceptionMessage(e);
+                                        EffectTypeCombo->IsEnabled = true;
+                                    }
+                                });
+
+                            }
+                            catch(Exception ^e)
+                            {
+                                ShowExceptionMessage(e);
+                                EffectTypeCombo->IsEnabled = true;
+                            }
+
+                        });
+                        break;
+
+                    }
+                    i++;
+                }
+            }
+        }
+        else
+        {
+            //Add the effect to the image pin if the type is already "Video"
+            task<void>(mediaCapture->AddEffectAsync(Windows::Media::Capture::MediaStreamType::Photo,"OcvTransform.OcvImageManipulations", nullptr)).then([this](task<void> effectTask3)
+            {
+                try
+                {
+                    effectTask3.get();
+                    m_bEffectAddedToPhoto = true;
+                    ShowStatusMessage("Adding effect to photo stream successful");
+                    EffectTypeCombo->IsEnabled = true;
+
+                }
+                catch(Exception ^e)
+                {
+                    ShowExceptionMessage(e);
+                    EffectTypeCombo->IsEnabled = true;
+                }
+            });
+        }
+    }
+}
+
+void AdvancedCapture::ShowStatusMessage(Platform::String^ text)
+{
+    rootPage->NotifyUser(text, NotifyType::StatusMessage);
+}
+
+void AdvancedCapture::ShowExceptionMessage(Platform::Exception^ ex)
+{
+    rootPage->NotifyUser(ex->Message, NotifyType::ErrorMessage);
+}
+
+void AdvancedCapture::EnableButton(bool enabled, String^ name)
+{
+    if (name->Equals("StartDevice"))
+    {
+        btnStartDevice2->IsEnabled = enabled;
+    }
+    else if (name->Equals("StartPreview"))
+    {
+        btnStartPreview2->IsEnabled = enabled;
+    }
+}
+
+task<Windows::Storage::StorageFile^> AdvancedCapture::ReencodePhotoAsync(
+    Windows::Storage::StorageFile ^tempStorageFile,
+    Windows::Storage::FileProperties::PhotoOrientation photoRotation)
+{
+    ReencodeState ^state = ref new ReencodeState();
+
+    return create_task(tempStorageFile->OpenAsync(Windows::Storage::FileAccessMode::Read)).then([state](Windows::Storage::Streams::IRandomAccessStream ^stream)
+    {
+        state->InputStream = stream;
+        return Windows::Graphics::Imaging::BitmapDecoder::CreateAsync(state->InputStream);
+    }).then([state](Windows::Graphics::Imaging::BitmapDecoder ^decoder)
+    {
+        state->Decoder = decoder;
+        return Windows::Storage::KnownFolders::PicturesLibrary->CreateFileAsync(PHOTO_FILE_NAME, Windows::Storage::CreationCollisionOption::GenerateUniqueName);
+    }).then([state](Windows::Storage::StorageFile ^storageFile)
+    {
+        state->PhotoStorage = storageFile;
+        return state->PhotoStorage->OpenAsync(Windows::Storage::FileAccessMode::ReadWrite);
+    }).then([state](Windows::Storage::Streams::IRandomAccessStream ^stream)
+    {
+        state->OutputStream = stream;
+        state->OutputStream->Size = 0;
+        return Windows::Graphics::Imaging::BitmapEncoder::CreateForTranscodingAsync(state->OutputStream, state->Decoder);
+    }).then([state, photoRotation](Windows::Graphics::Imaging::BitmapEncoder ^encoder)
+    {
+        state->Encoder = encoder;
+        auto properties = ref new Windows::Graphics::Imaging::BitmapPropertySet();
+        properties->Insert("System.Photo.Orientation",
+            ref new Windows::Graphics::Imaging::BitmapTypedValue((unsigned short)photoRotation, Windows::Foundation::PropertyType::UInt16));
+        return create_task(state->Encoder->BitmapProperties->SetPropertiesAsync(properties));
+    }).then([state]()
+    {
+        return state->Encoder->FlushAsync();
+    }).then([tempStorageFile, state](task<void> previousTask)
+    {
+        auto result = state->PhotoStorage;
+        delete state;
+
+        tempStorageFile->DeleteAsync(Windows::Storage::StorageDeleteOption::PermanentDelete);
+
+        previousTask.get();
+
+        return result;
+    });
+}
+
+Windows::Storage::FileProperties::PhotoOrientation AdvancedCapture::GetCurrentPhotoRotation()
+{
+    bool counterclockwiseRotation = m_bReversePreviewRotation;
+
+    if (m_bRotateVideoOnOrientationChange)
+    {
+        return PhotoRotationLookup(Windows::Graphics::Display::DisplayProperties::CurrentOrientation, counterclockwiseRotation);
+    }
+    else
+    {
+        return Windows::Storage::FileProperties::PhotoOrientation::Normal;
+    }
+}
+
+void AdvancedCapture::PrepareForVideoRecording()
+{
+    Windows::Media::Capture::MediaCapture ^mediaCapture = m_mediaCaptureMgr.Get();
+    if (mediaCapture == nullptr)
+    {
+        return;
+    }
+
+    bool counterclockwiseRotation = m_bReversePreviewRotation;
+
+    if (m_bRotateVideoOnOrientationChange)
+    {
+        mediaCapture->SetRecordRotation(VideoRotationLookup(Windows::Graphics::Display::DisplayProperties::CurrentOrientation, counterclockwiseRotation));
+    }
+    else
+    {
+        mediaCapture->SetRecordRotation(Windows::Media::Capture::VideoRotation::None);
+    }
+}
+
+void AdvancedCapture::DisplayProperties_OrientationChanged(Platform::Object^ sender)
+{
+    Windows::Media::Capture::MediaCapture ^mediaCapture = m_mediaCaptureMgr.Get();
+    if (mediaCapture == nullptr)
+    {
+        return;
+    }
+
+    bool previewMirroring = mediaCapture->GetPreviewMirroring();
+    bool counterclockwiseRotation = (previewMirroring && !m_bReversePreviewRotation) ||
+        (!previewMirroring && m_bReversePreviewRotation);
+
+    if (m_bRotateVideoOnOrientationChange)
+    {
+        mediaCapture->SetPreviewRotation(VideoRotationLookup(Windows::Graphics::Display::DisplayProperties::CurrentOrientation, counterclockwiseRotation));
+    }
+    else
+    {
+        mediaCapture->SetPreviewRotation(Windows::Media::Capture::VideoRotation::None);
+    }
+}
+
+Windows::Storage::FileProperties::PhotoOrientation AdvancedCapture::PhotoRotationLookup(
+    Windows::Graphics::Display::DisplayOrientations displayOrientation, bool counterclockwise)
+{
+    switch (displayOrientation)
+    {
+    case Windows::Graphics::Display::DisplayOrientations::Landscape:
+        return Windows::Storage::FileProperties::PhotoOrientation::Normal;
+
+    case Windows::Graphics::Display::DisplayOrientations::Portrait:
+        return (counterclockwise) ? Windows::Storage::FileProperties::PhotoOrientation::Rotate270:
+            Windows::Storage::FileProperties::PhotoOrientation::Rotate90;
+
+    case Windows::Graphics::Display::DisplayOrientations::LandscapeFlipped:
+        return Windows::Storage::FileProperties::PhotoOrientation::Rotate180;
+
+    case Windows::Graphics::Display::DisplayOrientations::PortraitFlipped:
+        return (counterclockwise) ? Windows::Storage::FileProperties::PhotoOrientation::Rotate90 :
+            Windows::Storage::FileProperties::PhotoOrientation::Rotate270;
+
+    default:
+        return Windows::Storage::FileProperties::PhotoOrientation::Unspecified;
+    }
+}
+
+Windows::Media::Capture::VideoRotation AdvancedCapture::VideoRotationLookup(
+    Windows::Graphics::Display::DisplayOrientations displayOrientation, bool counterclockwise)
+{
+    switch (displayOrientation)
+    {
+    case Windows::Graphics::Display::DisplayOrientations::Landscape:
+        return Windows::Media::Capture::VideoRotation::None;
+
+    case Windows::Graphics::Display::DisplayOrientations::Portrait:
+        return (counterclockwise) ? Windows::Media::Capture::VideoRotation::Clockwise270Degrees :
+            Windows::Media::Capture::VideoRotation::Clockwise90Degrees;
+
+    case Windows::Graphics::Display::DisplayOrientations::LandscapeFlipped:
+        return Windows::Media::Capture::VideoRotation::Clockwise180Degrees;
+
+    case Windows::Graphics::Display::DisplayOrientations::PortraitFlipped:
+        return (counterclockwise) ? Windows::Media::Capture::VideoRotation::Clockwise90Degrees:
+            Windows::Media::Capture::VideoRotation::Clockwise270Degrees ;
+
+    default:
+        return Windows::Media::Capture::VideoRotation::None;
+    }
+}
+
+void SDKSample::MediaCapture::AdvancedCapture::Button_Click(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e)
+{
+    try
+    {
+        create_task(m_mediaCaptureMgr->ClearEffectsAsync(Windows::Media::Capture::MediaStreamType::VideoPreview)).then([this](task<void> cleanTask)
+        {
+            m_bEffectAdded = true;
+            int index = EffectTypeCombo->SelectedIndex;
+            PropertySet^ props = ref new PropertySet();
+            props->Insert(L"{698649BE-8EAE-4551-A4CB-3EC98FBD3D86}", index);
+            create_task(m_mediaCaptureMgr->AddEffectAsync(Windows::Media::Capture::MediaStreamType::VideoPreview,"OcvTransform.OcvImageManipulations", props)).then([this](task<void> effectTask)
+            {
+                try
+                {
+                    effectTask.get();
+
+                    auto mediaCapture = m_mediaCaptureMgr.Get();
+                    Windows::Media::Capture::VideoDeviceCharacteristic charecteristic = mediaCapture->MediaCaptureSettings->VideoDeviceCharacteristic;
+
+                    ShowStatusMessage("Add effect successful to preview stream successful");
+                    if((charecteristic != Windows::Media::Capture::VideoDeviceCharacteristic::AllStreamsIdentical) &&
+                        (charecteristic != Windows::Media::Capture::VideoDeviceCharacteristic::PreviewRecordStreamsIdentical))
+                    {
+                        Windows::Media::MediaProperties::IMediaEncodingProperties ^props = mediaCapture->VideoDeviceController->GetMediaStreamProperties(Windows::Media::Capture::MediaStreamType::VideoRecord);
+                        Windows::Media::MediaProperties::VideoEncodingProperties ^videoEncodingProperties  = static_cast<Windows::Media::MediaProperties::VideoEncodingProperties ^>(props);
+                        if(!videoEncodingProperties->Subtype->Equals("H264")) //Cant add an effect to an H264 stream
+                        {
+                            task<void>(mediaCapture->AddEffectAsync(Windows::Media::Capture::MediaStreamType::VideoRecord,"OcvTransform.OcvImageManipulations", nullptr)).then([this](task<void> effectTask2)
+                            {
+                                try
+                                {
+                                    effectTask2.get();
+                                    ShowStatusMessage("Add effect successful to record stream successful");
+                                    m_bEffectAddedToRecord = true;
+                                    AddEffectToImageStream();
+                                    EffectTypeCombo->IsEnabled = true;
+                                }
+                                catch(Exception ^e)
+                                {
+                                    ShowExceptionMessage(e);
+                                    EffectTypeCombo->IsEnabled = true;
+                                }
+                            });
+                        }
+                        else
+                        {
+                            AddEffectToImageStream();
+                            EffectTypeCombo->IsEnabled = true;
+                        }
+
+                    }
+                    else
+                    {
+                        AddEffectToImageStream();
+                        EffectTypeCombo->IsEnabled = true;
+                    }
+                }
+                catch (Exception ^e)
+                {
+                    ShowExceptionMessage(e);
+                    EffectTypeCombo->IsEnabled = true;
+                }
+            });
+        });
+    }
+    catch (Platform::Exception ^e)
+    {
+        ShowExceptionMessage(e);
+        EffectTypeCombo->IsEnabled = true;
+    }
+}
diff --git a/samples/winrt/ImageManipulations/AdvancedCapture.xaml.h b/samples/winrt/ImageManipulations/AdvancedCapture.xaml.h
new file mode 100644
index 0000000000..94fa87c615
--- /dev/null
+++ b/samples/winrt/ImageManipulations/AdvancedCapture.xaml.h
@@ -0,0 +1,95 @@
+﻿//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// AdvancedCapture.xaml.h
+// Declaration of the AdvancedCapture class
+//
+
+#pragma once
+
+#include "pch.h"
+#include "AdvancedCapture.g.h"
+#include "MainPage.xaml.h"
+#include <ppl.h>
+
+#define VIDEO_FILE_NAME "video.mp4"
+#define PHOTO_FILE_NAME "photo.jpg"
+#define TEMP_PHOTO_FILE_NAME "photoTmp.jpg"
+
+using namespace concurrency;
+using namespace Windows::Devices::Enumeration;
+
+namespace SDKSample
+{
+    namespace MediaCapture
+    {
+        /// <summary>
+        /// An empty page that can be used on its own or navigated to within a Frame.
+        /// </summary>
+        [Windows::Foundation::Metadata::WebHostHidden]
+        public ref class AdvancedCapture sealed
+        {
+        public:
+            AdvancedCapture();
+
+        protected:
+            virtual void OnNavigatedTo(Windows::UI::Xaml::Navigation::NavigationEventArgs^ e) override;
+            virtual void OnNavigatedFrom(Windows::UI::Xaml::Navigation::NavigationEventArgs^ e) override;
+
+        private:
+            MainPage^ rootPage;
+            void ScenarioInit();
+            void ScenarioReset();
+
+            void Failed(Windows::Media::Capture::MediaCapture ^ mediaCapture, Windows::Media::Capture::MediaCaptureFailedEventArgs ^ args);
+
+            void btnStartDevice_Click(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+
+            void btnStartPreview_Click(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+
+            void lstEnumedDevices_SelectionChanged(Platform::Object^ sender, Windows::UI::Xaml::Controls::SelectionChangedEventArgs^ e);
+            void EnumerateWebcamsAsync();
+
+            void AddEffectToImageStream();
+
+            void ShowStatusMessage(Platform::String^ text);
+            void ShowExceptionMessage(Platform::Exception^ ex);
+
+            void EnableButton(bool enabled, Platform::String ^name);
+
+            task<Windows::Storage::StorageFile^> ReencodePhotoAsync(
+                Windows::Storage::StorageFile ^tempStorageFile,
+                Windows::Storage::FileProperties::PhotoOrientation photoRotation);
+            Windows::Storage::FileProperties::PhotoOrientation GetCurrentPhotoRotation();
+            void PrepareForVideoRecording();
+            void DisplayProperties_OrientationChanged(Platform::Object^ sender);
+            Windows::Storage::FileProperties::PhotoOrientation PhotoRotationLookup(
+                Windows::Graphics::Display::DisplayOrientations displayOrientation, bool counterclockwise);
+            Windows::Media::Capture::VideoRotation VideoRotationLookup(
+                Windows::Graphics::Display::DisplayOrientations displayOrientation, bool counterclockwise);
+
+            Platform::Agile<Windows::Media::Capture::MediaCapture> m_mediaCaptureMgr;
+            Windows::Storage::StorageFile^ m_recordStorageFile;
+            bool m_bRecording;
+            bool m_bEffectAdded;
+            bool m_bEffectAddedToRecord;
+            bool m_bEffectAddedToPhoto;
+            bool m_bSuspended;
+            bool m_bPreviewing;
+            DeviceInformationCollection^ m_devInfoCollection;
+            Windows::Foundation::EventRegistrationToken m_eventRegistrationToken;
+            bool m_bRotateVideoOnOrientationChange;
+            bool m_bReversePreviewRotation;
+            Windows::Foundation::EventRegistrationToken m_orientationChangedEventToken;
+            void Button_Click(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+        };
+    }
+}
diff --git a/samples/winrt/ImageManipulations/App.xaml b/samples/winrt/ImageManipulations/App.xaml
new file mode 100644
index 0000000000..2edfd7790e
--- /dev/null
+++ b/samples/winrt/ImageManipulations/App.xaml
@@ -0,0 +1,30 @@
+﻿<!--
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+-->
+
+<Application xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" 
+    x:Class="SDKSample.App"
+    RequestedTheme="Light">
+    <Application.Resources>
+
+        <ResourceDictionary>
+            <ResourceDictionary.MergedDictionaries>
+                <!-- 
+                    Styles that define common aspects of the platform look and feel
+                    Required by Visual Studio project and item templates
+                 -->
+                <ResourceDictionary Source="Common/StandardStyles.xaml"/>
+                <ResourceDictionary Source="Sample-Utils/SampleTemplateStyles.xaml"/>
+            </ResourceDictionary.MergedDictionaries>
+        </ResourceDictionary>
+    </Application.Resources>
+</Application>
diff --git a/samples/winrt/ImageManipulations/App.xaml.cpp b/samples/winrt/ImageManipulations/App.xaml.cpp
new file mode 100644
index 0000000000..a24a4f9c0a
--- /dev/null
+++ b/samples/winrt/ImageManipulations/App.xaml.cpp
@@ -0,0 +1,116 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// App.xaml.cpp
+// Implementation of the App.xaml class.
+//
+
+#include "pch.h"
+#include "MainPage.xaml.h"
+#include "AdvancedCapture.xaml.h"
+#include "Common\SuspensionManager.h"
+
+using namespace SDKSample;
+using namespace SDKSample::Common;
+using namespace SDKSample::MediaCapture;
+
+using namespace Concurrency;
+using namespace Platform;
+using namespace Windows::ApplicationModel;
+using namespace Windows::ApplicationModel::Activation;
+using namespace Windows::Foundation;
+using namespace Windows::Foundation::Collections;
+using namespace Windows::UI::Core;
+using namespace Windows::UI::Xaml;
+using namespace Windows::UI::Xaml::Controls;
+using namespace Windows::UI::Xaml::Controls::Primitives;
+using namespace Windows::UI::Xaml::Data;
+using namespace Windows::UI::Xaml::Input;
+using namespace Windows::UI::Xaml::Interop;
+using namespace Windows::UI::Xaml::Media;
+using namespace Windows::UI::Xaml::Navigation;
+
+/// <summary>
+/// Initializes the singleton application object.  This is the first line of authored code
+/// executed, and as such is the logical equivalent of main() or WinMain().
+/// </summary>
+App::App()
+{
+    InitializeComponent();
+    this->Suspending += ref new SuspendingEventHandler(this, &SDKSample::App::OnSuspending);
+}
+
+/// <summary>
+/// Invoked when the application is launched normally by the end user.  Other entry points will
+/// be used when the application is launched to open a specific file, to display search results,
+/// and so forth.
+/// </summary>
+/// <param name="pArgs">Details about the launch request and process.</param>
+void App::OnLaunched(LaunchActivatedEventArgs^ pArgs)
+{
+    this->LaunchArgs = pArgs;
+
+    // Do not repeat app initialization when already running, just ensure that
+    // the window is active
+    if (pArgs->PreviousExecutionState == ApplicationExecutionState::Running)
+    {
+        Window::Current->Activate();
+        return;
+    }
+
+    // Create a Frame to act as the navigation context and associate it with
+    // a SuspensionManager key
+    auto rootFrame = ref new Frame();
+    SuspensionManager::RegisterFrame(rootFrame, "AppFrame");
+
+    auto prerequisite = task<void>([](){});
+    if (pArgs->PreviousExecutionState == ApplicationExecutionState::Terminated)
+    {
+        // Restore the saved session state only when appropriate, scheduling the
+        // final launch steps after the restore is complete
+        prerequisite = SuspensionManager::RestoreAsync();
+    }
+    prerequisite.then([=]()
+    {
+        // When the navigation stack isn't restored navigate to the first page,
+        // configuring the new page by passing required information as a navigation
+        // parameter
+        if (rootFrame->Content == nullptr)
+        {
+            if (!rootFrame->Navigate(TypeName(MainPage::typeid)))
+            {
+                throw ref new FailureException("Failed to create initial page");
+            }
+        }
+
+        // Place the frame in the current Window and ensure that it is active
+        Window::Current->Content = rootFrame;
+        Window::Current->Activate();
+    }, task_continuation_context::use_current());
+}
+
+/// <summary>
+/// Invoked when application execution is being suspended.  Application state is saved
+/// without knowing whether the application will be terminated or resumed with the contents
+/// of memory still intact.
+/// </summary>
+/// <param name="sender">The source of the suspend request.</param>
+/// <param name="e">Details about the suspend request.</param>
+void App::OnSuspending(Object^ sender, SuspendingEventArgs^ e)
+{
+    (void) sender;	// Unused parameter
+
+    auto deferral = e->SuspendingOperation->GetDeferral();
+    SuspensionManager::SaveAsync().then([=]()
+    {
+        deferral->Complete();
+    });
+}
diff --git a/samples/winrt/ImageManipulations/App.xaml.h b/samples/winrt/ImageManipulations/App.xaml.h
new file mode 100644
index 0000000000..a8b6064248
--- /dev/null
+++ b/samples/winrt/ImageManipulations/App.xaml.h
@@ -0,0 +1,35 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// App.xaml.h
+// Declaration of the App.xaml class.
+//
+
+#pragma once
+
+#include "pch.h"
+#include "App.g.h"
+#include "MainPage.g.h"
+
+namespace SDKSample
+{
+    ref class App
+    {
+    internal:
+        App();
+        virtual void OnSuspending(Platform::Object^ sender, Windows::ApplicationModel::SuspendingEventArgs^ pArgs);
+        Windows::ApplicationModel::Activation::LaunchActivatedEventArgs^ LaunchArgs;
+    protected:
+        virtual void OnLaunched(Windows::ApplicationModel::Activation::LaunchActivatedEventArgs^ pArgs) override;
+    private:
+        Windows::UI::Xaml::Controls::Frame^ rootFrame;
+    };
+}
diff --git a/samples/winrt/ImageManipulations/Constants.cpp b/samples/winrt/ImageManipulations/Constants.cpp
new file mode 100644
index 0000000000..a26634272b
--- /dev/null
+++ b/samples/winrt/ImageManipulations/Constants.cpp
@@ -0,0 +1,22 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+#include "pch.h"
+#include "MainPage.xaml.h"
+#include "Constants.h"
+
+using namespace SDKSample;
+
+Platform::Array<Scenario>^ MainPage::scenariosInner = ref new Platform::Array<Scenario>
+{
+    // The format here is the following:
+    //     { "Description for the sample", "Fully quaified name for the class that implements the scenario" }
+    { "Enumerate cameras and add a video effect", "SDKSample.MediaCapture.AdvancedCapture" },
+};
diff --git a/samples/winrt/ImageManipulations/Constants.h b/samples/winrt/ImageManipulations/Constants.h
new file mode 100644
index 0000000000..143f06960f
--- /dev/null
+++ b/samples/winrt/ImageManipulations/Constants.h
@@ -0,0 +1,45 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+#pragma once
+
+#include <collection.h>
+namespace SDKSample
+{
+    public value struct Scenario
+    {
+        Platform::String^ Title;
+        Platform::String^ ClassName;
+    };
+
+    partial ref class MainPage
+    {
+    public:
+        static property Platform::String^ FEATURE_NAME
+        {
+            Platform::String^ get()
+            {
+                return ref new Platform::String(L"OpenCV Image Manipulations sample");
+            }
+        }
+
+        static property Platform::Array<Scenario>^ scenarios
+        {
+            Platform::Array<Scenario>^ get()
+            {
+                return scenariosInner;
+            }
+        }
+    private:
+        static Platform::Array<Scenario>^ scenariosInner;
+    };
+
+
+}
diff --git a/samples/winrt/ImageManipulations/MainPage.xaml b/samples/winrt/ImageManipulations/MainPage.xaml
new file mode 100644
index 0000000000..e0ed0d79c1
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MainPage.xaml
@@ -0,0 +1,156 @@
+﻿<!--
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+-->
+
+<common:LayoutAwarePage 
+    x:Class="SDKSample.MainPage"
+    xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+    xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
+    xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
+    xmlns:common="using:SDKSample.Common"
+    mc:Ignorable="d"
+    x:Name="RootPage">
+
+    <common:LayoutAwarePage.Resources>
+        <Style x:Key="BaseStatusStyle" TargetType="TextBlock">
+            <Setter Property="FontFamily" Value="Segoe UI Semilight"/>
+            <Setter Property="FontSize" Value="14.667"/>
+            <Setter Property="Margin" Value="0,0,0,5"/>
+        </Style>
+        <Style x:Key="StatusStyle" BasedOn="{StaticResource BaseStatusStyle}" TargetType="TextBlock">
+            <Setter Property="Foreground" Value="Green"/>
+        </Style>
+        <Style x:Key="ErrorStyle" BasedOn="{StaticResource BaseStatusStyle}" TargetType="TextBlock">
+            <Setter Property="Foreground" Value="Blue"/>
+        </Style>
+    </common:LayoutAwarePage.Resources>
+
+
+    <Grid x:Name="LayoutRoot" Background="{StaticResource ApplicationPageBackgroundThemeBrush}">
+
+        <Grid x:Name="ContentRoot" Background="{StaticResource ApplicationPageBackgroundThemeBrush}" Margin="100,20,100,20">
+            <Grid.RowDefinitions>
+                <RowDefinition Height="Auto"/>
+                <RowDefinition Height="*"/>
+                <RowDefinition Height="Auto"/>
+            </Grid.RowDefinitions>
+
+            <!-- Header -->
+            <StackPanel Orientation="Horizontal" Grid.Row="0">
+                <Image x:Name="WindowsLogo" Stretch="None" Source="Assets/windows-sdk.png" AutomationProperties.Name="Windows Logo" HorizontalAlignment="Left" Grid.Column="0"/>
+                <TextBlock VerticalAlignment="Bottom" Style="{StaticResource TitleTextStyle}" TextWrapping="Wrap" Grid.Column="1" Text="OpenCV for Windows RT"/>
+            </StackPanel>
+            <ScrollViewer x:Name="MainScrollViewer" Grid.Row="1" ZoomMode="Disabled" IsTabStop="False" VerticalScrollBarVisibility="Auto" HorizontalScrollBarVisibility="Auto" Padding="0,0,0,20" >
+                <Grid>
+                    <Grid.RowDefinitions>
+                        <RowDefinition Height="Auto"/>
+                        <RowDefinition Height="*"/>
+                    </Grid.RowDefinitions>
+                    <TextBlock x:Name="FeatureName" Grid.Row="0"  Text="Add Sample Title Here" Style="{StaticResource HeaderTextStyle}" TextWrapping="Wrap"/>
+
+                    <!-- Content -->
+                    <Grid Grid.Row="1">
+
+                        <!-- All XAML in this section is purely for design time so you can see sample content in the designer. -->
+                        <!-- This will be repaced at runtime by live content.                                                  -->
+                        <Grid>
+                            <Grid.RowDefinitions>
+                                <RowDefinition Height="Auto"/>
+                                <RowDefinition Height="Auto"/>
+                                <RowDefinition Height="Auto"/>
+                                <RowDefinition Height="Auto"/>
+                                <RowDefinition Height="Auto"/>
+                                <RowDefinition Height="Auto"/>
+                                <RowDefinition Height="Auto"/>
+                                <RowDefinition Height="*"/>
+                            </Grid.RowDefinitions>
+                            <Grid.ColumnDefinitions>
+                                <ColumnDefinition Width="Auto"/>
+                                <ColumnDefinition Width="*"/>
+                            </Grid.ColumnDefinitions>
+                            <TextBlock Grid.Row="0" Text="Input" Style="{StaticResource H2Style}"/>
+
+                            <TextBlock x:Name="ScenarioListLabel" Text="Select Scenario:" Grid.Row="1"  Style="{StaticResource SubheaderTextStyle}" Margin="0,5,0,0" />
+                            <ListBox x:Name="Scenarios" Margin="0,0,20,0" Grid.Row="2" AutomationProperties.Name="Scenarios" HorizontalAlignment="Left" 
+                                         VerticalAlignment="Top" ScrollViewer.HorizontalScrollBarVisibility="Auto"
+                                         AutomationProperties.LabeledBy="{Binding ElementName=ScenarioListLabel}" MaxHeight="125">
+                                <ListBox.ItemTemplate>
+                                    <DataTemplate>
+                                        <TextBlock Text="{Binding Name}"/>
+                                    </DataTemplate>
+                                </ListBox.ItemTemplate>
+                            </ListBox>
+                            <TextBlock x:Name="DescriptionText" Margin="0,5,0,0" Text="Description:" Style="{StaticResource SubheaderTextStyle}" Grid.Row="1" Grid.Column="1"/>
+                            <!-- Input Scenarios -->
+                            <UserControl x:Name="InputSection" Margin="0,5,0,0" IsTabStop="False" Grid.Row="2" Grid.Column="1" HorizontalAlignment="Left" VerticalAlignment="Top"/>
+
+                            <!-- Output section -->
+                            <TextBlock Text="Output" Grid.Row="5"  Margin="0,25,0,20" Style="{StaticResource H2Style}" Grid.ColumnSpan="2"/>
+                            <TextBlock x:Name="StatusBlock" Grid.Row="6" Margin="0,0,0,5" Grid.ColumnSpan="2"/>
+
+                            <!-- Output Scenarios -->
+                            <UserControl x:Name="OutputSection" Grid.Row="7" Grid.ColumnSpan="2" BorderThickness="0"/>
+                        </Grid>
+                    </Grid>
+                </Grid>
+            </ScrollViewer>
+
+            <!-- Footer -->
+            <Grid x:Name="Footer"  Grid.Row="3" Margin="0,10,0,10" VerticalAlignment="Bottom" >
+                <Grid.RowDefinitions>
+                    <RowDefinition Height="Auto"/>
+                    <RowDefinition Height="Auto"/>
+                    <RowDefinition Height="Auto"/>
+                </Grid.RowDefinitions>
+
+                <Grid.ColumnDefinitions>
+                    <ColumnDefinition Width="Auto"/>
+                    <ColumnDefinition Width="*"/>
+                </Grid.ColumnDefinitions>
+                <StackPanel x:Name="FooterPanel" Orientation="Horizontal" Grid.Row="1" Grid.Column="1" HorizontalAlignment="Right"/>
+            </Grid>
+
+
+        </Grid>
+
+        <VisualStateManager.VisualStateGroups>
+            <!-- Visual states reflect the application's view state -->
+            <VisualStateGroup>
+                <VisualState x:Name="FullScreenLandscape">
+                    <Storyboard>
+                    </Storyboard>
+                </VisualState>
+                <VisualState x:Name="Filled">
+                    <Storyboard>
+                    </Storyboard>
+                </VisualState>
+
+                <VisualState x:Name="FullScreenPortrait">
+                    <Storyboard>
+                    </Storyboard>
+                </VisualState>
+
+                <VisualState x:Name="Snapped">
+                    <Storyboard>
+                        <ObjectAnimationUsingKeyFrames Storyboard.TargetProperty="(FrameworkElement.Margin)" Storyboard.TargetName="ContentRoot">
+                            <DiscreteObjectKeyFrame KeyTime="0">
+                                <DiscreteObjectKeyFrame.Value>
+                                    <Thickness>20,20,20,20</Thickness>
+                                </DiscreteObjectKeyFrame.Value>
+                            </DiscreteObjectKeyFrame>
+                        </ObjectAnimationUsingKeyFrames>
+                    </Storyboard>
+                </VisualState>
+            </VisualStateGroup>
+        </VisualStateManager.VisualStateGroups>
+    </Grid>
+</common:LayoutAwarePage>
diff --git a/samples/winrt/ImageManipulations/MainPage.xaml.cpp b/samples/winrt/ImageManipulations/MainPage.xaml.cpp
new file mode 100644
index 0000000000..bd897fcc0b
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MainPage.xaml.cpp
@@ -0,0 +1,315 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// MainPage.xaml.cpp
+// Implementation of the MainPage.xaml class.
+//
+
+#include "pch.h"
+#include "MainPage.xaml.h"
+#include "App.xaml.h"
+
+#include <collection.h>
+
+using namespace Windows::UI::Xaml;
+using namespace Windows::UI::Xaml::Controls;
+using namespace Windows::Foundation;
+using namespace Windows::Foundation::Collections;
+using namespace Platform;
+using namespace SDKSample;
+using namespace Windows::UI::Xaml::Navigation;
+using namespace Windows::UI::Xaml::Interop;
+using namespace Windows::Graphics::Display;
+using namespace Windows::UI::ViewManagement;
+
+MainPage^ MainPage::Current = nullptr;
+
+MainPage::MainPage()
+{
+    InitializeComponent();
+
+    // This frame is hidden, meaning it is never shown.  It is simply used to load
+    // each scenario page and then pluck out the input and output sections and
+    // place them into the UserControls on the main page.
+    HiddenFrame = ref new Windows::UI::Xaml::Controls::Frame();
+    HiddenFrame->Visibility = Windows::UI::Xaml::Visibility::Collapsed;
+    ContentRoot->Children->Append(HiddenFrame);
+
+    FeatureName->Text = FEATURE_NAME;
+
+    this->SizeChanged += ref new SizeChangedEventHandler(this, &MainPage::MainPage_SizeChanged);
+    Scenarios->SelectionChanged += ref new SelectionChangedEventHandler(this, &MainPage::Scenarios_SelectionChanged);
+
+    MainPage::Current = this;
+    autoSizeInputSectionWhenSnapped = true;
+}
+
+/// <summary>
+/// We need to handle SizeChanged so that we can make the sample layout property
+/// in the various layouts.
+/// </summary>
+/// <param name="sender"></param>
+/// <param name="e"></param>
+void MainPage::MainPage_SizeChanged(Object^ sender, SizeChangedEventArgs^ e)
+{
+    InvalidateSize();
+    MainPageSizeChangedEventArgs^ args = ref new MainPageSizeChangedEventArgs();
+    args->ViewState = ApplicationView::Value;
+    MainPageResized(this, args);
+
+}
+
+void MainPage::InvalidateSize()
+{
+    // Get the window width
+    double windowWidth = this->ActualWidth;
+
+    if (windowWidth != 0.0)
+    {
+        // Get the width of the ListBox.
+        double listBoxWidth = Scenarios->ActualWidth;
+
+        // Is the ListBox using any margins that we need to consider?
+        double listBoxMarginLeft = Scenarios->Margin.Left;
+        double listBoxMarginRight = Scenarios->Margin.Right;
+
+        // Figure out how much room is left after considering the list box width
+        double availableWidth = windowWidth - listBoxWidth;
+
+        // Is the top most child using margins?
+        double layoutRootMarginLeft = ContentRoot->Margin.Left;
+        double layoutRootMarginRight = ContentRoot->Margin.Right;
+
+        // We have different widths to use depending on the view state
+        if (ApplicationView::Value != ApplicationViewState::Snapped)
+        {
+            // Make us as big as the the left over space, factoring in the ListBox width, the ListBox margins.
+            // and the LayoutRoot's margins
+            InputSection->Width = ((availableWidth) -
+                (layoutRootMarginLeft + layoutRootMarginRight + listBoxMarginLeft + listBoxMarginRight));
+        }
+        else
+        {
+            // Make us as big as the left over space, factoring in just the LayoutRoot's margins.
+            if (autoSizeInputSectionWhenSnapped)
+            {
+                InputSection->Width = (windowWidth - (layoutRootMarginLeft + layoutRootMarginRight));
+            }
+        }
+    }
+    InvalidateViewState();
+}
+
+void MainPage::InvalidateViewState()
+{
+    // Are we going to snapped mode?
+    if (ApplicationView::Value == ApplicationViewState::Snapped)
+    {
+        Grid::SetRow(DescriptionText, 3);
+        Grid::SetColumn(DescriptionText, 0);
+
+        Grid::SetRow(InputSection, 4);
+        Grid::SetColumn(InputSection, 0);
+
+        Grid::SetRow(FooterPanel, 2);
+        Grid::SetColumn(FooterPanel, 0);
+    }
+    else
+    {
+        Grid::SetRow(DescriptionText, 1);
+        Grid::SetColumn(DescriptionText, 1);
+
+        Grid::SetRow(InputSection, 2);
+        Grid::SetColumn(InputSection, 1);
+
+        Grid::SetRow(FooterPanel, 1);
+        Grid::SetColumn(FooterPanel, 1);
+    }
+
+    //  Since we don't load the scenario page in the traditional manner (we just pluck out the
+    // input and output sections from the page) we need to ensure that any VSM code used
+    // by the scenario's input and output sections is fired.
+    VisualStateManager::GoToState(InputSection, "Input" + LayoutAwarePage::DetermineVisualState(ApplicationView::Value), false);
+    VisualStateManager::GoToState(OutputSection, "Output" + LayoutAwarePage::DetermineVisualState(ApplicationView::Value), false);
+}
+
+void MainPage::PopulateScenarios()
+{
+    ScenarioList = ref new Platform::Collections::Vector<Object^>();
+
+    // Populate the ListBox with the list of scenarios as defined in Constants.cpp.
+    for (unsigned int i = 0; i < scenarios->Length; ++i)
+    {
+        Scenario s = scenarios[i];
+        ListBoxItem^ item = ref new ListBoxItem();
+        item->Name = s.ClassName;
+        item->Content = (i + 1).ToString() + ") " + s.Title;
+        ScenarioList->Append(item);
+    }
+
+    // Bind the ListBox to the scenario list.
+    Scenarios->ItemsSource = ScenarioList;
+    Scenarios->ScrollIntoView(Scenarios->SelectedItem);
+}
+
+/// <summary>
+/// This method is responsible for loading the individual input and output sections for each scenario.  This
+/// is based on navigating a hidden Frame to the ScenarioX.xaml page and then extracting out the input
+/// and output sections into the respective UserControl on the main page.
+/// </summary>
+/// <param name="scenarioName"></param>
+void MainPage::LoadScenario(String^ scenarioName)
+{
+    autoSizeInputSectionWhenSnapped = true;
+
+    // Load the ScenarioX.xaml file into the Frame.
+    TypeName scenarioType = {scenarioName, TypeKind::Custom};
+    HiddenFrame->Navigate(scenarioType, this);
+
+    // Get the top element, the Page, so we can look up the elements
+    // that represent the input and output sections of the ScenarioX file.
+    Page^ hiddenPage = safe_cast<Page^>(HiddenFrame->Content);
+
+    // Get each element.
+    UIElement^ input = safe_cast<UIElement^>(hiddenPage->FindName("Input"));
+    UIElement^ output = safe_cast<UIElement^>(hiddenPage->FindName("Output"));
+
+    if (input == nullptr)
+    {
+        // Malformed input section.
+        NotifyUser("Cannot load scenario input section for " + scenarioName +
+            "  Make sure root of input section markup has x:Name of 'Input'", NotifyType::ErrorMessage);
+        return;
+    }
+
+    if (output == nullptr)
+    {
+        // Malformed output section.
+        NotifyUser("Cannot load scenario output section for " + scenarioName +
+            "  Make sure root of output section markup has x:Name of 'Output'", NotifyType::ErrorMessage);
+        return;
+    }
+
+    // Find the LayoutRoot which parents the input and output sections in the main page.
+    Panel^ panel = safe_cast<Panel^>(hiddenPage->FindName("LayoutRoot"));
+
+    if (panel != nullptr)
+    {
+        unsigned int index = 0;
+        UIElementCollection^ collection = panel->Children;
+
+        // Get rid of the content that is currently in the intput and output sections.
+        collection->IndexOf(input, &index);
+        collection->RemoveAt(index);
+
+        collection->IndexOf(output, &index);
+        collection->RemoveAt(index);
+
+        // Populate the input and output sections with the newly loaded content.
+        InputSection->Content = input;
+        OutputSection->Content = output;
+
+        ScenarioLoaded(this, nullptr);
+    }
+    else
+    {
+        // Malformed Scenario file.
+        NotifyUser("Cannot load scenario: " + scenarioName + ".  Make sure root tag in the '" +
+            scenarioName + "' file has an x:Name of 'LayoutRoot'", NotifyType::ErrorMessage);
+    }
+}
+
+void MainPage::Scenarios_SelectionChanged(Object^ sender, SelectionChangedEventArgs^ e)
+{
+    if (Scenarios->SelectedItem != nullptr)
+    {
+        NotifyUser("", NotifyType::StatusMessage);
+
+        LoadScenario((safe_cast<ListBoxItem^>(Scenarios->SelectedItem))->Name);
+        InvalidateSize();
+    }
+}
+
+void MainPage::NotifyUser(String^ strMessage, NotifyType type)
+{
+    switch (type)
+    {
+    case NotifyType::StatusMessage:
+        // Use the status message style.
+        StatusBlock->Style = safe_cast<Windows::UI::Xaml::Style^>(this->Resources->Lookup("StatusStyle"));
+        break;
+    case NotifyType::ErrorMessage:
+        // Use the error message style.
+        StatusBlock->Style = safe_cast<Windows::UI::Xaml::Style^>(this->Resources->Lookup("ErrorStyle"));
+        break;
+    default:
+        break;
+    }
+    StatusBlock->Text = strMessage;
+
+    // Collapsed the StatusBlock if it has no text to conserve real estate.
+    if (StatusBlock->Text != "")
+    {
+        StatusBlock->Visibility = Windows::UI::Xaml::Visibility::Visible;
+    }
+    else
+    {
+        StatusBlock->Visibility = Windows::UI::Xaml::Visibility::Collapsed;
+    }
+}
+
+void MainPage::Footer_Click(Object^ sender, RoutedEventArgs^ e)
+{
+    auto uri = ref new Uri((String^)((HyperlinkButton^)sender)->Tag);
+    Windows::System::Launcher::LaunchUriAsync(uri);
+}
+
+
+/// <summary>
+/// Populates the page with content passed during navigation.  Any saved state is also
+/// provided when recreating a page from a prior session.
+/// </summary>
+/// <param name="navigationParameter">The parameter value passed to
+/// <see cref="Frame::Navigate(Type, Object)"/> when this page was initially requested.
+/// </param>
+/// <param name="pageState">A map of state preserved by this page during an earlier
+/// session.  This will be null the first time a page is visited.</param>
+void MainPage::LoadState(Object^ navigationParameter, IMap<String^, Object^>^ pageState)
+{
+    (void) navigationParameter;    // Unused parameter
+
+    PopulateScenarios();
+
+    // Starting scenario is the first or based upon a previous state.
+    ListBoxItem^ startingScenario = nullptr;
+    int startingScenarioIndex = -1;
+
+    if (pageState != nullptr && pageState->HasKey("SelectedScenarioIndex"))
+    {
+        startingScenarioIndex = safe_cast<int>(pageState->Lookup("SelectedScenarioIndex"));
+    }
+
+    Scenarios->SelectedIndex = startingScenarioIndex != -1 ? startingScenarioIndex : 0;
+
+    InvalidateViewState();
+}
+
+/// <summary>
+/// Preserves state associated with this page in case the application is suspended or the
+/// page is discarded from the navigation cache.  Values must conform to the serialization
+/// requirements of <see cref="SuspensionManager::SessionState"/>.
+/// </summary>
+/// <param name="pageState">An empty map to be populated with serializable state.</param>
+void MainPage::SaveState(IMap<String^, Object^>^ pageState)
+{
+    int selectedListBoxItemIndex = Scenarios->SelectedIndex;
+    pageState->Insert("SelectedScenarioIndex", selectedListBoxItemIndex);
+}
diff --git a/samples/winrt/ImageManipulations/MainPage.xaml.h b/samples/winrt/ImageManipulations/MainPage.xaml.h
new file mode 100644
index 0000000000..36fb7796a3
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MainPage.xaml.h
@@ -0,0 +1,105 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// MainPage.xaml.h
+// Declaration of the MainPage.xaml class.
+//
+
+#pragma once
+
+#include "pch.h"
+#include "MainPage.g.h"
+#include "Common\LayoutAwarePage.h" // Required by generated header
+#include "Constants.h"
+
+namespace SDKSample
+{
+    public enum class NotifyType
+    {
+        StatusMessage,
+        ErrorMessage
+    };
+
+    public ref class MainPageSizeChangedEventArgs sealed
+    {
+    public:
+        property Windows::UI::ViewManagement::ApplicationViewState ViewState
+        {
+            Windows::UI::ViewManagement::ApplicationViewState get()
+            {
+                return viewState;
+            }
+
+            void set(Windows::UI::ViewManagement::ApplicationViewState value)
+            {
+                viewState = value;
+            }
+        }
+
+    private:
+        Windows::UI::ViewManagement::ApplicationViewState viewState;
+    };
+
+    public ref class MainPage sealed
+    {
+    public:
+        MainPage();
+
+    protected:
+        virtual void LoadState(Platform::Object^ navigationParameter,
+            Windows::Foundation::Collections::IMap<Platform::String^, Platform::Object^>^ pageState) override;
+        virtual void SaveState(Windows::Foundation::Collections::IMap<Platform::String^, Platform::Object^>^ pageState) override;
+
+    internal:
+        property bool AutoSizeInputSectionWhenSnapped
+        {
+            bool get()
+            {
+                return autoSizeInputSectionWhenSnapped;
+            }
+
+            void set(bool value)
+            {
+                autoSizeInputSectionWhenSnapped = value;
+            }
+        }
+
+        property Windows::ApplicationModel::Activation::LaunchActivatedEventArgs^ LaunchArgs
+       {
+            Windows::ApplicationModel::Activation::LaunchActivatedEventArgs^ get()
+            {
+                return safe_cast<App^>(App::Current)->LaunchArgs;
+            }
+        }
+
+        void NotifyUser(Platform::String^ strMessage, NotifyType type);
+        void LoadScenario(Platform::String^ scenarioName);
+        event Windows::Foundation::EventHandler<Platform::Object^>^ ScenarioLoaded;
+        event Windows::Foundation::EventHandler<MainPageSizeChangedEventArgs^>^ MainPageResized;
+
+    private:
+        void PopulateScenarios();
+        void InvalidateSize();
+        void InvalidateViewState();
+
+        Platform::Collections::Vector<Object^>^ ScenarioList;
+        Windows::UI::Xaml::Controls::Frame^ HiddenFrame;
+        void Footer_Click(Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+        bool autoSizeInputSectionWhenSnapped;
+
+        void MainPage_SizeChanged(Object^ sender, Windows::UI::Xaml::SizeChangedEventArgs^ e);
+        void Scenarios_SelectionChanged(Object^ sender, Windows::UI::Xaml::Controls::SelectionChangedEventArgs^ e);
+
+    internal:
+        static MainPage^ Current;
+
+    };
+}
diff --git a/samples/winrt/ImageManipulations/MediaCapture.sln b/samples/winrt/ImageManipulations/MediaCapture.sln
new file mode 100644
index 0000000000..e1f9aa3859
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaCapture.sln
@@ -0,0 +1,52 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 11 Express for Windows 8
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MediaCapture", "MediaCapture.vcxproj", "{C5B886A7-8300-46FF-B533-9613DE2AF637}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GrayscaleTransform", "MediaExtensions\OcvTransform\OcvTransform.vcxproj", "{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|ARM = Debug|ARM
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|ARM = Release|ARM
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Debug|ARM.ActiveCfg = Debug|ARM
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Debug|ARM.Build.0 = Debug|ARM
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Debug|Win32.ActiveCfg = Debug|Win32
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Debug|Win32.Build.0 = Debug|Win32
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Debug|x64.ActiveCfg = Debug|x64
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Debug|x64.Build.0 = Debug|x64
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Release|ARM.ActiveCfg = Release|ARM
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Release|ARM.Build.0 = Release|ARM
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Release|Win32.ActiveCfg = Release|Win32
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Release|Win32.Build.0 = Release|Win32
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Release|x64.ActiveCfg = Release|x64
+		{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}.Release|x64.Build.0 = Release|x64
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|ARM.ActiveCfg = Debug|ARM
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|ARM.Build.0 = Debug|ARM
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|ARM.Deploy.0 = Debug|ARM
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|Win32.Build.0 = Debug|Win32
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|Win32.Deploy.0 = Debug|Win32
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|x64.ActiveCfg = Debug|x64
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|x64.Build.0 = Debug|x64
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Debug|x64.Deploy.0 = Debug|x64
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|ARM.ActiveCfg = Release|ARM
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|ARM.Build.0 = Release|ARM
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|ARM.Deploy.0 = Release|ARM
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|Win32.ActiveCfg = Release|Win32
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|Win32.Build.0 = Release|Win32
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|Win32.Deploy.0 = Release|Win32
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|x64.ActiveCfg = Release|x64
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|x64.Build.0 = Release|x64
+		{C5B886A7-8300-46FF-B533-9613DE2AF637}.Release|x64.Deploy.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/samples/winrt/ImageManipulations/MediaCapture.vcxproj b/samples/winrt/ImageManipulations/MediaCapture.vcxproj
new file mode 100644
index 0000000000..f4eceef61c
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaCapture.vcxproj
@@ -0,0 +1,310 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM">
+      <Configuration>Release</Configuration>
+      <Platform>ARM</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{C5B886A7-8300-46FF-B533-9613DE2AF637}</ProjectGuid>
+    <RootNamespace>SDKSample</RootNamespace>
+    <DefaultLanguage>en-US</DefaultLanguage>
+    <VCTargetsPath Condition="'$(VCTargetsPath11)' != '' and '$(VSVersion)' == '' and '$(VisualStudioVersion)' == ''">$(VCTargetsPath11)</VCTargetsPath>
+    <MinimumVisualStudioVersion>11.0</MinimumVisualStudioVersion>
+    <AppContainerApplication>true</AppContainerApplication>
+    <ProjectName>MediaCapture</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="AdvancedCapture.xaml.h">
+      <DependentUpon>AdvancedCapture.xaml</DependentUpon>
+      <SubType>Code</SubType>
+    </ClInclude>
+    <ClInclude Include="Constants.h" />
+    <ClInclude Include="MainPage.xaml.h">
+      <DependentUpon>MainPage.xaml</DependentUpon>
+    </ClInclude>
+    <ClInclude Include="pch.h" />
+    <ClInclude Include="Common\LayoutAwarePage.h" />
+    <ClInclude Include="Common\SuspensionManager.h" />
+    <ClInclude Include="App.xaml.h">
+      <DependentUpon>App.xaml</DependentUpon>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ApplicationDefinition Include="App.xaml">
+      <SubType>Designer</SubType>
+    </ApplicationDefinition>
+    <Page Include="AdvancedCapture.xaml">
+      <SubType>Designer</SubType>
+    </Page>
+    <Page Include="Common\StandardStyles.xaml">
+      <SubType>Designer</SubType>
+    </Page>
+    <Page Include="MainPage.xaml" />
+    <Page Include="Sample-Utils\SampleTemplateStyles.xaml">
+      <SubType>Designer</SubType>
+    </Page>
+  </ItemGroup>
+  <ItemGroup>
+    <AppxManifest Include="Package.appxmanifest">
+      <SubType>Designer</SubType>
+    </AppxManifest>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="AdvancedCapture.xaml.cpp">
+      <DependentUpon>AdvancedCapture.xaml</DependentUpon>
+      <SubType>Code</SubType>
+    </ClCompile>
+    <ClCompile Include="App.xaml.cpp">
+      <DependentUpon>App.xaml</DependentUpon>
+    </ClCompile>
+    <ClCompile Include="Common\LayoutAwarePage.cpp" />
+    <ClCompile Include="Constants.cpp" />
+    <ClCompile Include="Common\SuspensionManager.cpp" />
+    <ClCompile Include="MainPage.xaml.cpp">
+      <DependentUpon>MainPage.xaml</DependentUpon>
+    </ClCompile>
+    <ClCompile Include="pch.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Image Include="assets\opencv-logo-150.png" />
+    <Image Include="assets\opencv-logo-30.png" />
+    <Image Include="Assets\splash-sdk.png" />
+    <Image Include="Assets\windows-sdk.png" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="MediaExtensions\OcvTransform\OcvTransform.vcxproj">
+      <Project>{ba69218f-da5c-4d14-a78d-21a9e4dec669}</Project>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\..\build\install\bin\opencv_calib3d245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_contrib245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_core245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_features2d245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_flann245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_highgui245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_imgproc245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_legacy245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_ml245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_nonfree245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_objdetect245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_photo245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_stitching245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_superres245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_ts245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_video245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+    <None Include="..\..\..\..\build\install\bin\opencv_videostab245.dll">
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</DeploymentContent>
+    </None>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/Common/AsyncCB.h b/samples/winrt/ImageManipulations/MediaExtensions/Common/AsyncCB.h
new file mode 100644
index 0000000000..04ff69ed8a
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/Common/AsyncCB.h
@@ -0,0 +1,81 @@
+#pragma once
+
+//////////////////////////////////////////////////////////////////////////
+//  AsyncCallback [template]
+//
+//  Description: 
+//  Helper class that routes IMFAsyncCallback::Invoke calls to a class
+//  method on the parent class.
+//
+//  Usage:
+//  Add this class as a member variable. In the parent class constructor,
+//  initialize the AsyncCallback class like this:
+//  	m_cb(this, &CYourClass::OnInvoke)
+//  where
+//      m_cb       = AsyncCallback object
+//      CYourClass = parent class
+//      OnInvoke   = Method in the parent class to receive Invoke calls.
+//
+//  The parent's OnInvoke method (you can name it anything you like) must
+//  have a signature that matches the InvokeFn typedef below.
+//////////////////////////////////////////////////////////////////////////
+
+// T: Type of the parent object
+template<class T>
+class AsyncCallback : public IMFAsyncCallback
+{
+public: 
+    typedef HRESULT (T::*InvokeFn)(IMFAsyncResult *pAsyncResult);
+
+    AsyncCallback(T *pParent, InvokeFn fn) : m_pParent(pParent), m_pInvokeFn(fn)
+    {
+    }
+
+    // IUnknown
+    STDMETHODIMP_(ULONG) AddRef() { 
+        // Delegate to parent class.
+        return m_pParent->AddRef(); 
+    }
+    STDMETHODIMP_(ULONG) Release() { 
+        // Delegate to parent class.
+        return m_pParent->Release(); 
+    }
+    STDMETHODIMP QueryInterface(REFIID iid, void** ppv)
+    {
+        if (!ppv)
+        {
+            return E_POINTER;
+        }
+        if (iid == __uuidof(IUnknown))
+        {
+            *ppv = static_cast<IUnknown*>(static_cast<IMFAsyncCallback*>(this));
+        }
+        else if (iid == __uuidof(IMFAsyncCallback))
+        {
+            *ppv = static_cast<IMFAsyncCallback*>(this);
+        }
+        else
+        {
+            *ppv = NULL;
+            return E_NOINTERFACE;
+        }
+        AddRef();
+        return S_OK;
+    }
+
+
+    // IMFAsyncCallback methods
+    STDMETHODIMP GetParameters(DWORD*, DWORD*)
+    {
+        // Implementation of this method is optional.
+        return E_NOTIMPL;
+    }
+
+    STDMETHODIMP Invoke(IMFAsyncResult* pAsyncResult)
+    {
+        return (m_pParent->*m_pInvokeFn)(pAsyncResult);
+    }
+
+    T *m_pParent;
+    InvokeFn m_pInvokeFn;
+};
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/Common/BufferLock.h b/samples/winrt/ImageManipulations/MediaExtensions/Common/BufferLock.h
new file mode 100644
index 0000000000..92de15eacc
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/Common/BufferLock.h
@@ -0,0 +1,102 @@
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved
+
+
+#pragma once
+
+
+//////////////////////////////////////////////////////////////////////////
+//  VideoBufferLock
+//
+//  Description:
+//  Locks a video buffer that might or might not support IMF2DBuffer.
+//
+//////////////////////////////////////////////////////////////////////////
+
+class VideoBufferLock
+{
+public:
+    VideoBufferLock(IMFMediaBuffer *pBuffer) : m_p2DBuffer(NULL)
+    {
+        m_pBuffer = pBuffer;
+        m_pBuffer->AddRef();
+
+        // Query for the 2-D buffer interface. OK if this fails.
+        m_pBuffer->QueryInterface(IID_PPV_ARGS(&m_p2DBuffer));
+    }
+
+    ~VideoBufferLock()
+    {
+        UnlockBuffer();
+        SafeRelease(&m_pBuffer);
+        SafeRelease(&m_p2DBuffer);
+    }
+
+    // LockBuffer:
+    // Locks the buffer. Returns a pointer to scan line 0 and returns the stride.
+
+    // The caller must provide the default stride as an input parameter, in case
+    // the buffer does not expose IMF2DBuffer. You can calculate the default stride
+    // from the media type.
+
+    HRESULT LockBuffer(
+        LONG  lDefaultStride,    // Minimum stride (with no padding).
+        DWORD dwHeightInPixels,  // Height of the image, in pixels.
+        BYTE  **ppbScanLine0,    // Receives a pointer to the start of scan line 0.
+        LONG  *plStride          // Receives the actual stride.
+        )
+    {
+        HRESULT hr = S_OK;
+
+        // Use the 2-D version if available.
+        if (m_p2DBuffer)
+        {
+            hr = m_p2DBuffer->Lock2D(ppbScanLine0, plStride);
+        }
+        else
+        {
+            // Use non-2D version.
+            BYTE *pData = NULL;
+
+            hr = m_pBuffer->Lock(&pData, NULL, NULL);
+            if (SUCCEEDED(hr))
+            {
+                *plStride = lDefaultStride;
+                if (lDefaultStride < 0)
+                {
+                    // Bottom-up orientation. Return a pointer to the start of the
+                    // last row *in memory* which is the top row of the image.
+                    *ppbScanLine0 = pData + abs(lDefaultStride) * (dwHeightInPixels - 1);
+                }
+                else
+                {
+                    // Top-down orientation. Return a pointer to the start of the
+                    // buffer.
+                    *ppbScanLine0 = pData;
+                }
+            }
+        }
+        return hr;
+    }
+
+    HRESULT UnlockBuffer()
+    {
+        if (m_p2DBuffer)
+        {
+            return m_p2DBuffer->Unlock2D();
+        }
+        else
+        {
+            return m_pBuffer->Unlock();
+        }
+    }
+
+private:
+    IMFMediaBuffer  *m_pBuffer;
+    IMF2DBuffer     *m_p2DBuffer;
+};
+
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/Common/CritSec.h b/samples/winrt/ImageManipulations/MediaExtensions/Common/CritSec.h
new file mode 100644
index 0000000000..d5ea05bfd9
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/Common/CritSec.h
@@ -0,0 +1,62 @@
+#pragma once
+
+//////////////////////////////////////////////////////////////////////////
+//  CritSec
+//  Description: Wraps a critical section.
+//////////////////////////////////////////////////////////////////////////
+
+class CritSec
+{
+public:
+    CRITICAL_SECTION m_criticalSection;
+public:
+    CritSec()
+    {
+        InitializeCriticalSectionEx(&m_criticalSection, 100, 0);
+    }
+
+    ~CritSec()
+    {
+        DeleteCriticalSection(&m_criticalSection);
+    }
+
+	_Acquires_lock_(m_criticalSection)
+    void Lock()
+    {
+        EnterCriticalSection(&m_criticalSection);
+    }
+
+	_Releases_lock_(m_criticalSection)
+    void Unlock()
+    {
+        LeaveCriticalSection(&m_criticalSection);
+    }
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+//  AutoLock
+//  Description: Provides automatic locking and unlocking of a 
+//               of a critical section.
+//
+//  Note: The AutoLock object must go out of scope before the CritSec.
+//////////////////////////////////////////////////////////////////////////
+
+class AutoLock
+{
+private:
+    CritSec *m_pCriticalSection;
+public:
+	_Acquires_lock_(m_pCriticalSection)
+    AutoLock(CritSec& crit)
+    {
+        m_pCriticalSection = &crit;
+        m_pCriticalSection->Lock();
+    }
+
+	_Releases_lock_(m_pCriticalSection)
+    ~AutoLock()
+    {
+	    m_pCriticalSection->Unlock();
+    }
+};
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/Common/LinkList.h b/samples/winrt/ImageManipulations/MediaExtensions/Common/LinkList.h
new file mode 100644
index 0000000000..c67c0f2ca9
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/Common/LinkList.h
@@ -0,0 +1,516 @@
+//-----------------------------------------------------------------------------
+// File: Linklist.h
+// Desc: Linked list class.
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+//  Copyright (C) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+
+#pragma once
+
+// Notes:
+// 
+// The List class template implements a simple double-linked list. 
+// It uses STL's copy semantics. 
+
+// There are two versions of the Clear() method:
+//  Clear(void) clears the list w/out cleaning up the object.
+//  Clear(FN fn) takes a functor object that releases the objects, if they need cleanup.
+
+// The List class supports enumeration. Example of usage:
+//
+// List<T>::POSIITON pos = list.GetFrontPosition();
+// while (pos != list.GetEndPosition())
+// {
+//     T item;
+//     hr = list.GetItemPos(&item);
+//     pos = list.Next(pos);
+// }
+
+// The ComPtrList class template derives from List<> and implements a list of COM pointers.
+
+template <class T>
+struct NoOp
+{
+    void operator()(T& t)
+    {
+    }
+};
+
+template <class T>
+class List
+{
+protected:
+
+    // Nodes in the linked list
+    struct Node
+    {
+        Node *prev;
+        Node *next;
+        T    item;
+
+        Node() : prev(nullptr), next(nullptr)
+        {
+        }
+
+        Node(T item) : prev(nullptr), next(nullptr)
+        {
+            this->item = item;
+        }
+
+        T Item() const { return item; }
+    };
+
+public:
+
+    // Object for enumerating the list.
+    class POSITION
+    {
+        friend class List<T>;
+
+    public:
+        POSITION() : pNode(nullptr)
+        {
+        }
+
+        bool operator==(const POSITION &p) const
+        {
+            return pNode == p.pNode;
+        }
+
+        bool operator!=(const POSITION &p) const
+        {
+            return pNode != p.pNode;
+        }
+
+    private:
+        const Node *pNode;
+
+        POSITION(Node *p) : pNode(p) 
+        {
+        }
+    };
+
+protected:
+    Node    m_anchor;  // Anchor node for the linked list.
+    DWORD   m_count;   // Number of items in the list.
+
+    Node* Front() const
+    {
+        return m_anchor.next;
+    }
+
+    Node* Back() const
+    {
+        return m_anchor.prev;
+    }
+
+    virtual HRESULT InsertAfter(T item, Node *pBefore)
+    {
+        if (pBefore == nullptr)
+        {
+            return E_POINTER;
+        }
+
+        Node *pNode = new Node(item);
+        if (pNode == nullptr)
+        {
+            return E_OUTOFMEMORY;
+        }
+
+        Node *pAfter = pBefore->next;
+            
+        pBefore->next = pNode;
+        pAfter->prev = pNode;
+
+        pNode->prev = pBefore;
+        pNode->next = pAfter;
+
+        m_count++;
+
+        return S_OK;
+    }
+
+    virtual HRESULT GetItem(const Node *pNode, T* ppItem)
+    {
+        if (pNode == nullptr || ppItem == nullptr)
+        {
+            return E_POINTER;
+        }
+
+        *ppItem = pNode->item;
+        return S_OK;
+    }
+
+    // RemoveItem:
+    // Removes a node and optionally returns the item.
+    // ppItem can be nullptr.
+    virtual HRESULT RemoveItem(Node *pNode, T *ppItem)
+    {
+        if (pNode == nullptr)
+        {
+            return E_POINTER;
+        }
+
+        assert(pNode != &m_anchor); // We should never try to remove the anchor node.
+        if (pNode == &m_anchor)
+        {
+            return E_INVALIDARG;
+        }
+
+
+        T item;
+
+        // The next node's previous is this node's previous.
+        pNode->next->prev = pNode->prev;
+
+        // The previous node's next is this node's next.
+        pNode->prev->next = pNode->next;
+
+        item = pNode->item;
+        delete pNode;
+
+        m_count--;
+
+        if (ppItem)
+        {
+            *ppItem = item;
+        }
+
+        return S_OK;
+    }
+
+public:
+
+    List()
+    {
+        m_anchor.next = &m_anchor;
+        m_anchor.prev = &m_anchor;
+
+        m_count = 0;
+    }
+
+    virtual ~List()
+    {
+        Clear();
+    }
+
+    // Insertion functions
+    HRESULT InsertBack(T item)
+    {
+        return InsertAfter(item, m_anchor.prev);
+    }
+
+
+    HRESULT InsertFront(T item)
+    {
+        return InsertAfter(item, &m_anchor);
+    }
+
+    HRESULT InsertPos(POSITION pos, T item)
+    {
+        if (pos.pNode == nullptr)
+        {
+            return InsertBack(item);
+        }
+
+        return InsertAfter(item, pos.pNode->prev);
+    }
+
+    // RemoveBack: Removes the tail of the list and returns the value.
+    // ppItem can be nullptr if you don't want the item back. (But the method does not release the item.)
+    HRESULT RemoveBack(T *ppItem)
+    {
+        if (IsEmpty())
+        {
+            return E_FAIL;
+        }
+        else
+        {
+            return RemoveItem(Back(), ppItem);
+        }
+    }
+
+    // RemoveFront: Removes the head of the list and returns the value.
+    // ppItem can be nullptr if you don't want the item back. (But the method does not release the item.)
+    HRESULT RemoveFront(T *ppItem)
+    {
+        if (IsEmpty())
+        {
+            return E_FAIL;
+        }
+        else
+        {
+            return RemoveItem(Front(), ppItem);
+        }
+    }
+
+    // GetBack: Gets the tail item.
+    HRESULT GetBack(T *ppItem)
+    {
+        if (IsEmpty())
+        {
+            return E_FAIL;
+        }
+        else
+        {
+            return GetItem(Back(), ppItem);
+        }
+    }
+
+    // GetFront: Gets the front item.
+    HRESULT GetFront(T *ppItem)
+    {
+        if (IsEmpty())
+        {
+            return E_FAIL;
+        }
+        else
+        {
+            return GetItem(Front(), ppItem);
+        }
+    }
+
+
+    // GetCount: Returns the number of items in the list.
+    DWORD GetCount() const { return m_count; }
+
+    bool IsEmpty() const
+    {
+        return (GetCount() == 0);
+    }
+
+    // Clear: Takes a functor object whose operator()
+    // frees the object on the list.
+    template <class FN>
+    void Clear(FN& clear_fn)
+    {
+        Node *n = m_anchor.next;
+
+        // Delete the nodes
+        while (n != &m_anchor)
+        {
+            clear_fn(n->item);
+
+            Node *tmp = n->next;
+            delete n;
+            n = tmp;
+        }
+
+        // Reset the anchor to point at itself
+        m_anchor.next = &m_anchor;
+        m_anchor.prev = &m_anchor;
+
+        m_count = 0;
+    }
+
+    // Clear: Clears the list. (Does not delete or release the list items.)
+    virtual void Clear()
+    {
+        NoOp<T> clearOp;
+        Clear<>(clearOp);
+    }
+
+
+    // Enumerator functions
+
+    POSITION FrontPosition()
+    {
+        if (IsEmpty())
+        {
+            return POSITION(nullptr);
+        }
+        else
+        {
+            return POSITION(Front());
+        }
+    }
+
+    POSITION EndPosition() const
+    {
+        return POSITION();
+    }
+
+    HRESULT GetItemPos(POSITION pos, T *ppItem)
+    {   
+        if (pos.pNode)
+        {
+            return GetItem(pos.pNode, ppItem);
+        }
+        else 
+        {
+            return E_FAIL;
+        }
+    }
+
+    POSITION Next(const POSITION pos)
+    {
+        if (pos.pNode && (pos.pNode->next != &m_anchor))
+        {
+            return POSITION(pos.pNode->next);
+        }
+        else
+        {
+            return POSITION(nullptr);
+        }
+    }
+
+    // Remove an item at a position. 
+    // The item is returns in ppItem, unless ppItem is nullptr.
+    // NOTE: This method invalidates the POSITION object.
+    HRESULT Remove(POSITION& pos, T *ppItem)
+    {
+        if (pos.pNode)
+        {
+            // Remove const-ness temporarily...
+            Node *pNode = const_cast<Node*>(pos.pNode);
+
+            pos = POSITION();
+
+            return RemoveItem(pNode, ppItem);
+        }
+        else
+        {
+            return E_INVALIDARG;
+        }
+    }
+
+};
+
+
+
+// Typical functors for Clear method.
+
+// ComAutoRelease: Releases COM pointers.
+// MemDelete: Deletes pointers to new'd memory.
+
+class ComAutoRelease
+{
+public: 
+    void operator()(IUnknown *p)
+    {
+        if (p)
+        {
+            p->Release();
+        }
+    }
+};
+        
+class MemDelete
+{
+public: 
+    void operator()(void *p)
+    {
+        if (p)
+        {
+            delete p;
+        }
+    }
+};
+
+
+// ComPtrList class
+// Derived class that makes it safer to store COM pointers in the List<> class.
+// It automatically AddRef's the pointers that are inserted onto the list
+// (unless the insertion method fails). 
+//
+// T must be a COM interface type. 
+// example: ComPtrList<IUnknown>
+//
+// NULLABLE: If true, client can insert nullptr pointers. This means GetItem can
+// succeed but return a nullptr pointer. By default, the list does not allow nullptr
+// pointers.
+
+template <class T, bool NULLABLE = FALSE>
+class ComPtrList : public List<T*>
+{
+public:
+
+    typedef T* Ptr;
+
+    void Clear()
+    {
+        ComAutoRelease car;
+        List<Ptr>::Clear(car);
+    }
+
+    ~ComPtrList()
+    {
+        Clear();
+    }
+
+protected:
+    HRESULT InsertAfter(Ptr item, Node *pBefore)
+    {
+        // Do not allow nullptr item pointers unless NULLABLE is true.
+        if (item == nullptr && !NULLABLE)
+        {
+            return E_POINTER;
+        }
+
+        if (item)
+        {
+            item->AddRef();
+        }
+
+        HRESULT hr = List<Ptr>::InsertAfter(item, pBefore);
+        if (FAILED(hr) && item != nullptr)
+        {
+            item->Release();
+        }
+        return hr;
+    }
+
+    HRESULT GetItem(const Node *pNode, Ptr* ppItem)
+    {
+        Ptr pItem = nullptr;
+
+        // The base class gives us the pointer without AddRef'ing it.
+        // If we return the pointer to the caller, we must AddRef().
+        HRESULT hr = List<Ptr>::GetItem(pNode, &pItem);
+        if (SUCCEEDED(hr))
+        {
+            assert(pItem || NULLABLE);
+            if (pItem)
+            {
+                *ppItem = pItem;
+                (*ppItem)->AddRef();
+            }
+        }
+        return hr;
+    }
+
+    HRESULT RemoveItem(Node *pNode, Ptr *ppItem)
+    {
+        // ppItem can be nullptr, but we need to get the
+        // item so that we can release it. 
+
+        // If ppItem is not nullptr, we will AddRef it on the way out.
+
+        Ptr pItem = nullptr;
+
+        HRESULT hr = List<Ptr>::RemoveItem(pNode, &pItem);
+
+        if (SUCCEEDED(hr))
+        {
+            assert(pItem || NULLABLE);
+            if (ppItem && pItem)
+            {
+                *ppItem = pItem;
+                (*ppItem)->AddRef();
+            }
+
+            if (pItem)
+            {
+                pItem->Release();
+                pItem = nullptr;
+            }
+        }
+
+        return hr;
+    }
+};
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/Common/OpQueue.h b/samples/winrt/ImageManipulations/MediaExtensions/Common/OpQueue.h
new file mode 100644
index 0000000000..dd0813be30
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/Common/OpQueue.h
@@ -0,0 +1,222 @@
+//////////////////////////////////////////////////////////////////////////
+//
+// OpQueue.h
+// Async operation queue.
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#pragma warning( push )
+#pragma warning( disable : 4355 )  // 'this' used in base member initializer list
+
+/*
+    This header file defines an object to help queue and serialize
+    asynchronous operations.
+
+    Background:
+
+    To perform an operation asynchronously in Media Foundation, an object
+    does one of the following:
+
+        1. Calls MFPutWorkItem(Ex), using either a standard work queue
+           identifier or a caller-allocated work queue. The work-queue
+           thread invokes the object's callback.
+
+        2. Creates an async result object (IMFAsyncResult) and calls
+           MFInvokeCallback to invoke the object's callback.
+
+    Ultimately, either of these cause the object's callback to be invoked
+    from a work-queue thread. The object can then complete the operation
+    inside the callback.
+
+    However, the Media Foundation platform may dispatch async callbacks in
+    parallel on several threads. Putting an item on a work queue does NOT
+    guarantee that one operation will complete before the next one starts,
+    or even that work items will be dispatched in the same order they were
+    called.
+
+    To serialize async operations that should not overlap, an object should
+    use a queue. While one operation is pending, subsequent operations are
+    put on the queue, and only dispatched after the previous operation is
+    complete.
+
+    The granularity of a single "operation" depends on the requirements of
+    that particular object. A single operation might involve several
+    asynchronous calls before the object dispatches the next operation on
+    the queue.
+
+
+*/
+
+
+
+//-------------------------------------------------------------------
+// OpQueue class template
+//
+// Base class for an async operation queue.
+//
+// TOperation: The class used to describe operations. This class must
+//          implement IUnknown.
+//
+// The OpQueue class is an abstract class. The derived class must
+// implement the following pure-virtual methods:
+//
+// - IUnknown methods (AddRef, Release, QI)
+//
+// - DispatchOperation:
+//
+//      Performs the asynchronous operation specified by pOp.
+//
+//      At the end of each operation, the derived class must call
+//      ProcessQueue to process the next operation in the queue.
+//
+//      NOTE: An operation is not required to complete inside the
+//      DispatchOperation method. A single operation might consist
+//      of several asynchronous method calls.
+//
+// - ValidateOperation:
+//
+//      Checks whether the object can perform the operation specified
+//      by pOp at this time.
+//
+//      If the object cannot perform the operation now (e.g., because
+//      another operation is still in progress) the method should
+//      return MF_E_NOTACCEPTING.
+//
+//-------------------------------------------------------------------
+#include "linklist.h"
+#include "AsyncCB.h"
+
+template <class T, class TOperation>
+class OpQueue //: public IUnknown
+{
+public:
+
+    typedef ComPtrList<TOperation>   OpList;
+
+    HRESULT QueueOperation(TOperation *pOp);
+
+protected:
+
+    HRESULT ProcessQueue();
+    HRESULT ProcessQueueAsync(IMFAsyncResult *pResult);
+
+    virtual HRESULT DispatchOperation(TOperation *pOp) = 0;
+    virtual HRESULT ValidateOperation(TOperation *pOp) = 0;
+
+    OpQueue(CRITICAL_SECTION& critsec)
+        : m_OnProcessQueue(static_cast<T *>(this), &OpQueue::ProcessQueueAsync),
+          m_critsec(critsec)
+    {
+    }
+
+    virtual ~OpQueue()
+    {
+    }
+
+protected:
+    OpList                  m_OpQueue;         // Queue of operations.
+    CRITICAL_SECTION&       m_critsec;         // Protects the queue state.
+    AsyncCallback<T>  m_OnProcessQueue;  // ProcessQueueAsync callback.
+};
+
+
+
+//-------------------------------------------------------------------
+// Place an operation on the queue.
+// Public method.
+//-------------------------------------------------------------------
+
+template <class T, class TOperation>
+HRESULT OpQueue<T, TOperation>::QueueOperation(TOperation *pOp)
+{
+    HRESULT hr = S_OK;
+
+    EnterCriticalSection(&m_critsec);
+
+    hr = m_OpQueue.InsertBack(pOp);
+    if (SUCCEEDED(hr))
+    {
+        hr = ProcessQueue();
+    }
+
+    LeaveCriticalSection(&m_critsec);
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// Process the next operation on the queue.
+// Protected method.
+//
+// Note: This method dispatches the operation to a work queue.
+//-------------------------------------------------------------------
+
+template <class T, class TOperation>
+HRESULT OpQueue<T, TOperation>::ProcessQueue()
+{
+    HRESULT hr = S_OK;
+    if (m_OpQueue.GetCount() > 0)
+    {
+        hr = MFPutWorkItem2(
+            MFASYNC_CALLBACK_QUEUE_STANDARD,    // Use the standard work queue.
+            0,                                  // Default priority
+            &m_OnProcessQueue,                  // Callback method.
+            nullptr                             // State object.
+            );
+    }
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// Process the next operation on the queue.
+// Protected method.
+//
+// Note: This method is called from a work-queue thread.
+//-------------------------------------------------------------------
+
+template <class T, class TOperation>
+HRESULT OpQueue<T, TOperation>::ProcessQueueAsync(IMFAsyncResult *pResult)
+{
+    HRESULT hr = S_OK;
+    TOperation *pOp = nullptr;
+
+    EnterCriticalSection(&m_critsec);
+
+    if (m_OpQueue.GetCount() > 0)
+    {
+        hr = m_OpQueue.GetFront(&pOp);
+
+        if (SUCCEEDED(hr))
+        {
+            hr = ValidateOperation(pOp);
+        }
+        if (SUCCEEDED(hr))
+        {
+            hr = m_OpQueue.RemoveFront(nullptr);
+        }
+        if (SUCCEEDED(hr))
+        {
+            (void)DispatchOperation(pOp);
+        }
+    }
+
+    if (pOp != nullptr)
+    {
+        pOp->Release();
+    }
+
+    LeaveCriticalSection(&m_critsec);
+    return hr;
+}
+
+#pragma warning( pop )
\ No newline at end of file
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvImageManipulations.idl b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvImageManipulations.idl
new file mode 100644
index 0000000000..120ef7dbb4
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvImageManipulations.idl
@@ -0,0 +1,11 @@
+import "Windows.Media.idl";
+
+#include <sdkddkver.h>
+
+namespace OcvTransform
+{
+	[version(NTDDI_WIN8)]
+	runtimeclass OcvImageManipulations 
+    {
+	}
+}
\ No newline at end of file
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.cpp b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.cpp
new file mode 100644
index 0000000000..bf98128158
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.cpp
@@ -0,0 +1,1486 @@
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#include "OcvTransform.h"
+#include "bufferlock.h"
+
+#include "opencv2\core\core.hpp"
+#include "opencv2\imgproc\imgproc.hpp"
+
+using namespace Microsoft::WRL;
+
+/*
+
+This sample implements a video effect as a Media Foundation transform (MFT).
+
+NOTES ON THE MFT IMPLEMENTATION
+
+1. The MFT has fixed streams: One input stream and one output stream.
+
+2. The MFT supports NV12 format only.
+
+3. If the MFT is holding an input sample, SetInputType and SetOutputType both fail.
+
+4. The input and output types must be identical.
+
+5. If both types are set, no type can be set until the current type is cleared.
+
+6. Preferred input types:
+
+     (a) If the output type is set, that's the preferred type.
+     (b) Otherwise, the preferred types are partial types, constructed from the
+         list of supported subtypes.
+
+7. Preferred output types: As above.
+
+8. Streaming:
+
+    The private BeingStreaming() method is called in response to the
+    MFT_MESSAGE_NOTIFY_BEGIN_STREAMING message.
+
+    If the client does not send MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, the MFT calls
+    BeginStreaming inside the first call to ProcessInput or ProcessOutput.
+
+    This is a good approach for allocating resources that your MFT requires for
+    streaming.
+
+9. The configuration attributes are applied in the BeginStreaming method. If the
+   client changes the attributes during streaming, the change is ignored until
+   streaming is stopped (either by changing the media types or by sending the
+   MFT_MESSAGE_NOTIFY_END_STREAMING message) and then restarted.
+
+*/
+
+
+// Static array of media types (preferred and accepted).
+const GUID g_MediaSubtypes[] =
+{
+    MFVideoFormat_NV12
+};
+
+HRESULT GetDefaultStride(IMFMediaType *pType, LONG *plStride);
+
+template <typename T>
+inline T clamp(const T& val, const T& minVal, const T& maxVal)
+{
+    return (val < minVal ? minVal : (val > maxVal ? maxVal : val));
+}
+
+OcvImageManipulations::OcvImageManipulations() :
+    m_pSample(NULL), m_pInputType(NULL), m_pOutputType(NULL),
+    m_imageWidthInPixels(0), m_imageHeightInPixels(0), m_cbImageSize(0),
+    m_TransformType(Preview), m_bStreamingInitialized(false),
+    m_pAttributes(NULL)
+{
+    InitializeCriticalSectionEx(&m_critSec, 3000, 0);
+}
+
+OcvImageManipulations::~OcvImageManipulations()
+{
+    SafeRelease(&m_pInputType);
+    SafeRelease(&m_pOutputType);
+    SafeRelease(&m_pSample);
+    SafeRelease(&m_pAttributes);
+    DeleteCriticalSection(&m_critSec);
+}
+
+// Initialize the instance.
+STDMETHODIMP OcvImageManipulations::RuntimeClassInitialize()
+{
+    // Create the attribute store.
+    return MFCreateAttributes(&m_pAttributes, 3);
+}
+
+// IMediaExtension methods
+
+//-------------------------------------------------------------------
+// SetProperties
+// Sets the configuration of the effect
+//-------------------------------------------------------------------
+HRESULT OcvImageManipulations::SetProperties(ABI::Windows::Foundation::Collections::IPropertySet *pConfiguration)
+{
+    HRESULT hr = S_OK;
+
+    if (!pConfiguration)
+        return hr;
+
+    HSTRING key;
+    WindowsCreateString(L"{698649BE-8EAE-4551-A4CB-3EC98FBD3D86}", 38, &key);
+    Microsoft::WRL::ComPtr<ABI::Windows::Foundation::Collections::IMap<HSTRING, IInspectable *>> spSetting;
+    pConfiguration->QueryInterface(IID_PPV_ARGS(&spSetting));
+    boolean found;
+    spSetting->HasKey(key, &found);
+
+    if (found)
+    {
+        IInspectable* value;
+        spSetting->Lookup(key, &value);
+
+        Microsoft::WRL::ComPtr<ABI::Windows::Foundation::IReference<int>> ref;
+        hr = value->QueryInterface(IID_PPV_ARGS(&ref));
+        int effect = InvalidEffect;
+        hr = ref->get_Value(&effect);
+        if ((effect >= 0) && (effect < InvalidEffect))
+        {
+            m_TransformType = (ProcessingType)effect;
+        }
+    }
+
+	return hr;
+}
+
+// IMFTransform methods. Refer to the Media Foundation SDK documentation for details.
+
+//-------------------------------------------------------------------
+// GetStreamLimits
+// Returns the minimum and maximum number of streams.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetStreamLimits(
+    DWORD   *pdwInputMinimum,
+    DWORD   *pdwInputMaximum,
+    DWORD   *pdwOutputMinimum,
+    DWORD   *pdwOutputMaximum
+)
+{
+    if ((pdwInputMinimum == NULL) ||
+        (pdwInputMaximum == NULL) ||
+        (pdwOutputMinimum == NULL) ||
+        (pdwOutputMaximum == NULL))
+    {
+        return E_POINTER;
+    }
+
+    // This MFT has a fixed number of streams.
+    *pdwInputMinimum = 1;
+    *pdwInputMaximum = 1;
+    *pdwOutputMinimum = 1;
+    *pdwOutputMaximum = 1;
+    return S_OK;
+}
+
+
+//-------------------------------------------------------------------
+// GetStreamCount
+// Returns the actual number of streams.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetStreamCount(
+    DWORD   *pcInputStreams,
+    DWORD   *pcOutputStreams
+)
+{
+    if ((pcInputStreams == NULL) || (pcOutputStreams == NULL))
+
+    {
+        return E_POINTER;
+    }
+
+    // This MFT has a fixed number of streams.
+    *pcInputStreams = 1;
+    *pcOutputStreams = 1;
+    return S_OK;
+}
+
+
+
+//-------------------------------------------------------------------
+// GetStreamIDs
+// Returns stream IDs for the input and output streams.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetStreamIDs(
+    DWORD   dwInputIDArraySize,
+    DWORD   *pdwInputIDs,
+    DWORD   dwOutputIDArraySize,
+    DWORD   *pdwOutputIDs
+)
+{
+    // It is not required to implement this method if the MFT has a fixed number of
+    // streams AND the stream IDs are numbered sequentially from zero (that is, the
+    // stream IDs match the stream indexes).
+
+    // In that case, it is OK to return E_NOTIMPL.
+    return E_NOTIMPL;
+}
+
+
+//-------------------------------------------------------------------
+// GetInputStreamInfo
+// Returns information about an input stream.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetInputStreamInfo(
+    DWORD                     dwInputStreamID,
+    MFT_INPUT_STREAM_INFO *   pStreamInfo
+)
+{
+    if (pStreamInfo == NULL)
+    {
+        return E_POINTER;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidInputStream(dwInputStreamID))
+    {
+        LeaveCriticalSection(&m_critSec);
+        return MF_E_INVALIDSTREAMNUMBER;
+    }
+
+    // NOTE: This method should succeed even when there is no media type on the
+    //       stream. If there is no media type, we only need to fill in the dwFlags
+    //       member of MFT_INPUT_STREAM_INFO. The other members depend on having a
+    //       a valid media type.
+
+    pStreamInfo->hnsMaxLatency = 0;
+    pStreamInfo->dwFlags = MFT_INPUT_STREAM_WHOLE_SAMPLES | MFT_INPUT_STREAM_SINGLE_SAMPLE_PER_BUFFER;
+
+    if (m_pInputType == NULL)
+    {
+        pStreamInfo->cbSize = 0;
+    }
+    else
+    {
+        pStreamInfo->cbSize = m_cbImageSize;
+    }
+
+    pStreamInfo->cbMaxLookahead = 0;
+    pStreamInfo->cbAlignment = 0;
+
+    LeaveCriticalSection(&m_critSec);
+    return S_OK;
+}
+
+//-------------------------------------------------------------------
+// GetOutputStreamInfo
+// Returns information about an output stream.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetOutputStreamInfo(
+    DWORD                     dwOutputStreamID,
+    MFT_OUTPUT_STREAM_INFO *  pStreamInfo
+)
+{
+    if (pStreamInfo == NULL)
+    {
+        return E_POINTER;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidOutputStream(dwOutputStreamID))
+    {
+        LeaveCriticalSection(&m_critSec);
+        return MF_E_INVALIDSTREAMNUMBER;
+    }
+
+    // NOTE: This method should succeed even when there is no media type on the
+    //       stream. If there is no media type, we only need to fill in the dwFlags
+    //       member of MFT_OUTPUT_STREAM_INFO. The other members depend on having a
+    //       a valid media type.
+
+    pStreamInfo->dwFlags =
+        MFT_OUTPUT_STREAM_WHOLE_SAMPLES |
+        MFT_OUTPUT_STREAM_SINGLE_SAMPLE_PER_BUFFER |
+        MFT_OUTPUT_STREAM_FIXED_SAMPLE_SIZE ;
+
+    if (m_pOutputType == NULL)
+    {
+        pStreamInfo->cbSize = 0;
+    }
+    else
+    {
+        pStreamInfo->cbSize = m_cbImageSize;
+    }
+
+    pStreamInfo->cbAlignment = 0;
+
+    LeaveCriticalSection(&m_critSec);
+    return S_OK;
+}
+
+
+//-------------------------------------------------------------------
+// GetAttributes
+// Returns the attributes for the MFT.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetAttributes(IMFAttributes** ppAttributes)
+{
+    if (ppAttributes == NULL)
+    {
+        return E_POINTER;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    *ppAttributes = m_pAttributes;
+    (*ppAttributes)->AddRef();
+
+    LeaveCriticalSection(&m_critSec);
+    return S_OK;
+}
+
+
+//-------------------------------------------------------------------
+// GetInputStreamAttributes
+// Returns stream-level attributes for an input stream.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetInputStreamAttributes(
+    DWORD           dwInputStreamID,
+    IMFAttributes   **ppAttributes
+)
+{
+    // This MFT does not support any stream-level attributes, so the method is not implemented.
+    return E_NOTIMPL;
+}
+
+
+//-------------------------------------------------------------------
+// GetOutputStreamAttributes
+// Returns stream-level attributes for an output stream.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetOutputStreamAttributes(
+    DWORD           dwOutputStreamID,
+    IMFAttributes   **ppAttributes
+)
+{
+    // This MFT does not support any stream-level attributes, so the method is not implemented.
+    return E_NOTIMPL;
+}
+
+
+//-------------------------------------------------------------------
+// DeleteInputStream
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::DeleteInputStream(DWORD dwStreamID)
+{
+    // This MFT has a fixed number of input streams, so the method is not supported.
+    return E_NOTIMPL;
+}
+
+
+//-------------------------------------------------------------------
+// AddInputStreams
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::AddInputStreams(
+    DWORD   cStreams,
+    DWORD   *adwStreamIDs
+)
+{
+    // This MFT has a fixed number of output streams, so the method is not supported.
+    return E_NOTIMPL;
+}
+
+
+//-------------------------------------------------------------------
+// GetInputAvailableType
+// Returns a preferred input type.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetInputAvailableType(
+    DWORD           dwInputStreamID,
+    DWORD           dwTypeIndex, // 0-based
+    IMFMediaType    **ppType
+)
+{
+    if (ppType == NULL)
+    {
+        return E_INVALIDARG;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidInputStream(dwInputStreamID))
+    {
+        LeaveCriticalSection(&m_critSec);
+        return MF_E_INVALIDSTREAMNUMBER;
+    }
+
+    HRESULT hr = S_OK;
+
+    // If the output type is set, return that type as our preferred input type.
+    if (m_pOutputType == NULL)
+    {
+        // The output type is not set. Create a partial media type.
+        hr = OnGetPartialType(dwTypeIndex, ppType);
+    }
+    else if (dwTypeIndex > 0)
+    {
+        hr = MF_E_NO_MORE_TYPES;
+    }
+    else
+    {
+        *ppType = m_pOutputType;
+        (*ppType)->AddRef();
+    }
+
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+
+//-------------------------------------------------------------------
+// GetOutputAvailableType
+// Returns a preferred output type.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetOutputAvailableType(
+    DWORD           dwOutputStreamID,
+    DWORD           dwTypeIndex, // 0-based
+    IMFMediaType    **ppType
+)
+{
+    if (ppType == NULL)
+    {
+        return E_INVALIDARG;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidOutputStream(dwOutputStreamID))
+    {
+        LeaveCriticalSection(&m_critSec);
+        return MF_E_INVALIDSTREAMNUMBER;
+    }
+
+    HRESULT hr = S_OK;
+
+    if (m_pInputType == NULL)
+    {
+        // The input type is not set. Create a partial media type.
+        hr = OnGetPartialType(dwTypeIndex, ppType);
+    }
+    else if (dwTypeIndex > 0)
+    {
+        hr = MF_E_NO_MORE_TYPES;
+    }
+    else
+    {
+        *ppType = m_pInputType;
+        (*ppType)->AddRef();
+    }
+
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// SetInputType
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::SetInputType(
+    DWORD           dwInputStreamID,
+    IMFMediaType    *pType, // Can be NULL to clear the input type.
+    DWORD           dwFlags
+)
+{
+    // Validate flags.
+    if (dwFlags & ~MFT_SET_TYPE_TEST_ONLY)
+    {
+        return E_INVALIDARG;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidInputStream(dwInputStreamID))
+    {
+        LeaveCriticalSection(&m_critSec);
+        return MF_E_INVALIDSTREAMNUMBER;
+    }
+
+    HRESULT hr = S_OK;
+
+    // Does the caller want us to set the type, or just test it?
+    BOOL bReallySet = ((dwFlags & MFT_SET_TYPE_TEST_ONLY) == 0);
+
+    // If we have an input sample, the client cannot change the type now.
+    if (HasPendingOutput())
+    {
+        hr = MF_E_TRANSFORM_CANNOT_CHANGE_MEDIATYPE_WHILE_PROCESSING;
+        goto done;
+    }
+
+    // Validate the type, if non-NULL.
+    if (pType)
+    {
+        hr = OnCheckInputType(pType);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+    }
+
+    // The type is OK. Set the type, unless the caller was just testing.
+    if (bReallySet)
+    {
+        OnSetInputType(pType);
+
+        // When the type changes, end streaming.
+        hr = EndStreaming();
+    }
+
+done:
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+
+//-------------------------------------------------------------------
+// SetOutputType
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::SetOutputType(
+    DWORD           dwOutputStreamID,
+    IMFMediaType    *pType, // Can be NULL to clear the output type.
+    DWORD           dwFlags
+)
+{
+    // Validate flags.
+    if (dwFlags & ~MFT_SET_TYPE_TEST_ONLY)
+    {
+        return E_INVALIDARG;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidOutputStream(dwOutputStreamID))
+    {
+        LeaveCriticalSection(&m_critSec);
+        return MF_E_INVALIDSTREAMNUMBER;
+    }
+
+    HRESULT hr = S_OK;
+
+    // Does the caller want us to set the type, or just test it?
+    BOOL bReallySet = ((dwFlags & MFT_SET_TYPE_TEST_ONLY) == 0);
+
+    // If we have an input sample, the client cannot change the type now.
+    if (HasPendingOutput())
+    {
+        hr = MF_E_TRANSFORM_CANNOT_CHANGE_MEDIATYPE_WHILE_PROCESSING;
+        goto done;
+    }
+
+    // Validate the type, if non-NULL.
+    if (pType)
+    {
+        hr = OnCheckOutputType(pType);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+    }
+
+    // The type is OK. Set the type, unless the caller was just testing.
+    if (bReallySet)
+    {
+        OnSetOutputType(pType);
+
+        // When the type changes, end streaming.
+        hr = EndStreaming();
+    }
+
+done:
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// GetInputCurrentType
+// Returns the current input type.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetInputCurrentType(
+    DWORD           dwInputStreamID,
+    IMFMediaType    **ppType
+)
+{
+    if (ppType == NULL)
+    {
+        return E_POINTER;
+    }
+
+    HRESULT hr = S_OK;
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidInputStream(dwInputStreamID))
+    {
+        hr = MF_E_INVALIDSTREAMNUMBER;
+    }
+    else if (!m_pInputType)
+    {
+        hr = MF_E_TRANSFORM_TYPE_NOT_SET;
+    }
+    else
+    {
+        *ppType = m_pInputType;
+        (*ppType)->AddRef();
+    }
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// GetOutputCurrentType
+// Returns the current output type.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetOutputCurrentType(
+    DWORD           dwOutputStreamID,
+    IMFMediaType    **ppType
+)
+{
+    if (ppType == NULL)
+    {
+        return E_POINTER;
+    }
+
+    HRESULT hr = S_OK;
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidOutputStream(dwOutputStreamID))
+    {
+        hr = MF_E_INVALIDSTREAMNUMBER;
+    }
+    else if (!m_pOutputType)
+    {
+        hr = MF_E_TRANSFORM_TYPE_NOT_SET;
+    }
+    else
+    {
+        *ppType = m_pOutputType;
+        (*ppType)->AddRef();
+    }
+
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// GetInputStatus
+// Query if the MFT is accepting more input.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetInputStatus(
+    DWORD           dwInputStreamID,
+    DWORD           *pdwFlags
+)
+{
+    if (pdwFlags == NULL)
+    {
+        return E_POINTER;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    if (!IsValidInputStream(dwInputStreamID))
+    {
+        LeaveCriticalSection(&m_critSec);
+        return MF_E_INVALIDSTREAMNUMBER;
+    }
+
+    // If an input sample is already queued, do not accept another sample until the
+    // client calls ProcessOutput or Flush.
+
+    // NOTE: It is possible for an MFT to accept more than one input sample. For
+    // example, this might be required in a video decoder if the frames do not
+    // arrive in temporal order. In the case, the decoder must hold a queue of
+    // samples. For the video effect, each sample is transformed independently, so
+    // there is no reason to queue multiple input samples.
+
+    if (m_pSample == NULL)
+    {
+        *pdwFlags = MFT_INPUT_STATUS_ACCEPT_DATA;
+    }
+    else
+    {
+        *pdwFlags = 0;
+    }
+
+    LeaveCriticalSection(&m_critSec);
+    return S_OK;
+}
+
+
+
+//-------------------------------------------------------------------
+// GetOutputStatus
+// Query if the MFT can produce output.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::GetOutputStatus(DWORD *pdwFlags)
+{
+    if (pdwFlags == NULL)
+    {
+        return E_POINTER;
+    }
+
+    EnterCriticalSection(&m_critSec);
+
+    // The MFT can produce an output sample if (and only if) there an input sample.
+    if (m_pSample != NULL)
+    {
+        *pdwFlags = MFT_OUTPUT_STATUS_SAMPLE_READY;
+    }
+    else
+    {
+        *pdwFlags = 0;
+    }
+
+    LeaveCriticalSection(&m_critSec);
+    return S_OK;
+}
+
+
+//-------------------------------------------------------------------
+// SetOutputBounds
+// Sets the range of time stamps that the MFT will output.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::SetOutputBounds(
+    LONGLONG        hnsLowerBound,
+    LONGLONG        hnsUpperBound
+)
+{
+    // Implementation of this method is optional.
+    return E_NOTIMPL;
+}
+
+
+//-------------------------------------------------------------------
+// ProcessEvent
+// Sends an event to an input stream.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::ProcessEvent(
+    DWORD              dwInputStreamID,
+    IMFMediaEvent      *pEvent
+)
+{
+    // This MFT does not handle any stream events, so the method can
+    // return E_NOTIMPL. This tells the pipeline that it can stop
+    // sending any more events to this MFT.
+    return E_NOTIMPL;
+}
+
+
+//-------------------------------------------------------------------
+// ProcessMessage
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::ProcessMessage(
+    MFT_MESSAGE_TYPE    eMessage,
+    ULONG_PTR           ulParam
+)
+{
+    EnterCriticalSection(&m_critSec);
+
+    HRESULT hr = S_OK;
+
+    switch (eMessage)
+    {
+    case MFT_MESSAGE_COMMAND_FLUSH:
+        // Flush the MFT.
+        hr = OnFlush();
+        break;
+
+    case MFT_MESSAGE_COMMAND_DRAIN:
+        // Drain: Tells the MFT to reject further input until all pending samples are
+        // processed. That is our default behavior already, so there is nothing to do.
+        //
+        // For a decoder that accepts a queue of samples, the MFT might need to drain
+        // the queue in response to this command.
+    break;
+
+    case MFT_MESSAGE_SET_D3D_MANAGER:
+        // Sets a pointer to the IDirect3DDeviceManager9 interface.
+
+        // The pipeline should never send this message unless the MFT sets the MF_SA_D3D_AWARE
+        // attribute set to TRUE. Because this MFT does not set MF_SA_D3D_AWARE, it is an error
+        // to send the MFT_MESSAGE_SET_D3D_MANAGER message to the MFT. Return an error code in
+        // this case.
+
+        // NOTE: If this MFT were D3D-enabled, it would cache the IDirect3DDeviceManager9
+        // pointer for use during streaming.
+
+        hr = E_NOTIMPL;
+        break;
+
+    case MFT_MESSAGE_NOTIFY_BEGIN_STREAMING:
+        hr = BeginStreaming();
+        break;
+
+    case MFT_MESSAGE_NOTIFY_END_STREAMING:
+        hr = EndStreaming();
+        break;
+
+    // The next two messages do not require any action from this MFT.
+
+    case MFT_MESSAGE_NOTIFY_END_OF_STREAM:
+        break;
+
+    case MFT_MESSAGE_NOTIFY_START_OF_STREAM:
+        break;
+    }
+
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// ProcessInput
+// Process an input sample.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::ProcessInput(
+    DWORD               dwInputStreamID,
+    IMFSample           *pSample,
+    DWORD               dwFlags
+)
+{
+    // Check input parameters.
+    if (pSample == NULL)
+    {
+        return E_POINTER;
+    }
+
+    if (dwFlags != 0)
+    {
+        return E_INVALIDARG; // dwFlags is reserved and must be zero.
+    }
+
+    HRESULT hr = S_OK;
+
+    EnterCriticalSection(&m_critSec);
+
+    // Validate the input stream number.
+    if (!IsValidInputStream(dwInputStreamID))
+    {
+        hr = MF_E_INVALIDSTREAMNUMBER;
+        goto done;
+    }
+
+    // Check for valid media types.
+    // The client must set input and output types before calling ProcessInput.
+    if (!m_pInputType || !m_pOutputType)
+    {
+        hr = MF_E_NOTACCEPTING;
+        goto done;
+    }
+
+    // Check if an input sample is already queued.
+    if (m_pSample != NULL)
+    {
+        hr = MF_E_NOTACCEPTING;   // We already have an input sample.
+        goto done;
+    }
+
+    // Initialize streaming.
+    hr = BeginStreaming();
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    // Cache the sample. We do the actual work in ProcessOutput.
+    m_pSample = pSample;
+    pSample->AddRef();  // Hold a reference count on the sample.
+
+done:
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+
+//-------------------------------------------------------------------
+// ProcessOutput
+// Process an output sample.
+//-------------------------------------------------------------------
+
+HRESULT OcvImageManipulations::ProcessOutput(
+    DWORD                   dwFlags,
+    DWORD                   cOutputBufferCount,
+    MFT_OUTPUT_DATA_BUFFER  *pOutputSamples, // one per stream
+    DWORD                   *pdwStatus
+)
+{
+    // Check input parameters...
+
+    // This MFT does not accept any flags for the dwFlags parameter.
+
+    // The only defined flag is MFT_PROCESS_OUTPUT_DISCARD_WHEN_NO_BUFFER. This flag
+    // applies only when the MFT marks an output stream as lazy or optional. But this
+    // MFT has no lazy or optional streams, so the flag is not valid.
+
+    if (dwFlags != 0)
+    {
+        return E_INVALIDARG;
+    }
+
+    if (pOutputSamples == NULL || pdwStatus == NULL)
+    {
+        return E_POINTER;
+    }
+
+    // There must be exactly one output buffer.
+    if (cOutputBufferCount != 1)
+    {
+        return E_INVALIDARG;
+    }
+
+    // It must contain a sample.
+    if (pOutputSamples[0].pSample == NULL)
+    {
+        return E_INVALIDARG;
+    }
+
+    HRESULT hr = S_OK;
+
+    IMFMediaBuffer *pInput = NULL;
+    IMFMediaBuffer *pOutput = NULL;
+
+    EnterCriticalSection(&m_critSec);
+
+    // There must be an input sample available for processing.
+    if (m_pSample == NULL)
+    {
+        hr = MF_E_TRANSFORM_NEED_MORE_INPUT;
+        goto done;
+    }
+
+    // Initialize streaming.
+
+    hr = BeginStreaming();
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    // Get the input buffer.
+    hr = m_pSample->ConvertToContiguousBuffer(&pInput);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    // Get the output buffer.
+    hr = pOutputSamples[0].pSample->ConvertToContiguousBuffer(&pOutput);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    hr = OnProcessOutput(pInput, pOutput);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    // Set status flags.
+    pOutputSamples[0].dwStatus = 0;
+    *pdwStatus = 0;
+
+
+    // Copy the duration and time stamp from the input sample, if present.
+
+    LONGLONG hnsDuration = 0;
+    LONGLONG hnsTime = 0;
+
+    if (SUCCEEDED(m_pSample->GetSampleDuration(&hnsDuration)))
+    {
+        hr = pOutputSamples[0].pSample->SetSampleDuration(hnsDuration);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+    }
+
+    if (SUCCEEDED(m_pSample->GetSampleTime(&hnsTime)))
+    {
+        hr = pOutputSamples[0].pSample->SetSampleTime(hnsTime);
+    }
+
+done:
+    SafeRelease(&m_pSample);   // Release our input sample.
+    SafeRelease(&pInput);
+    SafeRelease(&pOutput);
+    LeaveCriticalSection(&m_critSec);
+    return hr;
+}
+
+// PRIVATE METHODS
+
+// All methods that follow are private to this MFT and are not part of the IMFTransform interface.
+
+// Create a partial media type from our list.
+//
+// dwTypeIndex: Index into the list of peferred media types.
+// ppmt:        Receives a pointer to the media type.
+
+HRESULT OcvImageManipulations::OnGetPartialType(DWORD dwTypeIndex, IMFMediaType **ppmt)
+{
+    if (dwTypeIndex >= ARRAYSIZE(g_MediaSubtypes))
+    {
+        return MF_E_NO_MORE_TYPES;
+    }
+
+    IMFMediaType *pmt = NULL;
+
+    HRESULT hr = MFCreateMediaType(&pmt);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    hr = pmt->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    hr = pmt->SetGUID(MF_MT_SUBTYPE, g_MediaSubtypes[dwTypeIndex]);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    *ppmt = pmt;
+    (*ppmt)->AddRef();
+
+done:
+    SafeRelease(&pmt);
+    return hr;
+}
+
+
+// Validate an input media type.
+
+HRESULT OcvImageManipulations::OnCheckInputType(IMFMediaType *pmt)
+{
+    assert(pmt != NULL);
+
+    HRESULT hr = S_OK;
+
+    // If the output type is set, see if they match.
+    if (m_pOutputType != NULL)
+    {
+        DWORD flags = 0;
+        hr = pmt->IsEqual(m_pOutputType, &flags);
+
+        // IsEqual can return S_FALSE. Treat this as failure.
+        if (hr != S_OK)
+        {
+            hr = MF_E_INVALIDMEDIATYPE;
+        }
+    }
+    else
+    {
+        // Output type is not set. Just check this type.
+        hr = OnCheckMediaType(pmt);
+    }
+    return hr;
+}
+
+
+// Validate an output media type.
+
+HRESULT OcvImageManipulations::OnCheckOutputType(IMFMediaType *pmt)
+{
+    assert(pmt != NULL);
+
+    HRESULT hr = S_OK;
+
+    // If the input type is set, see if they match.
+    if (m_pInputType != NULL)
+    {
+        DWORD flags = 0;
+        hr = pmt->IsEqual(m_pInputType, &flags);
+
+        // IsEqual can return S_FALSE. Treat this as failure.
+        if (hr != S_OK)
+        {
+            hr = MF_E_INVALIDMEDIATYPE;
+        }
+
+    }
+    else
+    {
+        // Input type is not set. Just check this type.
+        hr = OnCheckMediaType(pmt);
+    }
+    return hr;
+}
+
+
+// Validate a media type (input or output)
+
+HRESULT OcvImageManipulations::OnCheckMediaType(IMFMediaType *pmt)
+{
+    BOOL bFoundMatchingSubtype = FALSE;
+
+    // Major type must be video.
+    GUID major_type;
+    HRESULT hr = pmt->GetGUID(MF_MT_MAJOR_TYPE, &major_type);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    if (major_type != MFMediaType_Video)
+    {
+        hr = MF_E_INVALIDMEDIATYPE;
+        goto done;
+    }
+
+    // Subtype must be one of the subtypes in our global list.
+
+    // Get the subtype GUID.
+    GUID subtype;
+    hr = pmt->GetGUID(MF_MT_SUBTYPE, &subtype);
+    if (FAILED(hr))
+    {
+        goto done;
+    }
+
+    // Look for the subtype in our list of accepted types.
+    for (DWORD i = 0; i < ARRAYSIZE(g_MediaSubtypes); i++)
+    {
+        if (subtype == g_MediaSubtypes[i])
+        {
+            bFoundMatchingSubtype = TRUE;
+            break;
+        }
+    }
+
+    if (!bFoundMatchingSubtype)
+    {
+        hr = MF_E_INVALIDMEDIATYPE; // The MFT does not support this subtype.
+        goto done;
+    }
+
+    // Reject single-field media types.
+    UINT32 interlace = MFGetAttributeUINT32(pmt, MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive);
+    if (interlace == MFVideoInterlace_FieldSingleUpper  || interlace == MFVideoInterlace_FieldSingleLower)
+    {
+        hr = MF_E_INVALIDMEDIATYPE;
+    }
+
+done:
+    return hr;
+}
+
+
+// Set or clear the input media type.
+//
+// Prerequisite: The input type was already validated.
+
+void OcvImageManipulations::OnSetInputType(IMFMediaType *pmt)
+{
+    // if pmt is NULL, clear the type.
+    // if pmt is non-NULL, set the type.
+
+    SafeRelease(&m_pInputType);
+    m_pInputType = pmt;
+    if (m_pInputType)
+    {
+        m_pInputType->AddRef();
+    }
+
+    // Update the format information.
+    UpdateFormatInfo();
+}
+
+
+// Set or clears the output media type.
+//
+// Prerequisite: The output type was already validated.
+
+void OcvImageManipulations::OnSetOutputType(IMFMediaType *pmt)
+{
+    // If pmt is NULL, clear the type. Otherwise, set the type.
+
+    SafeRelease(&m_pOutputType);
+    m_pOutputType = pmt;
+    if (m_pOutputType)
+    {
+        m_pOutputType->AddRef();
+    }
+}
+
+
+// Initialize streaming parameters.
+//
+// This method is called if the client sends the MFT_MESSAGE_NOTIFY_BEGIN_STREAMING
+// message, or when the client processes a sample, whichever happens first.
+
+HRESULT OcvImageManipulations::BeginStreaming()
+{
+    HRESULT hr = S_OK;
+
+    if (!m_bStreamingInitialized)
+    {
+        m_bStreamingInitialized = true;
+        hr = S_OK;
+    }
+
+    return hr;
+}
+
+
+// End streaming.
+
+// This method is called if the client sends an MFT_MESSAGE_NOTIFY_END_STREAMING
+// message, or when the media type changes. In general, it should be called whenever
+// the streaming parameters need to be reset.
+
+HRESULT OcvImageManipulations::EndStreaming()
+{
+    m_bStreamingInitialized = false;
+    return S_OK;
+}
+
+
+
+// Generate output data.
+
+HRESULT OcvImageManipulations::OnProcessOutput(IMFMediaBuffer *pIn, IMFMediaBuffer *pOut)
+{
+    BYTE *pDest = NULL;         // Destination buffer.
+    LONG lDestStride = 0;       // Destination stride.
+
+    BYTE *pSrc = NULL;          // Source buffer.
+    LONG lSrcStride = 0;        // Source stride.
+
+    // Helper objects to lock the buffers.
+    VideoBufferLock inputLock(pIn);
+    VideoBufferLock outputLock(pOut);
+
+    // Stride if the buffer does not support IMF2DBuffer
+    LONG lDefaultStride = 0;
+
+    HRESULT hr = GetDefaultStride(m_pInputType, &lDefaultStride);
+    if (FAILED(hr))
+    {
+        return hr;
+    }
+
+    // Lock the input buffer.
+    hr = inputLock.LockBuffer(lDefaultStride, m_imageHeightInPixels, &pSrc, &lSrcStride);
+    if (FAILED(hr))
+    {
+        return hr;
+    }
+
+    // Lock the output buffer.
+    hr = outputLock.LockBuffer(lDefaultStride, m_imageHeightInPixels, &pDest, &lDestStride);
+    if (FAILED(hr))
+    {
+        return hr;
+    }
+
+    cv::Mat InputFrame(m_imageHeightInPixels + m_imageHeightInPixels/2, m_imageWidthInPixels, CV_8UC1, pSrc, lSrcStride);
+    cv::Mat InputGreyScale(InputFrame, cv::Range(0, m_imageHeightInPixels), cv::Range(0, m_imageWidthInPixels));
+    cv::Mat OutputFrame(m_imageHeightInPixels + m_imageHeightInPixels/2, m_imageWidthInPixels, CV_8UC1, pDest, lDestStride);
+
+    switch (m_TransformType)
+    {
+    case Preview:
+        {
+            InputFrame.copyTo(OutputFrame);
+        } break;
+    case GrayScale:
+        {
+            OutputFrame.setTo(cv::Scalar(128));
+            cv::Mat OutputGreyScale(OutputFrame, cv::Range(0, m_imageHeightInPixels), cv::Range(0, m_imageWidthInPixels));
+            InputGreyScale.copyTo(OutputGreyScale);
+        } break;
+    case Canny:
+        {
+            OutputFrame.setTo(cv::Scalar(128));
+            cv::Mat OutputGreyScale(OutputFrame, cv::Range(0, m_imageHeightInPixels), cv::Range(0, m_imageWidthInPixels));
+            cv::Canny(InputGreyScale, OutputGreyScale, 80, 90);
+
+        } break;
+    case Sobel:
+        {
+            OutputFrame.setTo(cv::Scalar(128));
+            cv::Mat OutputGreyScale(OutputFrame, cv::Range(0, m_imageHeightInPixels), cv::Range(0, m_imageWidthInPixels));
+            cv::Sobel(InputGreyScale, OutputGreyScale, CV_8U, 1, 1);
+        } break;
+    case Histogram:
+        {
+            const int mHistSizeNum = 25;
+            const int channels[3][1] = {{0}, {1}, {2}};
+            const int mHistSize[] = {25};
+            const float baseRabge[] = {0.f,256.f};
+            const float* ranges[] = {baseRabge};
+            
+			const cv::Scalar mColorsY[] = { cv::Scalar(76), cv::Scalar(149), cv::Scalar(29) };
+			const cv::Scalar mColorsUV[] = { cv::Scalar(84, 255), cv::Scalar(43, 21), cv::Scalar(255, 107) };
+
+			cv::Mat OutputY(m_imageHeightInPixels, m_imageWidthInPixels, CV_8UC1, pDest, lDestStride);
+			cv::Mat OutputUV(m_imageHeightInPixels/2, m_imageWidthInPixels/2,
+							 CV_8UC2, pDest+m_imageHeightInPixels*lDestStride, lDestStride);
+            cv::Mat BgrFrame;
+
+			InputFrame.copyTo(OutputFrame);
+
+            cv::cvtColor(InputFrame, BgrFrame, cv::COLOR_YUV420sp2BGR);
+            int thikness = (int) (BgrFrame.cols / (mHistSizeNum + 10) / 5);
+            if(thikness > 5) thikness = 5;
+            int offset = (int) ((BgrFrame.cols - (5*mHistSizeNum + 4*10)*thikness)/2);
+
+            // RGB
+            for (int c=0; c<3; c++)
+            {
+                cv::Mat hist;
+                cv::calcHist(&BgrFrame, 1, channels[c], cv::Mat(), hist, 1, mHistSize, ranges);
+                cv::normalize(hist, hist, BgrFrame.rows/2, 0, cv::NORM_INF);
+                for(int h=0; h<mHistSizeNum; h++) {
+                    cv::Point mP1, mP2;
+					// Draw on Y plane
+					mP1.x = mP2.x = offset + (c * (mHistSizeNum + 10) + h) * thikness;
+                    mP1.y = BgrFrame.rows-1;
+                    mP2.y = mP1.y - 2 - (int)hist.at<float>(h);
+					cv::line(OutputY, mP1, mP2, mColorsY[c], thikness);
+
+					// Draw on UV planes
+                    mP1.x /= 2;
+                    mP1.y /= 2;
+					mP2.x /= 2;
+                    mP2.y /= 2;
+					cv::line(OutputUV, mP1, mP2, mColorsUV[c], thikness/2);
+                }
+            }            
+        } break;
+    default:
+        break;
+    }
+
+    // Set the data size on the output buffer.
+    hr = pOut->SetCurrentLength(m_cbImageSize);
+
+    return hr;
+}
+
+
+// Flush the MFT.
+
+HRESULT OcvImageManipulations::OnFlush()
+{
+    // For this MFT, flushing just means releasing the input sample.
+    SafeRelease(&m_pSample);
+    return S_OK;
+}
+
+
+// Update the format information. This method is called whenever the
+// input type is set.
+
+HRESULT OcvImageManipulations::UpdateFormatInfo()
+{
+    HRESULT hr = S_OK;
+
+    GUID subtype = GUID_NULL;
+
+    m_imageWidthInPixels = 0;
+    m_imageHeightInPixels = 0;
+    m_cbImageSize = 0;
+
+    if (m_pInputType != NULL)
+    {
+        hr = m_pInputType->GetGUID(MF_MT_SUBTYPE, &subtype);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+        if (subtype != MFVideoFormat_NV12)
+        {
+            hr = E_UNEXPECTED;
+            goto done;
+        }
+
+        hr = MFGetAttributeSize(m_pInputType, MF_MT_FRAME_SIZE, &m_imageWidthInPixels, &m_imageHeightInPixels);
+        if (FAILED(hr))
+        {
+            goto done;
+        }
+
+        // Calculate the image size for YUV NV12 image(not including padding)
+		m_cbImageSize = (m_imageHeightInPixels + m_imageHeightInPixels/2)*m_imageWidthInPixels;
+    }
+
+done:
+    return hr;
+}
+
+
+// Get the default stride for a video format.
+HRESULT GetDefaultStride(IMFMediaType *pType, LONG *plStride)
+{
+    LONG lStride = 0;
+
+    // Try to get the default stride from the media type.
+    HRESULT hr = pType->GetUINT32(MF_MT_DEFAULT_STRIDE, (UINT32*)&lStride);
+    if (FAILED(hr))
+    {
+        // Attribute not set. Try to calculate the default stride.
+        GUID subtype = GUID_NULL;
+
+        UINT32 width = 0;
+        UINT32 height = 0;
+
+        // Get the subtype and the image size.
+        hr = pType->GetGUID(MF_MT_SUBTYPE, &subtype);
+        if (SUCCEEDED(hr))
+        {
+            hr = MFGetAttributeSize(pType, MF_MT_FRAME_SIZE, &width, &height);
+        }
+        if (SUCCEEDED(hr))
+        {
+            if (subtype == MFVideoFormat_NV12)
+            {
+                lStride = width;
+            }
+            else if (subtype == MFVideoFormat_YUY2 || subtype == MFVideoFormat_UYVY)
+            {
+                lStride = ((width * 2) + 3) & ~3;
+            }
+            else
+            {
+                hr = E_INVALIDARG;
+            }
+        }
+
+        // Set the attribute for later reference.
+        if (SUCCEEDED(hr))
+        {
+            (void)pType->SetUINT32(MF_MT_DEFAULT_STRIDE, UINT32(lStride));
+        }
+    }
+    if (SUCCEEDED(hr))
+    {
+        *plStride = lStride;
+    }
+    return hr;
+}
+
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.def b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.def
new file mode 100644
index 0000000000..0b801908c5
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.def
@@ -0,0 +1,4 @@
+EXPORTS
+    DllCanUnloadNow                     PRIVATE
+    DllGetActivationFactory             PRIVATE
+    DllGetClassObject                   PRIVATE
\ No newline at end of file
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.h b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.h
new file mode 100644
index 0000000000..118f6e432b
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.h
@@ -0,0 +1,247 @@
+// Defines the transform class.
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#ifndef GRAYSCALE_H
+#define GRAYSCALE_H
+
+#include <new>
+#include <mfapi.h>
+#include <mftransform.h>
+#include <mfidl.h>
+#include <mferror.h>
+#include <strsafe.h>
+#include <assert.h>
+
+#include <wrl\implements.h>
+#include <wrl\module.h>
+#include <windows.media.h>
+
+#include "OcvImageManipulations.h"
+
+// CLSID of the MFT.
+DEFINE_GUID(CLSID_GrayscaleMFT,
+0x2f3dbc05, 0xc011, 0x4a8f, 0xb2, 0x64, 0xe4, 0x2e, 0x35, 0xc6, 0x7b, 0xf4);
+
+//
+// * IMPORTANT: If you implement your own MFT, create a new GUID for the CLSID. *
+//
+
+
+// Configuration attributes
+// {698649BE-8EAE-4551-A4CB-3EC98FBD3D86}
+DEFINE_GUID(OCV_IMAGE_EFFECT,
+0x698649be, 0x8eae, 0x4551, 0xa4, 0xcb, 0x3e, 0xc9, 0x8f, 0xbd, 0x3d, 0x86);
+
+
+enum ProcessingType
+{
+    Preview,
+    GrayScale,
+    Canny,
+    Sobel,
+    Histogram,
+    InvalidEffect
+};
+
+template <class T> void SafeRelease(T **ppT)
+{
+    if (*ppT)
+    {
+        (*ppT)->Release();
+        *ppT = NULL;
+    }
+}
+
+// OcvImageManipulations class:
+// Implements a grayscale video effect.
+
+class OcvImageManipulations
+    : public Microsoft::WRL::RuntimeClass<
+           Microsoft::WRL::RuntimeClassFlags< Microsoft::WRL::RuntimeClassType::WinRtClassicComMix >,
+           ABI::Windows::Media::IMediaExtension,
+           IMFTransform >
+{
+    InspectableClass(RuntimeClass_OcvTransform_OcvImageManipulations, BaseTrust)
+
+public:
+    OcvImageManipulations();
+
+    ~OcvImageManipulations();
+
+    STDMETHOD(RuntimeClassInitialize)();
+
+    // IMediaExtension
+    STDMETHODIMP SetProperties(ABI::Windows::Foundation::Collections::IPropertySet *pConfiguration);
+
+    // IMFTransform
+    STDMETHODIMP GetStreamLimits(
+        DWORD   *pdwInputMinimum,
+        DWORD   *pdwInputMaximum,
+        DWORD   *pdwOutputMinimum,
+        DWORD   *pdwOutputMaximum
+    );
+
+    STDMETHODIMP GetStreamCount(
+        DWORD   *pcInputStreams,
+        DWORD   *pcOutputStreams
+    );
+
+    STDMETHODIMP GetStreamIDs(
+        DWORD   dwInputIDArraySize,
+        DWORD   *pdwInputIDs,
+        DWORD   dwOutputIDArraySize,
+        DWORD   *pdwOutputIDs
+    );
+
+    STDMETHODIMP GetInputStreamInfo(
+        DWORD                     dwInputStreamID,
+        MFT_INPUT_STREAM_INFO *   pStreamInfo
+    );
+
+    STDMETHODIMP GetOutputStreamInfo(
+        DWORD                     dwOutputStreamID,
+        MFT_OUTPUT_STREAM_INFO *  pStreamInfo
+    );
+
+    STDMETHODIMP GetAttributes(IMFAttributes** pAttributes);
+
+    STDMETHODIMP GetInputStreamAttributes(
+        DWORD           dwInputStreamID,
+        IMFAttributes   **ppAttributes
+    );
+
+    STDMETHODIMP GetOutputStreamAttributes(
+        DWORD           dwOutputStreamID,
+        IMFAttributes   **ppAttributes
+    );
+
+    STDMETHODIMP DeleteInputStream(DWORD dwStreamID);
+
+    STDMETHODIMP AddInputStreams(
+        DWORD   cStreams,
+        DWORD   *adwStreamIDs
+    );
+
+    STDMETHODIMP GetInputAvailableType(
+        DWORD           dwInputStreamID,
+        DWORD           dwTypeIndex, // 0-based
+        IMFMediaType    **ppType
+    );
+
+    STDMETHODIMP GetOutputAvailableType(
+        DWORD           dwOutputStreamID,
+        DWORD           dwTypeIndex, // 0-based
+        IMFMediaType    **ppType
+    );
+
+    STDMETHODIMP SetInputType(
+        DWORD           dwInputStreamID,
+        IMFMediaType    *pType,
+        DWORD           dwFlags
+    );
+
+    STDMETHODIMP SetOutputType(
+        DWORD           dwOutputStreamID,
+        IMFMediaType    *pType,
+        DWORD           dwFlags
+    );
+
+    STDMETHODIMP GetInputCurrentType(
+        DWORD           dwInputStreamID,
+        IMFMediaType    **ppType
+    );
+
+    STDMETHODIMP GetOutputCurrentType(
+        DWORD           dwOutputStreamID,
+        IMFMediaType    **ppType
+    );
+
+    STDMETHODIMP GetInputStatus(
+        DWORD           dwInputStreamID,
+        DWORD           *pdwFlags
+    );
+
+    STDMETHODIMP GetOutputStatus(DWORD *pdwFlags);
+
+    STDMETHODIMP SetOutputBounds(
+        LONGLONG        hnsLowerBound,
+        LONGLONG        hnsUpperBound
+    );
+
+    STDMETHODIMP ProcessEvent(
+        DWORD              dwInputStreamID,
+        IMFMediaEvent      *pEvent
+    );
+
+    STDMETHODIMP ProcessMessage(
+        MFT_MESSAGE_TYPE    eMessage,
+        ULONG_PTR           ulParam
+    );
+
+    STDMETHODIMP ProcessInput(
+        DWORD               dwInputStreamID,
+        IMFSample           *pSample,
+        DWORD               dwFlags
+    );
+
+    STDMETHODIMP ProcessOutput(
+        DWORD                   dwFlags,
+        DWORD                   cOutputBufferCount,
+        MFT_OUTPUT_DATA_BUFFER  *pOutputSamples, // one per stream
+        DWORD                   *pdwStatus
+    );
+
+
+private:
+    // HasPendingOutput: Returns TRUE if the MFT is holding an input sample.
+    BOOL HasPendingOutput() const { return m_pSample != NULL; }
+
+    // IsValidInputStream: Returns TRUE if dwInputStreamID is a valid input stream identifier.
+    BOOL IsValidInputStream(DWORD dwInputStreamID) const
+    {
+        return dwInputStreamID == 0;
+    }
+
+    // IsValidOutputStream: Returns TRUE if dwOutputStreamID is a valid output stream identifier.
+    BOOL IsValidOutputStream(DWORD dwOutputStreamID) const
+    {
+        return dwOutputStreamID == 0;
+    }
+
+    HRESULT OnGetPartialType(DWORD dwTypeIndex, IMFMediaType **ppmt);
+    HRESULT OnCheckInputType(IMFMediaType *pmt);
+    HRESULT OnCheckOutputType(IMFMediaType *pmt);
+    HRESULT OnCheckMediaType(IMFMediaType *pmt);
+    void    OnSetInputType(IMFMediaType *pmt);
+    void    OnSetOutputType(IMFMediaType *pmt);
+    HRESULT BeginStreaming();
+    HRESULT EndStreaming();
+    HRESULT OnProcessOutput(IMFMediaBuffer *pIn, IMFMediaBuffer *pOut);
+    HRESULT OnFlush();
+    HRESULT UpdateFormatInfo();
+
+    CRITICAL_SECTION            m_critSec;
+
+    // Transformation parameters
+    ProcessingType              m_TransformType;
+
+    // Streaming
+    bool                        m_bStreamingInitialized;
+    IMFSample                   *m_pSample;                 // Input sample.
+    IMFMediaType                *m_pInputType;              // Input media type.
+    IMFMediaType                *m_pOutputType;             // Output media type.
+
+    // Fomat information
+    UINT32                      m_imageWidthInPixels;
+    UINT32                      m_imageHeightInPixels;
+    DWORD                       m_cbImageSize;              // Image size, in bytes.
+
+    IMFAttributes               *m_pAttributes;
+};
+#endif
\ No newline at end of file
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.vcxproj b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.vcxproj
new file mode 100644
index 0000000000..fbe768cc93
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/OcvTransform.vcxproj
@@ -0,0 +1,319 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM">
+      <Configuration>Release</Configuration>
+      <Platform>ARM</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCTargetsPath Condition="'$(VCTargetsPath11)' != '' and '$(VSVersion)' == '' and '$(VisualStudioVersion)' == ''">$(VCTargetsPath11)</VCTargetsPath>
+    <ProjectGuid>{BA69218F-DA5C-4D14-A78D-21A9E4DEC669}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>OcvTransform</RootNamespace>
+    <ProjectName>OcvTransform</ProjectName>
+    <MinimumVisualStudioVersion>11.0</MinimumVisualStudioVersion>
+    <AppContainerApplication>true</AppContainerApplication>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <PropertyGroup>
+    <DefaultLanguage>en-US</DefaultLanguage>
+  </PropertyGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <GenerateManifest>false</GenerateManifest>
+    <OutDir>$(Configuration)\$(MSBuildProjectName)\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
+    <GenerateManifest>false</GenerateManifest>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <GenerateManifest>false</GenerateManifest>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <GenerateManifest>false</GenerateManifest>
+    <OutDir>$(Configuration)\$(MSBuildProjectName)\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">
+    <GenerateManifest>false</GenerateManifest>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <GenerateManifest>false</GenerateManifest>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions>_WINRT_DLL;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AdditionalUsingDirectories>$(WindowsSDK_WindowsMetadata);$(AdditionalUsingDirectories)</AdditionalUsingDirectories>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <AdditionalIncludeDirectories>$(OPENCV_DIR)\include;$(ProjectDir);$(IntermediateOutputPath);%(AdditionalIncludeDirectories);$(ProjectDir)\..\Common</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>runtimeobject.lib;%(AdditionalDependencies);mf.lib;mfuuid.lib;mfplat.lib;opencv_core245.lib;opencv_imgproc245.lib</AdditionalDependencies>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <ModuleDefinitionFile>OcvTransform.def</ModuleDefinitionFile>
+      <AdditionalLibraryDirectories>$(OPENCV_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+    <CustomBuildStep>
+      <Command>mdmerge -metadata_dir "$(WindowsSDK_MetadataPath)" -o "$(ProjectDir)$(Configuration)\$(MSBuildProjectName)" -i "$(MSBuildProjectDirectory)" -v -partial</Command>
+      <Outputs>$(ProjectDir)$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd</Outputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions>_WINRT_DLL;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AdditionalUsingDirectories>$(WindowsSDK_WindowsMetadata);$(AdditionalUsingDirectories)</AdditionalUsingDirectories>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <AdditionalIncludeDirectories>$(OPENCV_DIR)\include;$(ProjectDir);$(IntermediateOutputPath);%(AdditionalIncludeDirectories);$(ProjectDir)\..\Common</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>runtimeobject.lib;%(AdditionalDependencies);mf.lib;mfuuid.lib;mfplat.lib</AdditionalDependencies>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <ModuleDefinitionFile>OcvTransform.def</ModuleDefinitionFile>
+      <AdditionalLibraryDirectories>$(OPENCV_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+    <CustomBuildStep>
+      <Command>mdmerge -metadata_dir "$(WindowsSDK_MetadataPath)" -o "$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)" -i "$(MSBuildProjectDirectory)" -v -partial</Command>
+      <Outputs>$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd</Outputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions>_WINRT_DLL;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AdditionalUsingDirectories>$(WindowsSDK_WindowsMetadata);$(AdditionalUsingDirectories)</AdditionalUsingDirectories>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <AdditionalIncludeDirectories>$(OPENCV_DIR)\include;$(ProjectDir);$(IntermediateOutputPath);%(AdditionalIncludeDirectories);$(ProjectDir)\..\Common</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>runtimeobject.lib;%(AdditionalDependencies);mf.lib;mfuuid.lib;mfplat.lib</AdditionalDependencies>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <ModuleDefinitionFile>OcvTransform.def</ModuleDefinitionFile>
+      <AdditionalLibraryDirectories>$(OPENCV_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+    <CustomBuildStep>
+      <Command>mdmerge -metadata_dir "$(WindowsSDK_MetadataPath)" -o "$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)" -i "$(MSBuildProjectDirectory)" -v -partial</Command>
+      <Outputs>$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd</Outputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions>_WINRT_DLL;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AdditionalUsingDirectories>$(WindowsSDK_WindowsMetadata);$(AdditionalUsingDirectories)</AdditionalUsingDirectories>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <AdditionalIncludeDirectories>$(OPENCV_DIR)\include;$(ProjectDir);$(IntermediateOutputPath);%(AdditionalIncludeDirectories);$(ProjectDir)\..\Common</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>runtimeobject.lib;%(AdditionalDependencies);mf.lib;mfuuid.lib;mfplat.lib</AdditionalDependencies>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <ModuleDefinitionFile>OcvTransform.def</ModuleDefinitionFile>
+      <AdditionalLibraryDirectories>$(OPENCV_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+    <CustomBuildStep>
+      <Command>mdmerge -metadata_dir "$(WindowsSDK_MetadataPath)" -o "$(ProjectDir)$(Configuration)\$(MSBuildProjectName)" -i "$(MSBuildProjectDirectory)" -v -partial</Command>
+      <Outputs>$(ProjectDir)$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd</Outputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions>_WINRT_DLL;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AdditionalUsingDirectories>$(WindowsSDK_WindowsMetadata);$(AdditionalUsingDirectories)</AdditionalUsingDirectories>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <AdditionalIncludeDirectories>$(OPENCV_DIR)\include;$(ProjectDir);$(IntermediateOutputPath);%(AdditionalIncludeDirectories);$(ProjectDir)\..\Common</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>runtimeobject.lib;%(AdditionalDependencies);mf.lib;mfuuid.lib;mfplat.lib</AdditionalDependencies>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <ModuleDefinitionFile>OcvTransform.def</ModuleDefinitionFile>
+      <AdditionalLibraryDirectories>$(OPENCV_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+    <CustomBuildStep>
+      <Command>mdmerge -metadata_dir "$(WindowsSDK_MetadataPath)" -o "$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)" -i "$(MSBuildProjectDirectory)" -v -partial</Command>
+      <Outputs>$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd</Outputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions>_WINRT_DLL;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PrecompiledHeaderFile>
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderOutputFile>
+      </PrecompiledHeaderOutputFile>
+      <AdditionalUsingDirectories>$(WindowsSDK_WindowsMetadata);$(AdditionalUsingDirectories)</AdditionalUsingDirectories>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <AdditionalIncludeDirectories>$(OPENCV_DIR)\include;$(ProjectDir);$(IntermediateOutputPath);%(AdditionalIncludeDirectories);$(ProjectDir)\..\Common</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>runtimeobject.lib;%(AdditionalDependencies);mf.lib;mfuuid.lib;mfplat.lib</AdditionalDependencies>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <ModuleDefinitionFile>OcvTransform.def</ModuleDefinitionFile>
+      <AdditionalLibraryDirectories>$(OPENCV_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+    <CustomBuildStep>
+      <Command>mdmerge -metadata_dir "$(WindowsSDK_MetadataPath)" -o "$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)" -i "$(MSBuildProjectDirectory)" -v -partial</Command>
+      <Outputs>$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd</Outputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="OcvTransform.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="dllmain.cpp" />
+    <ClCompile Include="OcvTransform.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="OcvTransform.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <Midl Include="OcvImageManipulations.idl">
+      <MetadataFileName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </MetadataFileName>
+      <MetadataFileName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </MetadataFileName>
+      <MetadataFileName Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
+      </MetadataFileName>
+      <MetadataFileName Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">
+      </MetadataFileName>
+      <MetadataFileName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </MetadataFileName>
+      <MetadataFileName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </MetadataFileName>
+      <AdditionalMetadataDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(WindowsSDK_MetadataPath)</AdditionalMetadataDirectories>
+      <AdditionalMetadataDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(WindowsSDK_MetadataPath)</AdditionalMetadataDirectories>
+      <AdditionalMetadataDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">$(WindowsSDK_MetadataPath)</AdditionalMetadataDirectories>
+      <AdditionalMetadataDirectories Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">$(WindowsSDK_MetadataPath)</AdditionalMetadataDirectories>
+      <AdditionalMetadataDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(WindowsSDK_MetadataPath)</AdditionalMetadataDirectories>
+      <AdditionalMetadataDirectories Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(WindowsSDK_MetadataPath)</AdditionalMetadataDirectories>
+      <EnableWindowsRuntime Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</EnableWindowsRuntime>
+      <EnableWindowsRuntime Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</EnableWindowsRuntime>
+      <EnableWindowsRuntime Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</EnableWindowsRuntime>
+      <EnableWindowsRuntime Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</EnableWindowsRuntime>
+      <EnableWindowsRuntime Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</EnableWindowsRuntime>
+      <EnableWindowsRuntime Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</EnableWindowsRuntime>
+      <HeaderFileName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).h</HeaderFileName>
+      <HeaderFileName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).h</HeaderFileName>
+      <HeaderFileName Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">%(Filename).h</HeaderFileName>
+      <HeaderFileName Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">%(Filename).h</HeaderFileName>
+      <HeaderFileName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).h</HeaderFileName>
+      <HeaderFileName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).h</HeaderFileName>
+    </Midl>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ItemGroup>
+    <_MdMergeOutput Condition="'$(Platform)' == 'Win32'" Include="$(ProjectDir)$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd" />
+    <_MdMergeOutput Condition="'$(Platform)' != 'Win32'" Include="$(SolutionDir)$(Platform)\$(Configuration)\$(MSBuildProjectName)\$(ProjectName).winmd" />
+  </ItemGroup>
+  <Target Name="CopyWinmdArtifactsOutputGroup" Returns="@(CopyWinmdArtifactsOutputGroupOutputs)">
+    <ItemGroup>
+      <CopyWinmdArtifactsOutputGroupOutputs Include="@(_MdMergeOutput)">
+        <TargetPath>$(ProjectName).winmd</TargetPath>
+        <Implementation>$(TargetName)$(TargetExt)</Implementation>
+      </CopyWinmdArtifactsOutputGroupOutputs>
+    </ItemGroup>
+  </Target>
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/dllmain.cpp b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/dllmain.cpp
new file mode 100644
index 0000000000..d11bceaf07
--- /dev/null
+++ b/samples/winrt/ImageManipulations/MediaExtensions/OcvTransform/dllmain.cpp
@@ -0,0 +1,58 @@
+//////////////////////////////////////////////////////////////////////////
+//
+// dllmain.cpp
+//
+// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
+// PARTICULAR PURPOSE.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//////////////////////////////////////////////////////////////////////////
+
+#include <initguid.h>
+#include "OcvTransform.h"
+
+using namespace Microsoft::WRL;
+
+namespace Microsoft { namespace Samples {
+	ActivatableClass(OcvImageManipulations);
+}}
+
+BOOL WINAPI DllMain( _In_ HINSTANCE hInstance, _In_ DWORD dwReason, _In_opt_ LPVOID lpReserved )
+{
+    if( DLL_PROCESS_ATTACH == dwReason )
+    {
+        //
+        //  Don't need per-thread callbacks
+        //
+        DisableThreadLibraryCalls( hInstance );
+
+        Module<InProc>::GetModule().Create();
+    }
+    else if( DLL_PROCESS_DETACH == dwReason )
+    {
+        Module<InProc>::GetModule().Terminate();
+    }
+
+    return TRUE;
+}
+
+HRESULT WINAPI DllGetActivationFactory( _In_ HSTRING activatibleClassId, _Outptr_ IActivationFactory** factory )
+{
+    auto &module = Microsoft::WRL::Module< Microsoft::WRL::InProc >::GetModule();
+    return module.GetActivationFactory( activatibleClassId, factory );
+}
+
+HRESULT WINAPI DllCanUnloadNow()
+{
+    auto &module = Microsoft::WRL::Module<Microsoft::WRL::InProc>::GetModule();    
+    return (module.Terminate()) ? S_OK : S_FALSE;
+}
+
+STDAPI DllGetClassObject( _In_ REFCLSID rclsid, _In_ REFIID riid, _Outptr_ LPVOID FAR* ppv )
+{
+    auto &module = Microsoft::WRL::Module<Microsoft::WRL::InProc>::GetModule();
+    return module.GetClassObject( rclsid, riid, ppv );
+}
diff --git a/samples/winrt/ImageManipulations/Package.appxmanifest b/samples/winrt/ImageManipulations/Package.appxmanifest
new file mode 100644
index 0000000000..9d63dea7cc
--- /dev/null
+++ b/samples/winrt/ImageManipulations/Package.appxmanifest
@@ -0,0 +1,36 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Package xmlns="http://schemas.microsoft.com/appx/2010/manifest">
+  <Identity Name="Microsoft.SDKSamples.MediaCapture.CPP" Publisher="CN=Microsoft Corporation, O=Microsoft Corporation, L=Redmond, S=Washington, C=US" Version="1.0.0.0" />
+  <Properties>
+    <DisplayName>MediaCapture CPP sample</DisplayName>
+    <PublisherDisplayName>Microsoft Corporation</PublisherDisplayName>
+    <Logo>Assets\storeLogo-sdk.png</Logo>
+  </Properties>
+  <Prerequisites>
+    <OSMinVersion>6.2.1</OSMinVersion>
+    <OSMaxVersionTested>6.2.1</OSMaxVersionTested>
+  </Prerequisites>
+  <Resources>
+    <Resource Language="x-generate" />
+  </Resources>
+  <Applications>
+    <Application Id="MediaCapture.App" Executable="$targetnametoken$.exe" EntryPoint="MediaCapture.App">
+      <VisualElements DisplayName="OCV Image Manipulations" Logo="assets\opencv-logo-150.png" SmallLogo="assets\opencv-logo-30.png" Description="OpenCV Image Manipulations sample" ForegroundText="light" BackgroundColor="#00b2f0">
+        <DefaultTile ShortName="Ocv ImageManipulations" ShowName="allLogos" />
+        <SplashScreen Image="Assets\splash-sdk.png" BackgroundColor="#00b2f0" />
+      </VisualElements>
+    </Application>
+  </Applications>
+  <Capabilities>
+    <DeviceCapability Name="webcam" />
+    <DeviceCapability Name="microphone" />
+  </Capabilities>
+  <Extensions>
+    <Extension Category="windows.activatableClass.inProcessServer">
+      <InProcessServer>
+        <Path>OcvTransform.dll</Path>
+        <ActivatableClass ActivatableClassId="OcvTransform.OcvImageManipulations" ThreadingModel="both" />
+      </InProcessServer>
+    </Extension>
+  </Extensions>
+</Package>
\ No newline at end of file
diff --git a/samples/winrt/ImageManipulations/assets/opencv-logo-150.png b/samples/winrt/ImageManipulations/assets/opencv-logo-150.png
new file mode 100644
index 0000000000..ea685d651a
Binary files /dev/null and b/samples/winrt/ImageManipulations/assets/opencv-logo-150.png differ
diff --git a/samples/winrt/ImageManipulations/assets/opencv-logo-30.png b/samples/winrt/ImageManipulations/assets/opencv-logo-30.png
new file mode 100644
index 0000000000..efaf5468a1
Binary files /dev/null and b/samples/winrt/ImageManipulations/assets/opencv-logo-30.png differ
diff --git a/samples/winrt/ImageManipulations/assets/splash-sdk.png b/samples/winrt/ImageManipulations/assets/splash-sdk.png
new file mode 100644
index 0000000000..901c3b085a
Binary files /dev/null and b/samples/winrt/ImageManipulations/assets/splash-sdk.png differ
diff --git a/samples/winrt/ImageManipulations/assets/windows-sdk.png b/samples/winrt/ImageManipulations/assets/windows-sdk.png
new file mode 100644
index 0000000000..67268021df
Binary files /dev/null and b/samples/winrt/ImageManipulations/assets/windows-sdk.png differ
diff --git a/samples/winrt/ImageManipulations/common/LayoutAwarePage.cpp b/samples/winrt/ImageManipulations/common/LayoutAwarePage.cpp
new file mode 100644
index 0000000000..9449fbead3
--- /dev/null
+++ b/samples/winrt/ImageManipulations/common/LayoutAwarePage.cpp
@@ -0,0 +1,452 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+#include "pch.h"
+#include "LayoutAwarePage.h"
+#include "SuspensionManager.h"
+
+using namespace SDKSample::Common;
+
+using namespace Platform;
+using namespace Platform::Collections;
+using namespace Windows::Foundation;
+using namespace Windows::Foundation::Collections;
+using namespace Windows::System;
+using namespace Windows::UI::Core;
+using namespace Windows::UI::ViewManagement;
+using namespace Windows::UI::Xaml;
+using namespace Windows::UI::Xaml::Controls;
+using namespace Windows::UI::Xaml::Interop;
+using namespace Windows::UI::Xaml::Navigation;
+
+/// <summary>
+/// Initializes a new instance of the <see cref="LayoutAwarePage"/> class.
+/// </summary>
+LayoutAwarePage::LayoutAwarePage()
+{
+    if (Windows::ApplicationModel::DesignMode::DesignModeEnabled)
+    {
+        return;
+    }
+
+    // Create an empty default view model
+    DefaultViewModel = ref new Map<String^, Object^>(std::less<String^>());
+
+    // When this page is part of the visual tree make two changes:
+    // 1) Map application view state to visual state for the page
+    // 2) Handle keyboard and mouse navigation requests
+    Loaded += ref new RoutedEventHandler(this, &LayoutAwarePage::OnLoaded);
+
+    // Undo the same changes when the page is no longer visible
+    Unloaded += ref new RoutedEventHandler(this, &LayoutAwarePage::OnUnloaded);
+}
+
+static DependencyProperty^ _defaultViewModelProperty =
+    DependencyProperty::Register("DefaultViewModel",
+    TypeName(IObservableMap<String^, Object^>::typeid), TypeName(LayoutAwarePage::typeid), nullptr);
+
+/// <summary>
+/// Identifies the <see cref="DefaultViewModel"/> dependency property.
+/// </summary>
+DependencyProperty^ LayoutAwarePage::DefaultViewModelProperty::get()
+{
+    return _defaultViewModelProperty;
+}
+
+/// <summary>
+/// Gets an implementation of <see cref="IObservableMap&lt;String, Object&gt;"/> designed to be
+/// used as a trivial view model.
+/// </summary>
+IObservableMap<String^, Object^>^ LayoutAwarePage::DefaultViewModel::get()
+{
+    return safe_cast<IObservableMap<String^, Object^>^>(GetValue(DefaultViewModelProperty));
+}
+
+/// <summary>
+/// Sets an implementation of <see cref="IObservableMap&lt;String, Object&gt;"/> designed to be
+/// used as a trivial view model.
+/// </summary>
+void LayoutAwarePage::DefaultViewModel::set(IObservableMap<String^, Object^>^ value)
+{
+    SetValue(DefaultViewModelProperty, value);
+}
+
+/// <summary>
+/// Invoked when the page is part of the visual tree
+/// </summary>
+/// <param name="sender">Instance that triggered the event.</param>
+/// <param name="e">Event data describing the conditions that led to the event.</param>
+void LayoutAwarePage::OnLoaded(Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e)
+{
+	this->StartLayoutUpdates(sender, e);
+
+	// Keyboard and mouse navigation only apply when occupying the entire window
+	if (this->ActualHeight == Window::Current->Bounds.Height &&
+		this->ActualWidth == Window::Current->Bounds.Width)
+	{
+		// Listen to the window directly so focus isn't required
+		_acceleratorKeyEventToken = Window::Current->CoreWindow->Dispatcher->AcceleratorKeyActivated +=
+			ref new TypedEventHandler<CoreDispatcher^, AcceleratorKeyEventArgs^>(this,
+			&LayoutAwarePage::CoreDispatcher_AcceleratorKeyActivated);
+		_pointerPressedEventToken = Window::Current->CoreWindow->PointerPressed +=
+			ref new TypedEventHandler<CoreWindow^, PointerEventArgs^>(this,
+			&LayoutAwarePage::CoreWindow_PointerPressed);
+		_navigationShortcutsRegistered = true;
+	}
+}
+
+/// <summary>
+/// Invoked when the page is removed from visual tree
+/// </summary>
+/// <param name="sender">Instance that triggered the event.</param>
+/// <param name="e">Event data describing the conditions that led to the event.</param>
+void LayoutAwarePage::OnUnloaded(Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e)
+{
+	if (_navigationShortcutsRegistered)
+	{
+		Window::Current->CoreWindow->Dispatcher->AcceleratorKeyActivated -= _acceleratorKeyEventToken;
+		Window::Current->CoreWindow->PointerPressed -= _pointerPressedEventToken;
+		_navigationShortcutsRegistered = false;
+	}
+	StopLayoutUpdates(sender, e);
+}
+
+#pragma region Navigation support
+
+/// <summary>
+/// Invoked as an event handler to navigate backward in the page's associated <see cref="Frame"/>
+/// until it reaches the top of the navigation stack.
+/// </summary>
+/// <param name="sender">Instance that triggered the event.</param>
+/// <param name="e">Event data describing the conditions that led to the event.</param>
+void LayoutAwarePage::GoHome(Object^ sender, RoutedEventArgs^ e)
+{
+    (void) sender;	// Unused parameter
+    (void) e;	// Unused parameter
+
+    // Use the navigation frame to return to the topmost page
+    if (Frame != nullptr)
+    {
+        while (Frame->CanGoBack)
+        {
+            Frame->GoBack();
+        }
+    }
+}
+
+/// <summary>
+/// Invoked as an event handler to navigate backward in the navigation stack
+/// associated with this page's <see cref="Frame"/>.
+/// </summary>
+/// <param name="sender">Instance that triggered the event.</param>
+/// <param name="e">Event data describing the conditions that led to the event.</param>
+void LayoutAwarePage::GoBack(Object^ sender, RoutedEventArgs^ e)
+{
+    (void) sender;	// Unused parameter
+    (void) e;	// Unused parameter
+
+    // Use the navigation frame to return to the previous page
+    if (Frame != nullptr && Frame->CanGoBack)
+    {
+        Frame->GoBack();
+    }
+}
+
+/// <summary>
+/// Invoked as an event handler to navigate forward in the navigation stack
+/// associated with this page's <see cref="Frame"/>.
+/// </summary>
+/// <param name="sender">Instance that triggered the event.</param>
+/// <param name="e">Event data describing the conditions that led to the event.</param>
+void LayoutAwarePage::GoForward(Object^ sender, RoutedEventArgs^ e)
+{
+    (void) sender;	// Unused parameter
+    (void) e;	// Unused parameter
+
+    // Use the navigation frame to advance to the next page
+    if (Frame != nullptr && Frame->CanGoForward)
+    {
+        Frame->GoForward();
+    }
+}
+
+/// <summary>
+/// Invoked on every keystroke, including system keys such as Alt key combinations, when
+/// this page is active and occupies the entire window.  Used to detect keyboard navigation
+/// between pages even when the page itself doesn't have focus.
+/// </summary>
+/// <param name="sender">Instance that triggered the event.</param>
+/// <param name="args">Event data describing the conditions that led to the event.</param>
+void LayoutAwarePage::CoreDispatcher_AcceleratorKeyActivated(CoreDispatcher^ sender, AcceleratorKeyEventArgs^ args)
+{
+    auto virtualKey = args->VirtualKey;
+
+    // Only investigate further when Left, Right, or the dedicated Previous or Next keys
+    // are pressed
+    if ((args->EventType == CoreAcceleratorKeyEventType::SystemKeyDown ||
+        args->EventType == CoreAcceleratorKeyEventType::KeyDown) &&
+        (virtualKey == VirtualKey::Left || virtualKey == VirtualKey::Right ||
+        (int)virtualKey == 166 || (int)virtualKey == 167))
+    {
+        auto coreWindow = Window::Current->CoreWindow;
+        auto downState = Windows::UI::Core::CoreVirtualKeyStates::Down;
+        bool menuKey = (coreWindow->GetKeyState(VirtualKey::Menu) & downState) == downState;
+        bool controlKey = (coreWindow->GetKeyState(VirtualKey::Control) & downState) == downState;
+        bool shiftKey = (coreWindow->GetKeyState(VirtualKey::Shift) & downState) == downState;
+        bool noModifiers = !menuKey && !controlKey && !shiftKey;
+        bool onlyAlt = menuKey && !controlKey && !shiftKey;
+
+        if (((int)virtualKey == 166 && noModifiers) ||
+            (virtualKey == VirtualKey::Left && onlyAlt))
+        {
+            // When the previous key or Alt+Left are pressed navigate back
+            args->Handled = true;
+            GoBack(this, ref new RoutedEventArgs());
+        }
+        else if (((int)virtualKey == 167 && noModifiers) ||
+            (virtualKey == VirtualKey::Right && onlyAlt))
+        {
+            // When the next key or Alt+Right are pressed navigate forward
+            args->Handled = true;
+            GoForward(this, ref new RoutedEventArgs());
+        }
+    }
+}
+
+/// <summary>
+/// Invoked on every mouse click, touch screen tap, or equivalent interaction when this
+/// page is active and occupies the entire window.  Used to detect browser-style next and
+/// previous mouse button clicks to navigate between pages.
+/// </summary>
+/// <param name="sender">Instance that triggered the event.</param>
+/// <param name="args">Event data describing the conditions that led to the event.</param>
+void LayoutAwarePage::CoreWindow_PointerPressed(CoreWindow^ sender, PointerEventArgs^ args)
+{
+    auto properties = args->CurrentPoint->Properties;
+
+    // Ignore button chords with the left, right, and middle buttons
+    if (properties->IsLeftButtonPressed || properties->IsRightButtonPressed ||
+        properties->IsMiddleButtonPressed) return;
+
+    // If back or foward are pressed (but not both) navigate appropriately
+    bool backPressed = properties->IsXButton1Pressed;
+    bool forwardPressed = properties->IsXButton2Pressed;
+    if (backPressed ^ forwardPressed)
+    {
+        args->Handled = true;
+        if (backPressed) GoBack(this, ref new RoutedEventArgs());
+        if (forwardPressed) GoForward(this, ref new RoutedEventArgs());
+    }
+}
+
+#pragma endregion
+
+#pragma region Visual state switching
+
+/// <summary>
+/// Invoked as an event handler, typically on the <see cref="Loaded"/> event of a
+/// <see cref="Control"/> within the page, to indicate that the sender should start receiving
+/// visual state management changes that correspond to application view state changes.
+/// </summary>
+/// <param name="sender">Instance of <see cref="Control"/> that supports visual state management
+/// corresponding to view states.</param>
+/// <param name="e">Event data that describes how the request was made.</param>
+/// <remarks>The current view state will immediately be used to set the corresponding visual state
+/// when layout updates are requested.  A corresponding <see cref="Unloaded"/> event handler
+/// connected to <see cref="StopLayoutUpdates"/> is strongly encouraged.  Instances of
+/// <see cref="LayoutAwarePage"/> automatically invoke these handlers in their Loaded and Unloaded
+/// events.</remarks>
+/// <seealso cref="DetermineVisualState"/>
+/// <seealso cref="InvalidateVisualState"/>
+void LayoutAwarePage::StartLayoutUpdates(Object^ sender, RoutedEventArgs^ e)
+{
+    (void) e;	// Unused parameter
+
+    auto control = safe_cast<Control^>(sender);
+    if (_layoutAwareControls == nullptr)
+    {
+        // Start listening to view state changes when there are controls interested in updates
+        _layoutAwareControls = ref new Vector<Control^>();
+        _windowSizeEventToken = Window::Current->SizeChanged += ref new WindowSizeChangedEventHandler(this, &LayoutAwarePage::WindowSizeChanged);
+
+        // Page receives notifications for children. Protect the page until we stopped layout updates for all controls.
+        _this = this;
+    }
+    _layoutAwareControls->Append(control);
+
+    // Set the initial visual state of the control
+    VisualStateManager::GoToState(control, DetermineVisualState(ApplicationView::Value), false);
+}
+
+void LayoutAwarePage::WindowSizeChanged(Object^ sender, Windows::UI::Core::WindowSizeChangedEventArgs^ e)
+{
+    (void) sender;	// Unused parameter
+    (void) e;	// Unused parameter
+
+    InvalidateVisualState();
+}
+
+/// <summary>
+/// Invoked as an event handler, typically on the <see cref="Unloaded"/> event of a
+/// <see cref="Control"/>, to indicate that the sender should start receiving visual state
+/// management changes that correspond to application view state changes.
+/// </summary>
+/// <param name="sender">Instance of <see cref="Control"/> that supports visual state management
+/// corresponding to view states.</param>
+/// <param name="e">Event data that describes how the request was made.</param>
+/// <remarks>The current view state will immediately be used to set the corresponding visual state
+/// when layout updates are requested.</remarks>
+/// <seealso cref="StartLayoutUpdates"/>
+void LayoutAwarePage::StopLayoutUpdates(Object^ sender, RoutedEventArgs^ e)
+{
+    (void) e;	// Unused parameter
+
+    auto control = safe_cast<Control^>(sender);
+    unsigned int index;
+    if (_layoutAwareControls != nullptr && _layoutAwareControls->IndexOf(control, &index))
+    {
+        _layoutAwareControls->RemoveAt(index);
+        if (_layoutAwareControls->Size == 0)
+        {
+            // Stop listening to view state changes when no controls are interested in updates
+            Window::Current->SizeChanged -= _windowSizeEventToken;
+            _layoutAwareControls = nullptr;
+            // Last control has received the Unload notification.
+            _this = nullptr;
+        }
+    }
+}
+
+/// <summary>
+/// Translates <see cref="ApplicationViewState"/> values into strings for visual state management
+/// within the page.  The default implementation uses the names of enum values.  Subclasses may
+/// override this method to control the mapping scheme used.
+/// </summary>
+/// <param name="viewState">View state for which a visual state is desired.</param>
+/// <returns>Visual state name used to drive the <see cref="VisualStateManager"/></returns>
+/// <seealso cref="InvalidateVisualState"/>
+String^ LayoutAwarePage::DetermineVisualState(ApplicationViewState viewState)
+{
+    switch (viewState)
+    {
+    case ApplicationViewState::Filled:
+        return "Filled";
+    case ApplicationViewState::Snapped:
+        return "Snapped";
+    case ApplicationViewState::FullScreenPortrait:
+        return "FullScreenPortrait";
+    case ApplicationViewState::FullScreenLandscape:
+    default:
+        return "FullScreenLandscape";
+    }
+}
+
+/// <summary>
+/// Updates all controls that are listening for visual state changes with the correct visual
+/// state.
+/// </summary>
+/// <remarks>
+/// Typically used in conjunction with overriding <see cref="DetermineVisualState"/> to
+/// signal that a different value may be returned even though the view state has not changed.
+/// </remarks>
+void LayoutAwarePage::InvalidateVisualState()
+{
+    if (_layoutAwareControls != nullptr)
+    {
+        String^ visualState = DetermineVisualState(ApplicationView::Value);
+        auto controlIterator = _layoutAwareControls->First();
+        while (controlIterator->HasCurrent)
+        {
+            auto control = controlIterator->Current;
+            VisualStateManager::GoToState(control, visualState, false);
+            controlIterator->MoveNext();
+        }
+    }
+}
+
+#pragma endregion
+
+#pragma region Process lifetime management
+
+/// <summary>
+/// Invoked when this page is about to be displayed in a Frame.
+/// </summary>
+/// <param name="e">Event data that describes how this page was reached.  The Parameter
+/// property provides the group to be displayed.</param>
+void LayoutAwarePage::OnNavigatedTo(NavigationEventArgs^ e)
+{
+    // Returning to a cached page through navigation shouldn't trigger state loading
+    if (_pageKey != nullptr) return;
+
+    auto frameState = SuspensionManager::SessionStateForFrame(Frame);
+    _pageKey = "Page-" + Frame->BackStackDepth;
+
+    if (e->NavigationMode == NavigationMode::New)
+    {
+        // Clear existing state for forward navigation when adding a new page to the
+        // navigation stack
+        auto nextPageKey = _pageKey;
+        int nextPageIndex = Frame->BackStackDepth;
+        while (frameState->HasKey(nextPageKey))
+        {
+            frameState->Remove(nextPageKey);
+            nextPageIndex++;
+            nextPageKey = "Page-" + nextPageIndex;
+        }
+
+        // Pass the navigation parameter to the new page
+        LoadState(e->Parameter, nullptr);
+    }
+    else
+    {
+        // Pass the navigation parameter and preserved page state to the page, using
+        // the same strategy for loading suspended state and recreating pages discarded
+        // from cache
+        LoadState(e->Parameter, safe_cast<IMap<String^, Object^>^>(frameState->Lookup(_pageKey)));
+    }
+}
+
+/// <summary>
+/// Invoked when this page will no longer be displayed in a Frame.
+/// </summary>
+/// <param name="e">Event data that describes how this page was reached.  The Parameter
+/// property provides the group to be displayed.</param>
+void LayoutAwarePage::OnNavigatedFrom(NavigationEventArgs^ e)
+{
+    auto frameState = SuspensionManager::SessionStateForFrame(Frame);
+    auto pageState = ref new Map<String^, Object^>();
+    SaveState(pageState);
+    frameState->Insert(_pageKey, pageState);
+}
+
+/// <summary>
+/// Populates the page with content passed during navigation.  Any saved state is also
+/// provided when recreating a page from a prior session.
+/// </summary>
+/// <param name="navigationParameter">The parameter value passed to
+/// <see cref="Frame.Navigate(Type, Object)"/> when this page was initially requested.
+/// </param>
+/// <param name="pageState">A map of state preserved by this page during an earlier
+/// session.  This will be null the first time a page is visited.</param>
+void LayoutAwarePage::LoadState(Object^ navigationParameter, IMap<String^, Object^>^ pageState)
+{
+}
+
+/// <summary>
+/// Preserves state associated with this page in case the application is suspended or the
+/// page is discarded from the navigation cache.  Values must conform to the serialization
+/// requirements of <see cref="SuspensionManager.SessionState"/>.
+/// </summary>
+/// <param name="pageState">An empty map to be populated with serializable state.</param>
+void LayoutAwarePage::SaveState(IMap<String^, Object^>^ pageState)
+{
+}
+
+#pragma endregion
diff --git a/samples/winrt/ImageManipulations/common/LayoutAwarePage.h b/samples/winrt/ImageManipulations/common/LayoutAwarePage.h
new file mode 100644
index 0000000000..bd71062fe9
--- /dev/null
+++ b/samples/winrt/ImageManipulations/common/LayoutAwarePage.h
@@ -0,0 +1,88 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+#pragma once
+
+#include <collection.h>
+
+namespace SDKSample
+{
+    namespace Common
+    {
+        /// <summary>
+        /// Typical implementation of Page that provides several important conveniences:
+        /// <list type="bullet">
+        /// <item>
+        /// <description>Application view state to visual state mapping</description>
+        /// </item>
+        /// <item>
+        /// <description>GoBack, GoForward, and GoHome event handlers</description>
+        /// </item>
+        /// <item>
+        /// <description>Mouse and keyboard shortcuts for navigation</description>
+        /// </item>
+        /// <item>
+        /// <description>State management for navigation and process lifetime management</description>
+        /// </item>
+        /// <item>
+        /// <description>A default view model</description>
+        /// </item>
+        /// </list>
+        /// </summary>
+        [Windows::Foundation::Metadata::WebHostHidden]
+        public ref class LayoutAwarePage : Windows::UI::Xaml::Controls::Page
+        {
+        internal:
+            LayoutAwarePage();
+
+        public:
+            void StartLayoutUpdates(Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+            void StopLayoutUpdates(Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+            void InvalidateVisualState();
+            static property Windows::UI::Xaml::DependencyProperty^ DefaultViewModelProperty
+            {
+                Windows::UI::Xaml::DependencyProperty^ get();
+            };
+            property Windows::Foundation::Collections::IObservableMap<Platform::String^, Platform::Object^>^ DefaultViewModel
+            {
+                Windows::Foundation::Collections::IObservableMap<Platform::String^, Platform::Object^>^ get();
+                void set(Windows::Foundation::Collections::IObservableMap<Platform::String^, Platform::Object^>^ value);
+            }
+
+        protected:
+            virtual void GoHome(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+            virtual void GoBack(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+            virtual void GoForward(Platform::Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+            virtual Platform::String^ DetermineVisualState(Windows::UI::ViewManagement::ApplicationViewState viewState);
+            virtual void OnNavigatedTo(Windows::UI::Xaml::Navigation::NavigationEventArgs^ e) override;
+            virtual void OnNavigatedFrom(Windows::UI::Xaml::Navigation::NavigationEventArgs^ e) override;
+            virtual void LoadState(Platform::Object^ navigationParameter,
+                Windows::Foundation::Collections::IMap<Platform::String^, Platform::Object^>^ pageState);
+            virtual void SaveState(Windows::Foundation::Collections::IMap<Platform::String^, Platform::Object^>^ pageState);
+
+        private:
+            Platform::String^ _pageKey;
+            bool _navigationShortcutsRegistered;
+            Platform::Collections::Map<Platform::String^, Platform::Object^>^ _defaultViewModel;
+            Windows::Foundation::EventRegistrationToken _windowSizeEventToken,
+                _acceleratorKeyEventToken, _pointerPressedEventToken;
+            Platform::Collections::Vector<Windows::UI::Xaml::Controls::Control^>^ _layoutAwareControls;
+            void WindowSizeChanged(Platform::Object^ sender, Windows::UI::Core::WindowSizeChangedEventArgs^ e);
+            void OnLoaded(Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+            void OnUnloaded(Object^ sender, Windows::UI::Xaml::RoutedEventArgs^ e);
+
+            void CoreDispatcher_AcceleratorKeyActivated(Windows::UI::Core::CoreDispatcher^ sender,
+                Windows::UI::Core::AcceleratorKeyEventArgs^ args);
+            void CoreWindow_PointerPressed(Windows::UI::Core::CoreWindow^ sender,
+                Windows::UI::Core::PointerEventArgs^ args);
+            LayoutAwarePage^ _this; // Strong reference to self, cleaned up in OnUnload
+        };
+    }
+}
diff --git a/samples/winrt/ImageManipulations/common/StandardStyles.xaml b/samples/winrt/ImageManipulations/common/StandardStyles.xaml
new file mode 100644
index 0000000000..7c3d238776
--- /dev/null
+++ b/samples/winrt/ImageManipulations/common/StandardStyles.xaml
@@ -0,0 +1,978 @@
+﻿<!--
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+-->
+
+<ResourceDictionary
+    xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml">
+
+    <!-- Non-brush values that vary across themes -->
+    
+    <ResourceDictionary.ThemeDictionaries>
+        <ResourceDictionary x:Key="Default">
+            <x:String x:Key="BackButtonGlyph">&#xE071;</x:String>
+            <x:String x:Key="BackButtonSnappedGlyph">&#xE0BA;</x:String>
+        </ResourceDictionary>
+
+        <ResourceDictionary x:Key="HighContrast">
+            <x:String x:Key="BackButtonGlyph">&#xE0A6;</x:String>
+            <x:String x:Key="BackButtonSnappedGlyph">&#xE0C4;</x:String>
+        </ResourceDictionary>
+    </ResourceDictionary.ThemeDictionaries>
+
+    <!-- RichTextBlock styles -->
+
+    <Style x:Key="BasicRichTextStyle" TargetType="RichTextBlock">
+        <Setter Property="Foreground" Value="{StaticResource ApplicationForegroundThemeBrush}"/>
+        <Setter Property="FontSize" Value="{StaticResource ControlContentThemeFontSize}"/>
+        <Setter Property="FontFamily" Value="{StaticResource ContentControlThemeFontFamily}"/>
+        <Setter Property="TextTrimming" Value="WordEllipsis"/>
+        <Setter Property="TextWrapping" Value="Wrap"/>
+        <Setter Property="Typography.StylisticSet20" Value="True"/>
+        <Setter Property="Typography.DiscretionaryLigatures" Value="True"/>
+    </Style>
+
+    <Style x:Key="BaselineRichTextStyle" TargetType="RichTextBlock" BasedOn="{StaticResource BasicRichTextStyle}">
+        <Setter Property="LineHeight" Value="20"/>
+        <Setter Property="LineStackingStrategy" Value="BlockLineHeight"/>
+        <!-- Properly align text along its baseline -->
+        <Setter Property="RenderTransform">
+            <Setter.Value>
+                <TranslateTransform X="-1" Y="4"/>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <Style x:Key="ItemRichTextStyle" TargetType="RichTextBlock" BasedOn="{StaticResource BaselineRichTextStyle}"/>
+
+    <Style x:Key="BodyRichTextStyle" TargetType="RichTextBlock" BasedOn="{StaticResource BaselineRichTextStyle}">
+        <Setter Property="FontWeight" Value="SemiLight"/>
+    </Style>
+
+    <!-- TextBlock styles -->
+
+    <Style x:Key="BasicTextStyle" TargetType="TextBlock">
+        <Setter Property="Foreground" Value="{StaticResource ApplicationForegroundThemeBrush}"/>
+        <Setter Property="FontSize" Value="{StaticResource ControlContentThemeFontSize}"/>
+        <Setter Property="FontFamily" Value="{StaticResource ContentControlThemeFontFamily}"/>
+        <Setter Property="TextTrimming" Value="WordEllipsis"/>
+        <Setter Property="TextWrapping" Value="Wrap"/>
+        <Setter Property="Typography.StylisticSet20" Value="True"/>
+        <Setter Property="Typography.DiscretionaryLigatures" Value="True"/>
+    </Style>
+
+    <Style x:Key="BaselineTextStyle" TargetType="TextBlock" BasedOn="{StaticResource BasicTextStyle}">
+        <Setter Property="LineHeight" Value="20"/>
+        <Setter Property="LineStackingStrategy" Value="BlockLineHeight"/>
+        <!-- Properly align text along its baseline -->
+        <Setter Property="RenderTransform">
+            <Setter.Value>
+                <TranslateTransform X="-1" Y="4"/>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <Style x:Key="HeaderTextStyle" TargetType="TextBlock" BasedOn="{StaticResource BaselineTextStyle}">
+        <Setter Property="FontSize" Value="56"/>
+        <Setter Property="FontWeight" Value="Light"/>
+        <Setter Property="LineHeight" Value="40"/>
+        <Setter Property="RenderTransform">
+            <Setter.Value>
+                <TranslateTransform X="-2" Y="8"/>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <Style x:Key="SubheaderTextStyle" TargetType="TextBlock" BasedOn="{StaticResource BaselineTextStyle}">
+        <Setter Property="FontSize" Value="26.667"/>
+        <Setter Property="FontWeight" Value="Light"/>
+        <Setter Property="LineHeight" Value="30"/>
+        <Setter Property="RenderTransform">
+            <Setter.Value>
+                <TranslateTransform X="-1" Y="6"/>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <Style x:Key="TitleTextStyle" TargetType="TextBlock" BasedOn="{StaticResource BaselineTextStyle}">
+        <Setter Property="FontWeight" Value="SemiBold"/>
+    </Style>
+
+    <Style x:Key="ItemTextStyle" TargetType="TextBlock" BasedOn="{StaticResource BaselineTextStyle}"/>
+
+    <Style x:Key="BodyTextStyle" TargetType="TextBlock" BasedOn="{StaticResource BaselineTextStyle}">
+        <Setter Property="FontWeight" Value="SemiLight"/>
+    </Style>
+
+    <Style x:Key="CaptionTextStyle" TargetType="TextBlock" BasedOn="{StaticResource BaselineTextStyle}">
+        <Setter Property="FontSize" Value="12"/>
+        <Setter Property="Foreground" Value="{StaticResource ApplicationSecondaryForegroundThemeBrush}"/>
+    </Style>
+
+    <!-- Button styles -->
+
+    <!--
+        TextButtonStyle is used to style a Button using subheader-styled text with no other adornment.  This
+        style is used in the GroupedItemsPage as a group header and in the FileOpenPickerPage for triggering
+        commands.
+    -->
+    <Style x:Key="TextButtonStyle" TargetType="Button">
+        <Setter Property="MinWidth" Value="0"/>
+        <Setter Property="MinHeight" Value="0"/>
+        <Setter Property="Template">
+            <Setter.Value>
+                <ControlTemplate TargetType="Button">
+                    <Grid Background="Transparent">
+                        <TextBlock
+                            x:Name="Text"
+                            Text="{TemplateBinding Content}"
+                            Margin="3,-7,3,10"
+                            TextWrapping="NoWrap"
+                            Style="{StaticResource SubheaderTextStyle}"/>
+                        <Rectangle
+                            x:Name="FocusVisualWhite"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualWhiteStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="1.5"/>
+                        <Rectangle
+                            x:Name="FocusVisualBlack"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualBlackStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="0.5"/>
+
+                        <VisualStateManager.VisualStateGroups>
+                            <VisualStateGroup x:Name="CommonStates">
+                                <VisualState x:Name="Normal"/>
+                                <VisualState x:Name="PointerOver">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Text" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource ApplicationPointerOverForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Pressed">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Text" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource ApplicationPressedForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Disabled">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Text" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource ButtonDisabledForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                            </VisualStateGroup>
+                            <VisualStateGroup x:Name="FocusStates">
+                                <VisualState x:Name="Focused">
+                                    <Storyboard>
+                                        <DoubleAnimation Duration="0" To="1" Storyboard.TargetName="FocusVisualWhite" Storyboard.TargetProperty="Opacity"/>
+                                        <DoubleAnimation Duration="0" To="1" Storyboard.TargetName="FocusVisualBlack" Storyboard.TargetProperty="Opacity"/>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Unfocused"/>
+                            </VisualStateGroup>
+                        </VisualStateManager.VisualStateGroups>
+                    </Grid>
+                </ControlTemplate>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <!--
+        TextRadioButtonStyle is used to style a RadioButton using subheader-styled text with no other adornment.
+        This style is used in the SearchResultsPage to allow selection among filters.
+    -->
+    <Style x:Key="TextRadioButtonStyle" TargetType="RadioButton">
+        <Setter Property="MinWidth" Value="0"/>
+        <Setter Property="MinHeight" Value="0"/>
+        <Setter Property="Template">
+            <Setter.Value>
+                <ControlTemplate TargetType="RadioButton">
+                    <Grid Background="Transparent">
+                        <TextBlock
+                            x:Name="Text"
+                            Text="{TemplateBinding Content}"
+                            Margin="3,-7,3,10"
+                            TextWrapping="NoWrap"
+                            Style="{StaticResource SubheaderTextStyle}"/>
+                        <Rectangle
+                            x:Name="FocusVisualWhite"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualWhiteStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="1.5"/>
+                        <Rectangle
+                            x:Name="FocusVisualBlack"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualBlackStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="0.5"/>
+
+                        <VisualStateManager.VisualStateGroups>
+                            <VisualStateGroup x:Name="CommonStates">
+                                <VisualState x:Name="Normal"/>
+                                <VisualState x:Name="PointerOver">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Text" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource ApplicationPointerOverForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Pressed">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Text" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource ApplicationPressedForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Disabled">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Text" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource ButtonDisabledForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                            </VisualStateGroup>
+                            <VisualStateGroup x:Name="FocusStates">
+                                <VisualState x:Name="Focused">
+                                    <Storyboard>
+                                        <DoubleAnimation Duration="0" To="1" Storyboard.TargetName="FocusVisualWhite" Storyboard.TargetProperty="Opacity"/>
+                                        <DoubleAnimation Duration="0" To="1" Storyboard.TargetName="FocusVisualBlack" Storyboard.TargetProperty="Opacity"/>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Unfocused"/>
+                            </VisualStateGroup>
+                            <VisualStateGroup x:Name="CheckStates">
+                                <VisualState x:Name="Checked"/>
+                                <VisualState x:Name="Unchecked">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Text" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource ApplicationSecondaryForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Indeterminate"/>
+                            </VisualStateGroup>
+                        </VisualStateManager.VisualStateGroups>
+                    </Grid>
+                </ControlTemplate>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <!--
+        AppBarButtonStyle is used to style a Button for use in an App Bar.  Content will be centered and should fit within
+        the 40-pixel radius glyph provided.  16-point Segoe UI Symbol is used for content text to simplify the use of glyphs
+        from that font.  AutomationProperties.Name is used for the text below the glyph.
+    -->
+    <Style x:Key="AppBarButtonStyle" TargetType="Button">
+        <Setter Property="Foreground" Value="{StaticResource AppBarItemForegroundThemeBrush}"/>
+        <Setter Property="VerticalAlignment" Value="Stretch"/>
+        <Setter Property="FontFamily" Value="Segoe UI Symbol"/>
+        <Setter Property="FontWeight" Value="Normal"/>
+        <Setter Property="FontSize" Value="20"/>
+        <Setter Property="AutomationProperties.ItemType" Value="App Bar Button"/>
+        <Setter Property="Template">
+            <Setter.Value>
+                <ControlTemplate TargetType="Button">
+                    <Grid Width="100" Background="Transparent">
+                        <StackPanel VerticalAlignment="Top" Margin="0,14,0,13">
+                            <Grid Width="40" Height="40" Margin="0,0,0,5" HorizontalAlignment="Center">
+                                <TextBlock x:Name="BackgroundGlyph" Text="&#xE0A8;" FontFamily="Segoe UI Symbol" FontSize="53.333" Margin="-4,-19,0,0" Foreground="{StaticResource AppBarItemBackgroundThemeBrush}"/>
+                                <TextBlock x:Name="OutlineGlyph" Text="&#xE0A7;" FontFamily="Segoe UI Symbol" FontSize="53.333" Margin="-4,-19,0,0"/>
+                                <ContentPresenter x:Name="Content" HorizontalAlignment="Center" Margin="-1,-1,0,0" VerticalAlignment="Center"/>
+                            </Grid>
+                            <TextBlock
+                                x:Name="TextLabel"
+                                Text="{TemplateBinding AutomationProperties.Name}"
+                                Margin="0,0,2,0"
+                                FontSize="12"
+                                TextAlignment="Center"
+                                Width="88"
+                                MaxHeight="32"
+                                TextTrimming="WordEllipsis"
+                                Style="{StaticResource BasicTextStyle}"/>
+                        </StackPanel>
+                        <Rectangle
+                                x:Name="FocusVisualWhite"
+                                IsHitTestVisible="False"
+                                Stroke="{StaticResource FocusVisualWhiteStrokeThemeBrush}"
+                                StrokeEndLineCap="Square"
+                                StrokeDashArray="1,1"
+                                Opacity="0"
+                                StrokeDashOffset="1.5"/>
+                        <Rectangle
+                                x:Name="FocusVisualBlack"
+                                IsHitTestVisible="False"
+                                Stroke="{StaticResource FocusVisualBlackStrokeThemeBrush}"
+                                StrokeEndLineCap="Square"
+                                StrokeDashArray="1,1"
+                                Opacity="0"
+                                StrokeDashOffset="0.5"/>
+
+                        <VisualStateManager.VisualStateGroups>
+                            <VisualStateGroup x:Name="CommonStates">
+                                <VisualState x:Name="Normal"/>
+                                <VisualState x:Name="PointerOver">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="BackgroundGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource AppBarItemPointerOverBackgroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Content" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource AppBarItemPointerOverForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Pressed">
+                                    <Storyboard>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="OutlineGlyph"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="0"
+                                            Duration="0"/>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="BackgroundGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource AppBarItemForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Content" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource AppBarItemPressedForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Disabled">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="OutlineGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource AppBarItemDisabledForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="Content" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource AppBarItemDisabledForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="TextLabel" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource AppBarItemDisabledForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                            </VisualStateGroup>
+                            <VisualStateGroup x:Name="FocusStates">
+                                <VisualState x:Name="Focused">
+                                    <Storyboard>
+                                        <DoubleAnimation
+                                                Storyboard.TargetName="FocusVisualWhite"
+                                                Storyboard.TargetProperty="Opacity"
+                                                To="1"
+                                                Duration="0"/>
+                                        <DoubleAnimation
+                                                Storyboard.TargetName="FocusVisualBlack"
+                                                Storyboard.TargetProperty="Opacity"
+                                                To="1"
+                                                Duration="0"/>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Unfocused" />
+                                <VisualState x:Name="PointerFocused" />
+                            </VisualStateGroup>
+                        </VisualStateManager.VisualStateGroups>
+                    </Grid>
+                </ControlTemplate>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <!-- Standard App Bar buttons -->
+  
+    <Style x:Key="SkipBackAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="SkipBackAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Skip Back"/>
+        <Setter Property="Content" Value="&#xE100;"/>
+    </Style>
+    <Style x:Key="SkipAheadAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="SkipAheadAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Skip Ahead"/>
+        <Setter Property="Content" Value="&#xE101;"/>
+    </Style>
+    <Style x:Key="PlayAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="PlayAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Play"/>
+        <Setter Property="Content" Value="&#xE102;"/>
+    </Style>
+    <Style x:Key="PauseAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="PauseAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Pause"/>
+        <Setter Property="Content" Value="&#xE103;"/>
+    </Style>
+    <Style x:Key="EditAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="EditAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Edit"/>
+        <Setter Property="Content" Value="&#xE104;"/>
+    </Style>
+    <Style x:Key="SaveAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="SaveAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Save"/>
+        <Setter Property="Content" Value="&#xE105;"/>
+    </Style>
+    <Style x:Key="DeleteAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="DeleteAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Delete"/>
+        <Setter Property="Content" Value="&#xE106;"/>
+    </Style>
+    <Style x:Key="DiscardAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="DiscardAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Discard"/>
+        <Setter Property="Content" Value="&#xE107;"/>
+    </Style>
+    <Style x:Key="RemoveAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="RemoveAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Remove"/>
+        <Setter Property="Content" Value="&#xE108;"/>
+    </Style>
+    <Style x:Key="AddAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="AddAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Add"/>
+        <Setter Property="Content" Value="&#xE109;"/>
+    </Style>
+    <Style x:Key="NoAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="NoAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="No"/>
+        <Setter Property="Content" Value="&#xE10A;"/>
+    </Style>
+    <Style x:Key="YesAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="YesAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Yes"/>
+        <Setter Property="Content" Value="&#xE10B;"/>
+    </Style>
+    <Style x:Key="MoreAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="MoreAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="More"/>
+        <Setter Property="Content" Value="&#xE10C;"/>
+    </Style>
+    <Style x:Key="RedoAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="RedoAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Redo"/>
+        <Setter Property="Content" Value="&#xE10D;"/>
+    </Style>
+    <Style x:Key="UndoAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="UndoAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Undo"/>
+        <Setter Property="Content" Value="&#xE10E;"/>
+    </Style>
+    <Style x:Key="HomeAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="HomeAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Home"/>
+        <Setter Property="Content" Value="&#xE10F;"/>
+    </Style>
+    <Style x:Key="OutAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="OutAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Out"/>
+        <Setter Property="Content" Value="&#xE110;"/>
+    </Style>
+    <Style x:Key="NextAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="NextAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Next"/>
+        <Setter Property="Content" Value="&#xE111;"/>
+    </Style>
+    <Style x:Key="PreviousAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="PreviousAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Previous"/>
+        <Setter Property="Content" Value="&#xE112;"/>
+    </Style>
+    <Style x:Key="FavoriteAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="FavoriteAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Favorite"/>
+        <Setter Property="Content" Value="&#xE113;"/>
+    </Style>
+    <Style x:Key="PhotoAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="PhotoAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Photo"/>
+        <Setter Property="Content" Value="&#xE114;"/>
+    </Style>
+    <Style x:Key="SettingsAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="SettingsAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Settings"/>
+        <Setter Property="Content" Value="&#xE115;"/>
+    </Style>
+    <Style x:Key="VideoAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="VideoAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Video"/>
+        <Setter Property="Content" Value="&#xE116;"/>
+    </Style>
+    <Style x:Key="RefreshAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="RefreshAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Refresh"/>
+        <Setter Property="Content" Value="&#xE117;"/>
+    </Style>
+    <Style x:Key="DownloadAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="DownloadAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Download"/>
+        <Setter Property="Content" Value="&#xE118;"/>
+    </Style>
+    <Style x:Key="MailAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="MailAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Mail"/>
+        <Setter Property="Content" Value="&#xE119;"/>
+    </Style>
+    <Style x:Key="SearchAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="SearchAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Search"/>
+        <Setter Property="Content" Value="&#xE11A;"/>
+    </Style>
+    <Style x:Key="HelpAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="HelpAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Help"/>
+        <Setter Property="Content" Value="&#xE11B;"/>
+    </Style>
+    <Style x:Key="UploadAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="UploadAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Upload"/>
+        <Setter Property="Content" Value="&#xE11C;"/>
+    </Style>
+    <Style x:Key="PinAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="PinAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Pin"/>
+        <Setter Property="Content" Value="&#xE141;"/>
+    </Style>
+    <Style x:Key="UnpinAppBarButtonStyle" TargetType="Button" BasedOn="{StaticResource AppBarButtonStyle}">
+        <Setter Property="AutomationProperties.AutomationId" Value="UnpinAppBarButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Unpin"/>
+        <Setter Property="Content" Value="&#xE196;"/>
+    </Style>
+
+    <!-- Title area styles -->
+
+    <Style x:Key="PageHeaderTextStyle" TargetType="TextBlock" BasedOn="{StaticResource HeaderTextStyle}">
+        <Setter Property="TextWrapping" Value="NoWrap"/>
+        <Setter Property="VerticalAlignment" Value="Bottom"/>
+        <Setter Property="Margin" Value="0,0,40,40"/>
+    </Style>
+
+    <Style x:Key="PageSubheaderTextStyle" TargetType="TextBlock" BasedOn="{StaticResource SubheaderTextStyle}">
+        <Setter Property="TextWrapping" Value="NoWrap"/>
+        <Setter Property="VerticalAlignment" Value="Bottom"/>
+        <Setter Property="Margin" Value="0,0,0,40"/>
+    </Style>
+
+    <Style x:Key="SnappedPageHeaderTextStyle" TargetType="TextBlock" BasedOn="{StaticResource PageSubheaderTextStyle}">
+        <Setter Property="Margin" Value="0,0,18,40"/>
+    </Style>
+
+    <!--
+        BackButtonStyle is used to style a Button for use in the title area of a page.  Margins appropriate for
+        the conventional page layout are included as part of the style.
+    -->
+    <Style x:Key="BackButtonStyle" TargetType="Button">
+        <Setter Property="MinWidth" Value="0"/>
+        <Setter Property="Width" Value="48"/>
+        <Setter Property="Height" Value="48"/>
+        <Setter Property="Margin" Value="36,0,36,36"/>
+        <Setter Property="VerticalAlignment" Value="Bottom"/>
+        <Setter Property="FontFamily" Value="Segoe UI Symbol"/>
+        <Setter Property="FontWeight" Value="Normal"/>
+        <Setter Property="FontSize" Value="56"/>
+        <Setter Property="AutomationProperties.AutomationId" Value="BackButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Back"/>
+        <Setter Property="AutomationProperties.ItemType" Value="Navigation Button"/>
+        <Setter Property="Template">
+            <Setter.Value>
+                <ControlTemplate TargetType="Button">
+                    <Grid x:Name="RootGrid">
+                        <Grid Margin="-1,-16,0,0">
+                            <TextBlock x:Name="BackgroundGlyph" Text="&#xE0A8;" Foreground="{StaticResource BackButtonBackgroundThemeBrush}"/>
+                            <TextBlock x:Name="NormalGlyph" Text="{StaticResource BackButtonGlyph}" Foreground="{StaticResource BackButtonForegroundThemeBrush}"/>
+                            <TextBlock x:Name="ArrowGlyph" Text="&#xE0A6;" Foreground="{StaticResource BackButtonPressedForegroundThemeBrush}" Opacity="0"/>
+                        </Grid>
+                        <Rectangle
+                            x:Name="FocusVisualWhite"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualWhiteStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="1.5"/>
+                        <Rectangle
+                            x:Name="FocusVisualBlack"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualBlackStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="0.5"/>
+
+                        <VisualStateManager.VisualStateGroups>
+                            <VisualStateGroup x:Name="CommonStates">
+                                <VisualState x:Name="Normal" />
+                                <VisualState x:Name="PointerOver">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="BackgroundGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource BackButtonPointerOverBackgroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="NormalGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource BackButtonPointerOverForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Pressed">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="BackgroundGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource BackButtonForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="ArrowGlyph"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="1"
+                                            Duration="0"/>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="NormalGlyph"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="0"
+                                            Duration="0"/>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Disabled">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="RootGrid" Storyboard.TargetProperty="Visibility">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="Collapsed"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                            </VisualStateGroup>
+                            <VisualStateGroup x:Name="FocusStates">
+                                <VisualState x:Name="Focused">
+                                    <Storyboard>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="FocusVisualWhite"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="1"
+                                            Duration="0"/>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="FocusVisualBlack"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="1"
+                                            Duration="0"/>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Unfocused" />
+                                <VisualState x:Name="PointerFocused" />
+                            </VisualStateGroup>
+                        </VisualStateManager.VisualStateGroups>
+                    </Grid>
+                </ControlTemplate>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <!--
+        PortraitBackButtonStyle is used to style a Button for use in the title area of a portrait page.  Margins appropriate
+        for the conventional page layout are included as part of the style.
+    -->
+    <Style x:Key="PortraitBackButtonStyle" TargetType="Button" BasedOn="{StaticResource BackButtonStyle}">
+        <Setter Property="Margin" Value="26,0,26,36"/>
+    </Style>
+
+    <!--
+        SnappedBackButtonStyle is used to style a Button for use in the title area of a snapped page.  Margins appropriate
+        for the conventional page layout are included as part of the style.
+        
+        The obvious duplication here is necessary as the glyphs used in snapped are not merely smaller versions of the same
+        glyph but are actually distinct.
+    -->
+    <Style x:Key="SnappedBackButtonStyle" TargetType="Button">
+        <Setter Property="MinWidth" Value="0"/>
+        <Setter Property="Margin" Value="20,0,0,0"/>
+        <Setter Property="VerticalAlignment" Value="Bottom"/>
+        <Setter Property="FontFamily" Value="Segoe UI Symbol"/>
+        <Setter Property="FontWeight" Value="Normal"/>
+        <Setter Property="FontSize" Value="26.66667"/>
+        <Setter Property="AutomationProperties.AutomationId" Value="BackButton"/>
+        <Setter Property="AutomationProperties.Name" Value="Back"/>
+        <Setter Property="AutomationProperties.ItemType" Value="Navigation Button"/>
+        <Setter Property="Template">
+            <Setter.Value>
+                <ControlTemplate TargetType="Button">
+                    <Grid x:Name="RootGrid" Width="36" Height="36" Margin="-3,0,7,33">
+                        <Grid Margin="-1,-1,0,0">
+                            <TextBlock x:Name="BackgroundGlyph" Text="&#xE0D4;" Foreground="{StaticResource BackButtonBackgroundThemeBrush}"/>
+                            <TextBlock x:Name="NormalGlyph" Text="{StaticResource BackButtonSnappedGlyph}" Foreground="{StaticResource BackButtonForegroundThemeBrush}"/>
+                            <TextBlock x:Name="ArrowGlyph" Text="&#xE0C4;" Foreground="{StaticResource BackButtonPressedForegroundThemeBrush}" Opacity="0"/>
+                        </Grid>
+                        <Rectangle
+                            x:Name="FocusVisualWhite"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualWhiteStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="1.5"/>
+                        <Rectangle
+                            x:Name="FocusVisualBlack"
+                            IsHitTestVisible="False"
+                            Stroke="{StaticResource FocusVisualBlackStrokeThemeBrush}"
+                            StrokeEndLineCap="Square"
+                            StrokeDashArray="1,1"
+                            Opacity="0"
+                            StrokeDashOffset="0.5"/>
+
+                        <VisualStateManager.VisualStateGroups>
+                            <VisualStateGroup x:Name="CommonStates">
+                                <VisualState x:Name="Normal" />
+                                <VisualState x:Name="PointerOver">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="BackgroundGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource BackButtonPointerOverBackgroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="NormalGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource BackButtonPointerOverForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Pressed">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="BackgroundGlyph" Storyboard.TargetProperty="Foreground">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="{StaticResource BackButtonForegroundThemeBrush}"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="ArrowGlyph"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="1"
+                                            Duration="0"/>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="NormalGlyph"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="0"
+                                            Duration="0"/>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Disabled">
+                                    <Storyboard>
+                                        <ObjectAnimationUsingKeyFrames Storyboard.TargetName="RootGrid" Storyboard.TargetProperty="Visibility">
+                                            <DiscreteObjectKeyFrame KeyTime="0" Value="Collapsed"/>
+                                        </ObjectAnimationUsingKeyFrames>
+                                    </Storyboard>
+                                </VisualState>
+                            </VisualStateGroup>
+                            <VisualStateGroup x:Name="FocusStates">
+                                <VisualState x:Name="Focused">
+                                    <Storyboard>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="FocusVisualWhite"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="1"
+                                            Duration="0"/>
+                                        <DoubleAnimation
+                                            Storyboard.TargetName="FocusVisualBlack"
+                                            Storyboard.TargetProperty="Opacity"
+                                            To="1"
+                                            Duration="0"/>
+                                    </Storyboard>
+                                </VisualState>
+                                <VisualState x:Name="Unfocused" />
+                                <VisualState x:Name="PointerFocused" />
+                            </VisualStateGroup>
+                        </VisualStateManager.VisualStateGroups>
+                    </Grid>
+                </ControlTemplate>
+            </Setter.Value>
+        </Setter>
+    </Style>
+
+    <!-- Item templates -->
+
+    <!-- Grid-appropriate 250 pixel square item template as seen in the GroupedItemsPage and ItemsPage -->
+    <DataTemplate x:Key="Standard250x250ItemTemplate">
+        <Grid HorizontalAlignment="Left" Width="250" Height="250">
+            <Border Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}">
+                <Image Source="{Binding Image}" Stretch="UniformToFill"/>
+            </Border>
+            <StackPanel VerticalAlignment="Bottom" Background="{StaticResource ListViewItemOverlayBackgroundThemeBrush}">
+                <TextBlock Text="{Binding Title}" Foreground="{StaticResource ListViewItemOverlayForegroundThemeBrush}" Style="{StaticResource TitleTextStyle}" Height="60" Margin="15,0,15,0"/>
+                <TextBlock Text="{Binding Subtitle}" Foreground="{StaticResource ListViewItemOverlaySecondaryForegroundThemeBrush}" Style="{StaticResource CaptionTextStyle}" TextWrapping="NoWrap" Margin="15,0,15,10"/>
+            </StackPanel>
+        </Grid>
+    </DataTemplate>
+
+    <!-- Grid-appropriate 500 by 130 pixel item template as seen in the GroupDetailPage -->
+    <DataTemplate x:Key="Standard500x130ItemTemplate">
+        <Grid Height="110" Width="480" Margin="10">
+            <Grid.ColumnDefinitions>
+                <ColumnDefinition Width="Auto"/>
+                <ColumnDefinition Width="*"/>
+            </Grid.ColumnDefinitions>
+            <Border Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}" Width="110" Height="110">
+                <Image Source="{Binding Image}" Stretch="UniformToFill"/>
+            </Border>
+            <StackPanel Grid.Column="1" VerticalAlignment="Top" Margin="10,0,0,0">
+                <TextBlock Text="{Binding Title}" Style="{StaticResource TitleTextStyle}" TextWrapping="NoWrap"/>
+                <TextBlock Text="{Binding Subtitle}" Style="{StaticResource CaptionTextStyle}" TextWrapping="NoWrap"/>
+                <TextBlock Text="{Binding Description}" Style="{StaticResource BodyTextStyle}" MaxHeight="60"/>
+            </StackPanel>
+        </Grid>
+    </DataTemplate>
+
+    <!-- List-appropriate 130 pixel high item template as seen in the SplitPage -->
+    <DataTemplate x:Key="Standard130ItemTemplate">
+        <Grid Height="110" Margin="6">
+            <Grid.ColumnDefinitions>
+                <ColumnDefinition Width="Auto"/>
+                <ColumnDefinition Width="*"/>
+            </Grid.ColumnDefinitions>
+            <Border Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}" Width="110" Height="110">
+                <Image Source="{Binding Image}" Stretch="UniformToFill"/>
+            </Border>
+            <StackPanel Grid.Column="1" VerticalAlignment="Top" Margin="10,0,0,0">
+                <TextBlock Text="{Binding Title}" Style="{StaticResource TitleTextStyle}" TextWrapping="NoWrap"/>
+                <TextBlock Text="{Binding Subtitle}" Style="{StaticResource CaptionTextStyle}" TextWrapping="NoWrap"/>
+                <TextBlock Text="{Binding Description}" Style="{StaticResource BodyTextStyle}" MaxHeight="60"/>
+            </StackPanel>
+        </Grid>
+    </DataTemplate>
+
+    <!--
+        List-appropriate 80 pixel high item template as seen in the SplitPage when Filled, and
+        the following pages when snapped: GroupedItemsPage, GroupDetailPage, and ItemsPage
+    -->
+    <DataTemplate x:Key="Standard80ItemTemplate">
+        <Grid Margin="6">
+            <Grid.ColumnDefinitions>
+                <ColumnDefinition Width="Auto"/>
+                <ColumnDefinition Width="*"/>
+            </Grid.ColumnDefinitions>
+            <Border Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}" Width="60" Height="60">
+                <Image Source="{Binding Image}" Stretch="UniformToFill"/>
+            </Border>
+            <StackPanel Grid.Column="1" Margin="10,0,0,0">
+                <TextBlock Text="{Binding Title}" Style="{StaticResource ItemTextStyle}" MaxHeight="40"/>
+                <TextBlock Text="{Binding Subtitle}" Style="{StaticResource CaptionTextStyle}" TextWrapping="NoWrap"/>
+            </StackPanel>
+        </Grid>
+    </DataTemplate>
+
+    <!-- Grid-appropriate 300 by 70 pixel item template as seen in the SearchResultsPage -->
+    <DataTemplate x:Key="StandardSmallIcon300x70ItemTemplate">
+        <Grid Width="300">
+            <Grid.ColumnDefinitions>
+                <ColumnDefinition Width="Auto"/>
+                <ColumnDefinition Width="*"/>
+            </Grid.ColumnDefinitions>
+            <Border Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}" Margin="10,10,0,20" Width="40" Height="40">
+                <Image Source="{Binding Image}" Stretch="UniformToFill"/>
+            </Border>
+            <StackPanel Grid.Column="1" Margin="10,0,10,10">
+                <TextBlock Text="{Binding Title}" Style="{StaticResource BodyTextStyle}" TextWrapping="NoWrap"/>
+                <TextBlock Text="{Binding Subtitle}" Style="{StaticResource BodyTextStyle}" Foreground="{StaticResource ApplicationSecondaryForegroundThemeBrush}" Height="40"/>
+            </StackPanel>
+        </Grid>
+    </DataTemplate>
+
+    <!-- List-appropriate 70 pixel high item template as seen in the SearchResultsPage when Snapped -->
+    <DataTemplate x:Key="StandardSmallIcon70ItemTemplate">
+        <Grid Margin="6">
+            <Grid.ColumnDefinitions>
+                <ColumnDefinition Width="Auto"/>
+                <ColumnDefinition Width="*"/>
+            </Grid.ColumnDefinitions>
+            <Border Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}" Margin="0,0,0,10" Width="40" Height="40">
+                <Image Source="{Binding Image}" Stretch="UniformToFill"/>
+            </Border>
+            <StackPanel Grid.Column="1" Margin="10,-10,0,0">
+                <TextBlock Text="{Binding Title}" Style="{StaticResource BodyTextStyle}" TextWrapping="NoWrap"/>
+                <TextBlock Text="{Binding Subtitle}" Style="{StaticResource BodyTextStyle}" Foreground="{StaticResource ApplicationSecondaryForegroundThemeBrush}" Height="40"/>
+            </StackPanel>
+        </Grid>
+    </DataTemplate>
+
+  <!--
+      190x130 pixel item template for displaying file previews as seen in the FileOpenPickerPage
+      Includes an elaborate tooltip to display title and description text
+  -->
+  <DataTemplate x:Key="StandardFileWithTooltip190x130ItemTemplate">
+        <Grid>
+            <Grid Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}">
+                <Image
+                    Source="{Binding Image}"
+                    Width="190"
+                    Height="130"
+                    HorizontalAlignment="Center"
+                    VerticalAlignment="Center"
+                    Stretch="Uniform"/>
+            </Grid>
+            <ToolTipService.Placement>Mouse</ToolTipService.Placement>
+            <ToolTipService.ToolTip>
+                <Grid Background="{StaticResource ApplicationPageBackgroundThemeBrush}">
+                    <Grid.ColumnDefinitions>
+                        <ColumnDefinition Width="Auto"/>
+                        <ColumnDefinition Width="*"/>
+                    </Grid.ColumnDefinitions>
+
+                    <Grid Background="{StaticResource ListViewItemPlaceholderBackgroundThemeBrush}" Margin="20">
+                        <Image
+                            Source="{Binding Image}"
+                            Width="160"
+                            Height="160"
+                            HorizontalAlignment="Center"
+                            VerticalAlignment="Center"
+                            Stretch="Uniform"/>
+                    </Grid>
+                    <StackPanel Width="200" Grid.Column="1" Margin="0,20,20,20">
+                        <TextBlock Text="{Binding Title}" TextWrapping="NoWrap" Style="{StaticResource BodyTextStyle}"/>
+                        <TextBlock Text="{Binding Description}" MaxHeight="140" Foreground="{StaticResource ApplicationSecondaryForegroundThemeBrush}" Style="{StaticResource BodyTextStyle}"/>
+                    </StackPanel>
+                </Grid>                    
+            </ToolTipService.ToolTip>
+        </Grid>
+    </DataTemplate>
+
+    <!-- Default to 10-pixel spacing between grid items (after accounting for 4-pixel insets for focus) -->
+
+    <Style TargetType="GridViewItem">
+        <Setter Property="Margin" Value="0,0,2,2" />
+    </Style>
+
+    <!-- ScrollViewer styles -->
+
+    <Style x:Key="HorizontalScrollViewerStyle" TargetType="ScrollViewer">
+        <Setter Property="HorizontalScrollBarVisibility" Value="Auto"/>
+        <Setter Property="VerticalScrollBarVisibility" Value="Disabled"/>
+        <Setter Property="ScrollViewer.HorizontalScrollMode" Value="Enabled" />
+        <Setter Property="ScrollViewer.VerticalScrollMode" Value="Disabled" />
+        <Setter Property="ScrollViewer.ZoomMode" Value="Disabled" />
+    </Style>
+
+    <Style x:Key="VerticalScrollViewerStyle" TargetType="ScrollViewer">
+        <Setter Property="HorizontalScrollBarVisibility" Value="Disabled"/>
+        <Setter Property="VerticalScrollBarVisibility" Value="Auto"/>
+        <Setter Property="ScrollViewer.HorizontalScrollMode" Value="Disabled" />
+        <Setter Property="ScrollViewer.VerticalScrollMode" Value="Enabled" />
+        <Setter Property="ScrollViewer.ZoomMode" Value="Disabled" />
+    </Style>
+
+    <!-- Page layout roots typically use entrance animations and a theme-appropriate background color -->
+
+    <Style x:Key="LayoutRootStyle" TargetType="Panel">
+        <Setter Property="Background" Value="{StaticResource ApplicationPageBackgroundThemeBrush}"/>
+        <Setter Property="ChildrenTransitions">
+            <Setter.Value>
+                <TransitionCollection>
+                    <EntranceThemeTransition/>
+                </TransitionCollection>
+            </Setter.Value>
+        </Setter>
+    </Style>
+</ResourceDictionary>
diff --git a/samples/winrt/ImageManipulations/common/suspensionmanager.cpp b/samples/winrt/ImageManipulations/common/suspensionmanager.cpp
new file mode 100644
index 0000000000..c1ecf11cfd
--- /dev/null
+++ b/samples/winrt/ImageManipulations/common/suspensionmanager.cpp
@@ -0,0 +1,481 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// SuspensionManager.cpp
+// Implementation of the SuspensionManager class
+//
+
+#include "pch.h"
+#include "SuspensionManager.h"
+
+#include <collection.h>
+#include <algorithm>
+
+using namespace SDKSample::Common;
+
+using namespace Concurrency;
+using namespace Platform;
+using namespace Platform::Collections;
+using namespace Windows::Foundation;
+using namespace Windows::Foundation::Collections;
+using namespace Windows::Storage;
+using namespace Windows::Storage::FileProperties;
+using namespace Windows::Storage::Streams;
+using namespace Windows::UI::Xaml;
+using namespace Windows::UI::Xaml::Controls;
+using namespace Windows::UI::Xaml::Interop;
+
+namespace
+{
+    Map<String^, Object^>^ _sessionState = ref new Map<String^, Object^>();
+    String^ sessionStateFilename = "_sessionState.dat";
+
+    // Forward declarations for object object read / write support
+    void WriteObject(Windows::Storage::Streams::DataWriter^ writer, Platform::Object^ object);
+    Platform::Object^ ReadObject(Windows::Storage::Streams::DataReader^ reader);
+}
+
+/// <summary>
+/// Provides access to global session state for the current session.  This state is serialized by
+/// <see cref="SaveAsync"/> and restored by <see cref="RestoreAsync"/> which require values to be
+/// one of the following: boxed values including integers, floating-point singles and doubles,
+/// wide characters, boolean, Strings and Guids, or Map<String^, Object^> where map values are
+/// subject to the same constraints.  Session state should be as compact as possible.
+/// </summary>
+IMap<String^, Object^>^ SuspensionManager::SessionState::get(void)
+{
+    return _sessionState;
+}
+
+/// <summary>
+/// Wrap a WeakReference as a reference object for use in a collection.
+/// </summary>
+private ref class WeakFrame sealed
+{
+private:
+    WeakReference _frameReference;
+
+internal:
+    WeakFrame(Frame^ frame) { _frameReference = frame; }
+    property Frame^ ResolvedFrame
+    {
+        Frame^ get(void) { return _frameReference.Resolve<Frame>(); }
+    };
+};
+
+namespace
+{
+    std::vector<WeakFrame^> _registeredFrames;
+    DependencyProperty^ FrameSessionStateKeyProperty =
+        DependencyProperty::RegisterAttached("_FrameSessionStateKeyProperty",
+        TypeName(String::typeid), TypeName(SuspensionManager::typeid), nullptr);
+    DependencyProperty^ FrameSessionStateProperty =
+        DependencyProperty::RegisterAttached("_FrameSessionStateProperty",
+        TypeName(IMap<String^, Object^>::typeid), TypeName(SuspensionManager::typeid), nullptr);
+}
+
+/// <summary>
+/// Registers a <see cref="Frame"/> instance to allow its navigation history to be saved to
+/// and restored from <see cref="SessionState"/>.  Frames should be registered once
+/// immediately after creation if they will participate in session state management.  Upon
+/// registration if state has already been restored for the specified key
+/// the navigation history will immediately be restored.  Subsequent invocations of
+/// <see cref="RestoreAsync(String)"/> will also restore navigation history.
+/// </summary>
+/// <param name="frame">An instance whose navigation history should be managed by
+/// <see cref="SuspensionManager"/></param>
+/// <param name="sessionStateKey">A unique key into <see cref="SessionState"/> used to
+/// store navigation-related information.</param>
+void SuspensionManager::RegisterFrame(Frame^ frame, String^ sessionStateKey)
+{
+    if (frame->GetValue(FrameSessionStateKeyProperty) != nullptr)
+    {
+        throw ref new FailureException("Frames can only be registered to one session state key");
+    }
+
+    if (frame->GetValue(FrameSessionStateProperty) != nullptr)
+    {
+        throw ref new FailureException("Frames must be either be registered before accessing frame session state, or not registered at all");
+    }
+
+    // Use a dependency property to associate the session key with a frame, and keep a list of frames whose
+    // navigation state should be managed
+    frame->SetValue(FrameSessionStateKeyProperty, sessionStateKey);
+    _registeredFrames.insert(_registeredFrames.begin(), ref new WeakFrame(frame));
+
+    // Check to see if navigation state can be restored
+    RestoreFrameNavigationState(frame);
+}
+
+/// <summary>
+/// Disassociates a <see cref="Frame"/> previously registered by <see cref="RegisterFrame"/>
+/// from <see cref="SessionState"/>.  Any navigation state previously captured will be
+/// removed.
+/// </summary>
+/// <param name="frame">An instance whose navigation history should no longer be
+/// managed.</param>
+void SuspensionManager::UnregisterFrame(Frame^ frame)
+{
+    // Remove session state and remove the frame from the list of frames whose navigation
+    // state will be saved (along with any weak references that are no longer reachable)
+    auto key = safe_cast<String^>(frame->GetValue(FrameSessionStateKeyProperty));
+    if (SessionState->HasKey(key)) SessionState->Remove(key);
+    _registeredFrames.erase(
+        std::remove_if(_registeredFrames.begin(), _registeredFrames.end(), [=](WeakFrame^& e)
+        {
+            auto testFrame = e->ResolvedFrame;
+            return testFrame == nullptr || testFrame == frame;
+        }),
+        _registeredFrames.end()
+    );
+}
+
+/// <summary>
+/// Provides storage for session state associated with the specified <see cref="Frame"/>.
+/// Frames that have been previously registered with <see cref="RegisterFrame"/> have
+/// their session state saved and restored automatically as a part of the global
+/// <see cref="SessionState"/>.  Frames that are not registered have transient state
+/// that can still be useful when restoring pages that have been discarded from the
+/// navigation cache.
+/// </summary>
+/// <remarks>Apps may choose to rely on <see cref="LayoutAwarePage"/> to manage
+/// page-specific state instead of working with frame session state directly.</remarks>
+/// <param name="frame">The instance for which session state is desired.</param>
+/// <returns>A collection of state subject to the same serialization mechanism as
+/// <see cref="SessionState"/>.</returns>
+IMap<String^, Object^>^ SuspensionManager::SessionStateForFrame(Frame^ frame)
+{
+    auto frameState = safe_cast<IMap<String^, Object^>^>(frame->GetValue(FrameSessionStateProperty));
+
+    if (frameState == nullptr)
+    {
+        auto frameSessionKey = safe_cast<String^>(frame->GetValue(FrameSessionStateKeyProperty));
+        if (frameSessionKey != nullptr)
+        {
+            // Registered frames reflect the corresponding session state
+            if (!_sessionState->HasKey(frameSessionKey))
+            {
+                _sessionState->Insert(frameSessionKey, ref new Map<String^, Object^>());
+            }
+            frameState = safe_cast<IMap<String^, Object^>^>(_sessionState->Lookup(frameSessionKey));
+        }
+        else
+        {
+            // Frames that aren't registered have transient state
+            frameState = ref new Map<String^, Object^>();
+        }
+        frame->SetValue(FrameSessionStateProperty, frameState);
+    }
+    return frameState;
+}
+
+void SuspensionManager::RestoreFrameNavigationState(Frame^ frame)
+{
+    auto frameState = SessionStateForFrame(frame);
+    if (frameState->HasKey("Navigation"))
+    {
+        frame->SetNavigationState(safe_cast<String^>(frameState->Lookup("Navigation")));
+    }
+}
+
+void SuspensionManager::SaveFrameNavigationState(Frame^ frame)
+{
+    auto frameState = SessionStateForFrame(frame);
+    frameState->Insert("Navigation", frame->GetNavigationState());
+}
+
+/// <summary>
+/// Save the current <see cref="SessionState"/>.  Any <see cref="Frame"/> instances
+/// registered with <see cref="RegisterFrame"/> will also preserve their current
+/// navigation stack, which in turn gives their active <see cref="Page"/> an opportunity
+/// to save its state.
+/// </summary>
+/// <returns>An asynchronous task that reflects when session state has been saved.</returns>
+task<void> SuspensionManager::SaveAsync(void)
+{
+    // Save the navigation state for all registered frames
+    for (auto&& weakFrame : _registeredFrames)
+    {
+        auto frame = weakFrame->ResolvedFrame;
+        if (frame != nullptr) SaveFrameNavigationState(frame);
+    }
+
+    // Serialize the session state synchronously to avoid asynchronous access to shared
+    // state
+    auto sessionData = ref new InMemoryRandomAccessStream();
+    auto sessionDataWriter = ref new DataWriter(sessionData->GetOutputStreamAt(0));
+    WriteObject(sessionDataWriter, _sessionState);
+
+    // Once session state has been captured synchronously, begin the asynchronous process
+    // of writing the result to disk
+    return task<unsigned int>(sessionDataWriter->StoreAsync()).then([=](unsigned int)
+    {
+        return sessionDataWriter->FlushAsync();
+    }).then([=](bool flushSucceeded)
+    {
+        (void)flushSucceeded; // Unused parameter
+        return ApplicationData::Current->LocalFolder->CreateFileAsync(sessionStateFilename,
+            CreationCollisionOption::ReplaceExisting);
+    }).then([=](StorageFile^ createdFile)
+    {
+        return createdFile->OpenAsync(FileAccessMode::ReadWrite);
+    }).then([=](IRandomAccessStream^ newStream)
+    {
+        return RandomAccessStream::CopyAndCloseAsync(
+            sessionData->GetInputStreamAt(0), newStream->GetOutputStreamAt(0));
+    }).then([=](UINT64 copiedBytes)
+    {
+        (void)copiedBytes; // Unused parameter
+        return;
+    });
+}
+
+/// <summary>
+/// Restores previously saved <see cref="SessionState"/>.  Any <see cref="Frame"/> instances
+/// registered with <see cref="RegisterFrame"/> will also restore their prior navigation
+/// state, which in turn gives their active <see cref="Page"/> an opportunity restore its
+/// state.
+/// </summary>
+/// <param name="version">A version identifer compared to the session state to prevent
+/// incompatible versions of session state from reaching app code.  Saved state with a
+/// different version will be ignored, resulting in an empty <see cref="SessionState"/>
+/// dictionary.</param>
+/// <returns>An asynchronous task that reflects when session state has been read.  The
+/// content of <see cref="SessionState"/> should not be relied upon until this task
+/// completes.</returns>
+task<void> SuspensionManager::RestoreAsync(void)
+{
+    _sessionState->Clear();
+
+    task<StorageFile^> getFileTask(ApplicationData::Current->LocalFolder->GetFileAsync(sessionStateFilename));
+    return getFileTask.then([=](StorageFile^ stateFile)
+    {
+        task<BasicProperties^> getBasicPropertiesTask(stateFile->GetBasicPropertiesAsync());
+        return getBasicPropertiesTask.then([=](BasicProperties^ stateFileProperties)
+        {
+            auto size = unsigned int(stateFileProperties->Size);
+            if (size != stateFileProperties->Size) throw ref new FailureException("Session state larger than 4GB");
+            task<IRandomAccessStreamWithContentType^> openReadTask(stateFile->OpenReadAsync());
+            return openReadTask.then([=](IRandomAccessStreamWithContentType^ stateFileStream)
+            {
+                auto stateReader = ref new DataReader(stateFileStream);
+                return task<unsigned int>(stateReader->LoadAsync(size)).then([=](unsigned int bytesRead)
+                {
+                    (void)bytesRead; // Unused parameter
+                    // Deserialize the Session State
+                    Object^ content = ReadObject(stateReader);
+                    _sessionState = (Map<String^, Object^>^)content;
+
+                    // Restore any registered frames to their saved state
+                    for (auto&& weakFrame : _registeredFrames)
+                    {
+                        auto frame = weakFrame->ResolvedFrame;
+                        if (frame != nullptr)
+                        {
+                            frame->ClearValue(FrameSessionStateProperty);
+                            RestoreFrameNavigationState(frame);
+                        }
+                    }
+                }, task_continuation_context::use_current());
+            });
+        });
+    });
+}
+
+#pragma region Object serialization for a known set of types
+
+namespace
+{
+    // Codes used for identifying serialized types
+    enum StreamTypes {
+        NullPtrType = 0,
+
+        // Supported IPropertyValue types
+        UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int16Type, Int32Type, Int64Type,
+        SingleType, DoubleType, BooleanType, Char16Type, GuidType, StringType,
+
+        // Additional supported types
+        StringToObjectMapType,
+
+        // Marker values used to ensure stream integrity
+        MapEndMarker
+    };
+
+    void WriteString(DataWriter^ writer, String^ string)
+    {
+        writer->WriteByte(StringType);
+        writer->WriteUInt32(writer->MeasureString(string));
+        writer->WriteString(string);
+    }
+
+    void WriteProperty(DataWriter^ writer, IPropertyValue^ propertyValue)
+    {
+        switch (propertyValue->Type)
+        {
+        case PropertyType::UInt8:
+            writer->WriteByte(UInt8Type);
+            writer->WriteByte(propertyValue->GetUInt8());
+            return;
+        case PropertyType::UInt16:
+            writer->WriteByte(UInt16Type);
+            writer->WriteUInt16(propertyValue->GetUInt16());
+            return;
+        case PropertyType::UInt32:
+            writer->WriteByte(UInt32Type);
+            writer->WriteUInt32(propertyValue->GetUInt32());
+            return;
+        case PropertyType::UInt64:
+            writer->WriteByte(UInt64Type);
+            writer->WriteUInt64(propertyValue->GetUInt64());
+            return;
+        case PropertyType::Int16:
+            writer->WriteByte(Int16Type);
+            writer->WriteUInt16(propertyValue->GetInt16());
+            return;
+        case PropertyType::Int32:
+            writer->WriteByte(Int32Type);
+            writer->WriteUInt32(propertyValue->GetInt32());
+            return;
+        case PropertyType::Int64:
+            writer->WriteByte(Int64Type);
+            writer->WriteUInt64(propertyValue->GetInt64());
+            return;
+        case PropertyType::Single:
+            writer->WriteByte(SingleType);
+            writer->WriteSingle(propertyValue->GetSingle());
+            return;
+        case PropertyType::Double:
+            writer->WriteByte(DoubleType);
+            writer->WriteDouble(propertyValue->GetDouble());
+            return;
+        case PropertyType::Boolean:
+            writer->WriteByte(BooleanType);
+            writer->WriteBoolean(propertyValue->GetBoolean());
+            return;
+        case PropertyType::Char16:
+            writer->WriteByte(Char16Type);
+            writer->WriteUInt16(propertyValue->GetChar16());
+            return;
+        case PropertyType::Guid:
+            writer->WriteByte(GuidType);
+            writer->WriteGuid(propertyValue->GetGuid());
+            return;
+        case PropertyType::String:
+            WriteString(writer, propertyValue->GetString());
+            return;
+        default:
+            throw ref new InvalidArgumentException("Unsupported property type");
+        }
+    }
+
+    void WriteStringToObjectMap(DataWriter^ writer, IMap<String^, Object^>^ map)
+    {
+        writer->WriteByte(StringToObjectMapType);
+        writer->WriteUInt32(map->Size);
+        for (auto&& pair : map)
+        {
+            WriteObject(writer, pair->Key);
+            WriteObject(writer, pair->Value);
+        }
+        writer->WriteByte(MapEndMarker);
+    }
+
+    void WriteObject(DataWriter^ writer, Object^ object)
+    {
+        if (object == nullptr)
+        {
+            writer->WriteByte(NullPtrType);
+            return;
+        }
+
+        auto propertyObject = dynamic_cast<IPropertyValue^>(object);
+        if (propertyObject != nullptr)
+        {
+            WriteProperty(writer, propertyObject);
+            return;
+        }
+
+        auto mapObject = dynamic_cast<IMap<String^, Object^>^>(object);
+        if (mapObject != nullptr)
+        {
+            WriteStringToObjectMap(writer, mapObject);
+            return;
+        }
+
+        throw ref new InvalidArgumentException("Unsupported data type");
+    }
+
+    String^ ReadString(DataReader^ reader)
+    {
+        int length = reader->ReadUInt32();
+        String^ string = reader->ReadString(length);
+        return string;
+    }
+
+    IMap<String^, Object^>^ ReadStringToObjectMap(DataReader^ reader)
+    {
+        auto map = ref new Map<String^, Object^>();
+        auto size = reader->ReadUInt32();
+        for (unsigned int index = 0; index < size; index++)
+        {
+            auto key = safe_cast<String^>(ReadObject(reader));
+            auto value = ReadObject(reader);
+            map->Insert(key, value);
+        }
+        if (reader->ReadByte() != MapEndMarker)
+        {
+            throw ref new InvalidArgumentException("Invalid stream");
+        }
+        return map;
+    }
+
+    Object^ ReadObject(DataReader^ reader)
+    {
+        auto type = reader->ReadByte();
+        switch (type)
+        {
+        case NullPtrType:
+            return nullptr;
+        case UInt8Type:
+            return reader->ReadByte();
+        case UInt16Type:
+            return reader->ReadUInt16();
+        case UInt32Type:
+            return reader->ReadUInt32();
+        case UInt64Type:
+            return reader->ReadUInt64();
+        case Int16Type:
+            return reader->ReadInt16();
+        case Int32Type:
+            return reader->ReadInt32();
+        case Int64Type:
+            return reader->ReadInt64();
+        case SingleType:
+            return reader->ReadSingle();
+        case DoubleType:
+            return reader->ReadDouble();
+        case BooleanType:
+            return reader->ReadBoolean();
+        case Char16Type:
+            return (char16_t)reader->ReadUInt16();
+        case GuidType:
+            return reader->ReadGuid();
+        case StringType:
+            return ReadString(reader);
+        case StringToObjectMapType:
+            return ReadStringToObjectMap(reader);
+        default:
+            throw ref new InvalidArgumentException("Unsupported property type");
+        }
+    }
+}
+
+#pragma endregion
diff --git a/samples/winrt/ImageManipulations/common/suspensionmanager.h b/samples/winrt/ImageManipulations/common/suspensionmanager.h
new file mode 100644
index 0000000000..65e1180a06
--- /dev/null
+++ b/samples/winrt/ImageManipulations/common/suspensionmanager.h
@@ -0,0 +1,50 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// SuspensionManager.h
+// Declaration of the SuspensionManager class
+//
+
+#pragma once
+
+#include <ppltasks.h>
+
+namespace SDKSample
+{
+    namespace Common
+    {
+        /// <summary>
+        /// SuspensionManager captures global session state to simplify process lifetime management
+        /// for an application.  Note that session state will be automatically cleared under a variety
+        /// of conditions and should only be used to store information that would be convenient to
+        /// carry across sessions, but that should be disacarded when an application crashes or is
+        /// upgraded.
+        /// </summary>
+        ref class SuspensionManager sealed
+        {
+        internal:
+            static void RegisterFrame(Windows::UI::Xaml::Controls::Frame^ frame, Platform::String^ sessionStateKey);
+            static void UnregisterFrame(Windows::UI::Xaml::Controls::Frame^ frame);
+            static Concurrency::task<void> SaveAsync(void);
+            static Concurrency::task<void> RestoreAsync(void);
+            static property Windows::Foundation::Collections::IMap<Platform::String^, Platform::Object^>^ SessionState
+            {
+                Windows::Foundation::Collections::IMap<Platform::String^, Platform::Object^>^ get(void);
+            };
+            static Windows::Foundation::Collections::IMap<Platform::String^, Platform::Object^>^ SessionStateForFrame(
+                Windows::UI::Xaml::Controls::Frame^ frame);
+
+        private:
+            static void RestoreFrameNavigationState(Windows::UI::Xaml::Controls::Frame^ frame);
+            static void SaveFrameNavigationState(Windows::UI::Xaml::Controls::Frame^ frame);
+        };
+    }
+}
diff --git a/samples/winrt/ImageManipulations/pch.cpp b/samples/winrt/ImageManipulations/pch.cpp
new file mode 100644
index 0000000000..97389d94cc
--- /dev/null
+++ b/samples/winrt/ImageManipulations/pch.cpp
@@ -0,0 +1,16 @@
+﻿//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// pch.cpp
+// Include the standard header and generate the precompiled header.
+//
+
+#include "pch.h"
diff --git a/samples/winrt/ImageManipulations/pch.h b/samples/winrt/ImageManipulations/pch.h
new file mode 100644
index 0000000000..13f9bc34c7
--- /dev/null
+++ b/samples/winrt/ImageManipulations/pch.h
@@ -0,0 +1,23 @@
+﻿//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+//
+// pch.h
+// Header for standard system include files.
+//
+
+#pragma once
+
+#include <collection.h>
+#include <ppltasks.h>
+#include <agile.h>
+#include "Common\LayoutAwarePage.h"
+#include "Common\SuspensionManager.h"
+#include "App.xaml.h"
diff --git a/samples/winrt/ImageManipulations/sample-utils/SampleTemplateStyles.xaml b/samples/winrt/ImageManipulations/sample-utils/SampleTemplateStyles.xaml
new file mode 100644
index 0000000000..ec2c1a7132
--- /dev/null
+++ b/samples/winrt/ImageManipulations/sample-utils/SampleTemplateStyles.xaml
@@ -0,0 +1,51 @@
+﻿<!--
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+-->
+<ResourceDictionary
+    xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation" 
+    xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml">
+
+    <Style x:Key="TitleTextStyle" TargetType="TextBlock">
+        <Setter Property="FontFamily" Value="Segoe UI Light" />
+        <Setter Property="FontSize" Value="16" />
+    </Style>
+    <Style x:Key="HeaderTextStyle" TargetType="TextBlock">
+        <Setter Property="FontFamily" Value="Segoe UI Semilight" />
+        <Setter Property="FontSize" Value="26.667" />
+        <Setter Property="Margin" Value="0,0,0,25" />
+    </Style>
+    <Style x:Key="H2Style" TargetType="TextBlock">
+        <Setter Property="FontFamily" Value="Segoe UI" />
+        <Setter Property="FontSize" Value="14.667" />
+        <Setter Property="Margin" Value="0,0,0,0" />
+    </Style>
+    <Style x:Key="SubheaderTextStyle" TargetType="TextBlock">
+        <Setter Property="FontFamily" Value="Segoe UI Semilight" />
+        <Setter Property="FontSize" Value="14.667" />
+        <Setter Property="Margin" Value="0,0,0,5" />
+    </Style>
+    <Style x:Key="BasicTextStyle" TargetType="TextBlock">
+        <Setter Property="FontFamily" Value="Segoe UI Light" />
+        <Setter Property="FontSize" Value="16" />
+    </Style>
+    <Style x:Key="SeparatorStyle" TargetType="TextBlock">
+        <Setter Property="FontFamily" Value="Segoe UI" />
+        <Setter Property="FontSize" Value="9" />
+    </Style>
+    <Style x:Key="FooterStyle" TargetType="TextBlock">
+        <Setter Property="FontFamily" Value="Segoe UI" />
+        <Setter Property="FontSize" Value="12" />
+        <Setter Property="Margin" Value="0,8,0,0" />
+    </Style>
+    <Style x:Key="HyperlinkStyle" TargetType="HyperlinkButton">
+        <Setter Property="Padding" Value="5"/>
+    </Style>
+</ResourceDictionary>