diff --git a/3rdparty/ffmpeg/opencv_ffmpeg.dll b/3rdparty/ffmpeg/opencv_ffmpeg.dll
index 565502c340..1641c8a516 100644
Binary files a/3rdparty/ffmpeg/opencv_ffmpeg.dll and b/3rdparty/ffmpeg/opencv_ffmpeg.dll differ
diff --git a/3rdparty/ffmpeg/opencv_ffmpeg_64.dll b/3rdparty/ffmpeg/opencv_ffmpeg_64.dll
index 05ab90be3b..ad02f4e684 100644
Binary files a/3rdparty/ffmpeg/opencv_ffmpeg_64.dll and b/3rdparty/ffmpeg/opencv_ffmpeg_64.dll differ
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f682c4cd2..99215d5900 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,7 +110,7 @@ endif()
 
 # Optional 3rd party components
 # ===================================================
-OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) )
+OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
@@ -141,9 +141,9 @@ OCV_OPTION(WITH_VIDEOINPUT     "Build HighGUI with DirectShow support"       ON
 OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF  IF (NOT ANDROID AND NOT APPLE) )
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_CLP            "Include Clp support (EPL)"                   OFF)
-OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
-OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
-OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
+OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      OFF  IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     OFF  IF (NOT ANDROID AND NOT IOS) )
 
 
 # OpenCV build components
@@ -163,12 +163,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF I
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )
 
 # 3rd party libs
-OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             WIN32 OR APPLE OR CARMA )
-OCV_OPTION(BUILD_TIFF               "Build libtiff from source"          WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_JASPER             "Build libjasper from source"        WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"          WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_PNG                "Build libpng from source"           WIN32 OR ANDROID OR APPLE OR CARMA )
-OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"          WIN32 OR ANDROID OR APPLE OR CARMA )
+OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             WIN32 OR APPLE )
+OCV_OPTION(BUILD_TIFF               "Build libtiff from source"          WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_JASPER             "Build libjasper from source"        WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"          WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_PNG                "Build libpng from source"           WIN32 OR ANDROID OR APPLE )
+OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"          WIN32 OR ANDROID OR APPLE )
 OCV_OPTION(BUILD_TBB                "Download and build TBB from source" ANDROID IF CMAKE_COMPILER_IS_GNUCXX )
 
 # OpenCV installation options
@@ -301,21 +301,19 @@ find_host_program(GIT_EXECUTABLE NAMES ${git_names} PATH_SUFFIXES Git/cmd Git/bi
 mark_as_advanced(GIT_EXECUTABLE)
 
 if(GIT_EXECUTABLE)
-  execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+  execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --always --dirty --match "2.[0-9].[0-9]*"
     WORKING_DIRECTORY "${OpenCV_SOURCE_DIR}"
-    OUTPUT_VARIABLE OPENCV_GIT_HASH_SORT
+    OUTPUT_VARIABLE OPENCV_VCSVERSION
     RESULT_VARIABLE GIT_RESULT
     ERROR_QUIET
     OUTPUT_STRIP_TRAILING_WHITESPACE
   )
-  if(GIT_RESULT EQUAL 0)
-    set(OPENCV_VCSVERSION "commit:${OPENCV_GIT_HASH_SORT}")
-  else()
-    set(OPENCV_VCSVERSION "exported")
+  if(NOT GIT_RESULT EQUAL 0)
+    set(OPENCV_VCSVERSION "unknown")
   endif()
 else()
   # We don't have git:
-  set(OPENCV_VCSVERSION "")
+  set(OPENCV_VCSVERSION "unknown")
 endif()
 
 
@@ -523,10 +521,21 @@ if(NOT CMAKE_GENERATOR MATCHES "Xcode|Visual Studio")
 endif()
 
 # ========================== C/C++ options ==========================
+if(CMAKE_CXX_COMPILER_VERSION)
+  set(OPENCV_COMPILER_STR "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} (ver ${CMAKE_CXX_COMPILER_VERSION})")
+elseif(CMAKE_COMPILER_IS_CLANGCXX)
+  set(OPENCV_COMPILER_STR "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} (ver ${CMAKE_CLANG_REGEX_VERSION})")
+elseif(CMAKE_COMPILER_IS_GNUCXX)
+  set(OPENCV_COMPILER_STR "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} (ver ${CMAKE_GCC_REGEX_VERSION})")
+else()
+  set(OPENCV_COMPILER_STR "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}")
+endif()
+string(STRIP "${OPENCV_COMPILER_STR}" OPENCV_COMPILER_STR)
+
 status("")
 status("  C/C++:")
 status("    Built as dynamic libs?:" BUILD_SHARED_LIBS THEN YES ELSE NO)
-status("    C++ Compiler:"           CMAKE_COMPILER_IS_GNUCXX THEN "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} (ver ${CMAKE_GCC_REGEX_VERSION})" ELSE "${CMAKE_CXX_COMPILER}" )
+status("    C++ Compiler:"           ${OPENCV_COMPILER_STR})
 status("    C++ flags (Release):"    ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE})
 status("    C++ flags (Debug):"      ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG})
 status("    C Compiler:"             ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1})
@@ -575,15 +584,16 @@ if(ANDROID)
   status("    Android ABI:" ${ANDROID_ABI})
   status("    STL type:" ${ANDROID_STL})
   status("    Native API level:" android-${ANDROID_NATIVE_API_LEVEL})
-  status("    SDK target:" "${ANDROID_SDK_TARGET}")
+  android_get_compatible_target(android_sdk_target_status ${ANDROID_NATIVE_API_LEVEL} ${ANDROID_SDK_TARGET} 11)
+  status("    SDK target:" "${android_sdk_target_status}")
   if(BUILD_WITH_ANDROID_NDK)
     status("    Android NDK:" "${ANDROID_NDK} (toolchain: ${ANDROID_TOOLCHAIN_NAME})")
   elseif(BUILD_WITH_STANDALONE_TOOLCHAIN)
     status("    Android toolchain:" "${ANDROID_STANDALONE_TOOLCHAIN}")
   endif()
   status("    android tool:"  ANDROID_EXECUTABLE  THEN "${ANDROID_EXECUTABLE} (${ANDROID_TOOLS_Pkg_Desc})" ELSE NO)
-  status("    ant:"           ANT_EXECUTABLE      THEN "${ANT_EXECUTABLE} (ver ${ANT_VERSION})"            ELSE NO)
-  status("    Google Play package:" BUILD_ANDROID_PACKAGE THEN YES ELSE NO)
+  status("    Google Play package:" BUILD_ANDROID_PACKAGE                                         THEN YES ELSE NO)
+  status("    Android examples:"    BUILD_ANDROID_EXAMPLES AND CAN_BUILD_ANDROID_PROJECTS         THEN YES ELSE NO)
 endif()
 
 # ========================== GUI ==========================
@@ -707,7 +717,7 @@ if(DEFINED WITH_PVAPI)
 endif(DEFINED WITH_PVAPI)
 
 if(DEFINED WITH_GIGEAPI)
-  status("    GigEVisionSDK:"  HAVE_GIGE_API          THEN YES                                        ELSE NO)
+  status("    GigEVisionSDK:"  HAVE_GIGE_API       THEN YES                                        ELSE NO)
 endif(DEFINED WITH_GIGEAPI)
 
 if(DEFINED WITH_QUICKTIME)
@@ -753,31 +763,21 @@ endif(DEFINED WITH_XINE)
 status("")
 status("  Other third-party libraries:")
 
-if(DEFINED WITH_IPP)
-  if(WITH_IPP AND IPP_FOUND)
-    status("    Use IPP:" "${IPP_LATEST_VERSION_STR} [${IPP_LATEST_VERSION_MAJOR}.${IPP_LATEST_VERSION_MINOR}.${IPP_LATEST_VERSION_BUILD}]")
-    status("         at:" "${IPP_ROOT_DIR}")
-  else()
-    status("    Use IPP:"   WITH_IPP AND NOT IPP_FOUND THEN "IPP not found" ELSE NO)
-  endif()
-endif(DEFINED WITH_IPP)
+if(WITH_IPP AND IPP_FOUND)
+  status("    Use IPP:" "${IPP_LATEST_VERSION_STR} [${IPP_LATEST_VERSION_MAJOR}.${IPP_LATEST_VERSION_MINOR}.${IPP_LATEST_VERSION_BUILD}]")
+  status("         at:" "${IPP_ROOT_DIR}")
+else()
+  status("    Use IPP:"   WITH_IPP AND NOT IPP_FOUND THEN "IPP not found" ELSE NO)
+endif()
 
-if(DEFINED WITH_TBB)
-  status("    Use TBB:"   HAVE_TBB   THEN "YES (ver ${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR} interface ${TBB_INTERFACE_VERSION})" ELSE NO)
-endif(DEFINED WITH_TBB)
-
-if(DEFINED WITH_CSTRIPES)
-  status("    Use C=:"   HAVE_CSTRIPES   THEN YES ELSE NO)
-endif(DEFINED WITH_CSTRIPES)
-
-if(DEFINED WITH_CUDA)
-  status("    Use Cuda:"  HAVE_CUDA  THEN "YES (ver ${CUDA_VERSION_STRING})" ELSE NO)
-endif(DEFINED WITH_CUDA)
-
-status("    Use OpenCL:"  HAVE_OPENCL  THEN YES ELSE NO)
-
-status("    Use Eigen:" HAVE_EIGEN THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
-status("    Use Clp:"   HAVE_CLP   THEN YES ELSE NO)
+status("    Use Eigen:"      HAVE_EIGEN       THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
+status("    Use TBB:"        HAVE_TBB         THEN "YES (ver ${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR} interface ${TBB_INTERFACE_VERSION})" ELSE NO)
+status("    Use OpenMP:"     HAVE_OPENMP      THEN YES ELSE NO)
+status("    Use GCD"         HAVE_GCD         THEN YES ELSE NO)
+status("    Use Concurrency" HAVE_CONCURRENCY THEN YES ELSE NO)
+status("    Use C=:"         HAVE_CSTRIPES    THEN YES ELSE NO)
+status("    Use Cuda:"       HAVE_CUDA        THEN "YES (ver ${CUDA_VERSION_STRING})" ELSE NO)
+status("    Use OpenCL:"     HAVE_OPENCL      THEN YES ELSE NO)
 
 if(HAVE_CUDA)
   status("")
@@ -791,10 +791,23 @@ if(HAVE_CUDA)
   status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)
 endif()
 
+if(HAVE_OPENCL AND BUILD_opencv_ocl)
+  status("")
+  status("  OpenCL")
+  if(OPENCL_INCLUDE_DIR)
+    status("    Include:"            ${OPENCL_INCLUDE_DIR})
+  endif()
+  if(OPENCL_LIBRARIES)
+    status("    libraries:"          ${OPENCL_LIBRARIES})
+  endif()
+  status("    Use AMDFFT:"           HAVE_CLAMDFFT  THEN YES ELSE NO)
+  status("    Use AMDBLAS:"          HAVE_CLAMDBLAS THEN YES ELSE NO)
+endif()
+
 # ========================== python ==========================
 status("")
 status("  Python:")
-status("    Interpreter:"   PYTHON_EXECUTABLE     THEN "${PYTHON_EXECUTABLE} (ver ${PYTHON_VERSION_FULL})"         ELSE NO)
+status("    Interpreter:"     PYTHON_EXECUTABLE   THEN "${PYTHON_EXECUTABLE} (ver ${PYTHON_VERSION_FULL})"         ELSE NO)
 if(BUILD_opencv_python)
   if(PYTHONLIBS_VERSION_STRING)
     status("    Libraries:"   HAVE_opencv_python  THEN  "${PYTHON_LIBRARIES} (ver ${PYTHONLIBS_VERSION_STRING})"   ELSE NO)
@@ -805,6 +818,15 @@ if(BUILD_opencv_python)
   status("    packages path:" PYTHON_EXECUTABLE   THEN "${PYTHON_PACKAGES_PATH}"                                   ELSE "-")
 endif()
 
+# ========================== java ==========================
+status("")
+status("  Java:")
+status("    ant:"           ANT_EXECUTABLE      THEN "${ANT_EXECUTABLE} (ver ${ANT_VERSION})"                    ELSE NO)
+if(NOT ANDROID)
+  status("    JNI:"         JNI_INCLUDE_DIRS    THEN "${JNI_INCLUDE_DIRS}"                                       ELSE NO)
+endif()
+status("    Java tests:"    BUILD_TESTS AND (NOT ANDROID OR CAN_BUILD_ANDROID_PROJECTS)                 THEN YES ELSE NO)
+
 # ========================== documentation ==========================
 if(BUILD_DOCS)
   status("")
@@ -823,12 +845,7 @@ status("")
 status("  Tests and samples:")
 status("    Tests:"             BUILD_TESTS AND HAVE_opencv_ts       THEN YES ELSE NO)
 status("    Performance tests:" BUILD_PERF_TESTS AND HAVE_opencv_ts  THEN YES ELSE NO)
-status("    Examples:"          BUILD_EXAMPLES                       THEN YES ELSE NO)
-
-if(ANDROID)
-  status("    Android tests:"    BUILD_TESTS AND CAN_BUILD_ANDROID_PROJECTS            THEN YES ELSE NO)
-  status("    Android examples:" BUILD_ANDROID_EXAMPLES AND CAN_BUILD_ANDROID_PROJECTS THEN YES ELSE NO)
-endif()
+status("    C/C++ Examples:"    BUILD_EXAMPLES                       THEN YES ELSE NO)
 
 # ========================== auxiliary ==========================
 status("")
diff --git a/android/scripts/ABI_compat_generator.py b/android/scripts/ABI_compat_generator.py
index 05f43829d2..39253bbdec 100755
--- a/android/scripts/ABI_compat_generator.py
+++ b/android/scripts/ABI_compat_generator.py
@@ -1,130 +1,228 @@
 #!/usr/bin/python
 
+from optparse import OptionParser
+from shutil import rmtree
 import os
-import sys
 
-ANDROID_SDK_PATH = "/opt/android-sdk-linux"
-ANDROID_NDK_PATH = None
-INSTALL_DIRECTORY = None
-CLASS_PATH = None
-TMP_HEADER_PATH="tmp_include"
-HEADER_EXTS = set(['h', 'hpp'])
-SYS_INCLUDES = ["platforms/android-8/arch-arm/usr/include", "sources/cxx-stl/gnu-libstdc++/include", "sources/cxx-stl/gnu-libstdc++/libs/armeabi/include"]
 
-PROJECT_NAME = "OpenCV-branch"
-TARGET_LIBS = ["libopencv_java.so"]
-ARCH = "armeabi"
-GCC_OPTIONS = "-fpermissive"
-EXCLUDE_HEADERS = set(["hdf5.h", "eigen.hpp", "cxeigen.hpp"]);
+architecture = 'armeabi'
+excludedHeaders = set(['hdf5.h', 'cap_ios.h',
+    'eigen.hpp', 'cxeigen.hpp' #TOREMOVE
+    ])
+systemIncludes = ['sources/cxx-stl/gnu-libstdc++/4.6/include', \
+    '/opt/android-ndk-r8c/platforms/android-8/arch-arm', # TODO: check if this one could be passed as command line arg
+    'sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include']
+targetLibs = ['libopencv_java.so']
+preamble = ['Eigen/Core']
+# TODO: get gcc_options automatically
+gcc_options = ['-fexceptions', '-frtti', '-Wno-psabi', '--sysroot=/opt/android-ndk-r8c/platforms/android-8/arch-arm', '-fpic', '-D__ARM_ARCH_5__', '-D__ARM_ARCH_5T__', '-D__ARM_ARCH_5E__', '-D__ARM_ARCH_5TE__', '-fsigned-char', '-march=armv5te', '-mtune=xscale', '-msoft-float', '-fdata-sections', '-ffunction-sections', '-Wa,--noexecstack   ', '-W', '-Wall', '-Werror=return-type', '-Werror=address', '-Werror=sequence-point', '-Wformat', '-Werror=format-security', '-Wmissing-declarations', '-Wundef', '-Winit-self', '-Wpointer-arith', '-Wshadow', '-Wsign-promo', '-Wno-narrowing', '-fdiagnostics-show-option', '-fomit-frame-pointer', '-mthumb', '-fomit-frame-pointer', '-O3', '-DNDEBUG ', '-DNDEBUG']
+excludedOptionsPrefix = '-W'
 
-def FindClasses(root, prefix):
-    classes = []
-    if ("" != prefix):
-    prefix = prefix + "."
-    for path in os.listdir(root):
-    currentPath = os.path.join(root, path)
-    if (os.path.isdir(currentPath)):
-        classes += FindClasses(currentPath, prefix + path)
-    else:
-        name = str.split(path, ".")[0]
-        ext = str.split(path, ".")[1]
-        if (ext == "class"):
-        #print("class: %s" % (prefix + name))
-        classes.append(prefix+name)
-    return classes
 
-def FindHeaders(root):
+
+def GetHeaderFiles(root):
     headers = []
     for path in os.listdir(root):
-    currentPath = os.path.join(root, path)
-    if (os.path.isdir(currentPath)):
-        headers += FindHeaders(currentPath)
-    else:
-        ext = str.split(path, ".")[-1]
-        #print("%s: \"%s\"" % (currentPath, ext))
-        if (ext in HEADER_EXTS):
-        #print("Added as header file")
-        if (path not in EXCLUDE_HEADERS):
-            headers.append(currentPath)
+        if not os.path.isdir(os.path.join(root, path)) \
+            and os.path.splitext(path)[1] in ['.h', '.hpp'] \
+            and not path in excludedHeaders:
+            headers.append(os.path.join(root, path))
+    return sorted(headers)
+
+
+
+def GetClasses(root, prefix):
+    classes = []
+    if ('' != prefix):
+        prefix = prefix + '.'
+    for path in os.listdir(root):
+        currentPath = os.path.join(root, path)
+        if (os.path.isdir(currentPath)):
+            classes += GetClasses(currentPath, prefix + path)
+        else:
+            name = str.split(path, '.')[0]
+            ext = str.split(path, '.')[1]
+            if (ext == 'class'):
+                classes.append(prefix + name)
+    return classes
+
+
+
+def GetJavaHHeaders():
+    print('\nGenerating JNI headers for Java API ...')
+
+    javahHeaders = os.path.join(managerDir, 'javah_generated_headers')
+    if os.path.exists(javahHeaders):
+        rmtree(javahHeaders)
+    os.makedirs(os.path.join(os.getcwd(), javahHeaders))
+
+    AndroidJavaDeps = os.path.join(SDK_path, 'platforms/android-11/android.jar')
+
+    classPath = os.path.join(managerDir, 'sdk/java/bin/classes')
+    if not os.path.exists(classPath):
+        print('Error: no Java classes found in \'%s\'' % classPath)
+        quit()
+
+    allJavaClasses = GetClasses(classPath, '')
+    if not allJavaClasses:
+        print('Error: no Java classes found')
+        quit()
+
+    for currentClass in allJavaClasses:
+        os.system('javah -d %s -classpath %s:%s %s' % (javahHeaders, classPath, \
+            AndroidJavaDeps, currentClass))
+
+    print('\nBuilding JNI headers list ...')
+    jniHeaders = GetHeaderFiles(javahHeaders)
+
+    return jniHeaders
+
+
+
+def GetImmediateSubdirs(dir):
+    return [name for name in os.listdir(dir)
+            if os.path.isdir(os.path.join(dir, name))]
+
+
+
+def GetOpenCVModules():
+    makefile = open(os.path.join(managerDir, 'sdk/native/jni/OpenCV.mk'), 'r')
+    makefileStr = makefile.read()
+    left = makefileStr.find('OPENCV_MODULES:=') + len('OPENCV_MODULES:=')
+    right = makefileStr[left:].find('\n')
+    modules = makefileStr[left:left+right].split()
+    modules = filter(lambda x: x != 'ts' and x != 'androidcamera', modules)
+    return modules
+
+
+
+def FindHeaders():
+    headers = []
+
+    print('\nBuilding Native OpenCV header list ...')
+
+    cppHeadersFolder = os.path.join(managerDir, 'sdk/native/jni/include/opencv2')
+
+    modulesFolders = GetImmediateSubdirs(cppHeadersFolder)
+    modules = GetOpenCVModules()
+
+    cppHeaders = []
+    for m in modules:
+        for f in modulesFolders:
+            moduleHeaders = []
+            if f == m:
+                moduleHeaders += GetHeaderFiles(os.path.join(cppHeadersFolder, f))
+                if m == 'flann':
+                    flann = os.path.join(cppHeadersFolder, f, 'flann.hpp')
+                    moduleHeaders.remove(flann)
+                    moduleHeaders.insert(0, flann)
+                cppHeaders += moduleHeaders
+
+
+    cppHeaders += GetHeaderFiles(cppHeadersFolder)
+    headers += cppHeaders
+
+    cHeaders = GetHeaderFiles(os.path.join(managerDir, \
+        'sdk/native/jni/include/opencv'))
+    headers += cHeaders
+
+    headers += GetJavaHHeaders()
+
     return headers
 
-if (len(sys.argv) < 3):
-    print("Error: Invalid command line arguments")
-    exit(-1)
 
-INSTALL_DIRECTORY = sys.argv[1]
-PROJECT_NAME = sys.argv[2]
 
-CLASS_PATH = os.path.join(INSTALL_DIRECTORY, "sdk/java/bin/classes")
-if (not os.path.exists(CLASS_PATH)):
-    print("Error: no java classes found in \"%s\"" % CLASS_PATH)
-    exit(-2)
+def FindLibraries():
+    libraries = []
+    for lib in targetLibs:
+        libraries.append(os.path.join(managerDir, 'sdk/native/libs', architecture, lib))
+    return libraries
 
-if (os.environ.has_key("NDK_ROOT")):
-    ANDROID_NDK_PATH = os.environ["NDK_ROOT"];
-    print("Using Android NDK from NDK_ROOT (\"%s\")" % ANDROID_NDK_PATH)
 
-if (not ANDROID_NDK_PATH):
-    pipe = os.popen("which ndk-build")
-    tmp = str.strip(pipe.readline(), "\n")
-    while(not tmp):
-    tmp = str.strip(pipe.readline(), "\n")
-    pipe.close()
-    ANDROID_NDK_PATH = os.path.split(tmp)[0]
-    print("Using Android NDK from PATH (\"%s\")" % ANDROID_NDK_PATH)
 
-print("Using Android SDK from \"%s\"" % ANDROID_SDK_PATH)
+def FindIncludes():
+    includes = [os.path.join(managerDir, 'sdk', 'native', 'jni', 'include'),
+        os.path.join(managerDir, 'sdk', 'native', 'jni', 'include', 'opencv'),
+        os.path.join(managerDir, 'sdk', 'native', 'jni', 'include', 'opencv2')]
 
-outputFileName = PROJECT_NAME + ".xml"
-try:
-    outputFile = open(outputFileName, "w")
-except:
-    print("Error: Cannot open output file \"%s\" for writing" % outputFileName)
+    for inc in systemIncludes:
+        includes.append(os.path.join(NDK_path, inc))
 
-allJavaClasses = FindClasses(CLASS_PATH, "")
-if (not allJavaClasses):
-    print("Error: No Java classes found :(")
-    exit(-1)
+    return includes
 
-if (not os.path.exists(TMP_HEADER_PATH)):
-    os.makedirs(os.path.join(os.getcwd(), TMP_HEADER_PATH))
 
-print("Generating JNI headers for Java API ...")
-AndroidJavaDeps = os.path.join(ANDROID_SDK_PATH, "platforms/android-11/android.jar")
-for currentClass in allJavaClasses:
-    os.system("javah -d %s -classpath %s:%s %s" % (TMP_HEADER_PATH, CLASS_PATH, AndroidJavaDeps, currentClass))
 
-print("Building JNI headers list ...")
-jniHeaders = FindHeaders(TMP_HEADER_PATH)
-#print(jniHeaders)
+def FilterGCCOptions():
+    gcc = filter(lambda x: not x.startswith(excludedOptionsPrefix), gcc_options)
+    return sorted(gcc)
 
-print("Building Native OpenCV header list ...")
-cHeaders = FindHeaders(os.path.join(INSTALL_DIRECTORY, "sdk/native/jni/include/opencv"))
-cppHeaders = FindHeaders(os.path.join(INSTALL_DIRECTORY, "sdk/native/jni/include/opencv2"))
-#print(cHeaders)
-#print(cppHeaders)
 
-print("Writing config file ...")
-outputFile.write("<descriptor>\n\n<version>\n\t%s\n</version>\n\n<headers>\n" % PROJECT_NAME)
-outputFile.write("\t"   + "\n\t".join(cHeaders))
-outputFile.write("\n\t" + "\n\t".join(cppHeaders))
-outputFile.write("\n\t" + "\n\t".join(jniHeaders))
-outputFile.write("\n</headers>\n\n")
 
-includes = [os.path.join(INSTALL_DIRECTORY, "sdk", "native", "jni", "include"),
-    os.path.join(INSTALL_DIRECTORY, "sdk", "native", "jni", "include", "opencv"),
-    os.path.join(INSTALL_DIRECTORY, "sdk", "native", "jni", "include", "opencv2")]
+def WriteXml(version, headers, includes, libraries):
+    xmlName = version + '.xml'
 
-for inc in SYS_INCLUDES:
-    includes.append(os.path.join(ANDROID_NDK_PATH, inc))
+    print '\noutput file: ' + xmlName
+    try:
+        xml = open(xmlName, 'w')
+    except:
+        print 'Error: Cannot open output file "%s" for writing' % xmlName
+        quit()
 
-outputFile.write("<include_paths>\n\t%s\n</include_paths>\n\n" % "\n\t".join(includes))
+    xml.write('<descriptor>')
 
-libraries = []
-for lib in TARGET_LIBS:
-    libraries.append(os.path.join(INSTALL_DIRECTORY, "sdk/native/libs", ARCH, lib))
+    xml.write('\n\n<version>')
+    xml.write('\n\t%s' % version)
+    xml.write('\n</version>')
 
-outputFile.write("<libs>\n\t%s\n</libs>\n\n" % "\n\t".join(libraries))
-outputFile.write("<gcc_options>\n\t%s\n</gcc_options>\n\n</descriptor>" % GCC_OPTIONS)
+    xml.write('\n\n<headers>')
+    xml.write('\n\t%s' % '\n\t'.join(headers))
+    xml.write('\n</headers>')
 
-print("done!")
+    xml.write('\n\n<include_paths>')
+    xml.write('\n\t%s' % '\n\t'.join(includes))
+    xml.write('\n</include_paths>')
+
+    # TODO: uncomment when Eigen problem is solved
+    # xml.write('\n\n<include_preamble>')
+    # xml.write('\n\t%s' % '\n\t'.join(preamble))
+    # xml.write('\n</include_preamble>')
+
+    xml.write('\n\n<libs>')
+    xml.write('\n\t%s' % '\n\t'.join(libraries))
+    xml.write('\n</libs>')
+
+    xml.write('\n\n<gcc_options>')
+    xml.write('\n\t%s' % '\n\t'.join(gcc_options))
+    xml.write('\n</gcc_options>')
+
+    xml.write('\n\n</descriptor>')
+
+
+
+if __name__ == '__main__':
+    usage = '%prog <OpenCV_Manager install directory> <OpenCV_Manager version>'
+    parser = OptionParser(usage = usage)
+
+    args = parser.parse_args()
+    if 2 != len(args):
+        parser.print_help()
+        quit()
+
+    managerDir = args[1][0]
+    version = args[1][1]
+
+    NDK_path = '/opt/android-ndk-r8c'
+    print '\nUsing Android NDK from "%s"' % NDK_path
+
+    SDK_path = '~/NVPACK/android-sdk-linux'
+    print '\nUsing Android SDK from "%s"' % SDK_path
+
+    headers = FindHeaders()
+
+    includes = FindIncludes()
+
+    libraries = FindLibraries()
+
+    gcc_options = FilterGCCOptions()
+
+    WriteXml(version, headers, includes, libraries)
diff --git a/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp b/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp
index b2d8fd7085..d202d654d4 100644
--- a/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp
+++ b/android/service/engine/jni/JNIWrapper/JavaBasedPackageManager.cpp
@@ -40,6 +40,7 @@ bool JavaBasedPackageManager::InstallPackage(const PackageInfo& package)
     if (!jmethod)
     {
         LOGE("MarketConnector::GetAppFormMarket method was not found!");
+        jenv->DeleteLocalRef(jclazz);
         return false;
     }
 
@@ -74,7 +75,6 @@ vector<PackageInfo> JavaBasedPackageManager::GetInstalledPackages()
         JavaContext->AttachCurrentThread(&jenv, NULL);
     }
 
-    LOGD("GetObjectClass call");
     jclass jclazz = jenv->GetObjectClass(JavaPackageManager);
     if (!jclazz)
     {
@@ -82,15 +82,14 @@ vector<PackageInfo> JavaBasedPackageManager::GetInstalledPackages()
         return result;
     }
 
-    LOGD("GetMethodID call");
     jmethodID jmethod = jenv->GetMethodID(jclazz, "GetInstalledOpenCVPackages", "()[Landroid/content/pm/PackageInfo;");
     if (!jmethod)
     {
         LOGE("MarketConnector::GetInstalledOpenCVPackages method was not found!");
+        jenv->DeleteLocalRef(jclazz);
         return result;
     }
 
-    LOGD("Java package manager call");
     jobjectArray jpkgs = static_cast<jobjectArray>(jenv->CallNonvirtualObjectMethod(JavaPackageManager, jclazz, jmethod));
     jsize size = jenv->GetArrayLength(jpkgs);
 
@@ -102,7 +101,6 @@ vector<PackageInfo> JavaBasedPackageManager::GetInstalledPackages()
     {
         jobject jtmp = jenv->GetObjectArrayElement(jpkgs, i);
         PackageInfo tmp = ConvertPackageFromJava(jtmp, jenv);
-        jenv->DeleteLocalRef(jtmp);
 
         if (tmp.IsValid())
             result.push_back(tmp);
@@ -137,6 +135,7 @@ static jint GetAndroidVersion(JNIEnv* jenv)
 PackageInfo JavaBasedPackageManager::ConvertPackageFromJava(jobject package, JNIEnv* jenv)
 {
     jclass jclazz = jenv->GetObjectClass(package);
+
     jfieldID jfield = jenv->GetFieldID(jclazz, "packageName", "Ljava/lang/String;");
     jstring jnameobj = static_cast<jstring>(jenv->GetObjectField(package, jfield));
     const char* jnamestr = jenv->GetStringUTFChars(jnameobj, NULL);
@@ -148,6 +147,7 @@ PackageInfo JavaBasedPackageManager::ConvertPackageFromJava(jobject package, JNI
     const char* jversionstr = jenv->GetStringUTFChars(jversionobj, NULL);
     string verison(jversionstr);
     jenv->DeleteLocalRef(jversionobj);
+
     jenv->DeleteLocalRef(jclazz);
 
     static const jint api_level = GetAndroidVersion(jenv);
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index fb08384f8e..22b5371a1e 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -33,13 +33,8 @@ if(CUDA_FOUND)
 
   message(STATUS "CUDA detected: " ${CUDA_VERSION})
 
-  if (CARMA)
-    set(CUDA_ARCH_BIN "2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
-  else()
-    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
-  endif()
+  set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+  set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
 
   string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
   string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
@@ -83,13 +78,8 @@ if(CUDA_FOUND)
     set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
   endforeach()
 
-  if(CARMA)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" )
-
-    if (CMAKE_VERSION VERSION_LESS 2.8.10)
-      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" )
-    endif()
-
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM")
   endif()
 
   # These vars will be processed in other scripts
diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index d8c00cfdd3..7b6ff5e5bc 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -8,10 +8,12 @@ endif()
 if(NOT APPLE)
   if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
     set(CMAKE_COMPILER_IS_GNUCXX 1)
+    set(CMAKE_COMPILER_IS_CLANGCXX 1)
     set(ENABLE_PRECOMPILED_HEADERS OFF CACHE BOOL "" FORCE)
   endif()
   if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
     set(CMAKE_COMPILER_IS_GNUCC 1)
+    set(CMAKE_COMPILER_IS_CLANGCC 1)
     set(ENABLE_PRECOMPILED_HEADERS OFF CACHE BOOL "" FORCE)
   endif()
 endif()
@@ -44,16 +46,24 @@ if(MSVC AND CMAKE_C_COMPILER MATCHES "icc")
     set(CV_ICC   __INTEL_COMPILER_FOR_WINDOWS)
 endif()
 
-if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR (UNIX AND CV_ICC))
-  set(CV_COMPILER_IS_GNU TRUE)
-else()
-  set(CV_COMPILER_IS_GNU FALSE)
-endif()
-
 # ----------------------------------------------------------------------------
 # Detect GNU version:
 # ----------------------------------------------------------------------------
-if(CMAKE_COMPILER_IS_GNUCXX)
+if(CMAKE_COMPILER_IS_CLANGCXX)
+  set(CMAKE_GCC_REGEX_VERSION "4.2.1")
+  set(CMAKE_OPENCV_GCC_VERSION_MAJOR 4)
+  set(CMAKE_OPENCV_GCC_VERSION_MINOR 2)
+  set(CMAKE_OPENCV_GCC_VERSION 42)
+  set(CMAKE_OPENCV_GCC_VERSION_NUM 402)
+
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -v
+                  ERROR_VARIABLE CMAKE_OPENCV_CLANG_VERSION_FULL
+                  ERROR_STRIP_TRAILING_WHITESPACE)
+
+  string(REGEX MATCH "version.*$" CMAKE_OPENCV_CLANG_VERSION_FULL "${CMAKE_OPENCV_CLANG_VERSION_FULL}")
+  string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_CLANG_REGEX_VERSION "${CMAKE_OPENCV_CLANG_VERSION_FULL}")
+
+elseif(CMAKE_COMPILER_IS_GNUCXX)
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -dumpversion
                   OUTPUT_VARIABLE CMAKE_OPENCV_GCC_VERSION_FULL
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake
index 6772c79b87..6e497accd3 100644
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@@ -7,11 +7,6 @@ if(WITH_TBB)
   include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectTBB.cmake")
 endif(WITH_TBB)
 
-# --- C= ---
-if(WITH_CSTRIPES)
-  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCStripes.cmake")
-endif(WITH_CSTRIPES)
-
 # --- IPP ---
 ocv_clear_vars(IPP_FOUND)
 if(WITH_IPP)
@@ -81,4 +76,36 @@ if(WITH_CLP)
       set(HAVE_CLP TRUE)
     endif()
   endif()
-endif(WITH_CLP)
\ No newline at end of file
+endif(WITH_CLP)
+
+# --- C= ---
+if(WITH_CSTRIPES AND NOT HAVE_TBB)
+  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCStripes.cmake")
+else()
+  set(HAVE_CSTRIPES 0)
+endif()
+
+# --- OpenMP ---
+if(NOT HAVE_TBB AND NOT HAVE_CSTRIPES)
+  set(_fname "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/omptest.cpp")
+  FILE(WRITE "${_fname}" "#ifndef _OPENMP\n#error\n#endif\nint main() { return 0; }\n")
+  TRY_COMPILE(HAVE_OPENMP "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp" "${_fname}")
+else()
+  set(HAVE_OPENMP 0)
+endif()
+
+# --- GCD ---
+if(APPLE AND NOT HAVE_TBB AND NOT HAVE_CSTRIPES AND NOT HAVE_OPENMP)
+  set(HAVE_GCD 1)
+else()
+  set(HAVE_GCD 0)
+endif()
+
+# --- Concurrency ---
+if(MSVC AND NOT HAVE_TBB AND NOT HAVE_CSTRIPES AND NOT HAVE_OPENMP)
+  set(_fname "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/concurrencytest.cpp")
+  FILE(WRITE "${_fname}" "#if _MSC_VER < 1600\n#error\n#endif\nint main() { return 0; }\n")
+  TRY_COMPILE(HAVE_CONCURRENCY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp" "${_fname}")
+else()
+  set(HAVE_CONCURRENCY 0)
+endif()
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index f40cc6d19c..db24c99708 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -141,7 +141,7 @@ macro(ocv_warnings_disable)
           set(${var} "${${var}} ${warning}")
         endforeach()
       endforeach()
-    elseif(CV_COMPILER_IS_GNU AND _gxx_warnings AND _flag_vars)
+    elseif((CMAKE_COMPILER_IS_GNUCXX OR (UNIX AND CV_ICC)) AND _gxx_warnings AND _flag_vars)
       foreach(var ${_flag_vars})
         foreach(warning ${_gxx_warnings})
           if(NOT warning MATCHES "^-Wno-")
diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in
index 4cf87536d8..d31e1b6a61 100644
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -42,8 +42,9 @@
 set(OpenCV_COMPUTE_CAPABILITIES @OpenCV_CUDA_CC_CONFIGCMAKE@)
 
 set(OpenCV_CUDA_VERSION @OpenCV_CUDA_VERSION@)
-set(OpenCV_USE_CUBLAS @HAVE_CUBLAS@)
-set(OpenCV_USE_CUFFT  @HAVE_CUFFT@)
+set(OpenCV_USE_CUBLAS   @HAVE_CUBLAS@)
+set(OpenCV_USE_CUFFT    @HAVE_CUFFT@)
+set(OpenCV_USE_NVCUVID  @HAVE_NVCUVID@)
 
 # Android API level from which OpenCV has been compiled is remembered
 set(OpenCV_ANDROID_NATIVE_API_LEVEL @OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE@)
@@ -218,13 +219,14 @@ foreach(__opttype OPT DBG)
   else()
     #TODO: duplicates are annoying but they should not be the problem
   endif()
-  # fix hard coded paths for CUDA libraries under Windows
-  if(WIN32 AND OpenCV_CUDA_VERSION AND NOT OpenCV_SHARED)
+
+  # CUDA
+  if(OpenCV_CUDA_VERSION AND WIN32 AND NOT OpenCV_SHARED)
     if(NOT CUDA_FOUND)
       find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
     else()
       if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
-        message(FATAL_ERROR "OpenCV static library compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+        message(FATAL_ERROR "OpenCV static library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
       endif()
     endif()
 
@@ -238,6 +240,13 @@ foreach(__opttype OPT DBG)
       list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUFFT_LIBRARIES})
     endif()
 
+    if(OpenCV_USE_NVCUVID)
+      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvid_LIBRARIES})
+    endif()
+
+    if(WIN32)
+      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvenc_LIBRARIES})
+    endif()
   endif()
 endforeach()
 
diff --git a/doc/tutorials/definitions/tocDefinitions.rst b/doc/tutorials/definitions/tocDefinitions.rst
index c556a394ed..ffe763fd16 100644
--- a/doc/tutorials/definitions/tocDefinitions.rst
+++ b/doc/tutorials/definitions/tocDefinitions.rst
@@ -8,4 +8,5 @@
 .. |Author_FernandoI| unicode:: Fernando U+0020 Iglesias U+0020 Garc U+00ED a
 .. |Author_EduardF| unicode:: Eduard U+0020 Feicho
 .. |Author_AlexB| unicode:: Alexandre U+0020 Benoit
-
+.. |Author_EricCh| unicode:: Eric U+0020 Christiansen
+.. |Author_AndreyP| unicode:: Andrey U+0020 Pavlenko
diff --git a/doc/tutorials/introduction/desktop_java/images/Java_logo.png b/doc/tutorials/introduction/desktop_java/images/Java_logo.png
new file mode 100644
index 0000000000..2114751896
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/Java_logo.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/ant_output.png b/doc/tutorials/introduction/desktop_java/images/ant_output.png
new file mode 100644
index 0000000000..a658fd7853
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/ant_output.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/cmake_output.png b/doc/tutorials/introduction/desktop_java/images/cmake_output.png
new file mode 100644
index 0000000000..bab140d6a4
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/cmake_output.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png b/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png
new file mode 100644
index 0000000000..84c152e6da
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png b/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png
new file mode 100644
index 0000000000..34e03972e7
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_run.png b/doc/tutorials/introduction/desktop_java/images/eclipse_run.png
new file mode 100644
index 0000000000..2efc8e6715
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_run.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png
new file mode 100644
index 0000000000..11694526ac
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png
new file mode 100644
index 0000000000..2b9ec5c3c1
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png
new file mode 100644
index 0000000000..4bf83ee033
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png
new file mode 100644
index 0000000000..c3f353155b
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png
new file mode 100644
index 0000000000..ed79d92d41
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png
new file mode 100644
index 0000000000..3a98e38b1b
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png
new file mode 100644
index 0000000000..019432016a
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png
new file mode 100644
index 0000000000..5650aa79a4
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/faceDetection.png b/doc/tutorials/introduction/desktop_java/images/faceDetection.png
new file mode 100644
index 0000000000..a7c97421e1
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/faceDetection.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/lena.png b/doc/tutorials/introduction/desktop_java/images/lena.png
new file mode 100644
index 0000000000..68342fae53
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/lena.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/sbt_eclipse.png b/doc/tutorials/introduction/desktop_java/images/sbt_eclipse.png
new file mode 100644
index 0000000000..cd532d7f54
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/sbt_eclipse.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/sbt_run.png b/doc/tutorials/introduction/desktop_java/images/sbt_run.png
new file mode 100644
index 0000000000..b2cbdd47e1
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/sbt_run.png differ
diff --git a/doc/tutorials/introduction/desktop_java/images/sbt_run_face.png b/doc/tutorials/introduction/desktop_java/images/sbt_run_face.png
new file mode 100644
index 0000000000..7e105f1000
Binary files /dev/null and b/doc/tutorials/introduction/desktop_java/images/sbt_run_face.png differ
diff --git a/doc/tutorials/introduction/desktop_java/java_dev_intro.rst b/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
new file mode 100644
index 0000000000..9e5e9510c3
--- /dev/null
+++ b/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
@@ -0,0 +1,523 @@
+
+.. _Java_Dev_Intro:
+
+
+Introduction to Java Development
+********************************
+
+Last updated: 12 February, 2013.
+
+As of OpenCV 2.4.4, OpenCV supports desktop Java development using nearly the same interface as for
+Android development. This guide will help you to create your first Java (or Scala) application using OpenCV.
+We will use either `Eclipse <http://eclipse.org/>`_, `Apache Ant <http://ant.apache.org/>`_ or the
+`Simple Build Tool (SBT) <http://www.scala-sbt.org/>`_ to build the application.
+
+For further reading after this guide, look at the :ref:`Android_Dev_Intro` tutorials.
+
+What we'll do in this guide
+***************************
+
+In this guide, we will:
+
+* Get OpenCV with desktop Java support
+
+* Create an ``Ant``, ``Eclipse`` or ``SBT`` project
+
+* Write a simple OpenCV application in Java or Scala
+
+The same process was used to create the samples in the :file:`samples/java` folder of the OpenCV repository,
+so consult those files if you get lost.
+
+Get OpenCV with desktop Java support
+************************************
+
+Starting from version 2.4.4 OpenCV includes desktop Java bindings.
+The most simple way to get it is downloading the appropriate package of **version 2.4.4 or higher** from the
+`OpenCV SourceForge repository <http://sourceforge.net/projects/opencvlibrary/files/>`_.
+
+.. note:: Windows users can find the prebuilt files needed for Java development in the
+          :file:`opencv/build/java/` folder inside the package.
+          For other OSes it's required to build OpenCV from sources.
+
+Another option to get OpenCV sources is to clone `OpenCV git repository
+<https://github.com/Itseez/opencv/>`_.
+In order to build OpenCV with Java bindings you need :abbr:`JDK (Java Development Kit)`
+(we recommend `Oracle/Sun JDK 6 or 7 <http://www.oracle.com/technetwork/java/javase/downloads/>`_),
+`Apache Ant <http://ant.apache.org/>`_ and `Python` v2.6 or higher to be installed.
+
+Build OpenCV
+############
+
+Let's build OpenCV:
+
+        .. code-block:: bash
+
+           git clone git://github.com/Itseez/opencv.git
+           cd opencv
+           git checkout 2.4
+           mkdir build
+           cd build
+
+Generate a Makefile or a MS Visual Studio* solution, or whatever you use for
+building executables in your system:
+
+        .. code-block:: bash
+
+           cmake -DBUILD_SHARED_LIBS=OFF ..
+
+or
+
+        .. code-block:: bat
+
+           cmake -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 10" ..
+
+.. note:: When OpenCV is built as a set of **static** libraries (``-DBUILD_SHARED_LIBS=OFF`` option)
+          the Java bindings dynamic library is all-sufficient,
+          i.e. doesn't depend on other OpenCV libs, but includes all the OpenCV code inside.
+
+Examine the output of CMake and ensure ``java`` is one of the modules "To be built".
+If not, it's likely you're missing a dependency. You should troubleshoot by looking
+through the CMake output for any Java-related tools that aren't found and installing them.
+
+     .. image:: images/cmake_output.png
+        :alt: CMake output
+        :align: center
+
+Now start the build:
+
+        .. code-block:: bash
+
+           make -j8
+
+or
+
+        .. code-block:: bat
+
+           msbuild /m OpenCV.sln /t:Build /p:Configuration=Release /v:m
+
+Besides all this will create a ``jar`` containing the Java interface (:file:`bin/opencv_2.4.4.jar`)
+and a native dynamic library containing Java bindings and all the OpenCV stuff
+(:file:`bin/Release/opencv_java244.dll` or :file:`bin/libopencv_java244.so` respectively).
+We'll use these files later.
+
+Create a simple Java sample and an Ant build file for it
+********************************************************
+
+.. note::
+    The described sample is provided with OpenCV library in the :file:`opencv/samples/java/ant` folder.
+
+* Create a folder where you'll develop this sample application.
+
+* In this folder create an XML file with the following content using any text editor:
+
+    .. code-block:: xml
+        :linenos:
+
+        <project name="SimpleSample" basedir="." default="rebuild-run">
+
+            <property name="src.dir"     value="src"/>
+
+            <property name="lib.dir"     value="${ocvJarDir}"/>
+            <path id="classpath">
+                <fileset dir="${lib.dir}" includes="**/*.jar"/>
+            </path>
+
+            <property name="build.dir"   value="build"/>
+            <property name="classes.dir" value="${build.dir}/classes"/>
+            <property name="jar.dir"     value="${build.dir}/jar"/>
+
+            <property name="main-class"  value="${ant.project.name}"/>
+
+
+            <target name="clean">
+                <delete dir="${build.dir}"/>
+            </target>
+
+            <target name="compile">
+                <mkdir dir="${classes.dir}"/>
+                <javac srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/>
+            </target>
+
+            <target name="jar" depends="compile">
+                <mkdir dir="${jar.dir}"/>
+                <jar destfile="${jar.dir}/${ant.project.name}.jar" basedir="${classes.dir}">
+                    <manifest>
+                        <attribute name="Main-Class" value="${main-class}"/>
+                    </manifest>
+                </jar>
+            </target>
+
+            <target name="run" depends="jar">
+                <java fork="true" classname="${main-class}">
+                    <sysproperty key="java.library.path" path="${ocvLibDir}"/>
+                    <classpath>
+                        <path refid="classpath"/>
+                        <path location="${jar.dir}/${ant.project.name}.jar"/>
+                    </classpath>
+                </java>
+            </target>
+
+            <target name="rebuild" depends="clean,jar"/>
+
+            <target name="rebuild-run" depends="clean,run"/>
+
+        </project>
+
+    .. note::
+        This XML file can be reused for building other Java applications.
+        It describes a common folder structure in the lines 3 - 12 and common targets
+        for compiling and running the application.
+
+        When reusing this XML don't forget to modify the project name in the line 1,
+        that is also the name of the `main` class (line 14).
+        The paths to OpenCV `jar` and `jni lib` are expected as parameters
+        (``"${ocvJarDir}"`` in line 5 and ``"${ocvLibDir}"`` in line 37), but
+        you can hardcode these paths for your convenience.
+        See `Ant documentation <http://ant.apache.org/manual/>`_ for detailed description
+        of its build file format.
+
+* Create an :file:`src` folder next to the :file:`build.xml` file and a :file:`SimpleSample.java` file in it.
+
+* Put the following Java code into the :file:`SimpleSample.java` file:
+    .. code-block:: java
+
+        import org.opencv.core.Mat;
+        import org.opencv.core.CvType;
+        import org.opencv.core.Scalar;
+
+        class SimpleSample {
+
+          static{ System.loadLibrary("opencv_java244"); }
+
+          public static void main(String[] args) {
+            Mat m = new Mat(5, 10, CvType.CV_8UC1, new Scalar(0));
+            System.out.println("OpenCV Mat: " + m);
+            Mat mr1 = m.row(1);
+            mr1.setTo(new Scalar(1));
+            Mat mc5 = m.col(5);
+            mc5.setTo(new Scalar(5));
+            System.out.println("OpenCV Mat data:\n" + m.dump());
+          }
+
+        }
+
+* Run the following command in console in the folder containing :file:`build.xml`:
+    .. code-block:: bash
+
+        ant -DocvJarDir=path/to/dir/containing/opencv-244.jar -DocvLibDir=path/to/dir/containing/opencv_java244/native/library
+
+    For example:
+
+    .. code-block:: bat
+
+        ant -DocvJarDir=X:\opencv-2.4.4\bin -DocvLibDir=X:\opencv-2.4.4\bin\Release
+
+    The command should initiate [re]building and running the sample.
+    You should see on the screen something like this:
+
+    .. image:: images/ant_output.png
+        :alt: run app with Ant
+        :align: center
+
+Create a simple Java project in Eclipse
+***************************************
+
+Now let's look at the possiblity of using OpenCV in Java when developing in Eclipse IDE.
+
+* Create a new Eclipse workspace
+* Create a new Java project via :guilabel:`File --> New --> Java Project`
+
+    .. image:: images/eclipse_new_java_prj.png
+        :alt: Eclipse: new Java project
+        :align: center
+
+    Call it say "HelloCV".
+
+* Open :guilabel:`Java Build Path` tab on :guilabel:`Project Properties` dialog
+    and configure additional library (OpenCV) reference (jar and native library location):
+
+    .. image:: images/eclipse_user_lib.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+    .. image:: images/eclipse_user_lib2.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+    .. image:: images/eclipse_user_lib3.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+    .. image:: images/eclipse_user_lib4.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+    .. image:: images/eclipse_user_lib5.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+    .. image:: images/eclipse_user_lib6.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+    .. image:: images/eclipse_user_lib7.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+    .. image:: images/eclipse_user_lib8.png
+        :alt: Eclipse: external JAR
+        :align: center
+
+   ` `
+
+* Add a new Java class (say ``Main``) containing the application entry:
+
+    .. image:: images/eclipse_main_class.png
+        :alt: Eclipse: Main class
+        :align: center
+
+* Put some simple OpenCV calls there, e.g.:
+    .. code-block:: java
+
+        import org.opencv.core.CvType;
+        import org.opencv.core.Mat;
+
+        public class Main {
+            public static void main(String[] args) {
+                System.loadLibrary("opencv_java244");
+                Mat m  = Mat.eye(3, 3, CvType.CV_8UC1);
+                System.out.println("m = " + m.dump());
+            }
+        }
+
+* Press :guilabel:`Run` button and find the identity matrix content in the Eclipse ``Console`` window.
+
+    .. image:: images/eclipse_run.png
+        :alt: Eclipse: run
+        :align: center
+
+Create an SBT project and samples in Java and Scala
+***************************************************
+
+Now we'll create a simple Java application using SBT. This serves as a brief introduction to
+those unfamiliar with this build tool. We're using SBT because it is particularly easy and powerful.
+
+First, download and install `SBT <http://www.scala-sbt.org/>`_ using the instructions on its `web site <http://www.scala-sbt.org/>`_.
+
+Next, navigate to a new directory where you'd like the application source to live (outside :file:`opencv` dir).
+Let's call it "JavaSample" and create a directory for it:
+
+        .. code-block:: bash
+
+           cd <somewhere outside opencv>
+           mkdir JavaSample
+
+Now we will create the necessary folders and an SBT project:
+
+        .. code-block:: bash
+
+           cd JavaSample
+           mkdir -p src/main/java # This is where SBT expects to find Java sources
+           mkdir project # This is where the build definitions live
+
+Now open :file:`project/build.scala` in your favorite editor and paste the following.
+It defines your project:
+
+        .. code-block:: scala
+
+            import sbt._
+            import Keys._
+
+           object JavaSampleBuild extends Build {
+             def scalaSettings = Seq(
+               scalaVersion := "2.10.0",
+               scalacOptions ++= Seq(
+                 "-optimize",
+                 "-unchecked",
+                 "-deprecation"
+               )
+             )
+
+             def buildSettings =
+               Project.defaultSettings ++
+               scalaSettings
+
+             lazy val root = {
+               val settings = buildSettings ++ Seq(name := "JavaSample")
+               Project(id = "JavaSample", base = file("."), settings = settings)
+             }
+           }
+
+Now edit :file:`project/plugins.sbt` and paste the following.
+This will enable auto-generation of an Eclipse project:
+
+        .. code-block:: scala
+
+           addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.1.0")
+
+Now run ``sbt`` from the :file:`JavaSample` root and from within SBT run ``eclipse`` to generate an eclipse project:
+
+        .. code-block:: bash
+
+           sbt # Starts the sbt console
+           > eclipse # Running "eclipse" from within the sbt console
+
+You should see something like this:
+
+     .. image:: images/sbt_eclipse.png
+        :alt: SBT output
+        :align: center
+
+You can now import the SBT project to Eclipse using :guilabel:`Import ... -> Existing projects into workspace`.
+Whether you actually do this is optional for the guide;
+we'll be using SBT to build the project, so if you choose to use Eclipse it will just serve as a text editor.
+
+To test that everything is working, create a simple "Hello OpenCV" application.
+Do this by creating a file :file:`src/main/java/HelloOpenCV.java` with the following contents:
+
+        .. code-block:: java
+
+            public class HelloOpenCV {
+              public static void main(String[] args) {
+                System.out.println("Hello, OpenCV");
+             }
+           }
+
+Now execute ``run`` from the sbt console, or more concisely, run ``sbt run`` from the command line:
+
+        .. code-block:: bash
+
+           sbt run
+
+You should see something like this:
+
+     .. image:: images/sbt_run.png
+        :alt: SBT run
+        :align: center
+
+Copy the OpenCV jar and write a simple application
+********************************************************
+
+Now we'll create a simple face detection application using OpenCV.
+
+First, create a :file:`lib/` folder and copy the OpenCV jar into it.
+By default, SBT adds jars in the lib folder to the Java library search path.
+You can optionally rerun ``sbt eclipse`` to update your Eclipse project.
+
+        .. code-block:: bash
+
+           mkdir lib
+           cp <opencv_dir>/build/bin/opencv_<version>.jar lib/
+           sbt eclipse
+
+Next, create the directory src/main/resources and download this Lena image into it:
+
+     .. image:: images/lena.png
+        :alt: Lena
+        :align: center
+
+Make sure it's called :file:`"lena.png"`.
+Items in the resources directory are available to the Java application at runtime.
+
+Next, copy :file:`lbpcascade_frontalface.xml` from :file:`opencv/data/` into the :file:`resources`
+directory:
+
+        .. code-block:: bash
+
+           cp <opencv_dir>/data/lbpcascades/lbpcascade_frontalface.xml src/main/resources/
+
+Now modify src/main/java/HelloOpenCV.java so it contains the following Java code:
+
+.. code-block:: java
+
+   import org.opencv.core.Core;
+   import org.opencv.core.Mat;
+   import org.opencv.core.MatOfRect;
+   import org.opencv.core.Point;
+   import org.opencv.core.Rect;
+   import org.opencv.core.Scalar;
+   import org.opencv.highgui.Highgui;
+   import org.opencv.objdetect.CascadeClassifier;
+
+   //
+   // Detects faces in an image, draws boxes around them, and writes the results
+   // to "faceDetection.png".
+   //
+   class DetectFaceDemo {
+     public void run() {
+       System.out.println("\nRunning DetectFaceDemo");
+
+       // Create a face detector from the cascade file in the resources
+       // directory.
+       CascadeClassifier faceDetector = new CascadeClassifier(getClass().getResource("/lbpcascade_frontalface.xml").getPath());
+       Mat image = Highgui.imread(getClass().getResource("/lena.png").getPath());
+
+       // Detect faces in the image.
+       // MatOfRect is a special container class for Rect.
+       MatOfRect faceDetections = new MatOfRect();
+       faceDetector.detectMultiScale(image, faceDetections);
+
+       System.out.println(String.format("Detected %s faces", faceDetections.toArray().length));
+
+       // Draw a bounding box around each face.
+       for (Rect rect : faceDetections.toArray()) {
+           Core.rectangle(image, new Point(rect.x, rect.y), new Point(rect.x + rect.width, rect.y + rect.height), new Scalar(0, 255, 0));
+       }
+
+       // Save the visualized detection.
+       String filename = "faceDetection.png";
+       System.out.println(String.format("Writing %s", filename));
+       Highgui.imwrite(filename, image);
+     }
+   }
+
+   public class HelloOpenCV {
+     public static void main(String[] args) {
+       System.out.println("Hello, OpenCV");
+
+       // Load the native library.
+       System.loadLibrary("opencv_java244");
+       new DetectFaceDemo().run();
+     }
+   }
+
+Note the call to ``System.loadLibrary("opencv_java244")``.
+This command must be executed exactly once per Java process prior to using any native OpenCV methods.
+If you don't call it, you will get ``UnsatisfiedLink errors``.
+You will also get errors if you try to load OpenCV when it has already been loaded.
+
+Now run the face detection app using ``sbt run``:
+
+        .. code-block:: bash
+
+           sbt run
+
+You should see something like this:
+
+     .. image:: images/sbt_run_face.png
+        :alt: SBT run
+        :align: center
+
+It should also write the following image to :file:`faceDetection.png`:
+
+     .. image:: images/faceDetection.png
+        :alt: Detected face
+        :align: center
+
+You're done!
+Now you have a sample Java application working with OpenCV, so you can start the work on your own.
+We wish you good luck and many years of joyful life!
diff --git a/doc/tutorials/introduction/table_of_content_introduction/images/Java_logo.png b/doc/tutorials/introduction/table_of_content_introduction/images/Java_logo.png
new file mode 100644
index 0000000000..2114751896
Binary files /dev/null and b/doc/tutorials/introduction/table_of_content_introduction/images/Java_logo.png differ
diff --git a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
index d918c14972..504e5e5639 100644
--- a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
+++ b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
@@ -101,6 +101,26 @@ Here you can read tutorials about how to set up your computer to work with the O
                       :height: 90pt
                       :width:  90pt
 
+* **Desktop Java**
+
+  .. tabularcolumns:: m{100pt} m{300pt}
+  .. cssclass:: toctableopencv
+
+  ================ =================================================
+  |JavaLogo|       **Title:** :ref:`Java_Dev_Intro`
+
+                   *Compatibility:* > OpenCV 2.4.4
+
+                   *Authors:* |Author_EricCh| and |Author_AndreyP|
+
+                   Explains how to build and run a simple desktop Java application using Eclipse, Ant or the Simple Build Tool (SBT).
+
+  ================ =================================================
+
+     .. |JavaLogo| image:: images/Java_logo.png
+                        :height: 90pt
+                        :width:  90pt
+
 * **Android**
 
   .. tabularcolumns:: m{100pt} m{300pt}
@@ -238,10 +258,11 @@ Here you can read tutorials about how to set up your computer to work with the O
    ../linux_eclipse/linux_eclipse
    ../windows_install/windows_install
    ../windows_visual_studio_Opencv/windows_visual_studio_Opencv
+   ../desktop_java/java_dev_intro
    ../android_binary_package/android_dev_intro
    ../android_binary_package/O4A_SDK
    ../android_binary_package/dev_with_OCV_on_Android
    ../ios_install/ios_install
    ../display_image/display_image
    ../load_save_image/load_save_image
-   ../how_to_write_a_tutorial/how_to_write_a_tutorial
\ No newline at end of file
+   ../how_to_write_a_tutorial/how_to_write_a_tutorial
diff --git a/modules/core/doc/utility_and_system_functions_and_macros.rst b/modules/core/doc/utility_and_system_functions_and_macros.rst
index 861f98bfc8..9055415f13 100644
--- a/modules/core/doc/utility_and_system_functions_and_macros.rst
+++ b/modules/core/doc/utility_and_system_functions_and_macros.rst
@@ -311,13 +311,34 @@ Returns true if the specified feature is supported by the host hardware.
 
 The function returns true if the host hardware supports the specified feature. When user calls ``setUseOptimized(false)``, the subsequent calls to ``checkHardwareSupport()`` will return false until ``setUseOptimized(true)`` is called. This way user can dynamically switch on and off the optimized code in OpenCV.
 
+
+
+getNumberOfCPUs
+-----------------
+Returns the number of logical CPUs available for the process.
+
+.. ocv:function:: int getNumberOfCPUs()
+
+
+
 getNumThreads
--------------
-Returns the number of threads used by OpenCV.
+-----------------
+Returns the number of threads used by OpenCV for parallel regions.
+Always returns 1 if OpenCV is built without threading support.
 
 .. ocv:function:: int getNumThreads()
 
-The function returns the number of threads that is used by OpenCV.
+The exact meaning of return value depends on the threading framework used by OpenCV library:
+
+    * **TBB** – The number of threads, that OpenCV will try to use for parallel regions.
+      If there is any ``tbb::thread_scheduler_init`` in user code conflicting with OpenCV, then
+      function returns default number of threads used by TBB library.
+    * **OpenMP** – An upper bound on the number of threads that could be used to form a new team.
+    * **Concurrency** – The number of threads, that OpenCV will try to use for parallel regions.
+    * **GCD** – Unsupported; returns the GCD thread pool limit (512) for compatibility.
+    * **C=** – The number of threads, that OpenCV will try to use for parallel regions,
+      if before called ``setNumThreads`` with ``threads > 0``,
+      otherwise returns the number of logical CPUs, available for the process.
 
 .. seealso::
    :ocv:func:`setNumThreads`,
@@ -326,16 +347,24 @@ The function returns the number of threads that is used by OpenCV.
 
 
 getThreadNum
-------------
-Returns the index of the currently executed thread.
+----------------
+Returns the index of the currently executed thread within the current parallel region.
+Always returns 0 if called outside of parallel region.
 
 .. ocv:function:: int getThreadNum()
 
-The function returns a 0-based index of the currently executed thread. The function is only valid inside a parallel OpenMP region. When OpenCV is built without OpenMP support, the function always returns 0.
+The exact meaning of return value depends on the threading framework used by OpenCV library:
+
+    * **TBB** – Unsupported with current 4.1 TBB release. May be will be supported in future.
+    * **OpenMP** – The thread number, within the current team, of the calling thread.
+    * **Concurrency** – An ID for the virtual processor that the current context is executing
+      on (0 for master thread and unique number for others, but not necessary 1,2,3,...).
+    * **GCD** – System calling thread's ID. Never returns 0 inside parallel region.
+    * **C=** – The index of the current parallel task.
 
 .. seealso::
    :ocv:func:`setNumThreads`,
-   :ocv:func:`getNumThreads` .
+   :ocv:func:`getNumThreads`
 
 
 
@@ -410,13 +439,25 @@ This operation is used in the simplest or most complex image processing function
 
 setNumThreads
 -----------------
-Sets the number of threads used by OpenCV.
+OpenCV will try to set the number of threads for the next parallel region.
+If ``threads == 0``, OpenCV will disable threading optimizations and run all it's
+functions sequentially. Passing ``threads < 0`` will reset threads number to system default.
+This function must be called outside of parallel region.
 
-.. ocv:function:: void setNumThreads(int nthreads)
+.. ocv:function:: void setNumThreads(int threads)
 
-    :param nthreads: Number of threads used by OpenCV.
+    :param threads: Number of threads used by OpenCV.
 
-The function sets the number of threads used by OpenCV in parallel OpenMP regions. If ``nthreads=0`` , the function uses the default number of threads that is usually equal to the number of the processing cores.
+OpenCV will try to run it's functions with specified threads number, but
+some behaviour differs from framework:
+
+    * **TBB** – User-defined parallel constructions will run with the same threads number,
+      if another does not specified. If late on user creates own scheduler, OpenCV will be use it.
+    * **OpenMP** – No special defined behaviour.
+    * **Concurrency** – If ``threads == 1``, OpenCV will disable threading optimizations
+      and run it's functions sequentially.
+    * **GCD** – Supports only values <= 0.
+    * **C=** – No special defined behaviour.
 
 .. seealso::
    :ocv:func:`getNumThreads`,
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 2bc8a05880..0b2a845ac1 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -305,7 +305,9 @@ int cv::getNumThreads(void)
 
 #elif defined HAVE_CSTRIPES
 
-    return cv::getNumberOfCPUs();
+    return numThreads > 0
+            ? numThreads
+            : cv::getNumberOfCPUs();
 
 #elif defined HAVE_OPENMP
 
@@ -491,4 +493,4 @@ CV_IMPL int cvGetNumThreads()
 CV_IMPL int cvGetThreadNum()
 {
     return cv::getThreadNum();
-}
\ No newline at end of file
+}
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 0c9f709b15..26bf624c66 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -21,7 +21,7 @@ source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
 source_group("Device"         FILES ${lib_device_hdrs})
 source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
 
-if (HAVE_CUDA)
+if(HAVE_CUDA)
   file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
   file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
   set(ncv_files ${ncv_srcs} ${ncv_cuda})
@@ -104,3 +104,7 @@ ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
                        FILES "Src" ${test_srcs}
                        ${nvidia})
 ocv_add_perf_tests()
+
+if(HAVE_CUDA)
+  add_subdirectory(perf4au)
+endif()
diff --git a/modules/gpu/doc/data_structures.rst b/modules/gpu/doc/data_structures.rst
index 68e702a793..1291cf9bb6 100644
--- a/modules/gpu/doc/data_structures.rst
+++ b/modules/gpu/doc/data_structures.rst
@@ -271,41 +271,37 @@ This class encapsulates a queue of asynchronous calls. Some functions have overl
     class CV_EXPORTS Stream
     {
     public:
-            Stream();
-            ~Stream();
+        Stream();
+        ~Stream();
 
-            Stream(const Stream&);
-            Stream& operator=(const Stream&);
+        Stream(const Stream&);
+        Stream& operator=(const Stream&);
 
-            bool queryIfComplete();
-            void waitForCompletion();
+        bool queryIfComplete();
+        void waitForCompletion();
 
-            //! downloads asynchronously.
-            // Warning! cv::Mat must point to page locked memory
-                     (i.e. to CudaMem data or to its subMat)
-            void enqueueDownload(const GpuMat& src, CudaMem& dst);
-            void enqueueDownload(const GpuMat& src, Mat& dst);
+        void enqueueDownload(const GpuMat& src, CudaMem& dst);
+        void enqueueDownload(const GpuMat& src, Mat& dst);
 
-            //! uploads asynchronously.
-            // Warning! cv::Mat must point to page locked memory
-                     (i.e. to CudaMem data or to its ROI)
-            void enqueueUpload(const CudaMem& src, GpuMat& dst);
-            void enqueueUpload(const Mat& src, GpuMat& dst);
+        void enqueueUpload(const CudaMem& src, GpuMat& dst);
+        void enqueueUpload(const Mat& src, GpuMat& dst);
 
-            void enqueueCopy(const GpuMat& src, GpuMat& dst);
+        void enqueueCopy(const GpuMat& src, GpuMat& dst);
 
-            void enqueueMemSet(const GpuMat& src, Scalar val);
-            void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
+        void enqueueMemSet(const GpuMat& src, Scalar val);
+        void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
 
-            // converts matrix type, ex from float to uchar depending on type
-            void enqueueConvert(const GpuMat& src, GpuMat& dst, int type,
-                    double a = 1, double b = 0);
+        void enqueueConvert(const GpuMat& src, GpuMat& dst, int type,
+                            double a = 1, double b = 0);
+
+        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        void enqueueHostCallback(StreamCallback callback, void* userData);
     };
 
 
 
 gpu::Stream::queryIfComplete
---------------------------------
+----------------------------
 Returns ``true`` if the current stream queue is finished. Otherwise, it returns false.
 
 .. ocv:function:: bool gpu::Stream::queryIfComplete()
@@ -313,13 +309,73 @@ Returns ``true`` if the current stream queue is finished. Otherwise, it returns
 
 
 gpu::Stream::waitForCompletion
-----------------------------------
+------------------------------
 Blocks the current CPU thread until all operations in the stream are complete.
 
 .. ocv:function:: void gpu::Stream::waitForCompletion()
 
 
 
+gpu::Stream::enqueueDownload
+----------------------------
+Copies data from device to host.
+
+.. ocv:function:: void gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
+
+.. ocv:function:: void gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
+
+.. note:: ``cv::Mat`` must point to page locked memory (i.e. to ``CudaMem`` data or to its subMat) or must be registered with :ocv:func:`gpu::registerPageLocked` .
+
+
+
+gpu::Stream::enqueueUpload
+--------------------------
+Copies data from host to device.
+
+.. ocv:function:: void gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
+
+.. ocv:function:: void gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
+
+.. note:: ``cv::Mat`` must point to page locked memory (i.e. to ``CudaMem`` data or to its subMat) or must be registered with :ocv:func:`gpu::registerPageLocked` .
+
+
+
+gpu::Stream::enqueueCopy
+------------------------
+Copies data from device to device.
+
+.. ocv:function:: void gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
+
+
+
+gpu::Stream::enqueueMemSet
+--------------------------
+Initializes or sets device memory to a value.
+
+.. ocv:function:: void gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val)
+
+.. ocv:function:: void gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask)
+
+
+
+gpu::Stream::enqueueConvert
+---------------------------
+Converts matrix type, ex from float to uchar depending on type.
+
+.. ocv:function:: void gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0)
+
+
+
+gpu::Stream::enqueueHostCallback
+--------------------------------
+Adds a callback to be called on the host after all currently enqueued items in the stream have completed.
+
+.. ocv:function:: void gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+
+.. note:: Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization that may depend on outstanding device work or other callbacks that are not mandated to run earlier.  Callbacks without a mandated order (in independent streams) execute in undefined order and may be serialized.
+
+
+
 gpu::StreamAccessor
 -------------------
 .. ocv:struct:: gpu::StreamAccessor
diff --git a/modules/gpu/doc/matrix_reductions.rst b/modules/gpu/doc/matrix_reductions.rst
index 538267eb7a..e9229f8a81 100644
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
@@ -32,6 +32,8 @@ Returns the norm of a matrix (or difference of two matrices).
 
 .. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, GpuMat& buf)
 
+.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf)
+
 .. ocv:function:: double gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2)
 
     :param src1: Source matrix. Any matrices except 64F are supported.
@@ -40,6 +42,8 @@ Returns the norm of a matrix (or difference of two matrices).
 
     :param normType: Norm type.  ``NORM_L1`` ,  ``NORM_L2`` , and  ``NORM_INF``  are supported for now.
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 .. seealso:: :ocv:func:`norm`
@@ -54,8 +58,12 @@ Returns the sum of matrix elements.
 
 .. ocv:function:: Scalar gpu::sum(const GpuMat& src, GpuMat& buf)
 
+.. ocv:function:: Scalar gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
     :param src: Source image of any depth except for ``CV_64F`` .
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 .. seealso:: :ocv:func:`sum`
@@ -70,8 +78,12 @@ Returns the sum of absolute values for matrix elements.
 
 .. ocv:function:: Scalar gpu::absSum(const GpuMat& src, GpuMat& buf)
 
+.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
     :param src: Source image of any depth except for ``CV_64F`` .
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 
@@ -84,8 +96,12 @@ Returns the squared sum of matrix elements.
 
 .. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 
+.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
     :param src: Source image of any depth except for ``CV_64F`` .
 
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
     :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 
diff --git a/modules/gpu/doc/operations_on_matrices.rst b/modules/gpu/doc/operations_on_matrices.rst
index 7f586a1b02..d1762f442a 100644
--- a/modules/gpu/doc/operations_on_matrices.rst
+++ b/modules/gpu/doc/operations_on_matrices.rst
@@ -242,3 +242,33 @@ Converts polar coordinates into Cartesian.
     :param stream: Stream for the asynchronous version.
 
 .. seealso:: :ocv:func:`polarToCart`
+
+
+
+gpu::normalize
+--------------
+Normalizes the norm or value range of an array.
+
+.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat())
+
+.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+
+    :param src: input array.
+
+    :param dst: output array of the same size as  ``src`` .
+
+    :param alpha: norm value to normalize to or the lower range boundary in case of the range normalization.
+
+    :param beta: upper range boundary in case of the range normalization; it is not used for the norm normalization.
+
+    :param normType: normalization type (see the details below).
+
+    :param dtype: when negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
+
+    :param mask: optional operation mask.
+
+    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`normalize`
diff --git a/modules/gpu/include/opencv2/gpu/device/warp.hpp b/modules/gpu/include/opencv2/gpu/device/warp.hpp
index d4b0b8d8f7..0f1dc794ab 100644
--- a/modules/gpu/include/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp.hpp
@@ -97,6 +97,25 @@ namespace cv { namespace gpu { namespace device
             return out;
         }
 
+        template <class T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T *ptr, BinOp op)
+        {
+            const unsigned int lane = laneId();
+
+            if (lane < 16)
+            {
+                T partial = ptr[lane];
+
+                ptr[lane] = partial = op(partial, ptr[lane + 16]);
+                ptr[lane] = partial = op(partial, ptr[lane + 8]);
+                ptr[lane] = partial = op(partial, ptr[lane + 4]);
+                ptr[lane] = partial = op(partial, ptr[lane + 2]);
+                ptr[lane] = partial = op(partial, ptr[lane + 1]);
+            }
+
+            return *ptr;
+        }
+
         template<typename OutIt, typename T>
         static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
         {
@@ -109,4 +128,4 @@ namespace cv { namespace gpu { namespace device
     };
 }}} // namespace cv { namespace gpu { namespace device
 
-#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
\ No newline at end of file
+#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index ed32326227..5da7f4fa93 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -145,43 +145,49 @@ public:
     ~Stream();
 
     Stream(const Stream&);
-    Stream& operator=(const Stream&);
+    Stream& operator =(const Stream&);
 
     bool queryIfComplete();
     void waitForCompletion();
 
-    //! downloads asynchronously.
+    //! downloads asynchronously
     // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
     void enqueueDownload(const GpuMat& src, CudaMem& dst);
     void enqueueDownload(const GpuMat& src, Mat& dst);
 
-    //! uploads asynchronously.
+    //! uploads asynchronously
     // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
     void enqueueUpload(const CudaMem& src, GpuMat& dst);
     void enqueueUpload(const Mat& src, GpuMat& dst);
 
+    //! copy asynchronously
     void enqueueCopy(const GpuMat& src, GpuMat& dst);
 
+    //! memory set asynchronously
     void enqueueMemSet(GpuMat& src, Scalar val);
     void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
 
-    // converts matrix type, ex from float to uchar depending on type
-    void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);
+    //! converts matrix type, ex from float to uchar depending on type
+    void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+
+    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+    typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+    void enqueueHostCallback(StreamCallback callback, void* userData);
 
     static Stream& Null();
 
     operator bool() const;
 
 private:
+    struct Impl;
+
+    explicit Stream(Impl* impl);
     void create();
     void release();
 
-    struct Impl;
     Impl *impl;
 
     friend struct StreamAccessor;
-
-    explicit Stream(Impl* impl);
 };
 
 
@@ -459,6 +465,12 @@ CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude,
 //! supports only floating-point source
 CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
+//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
+CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
+                          int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat());
+CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b,
+                          int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+
 
 //////////////////////////// Per-element operations ////////////////////////////////////
 
@@ -910,11 +922,8 @@ CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuM
 //! supports NORM_INF, NORM_L1, NORM_L2
 //! supports all matrices except 64F
 CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
-
-//! computes norm of array
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports all matrices except 64F
 CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);
+CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf);
 
 //! computes norm of the difference between two arrays
 //! supports NORM_INF, NORM_L1, NORM_L2
@@ -924,45 +933,33 @@ CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM
 //! computes sum of array elements
 //! supports only single channel images
 CV_EXPORTS Scalar sum(const GpuMat& src);
-
-//! computes sum of array elements
-//! supports only single channel images
 CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
 
 //! computes sum of array elements absolute values
 //! supports only single channel images
 CV_EXPORTS Scalar absSum(const GpuMat& src);
-
-//! computes sum of array elements absolute values
-//! supports only single channel images
 CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
 
 //! computes squared sum of array elements
 //! supports only single channel images
 CV_EXPORTS Scalar sqrSum(const GpuMat& src);
-
-//! computes squared sum of array elements
-//! supports only single channel images
 CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
 
 //! finds global minimum and maximum array elements and returns their values
 CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
-
-//! finds global minimum and maximum array elements and returns their values
 CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
 
 //! finds global minimum and maximum array elements and returns their values with locations
 CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
                           const GpuMat& mask=GpuMat());
-
-//! finds global minimum and maximum array elements and returns their values with locations
 CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                           const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
 
 //! counts non-zero array elements
 CV_EXPORTS int countNonZero(const GpuMat& src);
-
-//! counts non-zero array elements
 CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
 
 //! reduces a matrix to a vector
diff --git a/modules/gpu/misc/carma.toolchain.cmake b/modules/gpu/misc/carma.toolchain.cmake
deleted file mode 100644
index 18f0e0f934..0000000000
--- a/modules/gpu/misc/carma.toolchain.cmake
+++ /dev/null
@@ -1,26 +0,0 @@
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_VERSION 1)
-set(CMAKE_SYSTEM_PROCESSOR arm)
-
-set(CMAKE_C_COMPILER    arm-linux-gnueabi-gcc-4.5)
-set(CMAKE_CXX_COMPILER  arm-linux-gnueabi-g++-4.5)
-
-#suppress compiller varning
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" )
-set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -Wno-psabi" )
-
-# can be any other plases
-set(__arm_linux_eabi_root /usr/arm-linux-gnueabi)
-
-set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root})
-
-if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
-    set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
-endif()
-
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-
-set(CARMA 1)
-add_definitions(-DCARMA)
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
index c78bfd6e20..b97c4999cd 100644
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -1631,7 +1631,7 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::norm(d_src, normType, d_buf);
+        TEST_CYCLE() dst = cv::gpu::norm(d_src, normType, cv::gpu::GpuMat(), d_buf);
     }
     else
     {
@@ -1701,7 +1701,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::sum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::sum(d_src, cv::gpu::GpuMat(), d_buf);
     }
     else
     {
@@ -1736,7 +1736,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::absSum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::absSum(d_src, cv::gpu::GpuMat(), d_buf);
 
         SANITY_CHECK(dst, 1e-6);
     }
@@ -1770,7 +1770,7 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
         cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() dst = cv::gpu::sqrSum(d_src, d_buf);
+        TEST_CYCLE() dst = cv::gpu::sqrSum(d_src, cv::gpu::GpuMat(), d_buf);
 
         SANITY_CHECK(dst, 1e-6);
     }
@@ -1926,4 +1926,48 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// Normalize
+
+DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);
+
+PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    Values(NormType(cv::NORM_INF),
+           NormType(cv::NORM_L1),
+           NormType(cv::NORM_L2),
+           NormType(cv::NORM_MINMAX))
+    ))
+{
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int norm_type = GET_PARAM(2);
+
+    double alpha = 1;
+    double beta = 0;
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_norm_buf, d_cvt_buf;
+
+        TEST_CYCLE() cv::gpu::normalize(d_src, d_dst, alpha, beta, norm_type, type, cv::gpu::GpuMat(), d_norm_buf, d_cvt_buf);
+
+        GPU_SANITY_CHECK(d_dst, 1);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::normalize(src, dst, alpha, beta, norm_type, type);
+
+        CPU_SANITY_CHECK(dst, 1);
+    }
+}
+
 } // namespace
diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp
index bf2fd99c6e..83213a1613 100644
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -431,13 +431,13 @@ PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
     {
         cv::Mat flow;
 
-        cv::OpticalFlowDual_TVL1 alg;
+        cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
 
-        alg(frame0, frame1, flow);
+        alg->calc(frame0, frame1, flow);
 
         TEST_CYCLE()
         {
-            alg(frame0, frame1, flow);
+            alg->calc(frame0, frame1, flow);
         }
 
         CPU_SANITY_CHECK(flow);
diff --git a/modules/gpu/perf/utility.hpp b/modules/gpu/perf/utility.hpp
index 09b84f53aa..6782b93768 100644
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@@ -17,7 +17,7 @@ CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONS
 
 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
 #define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
+CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)
 
 const int Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4;
 CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
diff --git a/modules/gpu/perf4au/CMakeLists.txt b/modules/gpu/perf4au/CMakeLists.txt
new file mode 100644
index 0000000000..376e7b2706
--- /dev/null
+++ b/modules/gpu/perf4au/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video opencv_legacy opencv_gpu opencv_ts)
+
+ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS})
+
+set(the_target gpu_perf4au)
+project(${the_target})
+
+ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
+
+if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
+endif()
+
+file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
+add_executable(${the_target} ${srcs})
+
+target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
+endif()
+
+if(WIN32)
+    if(MSVC AND NOT BUILD_SHARED_LIBS)
+        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
+    endif()
+endif()
diff --git a/modules/gpu/perf4au/im1_1280x800.jpg b/modules/gpu/perf4au/im1_1280x800.jpg
new file mode 100644
index 0000000000..bdbbd4aee9
Binary files /dev/null and b/modules/gpu/perf4au/im1_1280x800.jpg differ
diff --git a/modules/gpu/perf4au/im2_1280x800.jpg b/modules/gpu/perf4au/im2_1280x800.jpg
new file mode 100644
index 0000000000..ae49640a95
Binary files /dev/null and b/modules/gpu/perf4au/im2_1280x800.jpg differ
diff --git a/modules/gpu/perf4au/main.cpp b/modules/gpu/perf4au/main.cpp
new file mode 100644
index 0000000000..80d97ea806
--- /dev/null
+++ b/modules/gpu/perf4au/main.cpp
@@ -0,0 +1,490 @@
+#include <cstdio>
+#ifdef HAVE_CVCONFIG_H
+#include "cvconfig.h"
+#endif
+#include "opencv2/core/core.hpp"
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/legacy/legacy.hpp"
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+static void printOsInfo()
+{
+#if defined _WIN32
+#   if defined _WIN64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined linux
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#endif
+}
+
+static void printCudaInfo()
+{
+    const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
+
+    printf("[----------]\n"); fflush(stdout);
+    printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
+    printf("[----------]\n"); fflush(stdout);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        cv::gpu::DeviceInfo info(i);
+
+        printf("[----------]\n"); fflush(stdout);
+        printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
+        printf("[          ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
+        printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()); fflush(stdout);
+        printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
+        printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)); fflush(stdout);
+        if (!info.isCompatible())
+            printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
+        printf("[----------]\n"); fflush(stdout);
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    printOsInfo();
+    printCudaInfo();
+
+    perf::Regression::Init("nv_perf_test");
+    perf::TestBase::Init(argc, argv);
+    testing::InitGoogleTest(&argc, argv);
+
+    return RUN_ALL_TESTS();
+}
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+//////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+PERF_TEST_P(Image, HoughLinesP, testing::Values(std::string("im1_1280x800.jpg")))
+{
+    declare.time(30.0);
+
+    std::string fileName = GetParam();
+
+    const float rho = 1.f;
+    const float theta = 1.f;
+    const int threshold = 40;
+    const int minLineLenght = 20;
+    const int maxLineGap = 5;
+
+    cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_image(image);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        }
+    }
+    else
+    {
+        cv::Mat mask;
+        cv::Canny(image, mask, 50, 100);
+
+        std::vector<cv::Vec4i> lines;
+        cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+
+DEF_PARAM_TEST(Image_Depth, std::string, perf::MatDepth);
+
+PERF_TEST_P(Image_Depth, GoodFeaturesToTrack,
+                testing::Combine(
+                testing::Values(std::string("im1_1280x800.jpg")),
+                testing::Values(CV_8U, CV_16U)
+                ))
+{
+    declare.time(60);
+
+    const std::string fileName = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+    if (src.empty())
+        FAIL() << "Unable to load source image [" << fileName << "]";
+
+    if (depth != CV_8U)
+        src.convertTo(src, depth);
+
+    cv::Mat mask(src.size(), CV_8UC1, cv::Scalar::all(1));
+    mask(cv::Rect(0, 0, 100, 100)).setTo(cv::Scalar::all(0));
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k);
+
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_pts;
+
+        d_detector(d_src, d_pts, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_detector(d_src, d_pts, d_mask);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat pts;
+
+        cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+
+        TEST_CYCLE()
+        {
+            cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowPyrLKSparse
+
+typedef std::pair<std::string, std::string> string_pair;
+
+DEF_PARAM_TEST(ImagePair_Depth_GraySource, string_pair, perf::MatDepth, bool);
+
+PERF_TEST_P(ImagePair_Depth_GraySource, OpticalFlowPyrLKSparse,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(CV_8U, CV_16U),
+                    testing::Bool()
+                    ))
+{
+    declare.time(60);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+    const bool graySource = std::tr1::get<2>(GetParam());
+
+    // PyrLK params
+    const cv::Size winSize(15, 15);
+    const int maxLevel = 5;
+    const cv::TermCriteria criteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01);
+
+    // GoodFeaturesToTrack params
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src1 = cv::imread(fileNames.first, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    cv::Mat gray_src;
+    if (graySource)
+        gray_src = src1;
+    else
+        cv::cvtColor(src1, gray_src, cv::COLOR_BGR2GRAY);
+
+    cv::Mat pts;
+    cv::goodFeaturesToTrack(gray_src, pts, maxCorners, qualityLevel, minDistance, cv::noArray(), blockSize, useHarrisDetector, k);
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_pts(pts.reshape(2, 1));
+        cv::gpu::GpuMat d_nextPts;
+        cv::gpu::GpuMat d_status;
+
+        cv::gpu::PyrLKOpticalFlow d_pyrLK;
+        d_pyrLK.winSize = winSize;
+        d_pyrLK.maxLevel = maxLevel;
+        d_pyrLK.iters = criteria.maxCount;
+        d_pyrLK.useInitialFlow = false;
+
+        d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+
+        TEST_CYCLE()
+        {
+            d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat nextPts;
+        cv::Mat status;
+
+        cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+
+        TEST_CYCLE()
+        {
+            cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowFarneback
+
+DEF_PARAM_TEST(ImagePair_Depth, string_pair, perf::MatDepth);
+
+PERF_TEST_P(ImagePair_Depth, OpticalFlowFarneback,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(CV_8U, CV_16U)
+                    ))
+{
+    declare.time(500);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const double pyrScale = 0.5;
+    const int numLevels = 6;
+    const int winSize = 7;
+    const int numIters = 15;
+    const int polyN = 7;
+    const double polySigma = 1.5;
+    const int flags = cv::OPTFLOW_USE_INITIAL_FLOW;
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_u(src1.size(), CV_32FC1, cv::Scalar::all(0));
+        cv::gpu::GpuMat d_v(src1.size(), CV_32FC1, cv::Scalar::all(0));
+
+        cv::gpu::FarnebackOpticalFlow d_farneback;
+        d_farneback.pyrScale = pyrScale;
+        d_farneback.numLevels = numLevels;
+        d_farneback.winSize = winSize;
+        d_farneback.numIters = numIters;
+        d_farneback.polyN = polyN;
+        d_farneback.polySigma = polySigma;
+        d_farneback.flags = flags;
+
+        d_farneback(d_src1, d_src2, d_u, d_v);
+
+        TEST_CYCLE_N(10)
+        {
+            d_farneback(d_src1, d_src2, d_u, d_v);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat flow(src1.size(), CV_32FC2, cv::Scalar::all(0));
+
+        cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowBM
+
+void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                       cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                       cv::Mat& velx, cv::Mat& vely)
+{
+    cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+    velx.create(sz, CV_32FC1);
+    vely.create(sz, CV_32FC1);
+
+    CvMat cvprev = prev;
+    CvMat cvcurr = curr;
+
+    CvMat cvvelx = velx;
+    CvMat cvvely = vely;
+
+    cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+}
+
+DEF_PARAM_TEST(ImagePair_BlockSize_ShiftSize_MaxRange, string_pair, cv::Size, cv::Size, cv::Size);
+
+PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, OpticalFlowBM,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(cv::Size(16, 16)),
+                    testing::Values(cv::Size(2, 2)),
+                    testing::Values(cv::Size(16, 16))
+                    ))
+{
+    declare.time(3000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely, buf;
+
+        cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, FastOpticalFlowBM,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(cv::Size(16, 16)),
+                    testing::Values(cv::Size(1, 1)),
+                    testing::Values(cv::Size(16, 16))
+                    ))
+{
+    declare.time(3000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely;
+
+        cv::gpu::FastOpticalFlowBM fastBM;
+
+        fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+
+        TEST_CYCLE_N(10)
+        {
+            fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 242febded9..7e0aaab680 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -59,6 +59,8 @@ void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { thr
 void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
 void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
 void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -529,4 +531,47 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat&
     polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
 }
 
+////////////////////////////////////////////////////////////////////////
+// normalize
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
+{
+    GpuMat norm_buf;
+    GpuMat cvt_buf;
+    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
+}
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+{
+    double scale = 1, shift = 0;
+    if (norm_type == NORM_MINMAX)
+    {
+        double smin = 0, smax = 0;
+        double dmin = std::min(a, b), dmax = std::max(a, b);
+        minMax(src, &smin, &smax, mask, norm_buf);
+        scale = (dmax - dmin) * (smax - smin > numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        shift = dmin - smin * scale;
+    }
+    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
+    {
+        scale = norm(src, norm_type, mask, norm_buf);
+        scale = scale > numeric_limits<double>::epsilon() ? a / scale : 0.0;
+        shift = 0;
+    }
+    else
+    {
+        CV_Error(CV_StsBadArg, "Unknown/unsupported norm type");
+    }
+
+    if (mask.empty())
+    {
+        src.convertTo(dst, dtype, scale, shift);
+    }
+    else
+    {
+        src.convertTo(cvt_buf, dtype, scale, shift);
+        cvt_buf.copyTo(dst, mask);
+    }
+}
+
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index cbc471c815..cfcbb57340 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -352,8 +352,8 @@ namespace sum
         }
     };
 
-    template <int BLOCK_SIZE, typename src_type, typename result_type, class Op>
-    __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Op op, const int twidth, const int theight)
+    template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
+    __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
     {
         typedef typename VecTraits<src_type>::elem_type T;
         typedef typename VecTraits<result_type>::elem_type R;
@@ -375,9 +375,11 @@ namespace sum
 
             for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
             {
-                const src_type srcVal = ptr[x];
-
-                sum = sum + op(saturate_cast<result_type>(srcVal));
+                if (mask(y, x))
+                {
+                    const src_type srcVal = ptr[x];
+                    sum = sum + op(saturate_cast<result_type>(srcVal));
+                }
             }
         }
 
@@ -410,7 +412,7 @@ namespace sum
     }
 
     template <typename T, typename R, int cn, template <typename> class Op>
-    void caller(PtrStepSzb src_, void* buf_, double* out)
+    void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
     {
         typedef typename TypeVec<T, cn>::vec_type src_type;
         typedef typename TypeVec<R, cn>::vec_type result_type;
@@ -426,7 +428,10 @@ namespace sum
 
         Op<result_type> op;
 
-        kernel<threads_x * threads_y><<<grid, block>>>(src, buf, op, twidth, theight);
+        if (mask.data)
+            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
+        else
+            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
         cudaSafeCall( cudaGetLastError() );
 
         cudaSafeCall( cudaDeviceSynchronize() );
@@ -450,88 +455,88 @@ namespace sum
     template <> struct SumType<double> { typedef double R; };
 
     template <typename T, int cn>
-    void run(PtrStepSzb src, void* buf, double* out)
+    void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
     {
         typedef typename SumType<T>::R R;
-        caller<T, R, cn, identity>(src, buf, out);
+        caller<T, R, cn, identity>(src, buf, out, mask);
     }
 
-    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<schar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<schar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<schar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<schar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<short, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<short, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<short, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<short, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<int, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<int, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<int, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<int, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<float, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<float, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<float, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<float, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void run<double, 1>(PtrStepSzb src, void* buf, double* out);
-    template void run<double, 2>(PtrStepSzb src, void* buf, double* out);
-    template void run<double, 3>(PtrStepSzb src, void* buf, double* out);
-    template void run<double, 4>(PtrStepSzb src, void* buf, double* out);
+    template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
     template <typename T, int cn>
-    void runAbs(PtrStepSzb src, void* buf, double* out)
+    void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
     {
         typedef typename SumType<T>::R R;
-        caller<T, R, cn, abs_func>(src, buf, out);
+        caller<T, R, cn, abs_func>(src, buf, out, mask);
     }
 
-    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
     template <typename T> struct Sqr : unary_function<T, T>
     {
@@ -542,45 +547,45 @@ namespace sum
     };
 
     template <typename T, int cn>
-    void runSqr(PtrStepSzb src, void* buf, double* out)
+    void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
     {
-        caller<T, double, cn, Sqr>(src, buf, out);
+        caller<T, double, cn, Sqr>(src, buf, out, mask);
     }
 
-    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 
-    template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out);
-    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out);
+    template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 }
 
 /////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index 5e6e4c3ea0..f9fbe820eb 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -42,51 +42,37 @@
 
 #include "precomp.hpp"
 
+using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-#if defined HAVE_CUDA
-
-struct Stream::Impl
-{
-    static cudaStream_t getStream(const Impl* impl) { return impl ? impl->stream : 0; }
-    cudaStream_t stream;
-    int ref_counter;
-};
-
-#include "opencv2/gpu/stream_accessor.hpp"
-
-CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
-{
-    return Stream::Impl::getStream(stream.impl);
-};
-
-#endif /* !defined (HAVE_CUDA) */
-
-
 #if !defined (HAVE_CUDA)
 
-void cv::gpu::Stream::create() { throw_nogpu(); }
-void cv::gpu::Stream::release() { throw_nogpu(); }
-cv::gpu::Stream::Stream() : impl(0) { throw_nogpu(); }
-cv::gpu::Stream::~Stream() { throw_nogpu(); }
-cv::gpu::Stream::Stream(const Stream& /*stream*/) { throw_nogpu(); }
-Stream& cv::gpu::Stream::operator=(const Stream& /*stream*/) { throw_nogpu(); return *this; }
-bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return true; }
+cv::gpu::Stream::Stream() { throw_nogpu(); }
+cv::gpu::Stream::~Stream() {}
+cv::gpu::Stream::Stream(const Stream&) { throw_nogpu(); }
+Stream& cv::gpu::Stream::operator=(const Stream&) { throw_nogpu(); return *this; }
+bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return false; }
 void cv::gpu::Stream::waitForCompletion() { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, Mat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_nogpu(); }
 Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
 cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
+cv::gpu::Stream::Stream(Impl*) { throw_nogpu(); }
+void cv::gpu::Stream::create() { throw_nogpu(); }
+void cv::gpu::Stream::release() { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
+#include "opencv2/gpu/stream_accessor.hpp"
+
 namespace cv { namespace gpu
 {
     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
@@ -95,14 +81,247 @@ namespace cv { namespace gpu
     void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
 }}
 
+struct Stream::Impl
+{
+    static cudaStream_t getStream(const Impl* impl)
+    {
+        return impl ? impl->stream : 0;
+    }
+
+    cudaStream_t stream;
+    int ref_counter;
+};
+
+cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+{
+    return Stream::Impl::getStream(stream.impl);
+}
+
+cv::gpu::Stream::Stream() : impl(0)
+{
+    create();
+}
+
+cv::gpu::Stream::~Stream()
+{
+    release();
+}
+
+cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
+{
+    if (impl)
+        CV_XADD(&impl->ref_counter, 1);
+}
+
+Stream& cv::gpu::Stream::operator =(const Stream& stream)
+{
+    if (this != &stream)
+    {
+        release();
+        impl = stream.impl;
+        if (impl)
+            CV_XADD(&impl->ref_counter, 1);
+    }
+
+    return *this;
+}
+
+bool cv::gpu::Stream::queryIfComplete()
+{
+    cudaStream_t stream = Impl::getStream(impl);
+    cudaError_t err = cudaStreamQuery(stream);
+
+    if (err == cudaErrorNotReady || err == cudaSuccess)
+        return err == cudaSuccess;
+
+    cudaSafeCall(err);
+    return false;
+}
+
+void cv::gpu::Stream::waitForCompletion()
+{
+    cudaStream_t stream = Impl::getStream(impl);
+    cudaSafeCall( cudaStreamSynchronize(stream) );
+}
+
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
+{
+    // if not -> allocation will be done, but after that dst will not point to page locked memory
+    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
+}
+
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
+{
+    dst.create(src.size(), src.type(), CudaMem::ALLOC_PAGE_LOCKED);
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
+}
+
+void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
+{
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
+}
+
+void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
+{
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
+}
+
+void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
+{
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = Impl::getStream(impl);
+    size_t bwidth = src.cols * src.elemSize();
+    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToDevice, stream) );
+}
+
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
+{
+    const int sdepth = src.depth();
+
+    if (sdepth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    cudaStream_t stream = Impl::getStream(impl);
+
+    if (val[0] == 0.0 && val[1] == 0.0 && val[2] == 0.0 && val[3] == 0.0)
+    {
+        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, stream) );
+        return;
+    }
+
+    if (sdepth == CV_8U)
+    {
+        int cn = src.channels();
+
+        if (cn == 1 || (cn == 2 && val[0] == val[1]) || (cn == 3 && val[0] == val[1] && val[0] == val[2]) || (cn == 4 && val[0] == val[1] && val[0] == val[2] && val[0] == val[3]))
+        {
+            int ival = saturate_cast<uchar>(val[0]);
+            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, ival, src.cols * src.elemSize(), src.rows, stream) );
+            return;
+        }
+    }
+
+    setTo(src, val, stream);
+}
+
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
+{
+    const int sdepth = src.depth();
+
+    if (sdepth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    CV_Assert(mask.type() == CV_8UC1);
+
+    cudaStream_t stream = Impl::getStream(impl);
+
+    setTo(src, val, mask, stream);
+}
+
+void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double alpha, double beta)
+{
+    if (dtype < 0)
+        dtype = src.type();
+    else
+        dtype = CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels());
+
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();
+
+    if (sdepth == ddepth && noScale)
+    {
+        enqueueCopy(src, dst);
+        return;
+    }
+
+    dst.create(src.size(), dtype);
+
+    cudaStream_t stream = Impl::getStream(impl);
+    convertTo(src, dst, alpha, beta, stream);
+}
+
+#if CUDA_VERSION >= 5000
+
 namespace
 {
-    template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
+    struct CallbackData
     {
-        dst.create(src.size(), src.type());
-        size_t bwidth = src.cols * src.elemSize();
-        cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
+        cv::gpu::Stream::StreamCallback callback;
+        void* userData;
+        Stream stream;
     };
+
+    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
+    {
+        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
+        data->callback(data->stream, static_cast<int>(status), data->userData);
+        delete data;
+    }
+}
+
+#endif
+
+void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+{
+#if CUDA_VERSION >= 5000
+    CallbackData* data = new CallbackData;
+    data->callback = callback;
+    data->userData = userData;
+    data->stream = *this;
+
+    cudaStream_t stream = Impl::getStream(impl);
+
+    cudaSafeCall( cudaStreamAddCallback(stream, cudaStreamCallback, data, 0) );
+#else
+    (void) callback;
+    (void) userData;
+    CV_Error(CV_StsNotImplemented, "This function requires CUDA 5.0");
+#endif
+}
+
+cv::gpu::Stream& cv::gpu::Stream::Null()
+{
+    static Stream s((Impl*) 0);
+    return s;
+}
+
+cv::gpu::Stream::operator bool() const
+{
+    return impl && impl->stream;
+}
+
+cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_)
+{
 }
 
 void cv::gpu::Stream::create()
@@ -113,7 +332,7 @@ void cv::gpu::Stream::create()
     cudaStream_t stream;
     cudaSafeCall( cudaStreamCreate( &stream ) );
 
-    impl = (Stream::Impl*)fastMalloc(sizeof(Stream::Impl));
+    impl = (Stream::Impl*) fastMalloc(sizeof(Stream::Impl));
 
     impl->stream = stream;
     impl->ref_counter = 1;
@@ -121,133 +340,11 @@ void cv::gpu::Stream::create()
 
 void cv::gpu::Stream::release()
 {
-    if( impl && CV_XADD(&impl->ref_counter, -1) == 1 )
+    if (impl && CV_XADD(&impl->ref_counter, -1) == 1)
     {
-        cudaSafeCall( cudaStreamDestroy( impl->stream ) );
-        cv::fastFree( impl );
+        cudaSafeCall( cudaStreamDestroy(impl->stream) );
+        cv::fastFree(impl);
     }
 }
 
-cv::gpu::Stream::Stream() : impl(0) { create(); }
-cv::gpu::Stream::~Stream() { release(); }
-
-cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
-{
-    if( impl )
-        CV_XADD(&impl->ref_counter, 1);
-}
-Stream& cv::gpu::Stream::operator=(const Stream& stream)
-{
-    if( this != &stream )
-    {
-        if( stream.impl )
-            CV_XADD(&stream.impl->ref_counter, 1);
-
-        release();
-        impl = stream.impl;
-    }
-    return *this;
-}
-
-bool cv::gpu::Stream::queryIfComplete()
-{
-    cudaError_t err = cudaStreamQuery( Impl::getStream(impl) );
-
-    if (err == cudaErrorNotReady || err == cudaSuccess)
-        return err == cudaSuccess;
-
-    cudaSafeCall(err);
-    return false;
-}
-
-void cv::gpu::Stream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( Impl::getStream(impl) ) ); }
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
-{
-    // if not -> allocation will be done, but after that dst will not point to page locked memory
-    CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() );
-    devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost);
-}
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost); }
-
-void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToDevice); }
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
-{
-    CV_Assert((src.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-    {
-        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
-        return;
-    }
-    if (src.depth() == CV_8U)
-    {
-        int cn = src.channels();
-
-        if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-        {
-            int val = saturate_cast<uchar>(s[0]);
-            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
-            return;
-        }
-    }
-
-    setTo(src, s, Impl::getStream(impl));
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
-{
-    CV_Assert((src.depth() != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    CV_Assert(mask.type() == CV_8UC1);
-
-    setTo(src, val, mask, Impl::getStream(impl));
-}
-
-void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
-{
-    CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) ||
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
-
-    if( rtype < 0 )
-        rtype = src.type();
-    else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.channels());
-
-    int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype);
-    if( sdepth == ddepth && noScale )
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    GpuMat temp;
-    const GpuMat* psrc = &src;
-    if( sdepth != ddepth && psrc == &dst )
-        psrc = &(temp = src);
-
-    dst.create( src.size(), rtype );
-    convertTo(src, dst, alpha, beta, Impl::getStream(impl));
-}
-
-cv::gpu::Stream::operator bool() const
-{
-    return impl && impl->stream;
-}
-
-cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {}
-
-cv::gpu::Stream& cv::gpu::Stream::Null()
-{
-    static Stream s((Impl*)0);
-    return s;
-}
-
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index 09cf01850e..fecb717cd9 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -121,7 +121,9 @@ void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, f
     buf.accum.setTo(Scalar::all(0));
 
     DeviceInfo devInfo;
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+    cudaDeviceProp prop;
+    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
 
     ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
 
@@ -194,7 +196,9 @@ void cv::gpu::HoughLinesP(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf,
     buf.accum.setTo(Scalar::all(0));
 
     DeviceInfo devInfo;
-    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+    cudaDeviceProp prop;
+    cudaSafeCall(cudaGetDeviceProperties(&prop, devInfo.deviceID()));
+    linesAccum_gpu(srcPoints, pointsCount, buf.accum, rho, theta, prop.sharedMemPerBlock, devInfo.supports(FEATURE_SET_COMPUTE_20));
 
     ensureSizeIsEnough(1, maxLines, CV_32SC4, lines);
 
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 4295644c7a..67e65fc585 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -51,13 +51,17 @@ void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
 void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_nogpu(); }
 double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_nogpu(); return 0.0; }
+double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
 Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::absSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
@@ -150,24 +154,30 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
 double cv::gpu::norm(const GpuMat& src, int normType)
 {
     GpuMat buf;
-    return norm(src, normType, buf);
+    return norm(src, normType, GpuMat(), buf);
 }
 
 double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
+{
+    return norm(src, normType, GpuMat(), buf);
+}
+
+double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf)
 {
     CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1));
 
     GpuMat src_single_channel = src.reshape(1);
 
     if (normType == NORM_L1)
-        return absSum(src_single_channel, buf)[0];
+        return absSum(src_single_channel, mask, buf)[0];
 
     if (normType == NORM_L2)
-        return std::sqrt(sqrSum(src_single_channel, buf)[0]);
+        return std::sqrt(sqrSum(src_single_channel, mask, buf)[0]);
 
     // NORM_INF
     double min_val, max_val;
-    minMax(src_single_channel, &min_val, &max_val, GpuMat(), buf);
+    minMax(src_single_channel, &min_val, &max_val, mask, buf);
     return std::max(std::abs(min_val), std::abs(max_val));
 }
 
@@ -209,24 +219,29 @@ namespace sum
     void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
 
     template <typename T, int cn>
-    void run(PtrStepSzb src, void* buf, double* sum);
+    void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 
     template <typename T, int cn>
-    void runAbs(PtrStepSzb src, void* buf, double* sum);
+    void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 
     template <typename T, int cn>
-    void runSqr(PtrStepSzb src, void* buf, double* sum);
+    void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 }
 
 Scalar cv::gpu::sum(const GpuMat& src)
 {
     GpuMat buf;
-    return sum(src, buf);
+    return sum(src, GpuMat(), buf);
 }
 
 Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    return sum(src, GpuMat(), buf);
+}
+
+Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+{
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
         {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
@@ -238,6 +253,8 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
         {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
     };
 
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
     if (src.depth() == CV_64F)
     {
         if (!deviceSupports(NATIVE_DOUBLE))
@@ -252,7 +269,7 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
     const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    func(src, buf.data, result);
+    func(src, buf.data, result, mask);
 
     return Scalar(result[0], result[1], result[2], result[3]);
 }
@@ -260,12 +277,17 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::absSum(const GpuMat& src)
 {
     GpuMat buf;
-    return absSum(src, buf);
+    return absSum(src, GpuMat(), buf);
 }
 
 Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    return absSum(src, GpuMat(), buf);
+}
+
+Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+{
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
         {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
@@ -277,6 +299,8 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
         {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
     };
 
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
     if (src.depth() == CV_64F)
     {
         if (!deviceSupports(NATIVE_DOUBLE))
@@ -291,7 +315,7 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
     const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    func(src, buf.data, result);
+    func(src, buf.data, result, mask);
 
     return Scalar(result[0], result[1], result[2], result[3]);
 }
@@ -299,12 +323,17 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sqrSum(const GpuMat& src)
 {
     GpuMat buf;
-    return sqrSum(src, buf);
+    return sqrSum(src, GpuMat(), buf);
 }
 
 Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+    return sqrSum(src, GpuMat(), buf);
+}
+
+Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+{
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
         {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
@@ -316,6 +345,8 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
         {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
     };
 
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
     if (src.depth() == CV_64F)
     {
         if (!deviceSupports(NATIVE_DOUBLE))
@@ -330,7 +361,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
     const func_t func = funcs[src.depth()][src.channels()];
 
     double result[4];
-    func(src, buf.data, result);
+    func(src, buf.data, result, mask);
 
     return Scalar(result[0], result[1], result[2], result[3]);
 }
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index e6745abe33..6c046fca7b 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -2918,10 +2918,12 @@ PARAM_TEST_CASE(Norm, cv::gpu::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
 GPU_TEST_P(Norm, Accuracy)
 {
     cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
 
-    double val = cv::gpu::norm(loadMat(src, useRoi), normCode);
+    cv::gpu::GpuMat d_buf;
+    double val = cv::gpu::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi), d_buf);
 
-    double val_gold = cv::norm(src, normCode);
+    double val_gold = cv::norm(src, normCode, mask);
 
     EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
 }
@@ -3538,4 +3540,70 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
     ALL_REDUCE_CODES,
     WHOLE_SUBMAT));
 
+//////////////////////////////////////////////////////////////////////////////
+// Normalize
+
+PARAM_TEST_CASE(Normalize, cv::gpu::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int norm_type;
+    bool useRoi;
+
+    double alpha;
+    double beta;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        norm_type = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        alpha = 1;
+        beta = 0;
+    }
+
+};
+
+GPU_TEST_P(Normalize, WithOutMask)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, type);
+
+    cv::Mat dst_gold;
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, type);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+}
+
+GPU_TEST_P(Normalize, WithMask)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::gpu::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, type, loadMat(mask, useRoi));
+
+    cv::Mat dst_gold(size, type);
+    dst_gold.setTo(cv::Scalar::all(0));
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, type, mask);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Normalize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF), NormCode(cv::NORM_MINMAX)),
+    WHOLE_SUBMAT));
+
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_optflow.cpp b/modules/gpu/test/test_optflow.cpp
index c93ebbe19e..a97516f6d1 100644
--- a/modules/gpu/test/test_optflow.cpp
+++ b/modules/gpu/test/test_optflow.cpp
@@ -431,9 +431,9 @@ GPU_TEST_P(OpticalFlowDual_TVL1, Accuracy)
     cv::gpu::GpuMat d_flowy = createMat(frame0.size(), CV_32FC1, useRoi);
     d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
 
-    cv::OpticalFlowDual_TVL1 alg;
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
     cv::Mat flow;
-    alg(frame0, frame1, flow);
+    alg->calc(frame0, frame1, flow);
     cv::Mat gold[2];
     cv::split(flow, gold);
 
diff --git a/modules/gpu/test/test_stream.cpp b/modules/gpu/test/test_stream.cpp
new file mode 100644
index 0000000000..4adac41292
--- /dev/null
+++ b/modules/gpu/test/test_stream.cpp
@@ -0,0 +1,130 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#if CUDA_VERSION >= 5000
+
+struct Async : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::CudaMem src;
+    cv::gpu::GpuMat d_src;
+
+    cv::gpu::CudaMem dst;
+    cv::gpu::GpuMat d_dst;
+
+    virtual void SetUp()
+    {
+        cv::gpu::DeviceInfo devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        cv::Mat m = randomMat(cv::Size(128, 128), CV_8UC1);
+        src.create(m.size(), m.type(), cv::gpu::CudaMem::ALLOC_PAGE_LOCKED);
+        m.copyTo(src.createMatHeader());
+    }
+};
+
+void checkMemSet(cv::gpu::Stream&, int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::Mat src = test->src;
+    cv::Mat dst = test->dst;
+
+    cv::Mat dst_gold = cv::Mat::zeros(src.size(), src.type());
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+GPU_TEST_P(Async, MemSet)
+{
+    cv::gpu::Stream stream;
+
+    d_dst.upload(src);
+
+    stream.enqueueMemSet(d_dst, cv::Scalar::all(0));
+    stream.enqueueDownload(d_dst, dst);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkMemSet, test);
+
+    stream.waitForCompletion();
+}
+
+void checkConvert(cv::gpu::Stream&, int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::Mat src = test->src;
+    cv::Mat dst = test->dst;
+
+    cv::Mat dst_gold;
+    src.convertTo(dst_gold, CV_32S);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+GPU_TEST_P(Async, Convert)
+{
+    cv::gpu::Stream stream;
+
+    stream.enqueueUpload(src, d_src);
+    stream.enqueueConvert(d_src, d_dst, CV_32S);
+    stream.enqueueDownload(d_dst, dst);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkConvert, test);
+
+    stream.waitForCompletion();
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Stream, Async, ALL_DEVICES);
+
+#endif
+
+#endif // HAVE_CUDA
diff --git a/modules/highgui/src/cap_ffmpeg.cpp b/modules/highgui/src/cap_ffmpeg.cpp
index 0a2f0a308d..6029fa2e0e 100644
--- a/modules/highgui/src/cap_ffmpeg.cpp
+++ b/modules/highgui/src/cap_ffmpeg.cpp
@@ -57,21 +57,42 @@ static CvCreateVideoWriter_Plugin icvCreateVideoWriter_FFMPEG_p = 0;
 static CvReleaseVideoWriter_Plugin icvReleaseVideoWriter_FFMPEG_p = 0;
 static CvWriteFrame_Plugin icvWriteFrame_FFMPEG_p = 0;
 
-static void
-icvInitFFMPEG(void)
+static cv::Mutex _icvInitFFMPEG_mutex;
+
+class icvInitFFMPEG
 {
-    static int ffmpegInitialized = 0;
-    if( !ffmpegInitialized )
+public:
+    static void Init()
+    {
+        cv::AutoLock al(_icvInitFFMPEG_mutex);
+        static icvInitFFMPEG init;
+    }
+
+private:
+    #if defined WIN32 || defined _WIN32
+    HMODULE icvFFOpenCV;
+
+    ~icvInitFFMPEG()
+    {
+        if (icvFFOpenCV)
+        {
+            FreeLibrary(icvFFOpenCV);
+            icvFFOpenCV = 0;
+        }
+    }
+    #endif
+
+    icvInitFFMPEG()
     {
     #if defined WIN32 || defined _WIN32
         const char* module_name = "opencv_ffmpeg"
-            CVAUX_STR(CV_VERSION_EPOCH) CVAUX_STR(CV_VERSION_MAJOR) CVAUX_STR(CV_VERSION_MINOR)
+            CVAUX_STR(CV_MAJOR_VERSION) CVAUX_STR(CV_MINOR_VERSION) CVAUX_STR(CV_SUBMINOR_VERSION)
         #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
             "_64"
         #endif
             ".dll";
 
-        static HMODULE icvFFOpenCV = LoadLibrary( module_name );
+        icvFFOpenCV = LoadLibrary( module_name );
         if( icvFFOpenCV )
         {
             icvCreateFileCapture_FFMPEG_p =
@@ -93,18 +114,24 @@ icvInitFFMPEG(void)
             icvWriteFrame_FFMPEG_p =
                 (CvWriteFrame_Plugin)GetProcAddress(icvFFOpenCV, "cvWriteFrame_FFMPEG");
 
-            if( icvCreateFileCapture_FFMPEG_p == NULL ||
-                icvReleaseCapture_FFMPEG_p == NULL ||
-                icvGrabFrame_FFMPEG_p == NULL  ||
-                icvRetrieveFrame_FFMPEG_p == NULL ||
-                icvSetCaptureProperty_FFMPEG_p == NULL ||
-                icvGetCaptureProperty_FFMPEG_p == NULL ||
-                icvCreateVideoWriter_FFMPEG_p == NULL ||
-                icvReleaseVideoWriter_FFMPEG_p == NULL ||
-                icvWriteFrame_FFMPEG_p == NULL )
+#if 0
+            if( icvCreateFileCapture_FFMPEG_p != 0 &&
+                icvReleaseCapture_FFMPEG_p != 0 &&
+                icvGrabFrame_FFMPEG_p != 0 &&
+                icvRetrieveFrame_FFMPEG_p != 0 &&
+                icvSetCaptureProperty_FFMPEG_p != 0 &&
+                icvGetCaptureProperty_FFMPEG_p != 0 &&
+                icvCreateVideoWriter_FFMPEG_p != 0 &&
+                icvReleaseVideoWriter_FFMPEG_p != 0 &&
+                icvWriteFrame_FFMPEG_p != 0 )
             {
-                fprintf(stderr, "Failed to load FFMPEG plugin: module handle=%p\n", icvFFOpenCV);
+                printf("Successfully initialized ffmpeg plugin!\n");
             }
+            else
+            {
+                printf("Failed to load FFMPEG plugin: module handle=%p\n", icvFFOpenCV);
+            }
+#endif
         }
     #elif defined HAVE_FFMPEG
         icvCreateFileCapture_FFMPEG_p = (CvCreateFileCapture_Plugin)cvCreateFileCapture_FFMPEG;
@@ -117,13 +144,12 @@ icvInitFFMPEG(void)
         icvReleaseVideoWriter_FFMPEG_p = (CvReleaseVideoWriter_Plugin)cvReleaseVideoWriter_FFMPEG;
         icvWriteFrame_FFMPEG_p = (CvWriteFrame_Plugin)cvWriteFrame_FFMPEG;
     #endif
-
-        ffmpegInitialized = 1;
     }
-}
+};
 
 
-class CvCapture_FFMPEG_proxy : public CvCapture
+class CvCapture_FFMPEG_proxy :
+    public CvCapture
 {
 public:
     CvCapture_FFMPEG_proxy() { ffmpegCapture = 0; }
@@ -146,18 +172,18 @@ public:
         unsigned char* data = 0;
         int step=0, width=0, height=0, cn=0;
 
-        if(!ffmpegCapture ||
-           !icvRetrieveFrame_FFMPEG_p(ffmpegCapture,&data,&step,&width,&height,&cn))
-           return 0;
+        if (!ffmpegCapture ||
+           !icvRetrieveFrame_FFMPEG_p(ffmpegCapture, &data, &step, &width, &height, &cn))
+            return 0;
         cvInitImageHeader(&frame, cvSize(width, height), 8, cn);
         cvSetData(&frame, data, step);
         return &frame;
     }
     virtual bool open( const char* filename )
     {
+        icvInitFFMPEG::Init();
         close();
 
-        icvInitFFMPEG();
         if( !icvCreateFileCapture_FFMPEG_p )
             return false;
         ffmpegCapture = icvCreateFileCapture_FFMPEG_p( filename );
@@ -190,8 +216,8 @@ CvCapture* cvCreateFileCapture_FFMPEG_proxy(const char * filename)
 #endif
 }
 
-
-class CvVideoWriter_FFMPEG_proxy : public CvVideoWriter
+class CvVideoWriter_FFMPEG_proxy :
+    public CvVideoWriter
 {
 public:
     CvVideoWriter_FFMPEG_proxy() { ffmpegWriter = 0; }
@@ -208,8 +234,8 @@ public:
     }
     virtual bool open( const char* filename, int fourcc, double fps, CvSize frameSize, bool isColor )
     {
+        icvInitFFMPEG::Init();
         close();
-        icvInitFFMPEG();
         if( !icvCreateVideoWriter_FFMPEG_p )
             return false;
         ffmpegWriter = icvCreateVideoWriter_FFMPEG_p( filename, fourcc, fps, frameSize.width, frameSize.height, isColor );
diff --git a/modules/highgui/src/cap_ffmpeg_impl.hpp b/modules/highgui/src/cap_ffmpeg_impl.hpp
index a6952a2cba..8b571dda1d 100644
--- a/modules/highgui/src/cap_ffmpeg_impl.hpp
+++ b/modules/highgui/src/cap_ffmpeg_impl.hpp
@@ -332,28 +332,187 @@ void CvCapture_FFMPEG::close()
 #define AVSEEK_FLAG_ANY 1
 #endif
 
-static void icvInitFFMPEG_internal()
+class ImplMutex
 {
-    static volatile bool initialized = false;
-    if( !initialized )
+public:
+    ImplMutex() { init(); }
+    ~ImplMutex() { destroy(); }
+
+    void init();
+    void destroy();
+
+    void lock();
+    bool trylock();
+    void unlock();
+
+    struct Impl;
+protected:
+    Impl* impl;
+
+private:
+    ImplMutex(const ImplMutex&);
+    ImplMutex& operator = (const ImplMutex& m);
+};
+
+#if defined WIN32 || defined _WIN32 || defined WINCE
+
+struct ImplMutex::Impl
+{
+    void init() { InitializeCriticalSection(&cs); refcount = 1; }
+    void destroy() { DeleteCriticalSection(&cs); }
+
+    void lock() { EnterCriticalSection(&cs); }
+    bool trylock() { return TryEnterCriticalSection(&cs) != 0; }
+    void unlock() { LeaveCriticalSection(&cs); }
+
+    CRITICAL_SECTION cs;
+    int refcount;
+};
+
+#ifndef __GNUC__
+static int _interlockedExchangeAdd(int* addr, int delta)
+{
+#if defined _MSC_VER && _MSC_VER >= 1500
+    return (int)_InterlockedExchangeAdd((long volatile*)addr, delta);
+#else
+    return (int)InterlockedExchangeAdd((long volatile*)addr, delta);
+#endif
+}
+#endif // __GNUC__
+
+#elif defined __APPLE__
+
+#include <libkern/OSAtomic.h>
+
+struct ImplMutex::Impl
+{
+    void init() { sl = OS_SPINLOCK_INIT; refcount = 1; }
+    void destroy() { }
+
+    void lock() { OSSpinLockLock(&sl); }
+    bool trylock() { return OSSpinLockTry(&sl); }
+    void unlock() { OSSpinLockUnlock(&sl); }
+
+    OSSpinLock sl;
+    int refcount;
+};
+
+#elif defined __linux__ && !defined ANDROID
+
+struct ImplMutex::Impl
+{
+    void init() { pthread_spin_init(&sl, 0); refcount = 1; }
+    void destroy() { pthread_spin_destroy(&sl); }
+
+    void lock() { pthread_spin_lock(&sl); }
+    bool trylock() { return pthread_spin_trylock(&sl) == 0; }
+    void unlock() { pthread_spin_unlock(&sl); }
+
+    pthread_spinlock_t sl;
+    int refcount;
+};
+
+#else
+
+struct ImplMutex::Impl
+{
+    void init() { pthread_mutex_init(&sl, 0); refcount = 1; }
+    void destroy() { pthread_mutex_destroy(&sl); }
+
+    void lock() { pthread_mutex_lock(&sl); }
+    bool trylock() { return pthread_mutex_trylock(&sl) == 0; }
+    void unlock() { pthread_mutex_unlock(&sl); }
+
+    pthread_mutex_t sl;
+    int refcount;
+};
+
+#endif
+
+void ImplMutex::init()
+{
+    impl = (Impl*)malloc(sizeof(Impl));
+    impl->init();
+}
+void ImplMutex::destroy()
+{
+    impl->destroy();
+    free(impl);
+    impl = NULL;
+}
+void ImplMutex::lock() { impl->lock(); }
+void ImplMutex::unlock() { impl->unlock(); }
+bool ImplMutex::trylock() { return impl->trylock(); }
+
+static int LockCallBack(void **mutex, AVLockOp op)
+{
+    ImplMutex* localMutex = reinterpret_cast<ImplMutex*>(*mutex);
+    switch (op)
     {
+        case AV_LOCK_CREATE:
+            localMutex = reinterpret_cast<ImplMutex*>(malloc(sizeof(ImplMutex)));
+            localMutex->init();
+            *mutex = localMutex;
+            if (!*mutex)
+                return 1;
+        break;
+
+        case AV_LOCK_OBTAIN:
+            localMutex->lock();
+        break;
+
+        case AV_LOCK_RELEASE:
+            localMutex->unlock();
+        break;
+
+        case AV_LOCK_DESTROY:
+            localMutex->destroy();
+            free(localMutex);
+            localMutex = NULL;
+        break;
+    }
+    return 0;
+}
+
+static ImplMutex _mutex;
+static bool _initialized = false;
+
+class InternalFFMpegRegister
+{
+public:
+    InternalFFMpegRegister()
+    {
+        _mutex.lock();
+        if (!_initialized)
+        {
     #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(53, 13, 0)
-        avformat_network_init();
+            avformat_network_init();
     #endif
 
-        /* register all codecs, demux and protocols */
-        av_register_all();
+            /* register all codecs, demux and protocols */
+            av_register_all();
 
-        av_log_set_level(AV_LOG_ERROR);
+            /* register a callback function for synchronization */
+            av_lockmgr_register(&LockCallBack);
 
-        initialized = true;
+            av_log_set_level(AV_LOG_ERROR);
+
+            _initialized = true;
+        }
+        _mutex.unlock();
     }
-}
+
+    ~InternalFFMpegRegister()
+    {
+        _initialized = false;
+        av_lockmgr_register(NULL);
+    }
+};
+
+static InternalFFMpegRegister _init;
 
 bool CvCapture_FFMPEG::open( const char* _filename )
 {
-    icvInitFFMPEG_internal();
-
     unsigned i;
     bool valid = false;
 
@@ -365,7 +524,8 @@ bool CvCapture_FFMPEG::open( const char* _filename )
     int err = av_open_input_file(&ic, _filename, NULL, 0, NULL);
 #endif
 
-    if (err < 0) {
+    if (err < 0)
+    {
         CV_WARN("Error opening file");
         goto exit_func;
     }
@@ -375,7 +535,8 @@ bool CvCapture_FFMPEG::open( const char* _filename )
 #else
     av_find_stream_info(ic);
 #endif
-    if (err < 0) {
+    if (err < 0)
+    {
         CV_WARN("Could not find codec parameters");
         goto exit_func;
     }
@@ -387,17 +548,18 @@ bool CvCapture_FFMPEG::open( const char* _filename )
         AVCodecContext *enc = &ic->streams[i]->codec;
 #endif
 
-#ifdef FF_API_THREAD_INIT
-        avcodec_thread_init(enc, get_number_of_cpus());
-#else
+//#ifdef FF_API_THREAD_INIT
+//        avcodec_thread_init(enc, get_number_of_cpus());
+//#else
         enc->thread_count = get_number_of_cpus();
-#endif
+//#endif
 
 #if LIBAVFORMAT_BUILD < CALC_FFMPEG_VERSION(53, 2, 0)
 #define AVMEDIA_TYPE_VIDEO CODEC_TYPE_VIDEO
 #endif
 
-        if( AVMEDIA_TYPE_VIDEO == enc->codec_type && video_stream < 0) {
+        if( AVMEDIA_TYPE_VIDEO == enc->codec_type && video_stream < 0)
+        {
             AVCodec *codec = avcodec_find_decoder(enc->codec_id);
             if (!codec ||
 #if LIBAVCODEC_VERSION_INT >= ((53<<16)+(8<<8)+0)
@@ -405,7 +567,8 @@ bool CvCapture_FFMPEG::open( const char* _filename )
 #else
                 avcodec_open(enc, codec)
 #endif
-                < 0) goto exit_func;
+                < 0)
+                goto exit_func;
 
             video_stream = i;
             video_st = ic->streams[i];
@@ -1279,8 +1442,6 @@ void CvVideoWriter_FFMPEG::close()
 bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
                                  double fps, int width, int height, bool is_color )
 {
-    icvInitFFMPEG_internal();
-
     CodecID codec_id = CODEC_ID_NONE;
     int err, codec_pix_fmt;
     double bitrate_scale = 1;
@@ -1499,6 +1660,7 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     frame_width = width;
     frame_height = height;
     ok = true;
+
     return true;
 }
 
@@ -1510,6 +1672,7 @@ CvCapture_FFMPEG* cvCreateFileCapture_FFMPEG( const char* filename )
     capture->init();
     if( capture->open( filename ))
         return capture;
+
     capture->close();
     free(capture);
     return 0;
@@ -1558,7 +1721,6 @@ CvVideoWriter_FFMPEG* cvCreateVideoWriter_FFMPEG( const char* filename, int four
     return 0;
 }
 
-
 void cvReleaseVideoWriter_FFMPEG( CvVideoWriter_FFMPEG** writer )
 {
     if( writer && *writer )
@@ -1749,11 +1911,6 @@ bool OutputMediaStream_FFMPEG::open(const char* fileName, int width, int height,
     oc_ = 0;
     video_st_ = 0;
 
-    // tell FFMPEG to register codecs
-    av_register_all();
-
-    av_log_set_level(AV_LOG_ERROR);
-
     // auto detect the output format from the name and fourcc code
     #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(53, 2, 0)
         fmt_ = av_guess_format(NULL, fileName, NULL);
@@ -1934,11 +2091,6 @@ bool InputMediaStream_FFMPEG::open(const char* fileName, int* codec, int* chroma
         avformat_network_init();
     #endif
 
-    // register all codecs, demux and protocols
-    av_register_all();
-
-    av_log_set_level(AV_LOG_ERROR);
-
     #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(53, 6, 0)
         err = avformat_open_input(&ctx_, fileName, 0, 0);
     #else
diff --git a/modules/highgui/test/test_ffmpeg.cpp b/modules/highgui/test/test_ffmpeg.cpp
index 0702cb1de4..53065462a2 100644
--- a/modules/highgui/test/test_ffmpeg.cpp
+++ b/modules/highgui/test/test_ffmpeg.cpp
@@ -207,7 +207,6 @@ public:
         }
     }
 
-
 private:
     std::vector<VideoWriter*>* writers;
     std::vector<std::string>* files;
@@ -241,18 +240,18 @@ public:
 
     virtual void operator() (const Range& range) const
     {
-        if((range.start + 1) != range.end)
-            return;
-
-        VideoWriter* writer = writers->operator[](range.start);
-        CV_Assert(writer != NULL);
-        CV_Assert(writer->isOpened());
-
-        Mat frame(CreateVideoWriterInvoker::FrameSize, CV_8UC3);
-        for (unsigned int i = 0; i < FrameCount; ++i)
+        for (int j = range.start; j < range.end; ++j)
         {
-            GenerateFrame(frame, i);
-            writer->operator<< (frame);
+            VideoWriter* writer = writers->operator[](j);
+            CV_Assert(writer != NULL);
+            CV_Assert(writer->isOpened());
+
+            Mat frame(CreateVideoWriterInvoker::FrameSize, CV_8UC3);
+            for (unsigned int i = 0; i < FrameCount; ++i)
+            {
+                GenerateFrame(frame, i);
+                writer->operator<< (frame);
+            }
         }
     }
 
@@ -305,47 +304,47 @@ public:
 
     virtual void operator() (const Range& range) const
     {
-        if((range.start + 1) != range.end)
-            return;
-
-        VideoCapture* capture = readers->operator[](range.start);
-        CV_Assert(capture != NULL);
-        CV_Assert(capture->isOpened());
-
-        const static double eps = 23.0;
-        unsigned int frameCount = static_cast<unsigned int>(capture->get(CV_CAP_PROP_FRAME_COUNT));
-        CV_Assert(frameCount == WriteVideo_Invoker::FrameCount);
-        Mat reference(CreateVideoWriterInvoker::FrameSize, CV_8UC3);
-
-        for (unsigned int i = 0; i < frameCount && next; ++i)
+        for (int j = range.start; j < range.end; ++j)
         {
-            Mat actual;
-            (*capture) >> actual;
+            VideoCapture* capture = readers->operator[](j);
+            CV_Assert(capture != NULL);
+            CV_Assert(capture->isOpened());
 
-            WriteVideo_Invoker::GenerateFrame(reference, i);
+            const static double eps = 23.0;
+            unsigned int frameCount = static_cast<unsigned int>(capture->get(CV_CAP_PROP_FRAME_COUNT));
+            CV_Assert(frameCount == WriteVideo_Invoker::FrameCount);
+            Mat reference(CreateVideoWriterInvoker::FrameSize, CV_8UC3);
 
-            EXPECT_EQ(reference.cols, actual.cols);
-            EXPECT_EQ(reference.rows, actual.rows);
-            EXPECT_EQ(reference.depth(), actual.depth());
-            EXPECT_EQ(reference.channels(), actual.channels());
-
-            double psnr = PSNR(actual, reference);
-            if (psnr < eps)
+            for (unsigned int i = 0; i < frameCount && next; ++i)
             {
-#define SUM cvtest::TS::SUMMARY
-                ts->printf(SUM, "\nPSNR: %lf\n", psnr);
-                ts->printf(SUM, "Video #: %d\n", range.start);
-                ts->printf(SUM, "Frame #: %d\n", i);
-#undef SUM
-                ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-                ts->set_gtest_status();
+                Mat actual;
+                (*capture) >> actual;
 
-                Mat diff;
-                absdiff(actual, reference, diff);
+                WriteVideo_Invoker::GenerateFrame(reference, i);
 
-                EXPECT_EQ(countNonZero(diff.reshape(1) > 1), 0);
+                EXPECT_EQ(reference.cols, actual.cols);
+                EXPECT_EQ(reference.rows, actual.rows);
+                EXPECT_EQ(reference.depth(), actual.depth());
+                EXPECT_EQ(reference.channels(), actual.channels());
 
-                next = false;
+                double psnr = PSNR(actual, reference);
+                if (psnr < eps)
+                {
+    #define SUM cvtest::TS::SUMMARY
+                    ts->printf(SUM, "\nPSNR: %lf\n", psnr);
+                    ts->printf(SUM, "Video #: %d\n", range.start);
+                    ts->printf(SUM, "Frame #: %d\n", i);
+    #undef SUM
+                    ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
+                    ts->set_gtest_status();
+
+                    Mat diff;
+                    absdiff(actual, reference, diff);
+
+                    EXPECT_EQ(countNonZero(diff.reshape(1) > 1), 0);
+
+                    next = false;
+                }
             }
         }
     }
@@ -359,7 +358,7 @@ private:
 
 bool ReadImageAndTest::next;
 
-TEST(Highgui_Video_parallel_writers_and_readers, DISABLED_accuracy)
+TEST(Highgui_Video_parallel_writers_and_readers, accuracy)
 {
     const unsigned int threadsCount = 4;
     cvtest::TS* ts = cvtest::TS::ptr();
diff --git a/modules/imgproc/doc/filtering.rst b/modules/imgproc/doc/filtering.rst
old mode 100644
new mode 100755
index 5f4e0438f3..0b1f9a34e4
--- a/modules/imgproc/doc/filtering.rst
+++ b/modules/imgproc/doc/filtering.rst
@@ -487,6 +487,8 @@ Blurs an image using the box filter.
 
     :param dst: output image of the same size and type as ``src``.
 
+    :param ddepth: the output image depth (-1 to use ``src.depth()``).
+
     :param ksize: blurring kernel size.
 
     :param anchor: anchor point; default value ``Point(-1,-1)`` means that the anchor is at the kernel center.
diff --git a/modules/video/doc/motion_analysis_and_object_tracking.rst b/modules/video/doc/motion_analysis_and_object_tracking.rst
index 3674a5d579..158e71d68e 100644
--- a/modules/video/doc/motion_analysis_and_object_tracking.rst
+++ b/modules/video/doc/motion_analysis_and_object_tracking.rst
@@ -641,11 +641,11 @@ See [Tao2012]_. And site of project - http://graphics.berkeley.edu/papers/Tao-SA
 
 
 
-OpticalFlowDual_TVL1
---------------------
+createOptFlow_DualTVL1
+----------------------
 "Dual TV L1" Optical Flow Algorithm.
 
-.. ocv:class:: OpticalFlowDual_TVL12
+.. ocv:function:: Ptr<DenseOpticalFlow> createOptFlow_DualTVL1()
 
 
 The class implements the "Dual TV L1" optical flow algorithm described in [Zach2007]_ and [Javier2012]_ .
@@ -683,11 +683,11 @@ Here are important members of the class that control the algorithm, which you ca
 
 
 
-OpticalFlowDual_TVL1::operator()
---------------------------------
+DenseOpticalFlow::calc
+--------------------------
 Calculates an optical flow.
 
-.. ocv:function:: void OpticalFlowDual_TVL1::operator ()(InputArray I0, InputArray I1, InputOutputArray flow)
+.. ocv:function:: void DenseOpticalFlow::calc(InputArray I0, InputArray I1, InputOutputArray flow)
 
     :param prev: first 8-bit single-channel input image.
 
@@ -697,11 +697,11 @@ Calculates an optical flow.
 
 
 
-OpticalFlowDual_TVL1::collectGarbage
-------------------------------------
+DenseOpticalFlow::collectGarbage
+--------------------------------
 Releases all inner buffers.
 
-.. ocv:function:: void OpticalFlowDual_TVL1::collectGarbage()
+.. ocv:function:: void DenseOpticalFlow::collectGarbage()
 
 
 
diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp
index be9407aca6..b49747de73 100644
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -352,104 +352,19 @@ CV_EXPORTS_W void calcOpticalFlowSF(InputArray from,
                                     double upscale_sigma_color,
                                     double speed_up_thr);
 
+class CV_EXPORTS DenseOpticalFlow : public Algorithm
+{
+public:
+    virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow) = 0;
+    virtual void collectGarbage() = 0;
+};
+
 // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
 //
 // see reference:
 //   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
 //   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-class CV_EXPORTS OpticalFlowDual_TVL1
-{
-public:
-    OpticalFlowDual_TVL1();
-
-    void operator ()(InputArray I0, InputArray I1, InputOutputArray flow);
-
-    void collectGarbage();
-
-    /**
-     * Time step of the numerical scheme.
-     */
-    double tau;
-
-    /**
-     * Weight parameter for the data term, attachment parameter.
-     * This is the most relevant parameter, which determines the smoothness of the output.
-     * The smaller this parameter is, the smoother the solutions we obtain.
-     * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
-     */
-    double lambda;
-
-    /**
-     * Weight parameter for (u - v)^2, tightness parameter.
-     * It serves as a link between the attachment and the regularization terms.
-     * In theory, it should have a small value in order to maintain both parts in correspondence.
-     * The method is stable for a large range of values of this parameter.
-     */
-    double theta;
-
-    /**
-     * Number of scales used to create the pyramid of images.
-     */
-    int nscales;
-
-    /**
-     * Number of warpings per scale.
-     * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
-     * This is a parameter that assures the stability of the method.
-     * It also affects the running time, so it is a compromise between speed and accuracy.
-     */
-    int warps;
-
-    /**
-     * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
-     * A small value will yield more accurate solutions at the expense of a slower convergence.
-     */
-    double epsilon;
-
-    /**
-     * Stopping criterion iterations number used in the numerical scheme.
-     */
-    int iterations;
-
-    bool useInitialFlow;
-
-private:
-    void procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2);
-
-    std::vector<Mat_<float> > I0s;
-    std::vector<Mat_<float> > I1s;
-    std::vector<Mat_<float> > u1s;
-    std::vector<Mat_<float> > u2s;
-
-    Mat_<float> I1x_buf;
-    Mat_<float> I1y_buf;
-
-    Mat_<float> flowMap1_buf;
-    Mat_<float> flowMap2_buf;
-
-    Mat_<float> I1w_buf;
-    Mat_<float> I1wx_buf;
-    Mat_<float> I1wy_buf;
-
-    Mat_<float> grad_buf;
-    Mat_<float> rho_c_buf;
-
-    Mat_<float> v1_buf;
-    Mat_<float> v2_buf;
-
-    Mat_<float> p11_buf;
-    Mat_<float> p12_buf;
-    Mat_<float> p21_buf;
-    Mat_<float> p22_buf;
-
-    Mat_<float> div_p1_buf;
-    Mat_<float> div_p2_buf;
-
-    Mat_<float> u1x_buf;
-    Mat_<float> u1y_buf;
-    Mat_<float> u2x_buf;
-    Mat_<float> u2y_buf;
-};
+CV_EXPORTS Ptr<DenseOpticalFlow> createOptFlow_DualTVL1();
 
 }
 
diff --git a/modules/video/perf/perf_tvl1optflow.cpp b/modules/video/perf/perf_tvl1optflow.cpp
index 2014130a85..ad90915128 100644
--- a/modules/video/perf/perf_tvl1optflow.cpp
+++ b/modules/video/perf/perf_tvl1optflow.cpp
@@ -22,12 +22,9 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1, testing::Values(impair("cv/optflow/
 
     Mat flow;
 
-    OpticalFlowDual_TVL1 tvl1;
+    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
 
-    TEST_CYCLE()
-    {
-        tvl1(frame1, frame2, flow);
-    }
+    TEST_CYCLE_N(10) tvl1->calc(frame1, frame2, flow);
 
     SANITY_CHECK(flow, 0.5);
 }
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index 2f8034adaa..bff1d7ec0d 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -77,7 +77,67 @@
 using namespace std;
 using namespace cv;
 
-cv::OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
+namespace {
+
+class OpticalFlowDual_TVL1 : public DenseOpticalFlow
+{
+public:
+    OpticalFlowDual_TVL1();
+
+    void calc(InputArray I0, InputArray I1, InputOutputArray flow);
+    void collectGarbage();
+
+    AlgorithmInfo* info() const;
+
+protected:
+    double tau;
+    double lambda;
+    double theta;
+    int nscales;
+    int warps;
+    double epsilon;
+    int iterations;
+    bool useInitialFlow;
+
+private:
+    void procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2);
+
+    std::vector<Mat_<float> > I0s;
+    std::vector<Mat_<float> > I1s;
+    std::vector<Mat_<float> > u1s;
+    std::vector<Mat_<float> > u2s;
+
+    Mat_<float> I1x_buf;
+    Mat_<float> I1y_buf;
+
+    Mat_<float> flowMap1_buf;
+    Mat_<float> flowMap2_buf;
+
+    Mat_<float> I1w_buf;
+    Mat_<float> I1wx_buf;
+    Mat_<float> I1wy_buf;
+
+    Mat_<float> grad_buf;
+    Mat_<float> rho_c_buf;
+
+    Mat_<float> v1_buf;
+    Mat_<float> v2_buf;
+
+    Mat_<float> p11_buf;
+    Mat_<float> p12_buf;
+    Mat_<float> p21_buf;
+    Mat_<float> p22_buf;
+
+    Mat_<float> div_p1_buf;
+    Mat_<float> div_p2_buf;
+
+    Mat_<float> u1x_buf;
+    Mat_<float> u1y_buf;
+    Mat_<float> u2x_buf;
+    Mat_<float> u2y_buf;
+};
+
+OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
 {
     tau            = 0.25;
     lambda         = 0.15;
@@ -89,7 +149,7 @@ cv::OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
     useInitialFlow = false;
 }
 
-void cv::OpticalFlowDual_TVL1::operator ()(InputArray _I0, InputArray _I1, InputOutputArray _flow)
+void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow)
 {
     Mat I0 = _I0.getMat();
     Mat I1 = _I1.getMat();
@@ -195,542 +255,539 @@ void cv::OpticalFlowDual_TVL1::operator ()(InputArray _I0, InputArray _I1, Input
     merge(uxy, 2, _flow);
 }
 
-namespace
+////////////////////////////////////////////////////////////
+// buildFlowMap
+
+struct BuildFlowMapBody : ParallelLoopBody
 {
-    ////////////////////////////////////////////////////////////
-    // buildFlowMap
+    void operator() (const Range& range) const;
 
-    struct BuildFlowMapBody : ParallelLoopBody
+    Mat_<float> u1;
+    Mat_<float> u2;
+    mutable Mat_<float> map1;
+    mutable Mat_<float> map2;
+};
+
+void BuildFlowMapBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
     {
-        void operator() (const Range& range) const;
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
 
-        Mat_<float> u1;
-        Mat_<float> u2;
-        mutable Mat_<float> map1;
-        mutable Mat_<float> map2;
-    };
+        float* map1Row = map1[y];
+        float* map2Row = map2[y];
 
-    void BuildFlowMapBody::operator() (const Range& range) const
-    {
-        for (int y = range.start; y < range.end; ++y)
+        for (int x = 0; x < u1.cols; ++x)
         {
-            const float* u1Row = u1[y];
-            const float* u2Row = u2[y];
-
-            float* map1Row = map1[y];
-            float* map2Row = map2[y];
-
-            for (int x = 0; x < u1.cols; ++x)
-            {
-                map1Row[x] = x + u1Row[x];
-                map2Row[x] = y + u2Row[x];
-            }
+            map1Row[x] = x + u1Row[x];
+            map2Row[x] = y + u2Row[x];
         }
     }
-
-    void buildFlowMap(const Mat_<float>& u1, const Mat_<float>& u2, Mat_<float>& map1, Mat_<float>& map2)
-    {
-        CV_DbgAssert( u2.size() == u1.size() );
-        CV_DbgAssert( map1.size() == u1.size() );
-        CV_DbgAssert( map2.size() == u1.size() );
-
-        BuildFlowMapBody body;
-
-        body.u1 = u1;
-        body.u2 = u2;
-        body.map1 = map1;
-        body.map2 = map2;
-
-        parallel_for_(Range(0, u1.rows), body);
-    }
-
-    ////////////////////////////////////////////////////////////
-    // centeredGradient
-
-    struct CenteredGradientBody : ParallelLoopBody
-    {
-        void operator() (const Range& range) const;
-
-        Mat_<float> src;
-        mutable Mat_<float> dx;
-        mutable Mat_<float> dy;
-    };
-
-    void CenteredGradientBody::operator() (const Range& range) const
-    {
-        const int last_col = src.cols - 1;
-
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const float* srcPrevRow = src[y - 1];
-            const float* srcCurRow = src[y];
-            const float* srcNextRow = src[y + 1];
-
-            float* dxRow = dx[y];
-            float* dyRow = dy[y];
-
-            for (int x = 1; x < last_col; ++x)
-            {
-                dxRow[x] = 0.5f * (srcCurRow[x + 1] - srcCurRow[x - 1]);
-                dyRow[x] = 0.5f * (srcNextRow[x] - srcPrevRow[x]);
-            }
-        }
-    }
-
-    void centeredGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
-    {
-        CV_DbgAssert( src.rows > 2 && src.cols > 2 );
-        CV_DbgAssert( dx.size() == src.size() );
-        CV_DbgAssert( dy.size() == src.size() );
-
-        const int last_row = src.rows - 1;
-        const int last_col = src.cols - 1;
-
-        // compute the gradient on the center body of the image
-        {
-            CenteredGradientBody body;
-
-            body.src = src;
-            body.dx = dx;
-            body.dy = dy;
-
-            parallel_for_(Range(1, last_row), body);
-        }
-
-        // compute the gradient on the first and last rows
-        for (int x = 1; x < last_col; ++x)
-        {
-            dx(0, x) = 0.5f * (src(0, x + 1) - src(0, x - 1));
-            dy(0, x) = 0.5f * (src(1, x) - src(0, x));
-
-            dx(last_row, x) = 0.5f * (src(last_row, x + 1) - src(last_row, x - 1));
-            dy(last_row, x) = 0.5f * (src(last_row, x) - src(last_row - 1, x));
-        }
-
-        // compute the gradient on the first and last columns
-        for (int y = 1; y < last_row; ++y)
-        {
-            dx(y, 0) = 0.5f * (src(y, 1) - src(y, 0));
-            dy(y, 0) = 0.5f * (src(y + 1, 0) - src(y - 1, 0));
-
-            dx(y, last_col) = 0.5f * (src(y, last_col) - src(y, last_col - 1));
-            dy(y, last_col) = 0.5f * (src(y + 1, last_col) - src(y - 1, last_col));
-        }
-
-        // compute the gradient at the four corners
-        dx(0, 0) = 0.5f * (src(0, 1) - src(0, 0));
-        dy(0, 0) = 0.5f * (src(1, 0) - src(0, 0));
-
-        dx(0, last_col) = 0.5f * (src(0, last_col) - src(0, last_col - 1));
-        dy(0, last_col) = 0.5f * (src(1, last_col) - src(0, last_col));
-
-        dx(last_row, 0) = 0.5f * (src(last_row, 1) - src(last_row, 0));
-        dy(last_row, 0) = 0.5f * (src(last_row, 0) - src(last_row - 1, 0));
-
-        dx(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row, last_col - 1));
-        dy(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row - 1, last_col));
-    }
-
-    ////////////////////////////////////////////////////////////
-    // forwardGradient
-
-    struct ForwardGradientBody : ParallelLoopBody
-    {
-        void operator() (const Range& range) const;
-
-        Mat_<float> src;
-        mutable Mat_<float> dx;
-        mutable Mat_<float> dy;
-    };
-
-    void ForwardGradientBody::operator() (const Range& range) const
-    {
-        const int last_col = src.cols - 1;
-
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const float* srcCurRow = src[y];
-            const float* srcNextRow = src[y + 1];
-
-            float* dxRow = dx[y];
-            float* dyRow = dy[y];
-
-            for (int x = 0; x < last_col; ++x)
-            {
-                dxRow[x] = srcCurRow[x + 1] - srcCurRow[x];
-                dyRow[x] = srcNextRow[x] - srcCurRow[x];
-            }
-        }
-    }
-
-    void forwardGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
-    {
-        CV_DbgAssert( src.rows > 2 && src.cols > 2 );
-        CV_DbgAssert( dx.size() == src.size() );
-        CV_DbgAssert( dy.size() == src.size() );
-
-        const int last_row = src.rows - 1;
-        const int last_col = src.cols - 1;
-
-        // compute the gradient on the central body of the image
-        {
-            ForwardGradientBody body;
-
-            body.src = src;
-            body.dx = dx;
-            body.dy = dy;
-
-            parallel_for_(Range(0, last_row), body);
-        }
-
-        // compute the gradient on the last row
-        for (int x = 0; x < last_col; ++x)
-        {
-            dx(last_row, x) = src(last_row, x + 1) - src(last_row, x);
-            dy(last_row, x) = 0.0f;
-        }
-
-        // compute the gradient on the last column
-        for (int y = 0; y < last_row; ++y)
-        {
-            dx(y, last_col) = 0.0f;
-            dy(y, last_col) = src(y + 1, last_col) - src(y, last_col);
-        }
-
-        dx(last_row, last_col) = 0.0f;
-        dy(last_row, last_col) = 0.0f;
-    }
-
-    ////////////////////////////////////////////////////////////
-    // divergence
-
-    struct DivergenceBody : ParallelLoopBody
-    {
-        void operator() (const Range& range) const;
-
-        Mat_<float> v1;
-        Mat_<float> v2;
-        mutable Mat_<float> div;
-    };
-
-    void DivergenceBody::operator() (const Range& range) const
-    {
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const float* v1Row = v1[y];
-            const float* v2PrevRow = v2[y - 1];
-            const float* v2CurRow = v2[y];
-
-            float* divRow = div[y];
-
-            for(int x = 1; x < v1.cols; ++x)
-            {
-                const float v1x = v1Row[x] - v1Row[x - 1];
-                const float v2y = v2CurRow[x] - v2PrevRow[x];
-
-                divRow[x] = v1x + v2y;
-            }
-        }
-    }
-
-    void divergence(const Mat_<float>& v1, const Mat_<float>& v2, Mat_<float>& div)
-    {
-        CV_DbgAssert( v1.rows > 2 && v1.cols > 2 );
-        CV_DbgAssert( v2.size() == v1.size() );
-        CV_DbgAssert( div.size() == v1.size() );
-
-        {
-            DivergenceBody body;
-
-            body.v1 = v1;
-            body.v2 = v2;
-            body.div = div;
-
-            parallel_for_(Range(1, v1.rows), body);
-        }
-
-        // compute the divergence on the first row
-        for(int x = 1; x < v1.cols; ++x)
-            div(0, x) = v1(0, x) - v1(0, x - 1) + v2(0, x);
-
-        // compute the divergence on the first column
-        for (int y = 1; y < v1.rows; ++y)
-            div(y, 0) = v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
-
-        div(0, 0) = v1(0, 0) + v2(0, 0);
-    }
-
-    ////////////////////////////////////////////////////////////
-    // calcGradRho
-
-    struct CalcGradRhoBody : ParallelLoopBody
-    {
-        void operator() (const Range& range) const;
-
-        Mat_<float> I0;
-        Mat_<float> I1w;
-        Mat_<float> I1wx;
-        Mat_<float> I1wy;
-        Mat_<float> u1;
-        Mat_<float> u2;
-        mutable Mat_<float> grad;
-        mutable Mat_<float> rho_c;
-    };
-
-    void CalcGradRhoBody::operator() (const Range& range) const
-    {
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const float* I0Row = I0[y];
-            const float* I1wRow = I1w[y];
-            const float* I1wxRow = I1wx[y];
-            const float* I1wyRow = I1wy[y];
-            const float* u1Row = u1[y];
-            const float* u2Row = u2[y];
-
-            float* gradRow = grad[y];
-            float* rhoRow = rho_c[y];
-
-            for (int x = 0; x < I0.cols; ++x)
-            {
-                const float Ix2 = I1wxRow[x] * I1wxRow[x];
-                const float Iy2 = I1wyRow[x] * I1wyRow[x];
-
-                // store the |Grad(I1)|^2
-                gradRow[x] = Ix2 + Iy2;
-
-                // compute the constant part of the rho function
-                rhoRow[x] = (I1wRow[x] - I1wxRow[x] * u1Row[x] - I1wyRow[x] * u2Row[x] - I0Row[x]);
-            }
-        }
-    }
-
-    void calcGradRho(const Mat_<float>& I0, const Mat_<float>& I1w, const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2,
-        Mat_<float>& grad, Mat_<float>& rho_c)
-    {
-        CV_DbgAssert( I1w.size() == I0.size() );
-        CV_DbgAssert( I1wx.size() == I0.size() );
-        CV_DbgAssert( I1wy.size() == I0.size() );
-        CV_DbgAssert( u1.size() == I0.size() );
-        CV_DbgAssert( u2.size() == I0.size() );
-        CV_DbgAssert( grad.size() == I0.size() );
-        CV_DbgAssert( rho_c.size() == I0.size() );
-
-        CalcGradRhoBody body;
-
-        body.I0 = I0;
-        body.I1w = I1w;
-        body.I1wx = I1wx;
-        body.I1wy = I1wy;
-        body.u1 = u1;
-        body.u2 = u2;
-        body.grad = grad;
-        body.rho_c = rho_c;
-
-        parallel_for_(Range(0, I0.rows), body);
-    }
-
-    ////////////////////////////////////////////////////////////
-    // estimateV
-
-    struct EstimateVBody : ParallelLoopBody
-    {
-        void operator() (const Range& range) const;
-
-        Mat_<float> I1wx;
-        Mat_<float> I1wy;
-        Mat_<float> u1;
-        Mat_<float> u2;
-        Mat_<float> grad;
-        Mat_<float> rho_c;
-        mutable Mat_<float> v1;
-        mutable Mat_<float> v2;
-        float l_t;
-    };
-
-    void EstimateVBody::operator() (const Range& range) const
-    {
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const float* I1wxRow = I1wx[y];
-            const float* I1wyRow = I1wy[y];
-            const float* u1Row = u1[y];
-            const float* u2Row = u2[y];
-            const float* gradRow = grad[y];
-            const float* rhoRow = rho_c[y];
-
-            float* v1Row = v1[y];
-            float* v2Row = v2[y];
-
-            for (int x = 0; x < I1wx.cols; ++x)
-            {
-                const float rho = rhoRow[x] + (I1wxRow[x] * u1Row[x] + I1wyRow[x] * u2Row[x]);
-
-                float d1 = 0.0f;
-                float d2 = 0.0f;
-
-                if (rho < -l_t * gradRow[x])
-                {
-                    d1 = l_t * I1wxRow[x];
-                    d2 = l_t * I1wyRow[x];
-                }
-                else if (rho > l_t * gradRow[x])
-                {
-                    d1 = -l_t * I1wxRow[x];
-                    d2 = -l_t * I1wyRow[x];
-                }
-                else if (gradRow[x] > numeric_limits<float>::epsilon())
-                {
-                    float fi = -rho / gradRow[x];
-                    d1 = fi * I1wxRow[x];
-                    d2 = fi * I1wyRow[x];
-                }
-
-                v1Row[x] = u1Row[x] + d1;
-                v2Row[x] = u2Row[x] + d2;
-            }
-        }
-    }
-
-    void estimateV(const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2, const Mat_<float>& grad, const Mat_<float>& rho_c,
-                   Mat_<float>& v1, Mat_<float>& v2, float l_t)
-    {
-        CV_DbgAssert( I1wy.size() == I1wx.size() );
-        CV_DbgAssert( u1.size() == I1wx.size() );
-        CV_DbgAssert( u2.size() == I1wx.size() );
-        CV_DbgAssert( grad.size() == I1wx.size() );
-        CV_DbgAssert( rho_c.size() == I1wx.size() );
-        CV_DbgAssert( v1.size() == I1wx.size() );
-        CV_DbgAssert( v2.size() == I1wx.size() );
-
-        EstimateVBody body;
-
-        body.I1wx = I1wx;
-        body.I1wy = I1wy;
-        body.u1 = u1;
-        body.u2 = u2;
-        body.grad = grad;
-        body.rho_c = rho_c;
-        body.v1 = v1;
-        body.v2 = v2;
-        body.l_t = l_t;
-
-        parallel_for_(Range(0, I1wx.rows), body);
-    }
-
-    ////////////////////////////////////////////////////////////
-    // estimateU
-
-    float estimateU(const Mat_<float>& v1, const Mat_<float>& v2, const Mat_<float>& div_p1, const Mat_<float>& div_p2, Mat_<float>& u1, Mat_<float>& u2, float theta)
-    {
-        CV_DbgAssert( v2.size() == v1.size() );
-        CV_DbgAssert( div_p1.size() == v1.size() );
-        CV_DbgAssert( div_p2.size() == v1.size() );
-        CV_DbgAssert( u1.size() == v1.size() );
-        CV_DbgAssert( u2.size() == v1.size() );
-
-        float error = 0.0f;
-        for (int y = 0; y < v1.rows; ++y)
-        {
-            const float* v1Row = v1[y];
-            const float* v2Row = v2[y];
-            const float* divP1Row = div_p1[y];
-            const float* divP2Row = div_p2[y];
-
-            float* u1Row = u1[y];
-            float* u2Row = u2[y];
-
-            for (int x = 0; x < v1.cols; ++x)
-            {
-                const float u1k = u1Row[x];
-                const float u2k = u2Row[x];
-
-                u1Row[x] = v1Row[x] + theta * divP1Row[x];
-                u2Row[x] = v2Row[x] + theta * divP2Row[x];
-
-                error += (u1Row[x] - u1k) * (u1Row[x] - u1k) + (u2Row[x] - u2k) * (u2Row[x] - u2k);
-            }
-        }
-
-        return error;
-    }
-
-    ////////////////////////////////////////////////////////////
-    // estimateDualVariables
-
-    struct EstimateDualVariablesBody : ParallelLoopBody
-    {
-        void operator() (const Range& range) const;
-
-        Mat_<float> u1x;
-        Mat_<float> u1y;
-        Mat_<float> u2x;
-        Mat_<float> u2y;
-        mutable Mat_<float> p11;
-        mutable Mat_<float> p12;
-        mutable Mat_<float> p21;
-        mutable Mat_<float> p22;
-        float taut;
-    };
-
-    void EstimateDualVariablesBody::operator() (const Range& range) const
-    {
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const float* u1xRow = u1x[y];
-            const float* u1yRow = u1y[y];
-            const float* u2xRow = u2x[y];
-            const float* u2yRow = u2y[y];
-
-            float* p11Row = p11[y];
-            float* p12Row = p12[y];
-            float* p21Row = p21[y];
-            float* p22Row = p22[y];
-
-            for (int x = 0; x < u1x.cols; ++x)
-            {
-                const float g1 = static_cast<float>(hypot(u1xRow[x], u1yRow[x]));
-                const float g2 = static_cast<float>(hypot(u2xRow[x], u2yRow[x]));
-
-                const float ng1  = 1.0f + taut * g1;
-                const float ng2  = 1.0f + taut * g2;
-
-                p11Row[x] = (p11Row[x] + taut * u1xRow[x]) / ng1;
-                p12Row[x] = (p12Row[x] + taut * u1yRow[x]) / ng1;
-                p21Row[x] = (p21Row[x] + taut * u2xRow[x]) / ng2;
-                p22Row[x] = (p22Row[x] + taut * u2yRow[x]) / ng2;
-            }
-        }
-    }
-
-    void estimateDualVariables(const Mat_<float>& u1x, const Mat_<float>& u1y, const Mat_<float>& u2x, const Mat_<float>& u2y,
-                               Mat_<float>& p11, Mat_<float>& p12, Mat_<float>& p21, Mat_<float>& p22, float taut)
-    {
-        CV_DbgAssert( u1y.size() == u1x.size() );
-        CV_DbgAssert( u2x.size() == u1x.size() );
-        CV_DbgAssert( u2y.size() == u1x.size() );
-        CV_DbgAssert( p11.size() == u1x.size() );
-        CV_DbgAssert( p12.size() == u1x.size() );
-        CV_DbgAssert( p21.size() == u1x.size() );
-        CV_DbgAssert( p22.size() == u1x.size() );
-
-        EstimateDualVariablesBody body;
-
-        body.u1x = u1x;
-        body.u1y = u1y;
-        body.u2x = u2x;
-        body.u2y = u2y;
-        body.p11 = p11;
-        body.p12 = p12;
-        body.p21 = p21;
-        body.p22 = p22;
-        body.taut = taut;
-
-        parallel_for_(Range(0, u1x.rows), body);
-    }
 }
 
-void cv::OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2)
+void buildFlowMap(const Mat_<float>& u1, const Mat_<float>& u2, Mat_<float>& map1, Mat_<float>& map2)
+{
+    CV_DbgAssert( u2.size() == u1.size() );
+    CV_DbgAssert( map1.size() == u1.size() );
+    CV_DbgAssert( map2.size() == u1.size() );
+
+    BuildFlowMapBody body;
+
+    body.u1 = u1;
+    body.u2 = u2;
+    body.map1 = map1;
+    body.map2 = map2;
+
+    parallel_for_(Range(0, u1.rows), body);
+}
+
+////////////////////////////////////////////////////////////
+// centeredGradient
+
+struct CenteredGradientBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> src;
+    mutable Mat_<float> dx;
+    mutable Mat_<float> dy;
+};
+
+void CenteredGradientBody::operator() (const Range& range) const
+{
+    const int last_col = src.cols - 1;
+
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* srcPrevRow = src[y - 1];
+        const float* srcCurRow = src[y];
+        const float* srcNextRow = src[y + 1];
+
+        float* dxRow = dx[y];
+        float* dyRow = dy[y];
+
+        for (int x = 1; x < last_col; ++x)
+        {
+            dxRow[x] = 0.5f * (srcCurRow[x + 1] - srcCurRow[x - 1]);
+            dyRow[x] = 0.5f * (srcNextRow[x] - srcPrevRow[x]);
+        }
+    }
+}
+
+void centeredGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
+{
+    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
+    CV_DbgAssert( dx.size() == src.size() );
+    CV_DbgAssert( dy.size() == src.size() );
+
+    const int last_row = src.rows - 1;
+    const int last_col = src.cols - 1;
+
+    // compute the gradient on the center body of the image
+    {
+        CenteredGradientBody body;
+
+        body.src = src;
+        body.dx = dx;
+        body.dy = dy;
+
+        parallel_for_(Range(1, last_row), body);
+    }
+
+    // compute the gradient on the first and last rows
+    for (int x = 1; x < last_col; ++x)
+    {
+        dx(0, x) = 0.5f * (src(0, x + 1) - src(0, x - 1));
+        dy(0, x) = 0.5f * (src(1, x) - src(0, x));
+
+        dx(last_row, x) = 0.5f * (src(last_row, x + 1) - src(last_row, x - 1));
+        dy(last_row, x) = 0.5f * (src(last_row, x) - src(last_row - 1, x));
+    }
+
+    // compute the gradient on the first and last columns
+    for (int y = 1; y < last_row; ++y)
+    {
+        dx(y, 0) = 0.5f * (src(y, 1) - src(y, 0));
+        dy(y, 0) = 0.5f * (src(y + 1, 0) - src(y - 1, 0));
+
+        dx(y, last_col) = 0.5f * (src(y, last_col) - src(y, last_col - 1));
+        dy(y, last_col) = 0.5f * (src(y + 1, last_col) - src(y - 1, last_col));
+    }
+
+    // compute the gradient at the four corners
+    dx(0, 0) = 0.5f * (src(0, 1) - src(0, 0));
+    dy(0, 0) = 0.5f * (src(1, 0) - src(0, 0));
+
+    dx(0, last_col) = 0.5f * (src(0, last_col) - src(0, last_col - 1));
+    dy(0, last_col) = 0.5f * (src(1, last_col) - src(0, last_col));
+
+    dx(last_row, 0) = 0.5f * (src(last_row, 1) - src(last_row, 0));
+    dy(last_row, 0) = 0.5f * (src(last_row, 0) - src(last_row - 1, 0));
+
+    dx(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row, last_col - 1));
+    dy(last_row, last_col) = 0.5f * (src(last_row, last_col) - src(last_row - 1, last_col));
+}
+
+////////////////////////////////////////////////////////////
+// forwardGradient
+
+struct ForwardGradientBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> src;
+    mutable Mat_<float> dx;
+    mutable Mat_<float> dy;
+};
+
+void ForwardGradientBody::operator() (const Range& range) const
+{
+    const int last_col = src.cols - 1;
+
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* srcCurRow = src[y];
+        const float* srcNextRow = src[y + 1];
+
+        float* dxRow = dx[y];
+        float* dyRow = dy[y];
+
+        for (int x = 0; x < last_col; ++x)
+        {
+            dxRow[x] = srcCurRow[x + 1] - srcCurRow[x];
+            dyRow[x] = srcNextRow[x] - srcCurRow[x];
+        }
+    }
+}
+
+void forwardGradient(const Mat_<float>& src, Mat_<float>& dx, Mat_<float>& dy)
+{
+    CV_DbgAssert( src.rows > 2 && src.cols > 2 );
+    CV_DbgAssert( dx.size() == src.size() );
+    CV_DbgAssert( dy.size() == src.size() );
+
+    const int last_row = src.rows - 1;
+    const int last_col = src.cols - 1;
+
+    // compute the gradient on the central body of the image
+    {
+        ForwardGradientBody body;
+
+        body.src = src;
+        body.dx = dx;
+        body.dy = dy;
+
+        parallel_for_(Range(0, last_row), body);
+    }
+
+    // compute the gradient on the last row
+    for (int x = 0; x < last_col; ++x)
+    {
+        dx(last_row, x) = src(last_row, x + 1) - src(last_row, x);
+        dy(last_row, x) = 0.0f;
+    }
+
+    // compute the gradient on the last column
+    for (int y = 0; y < last_row; ++y)
+    {
+        dx(y, last_col) = 0.0f;
+        dy(y, last_col) = src(y + 1, last_col) - src(y, last_col);
+    }
+
+    dx(last_row, last_col) = 0.0f;
+    dy(last_row, last_col) = 0.0f;
+}
+
+////////////////////////////////////////////////////////////
+// divergence
+
+struct DivergenceBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> v1;
+    Mat_<float> v2;
+    mutable Mat_<float> div;
+};
+
+void DivergenceBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* v1Row = v1[y];
+        const float* v2PrevRow = v2[y - 1];
+        const float* v2CurRow = v2[y];
+
+        float* divRow = div[y];
+
+        for(int x = 1; x < v1.cols; ++x)
+        {
+            const float v1x = v1Row[x] - v1Row[x - 1];
+            const float v2y = v2CurRow[x] - v2PrevRow[x];
+
+            divRow[x] = v1x + v2y;
+        }
+    }
+}
+
+void divergence(const Mat_<float>& v1, const Mat_<float>& v2, Mat_<float>& div)
+{
+    CV_DbgAssert( v1.rows > 2 && v1.cols > 2 );
+    CV_DbgAssert( v2.size() == v1.size() );
+    CV_DbgAssert( div.size() == v1.size() );
+
+    {
+        DivergenceBody body;
+
+        body.v1 = v1;
+        body.v2 = v2;
+        body.div = div;
+
+        parallel_for_(Range(1, v1.rows), body);
+    }
+
+    // compute the divergence on the first row
+    for(int x = 1; x < v1.cols; ++x)
+        div(0, x) = v1(0, x) - v1(0, x - 1) + v2(0, x);
+
+    // compute the divergence on the first column
+    for (int y = 1; y < v1.rows; ++y)
+        div(y, 0) = v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
+
+    div(0, 0) = v1(0, 0) + v2(0, 0);
+}
+
+////////////////////////////////////////////////////////////
+// calcGradRho
+
+struct CalcGradRhoBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> I0;
+    Mat_<float> I1w;
+    Mat_<float> I1wx;
+    Mat_<float> I1wy;
+    Mat_<float> u1;
+    Mat_<float> u2;
+    mutable Mat_<float> grad;
+    mutable Mat_<float> rho_c;
+};
+
+void CalcGradRhoBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* I0Row = I0[y];
+        const float* I1wRow = I1w[y];
+        const float* I1wxRow = I1wx[y];
+        const float* I1wyRow = I1wy[y];
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+
+        float* gradRow = grad[y];
+        float* rhoRow = rho_c[y];
+
+        for (int x = 0; x < I0.cols; ++x)
+        {
+            const float Ix2 = I1wxRow[x] * I1wxRow[x];
+            const float Iy2 = I1wyRow[x] * I1wyRow[x];
+
+            // store the |Grad(I1)|^2
+            gradRow[x] = Ix2 + Iy2;
+
+            // compute the constant part of the rho function
+            rhoRow[x] = (I1wRow[x] - I1wxRow[x] * u1Row[x] - I1wyRow[x] * u2Row[x] - I0Row[x]);
+        }
+    }
+}
+
+void calcGradRho(const Mat_<float>& I0, const Mat_<float>& I1w, const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2,
+    Mat_<float>& grad, Mat_<float>& rho_c)
+{
+    CV_DbgAssert( I1w.size() == I0.size() );
+    CV_DbgAssert( I1wx.size() == I0.size() );
+    CV_DbgAssert( I1wy.size() == I0.size() );
+    CV_DbgAssert( u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == I0.size() );
+    CV_DbgAssert( grad.size() == I0.size() );
+    CV_DbgAssert( rho_c.size() == I0.size() );
+
+    CalcGradRhoBody body;
+
+    body.I0 = I0;
+    body.I1w = I1w;
+    body.I1wx = I1wx;
+    body.I1wy = I1wy;
+    body.u1 = u1;
+    body.u2 = u2;
+    body.grad = grad;
+    body.rho_c = rho_c;
+
+    parallel_for_(Range(0, I0.rows), body);
+}
+
+////////////////////////////////////////////////////////////
+// estimateV
+
+struct EstimateVBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> I1wx;
+    Mat_<float> I1wy;
+    Mat_<float> u1;
+    Mat_<float> u2;
+    Mat_<float> grad;
+    Mat_<float> rho_c;
+    mutable Mat_<float> v1;
+    mutable Mat_<float> v2;
+    float l_t;
+};
+
+void EstimateVBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* I1wxRow = I1wx[y];
+        const float* I1wyRow = I1wy[y];
+        const float* u1Row = u1[y];
+        const float* u2Row = u2[y];
+        const float* gradRow = grad[y];
+        const float* rhoRow = rho_c[y];
+
+        float* v1Row = v1[y];
+        float* v2Row = v2[y];
+
+        for (int x = 0; x < I1wx.cols; ++x)
+        {
+            const float rho = rhoRow[x] + (I1wxRow[x] * u1Row[x] + I1wyRow[x] * u2Row[x]);
+
+            float d1 = 0.0f;
+            float d2 = 0.0f;
+
+            if (rho < -l_t * gradRow[x])
+            {
+                d1 = l_t * I1wxRow[x];
+                d2 = l_t * I1wyRow[x];
+            }
+            else if (rho > l_t * gradRow[x])
+            {
+                d1 = -l_t * I1wxRow[x];
+                d2 = -l_t * I1wyRow[x];
+            }
+            else if (gradRow[x] > numeric_limits<float>::epsilon())
+            {
+                float fi = -rho / gradRow[x];
+                d1 = fi * I1wxRow[x];
+                d2 = fi * I1wyRow[x];
+            }
+
+            v1Row[x] = u1Row[x] + d1;
+            v2Row[x] = u2Row[x] + d2;
+        }
+    }
+}
+
+void estimateV(const Mat_<float>& I1wx, const Mat_<float>& I1wy, const Mat_<float>& u1, const Mat_<float>& u2, const Mat_<float>& grad, const Mat_<float>& rho_c,
+               Mat_<float>& v1, Mat_<float>& v2, float l_t)
+{
+    CV_DbgAssert( I1wy.size() == I1wx.size() );
+    CV_DbgAssert( u1.size() == I1wx.size() );
+    CV_DbgAssert( u2.size() == I1wx.size() );
+    CV_DbgAssert( grad.size() == I1wx.size() );
+    CV_DbgAssert( rho_c.size() == I1wx.size() );
+    CV_DbgAssert( v1.size() == I1wx.size() );
+    CV_DbgAssert( v2.size() == I1wx.size() );
+
+    EstimateVBody body;
+
+    body.I1wx = I1wx;
+    body.I1wy = I1wy;
+    body.u1 = u1;
+    body.u2 = u2;
+    body.grad = grad;
+    body.rho_c = rho_c;
+    body.v1 = v1;
+    body.v2 = v2;
+    body.l_t = l_t;
+
+    parallel_for_(Range(0, I1wx.rows), body);
+}
+
+////////////////////////////////////////////////////////////
+// estimateU
+
+float estimateU(const Mat_<float>& v1, const Mat_<float>& v2, const Mat_<float>& div_p1, const Mat_<float>& div_p2, Mat_<float>& u1, Mat_<float>& u2, float theta)
+{
+    CV_DbgAssert( v2.size() == v1.size() );
+    CV_DbgAssert( div_p1.size() == v1.size() );
+    CV_DbgAssert( div_p2.size() == v1.size() );
+    CV_DbgAssert( u1.size() == v1.size() );
+    CV_DbgAssert( u2.size() == v1.size() );
+
+    float error = 0.0f;
+    for (int y = 0; y < v1.rows; ++y)
+    {
+        const float* v1Row = v1[y];
+        const float* v2Row = v2[y];
+        const float* divP1Row = div_p1[y];
+        const float* divP2Row = div_p2[y];
+
+        float* u1Row = u1[y];
+        float* u2Row = u2[y];
+
+        for (int x = 0; x < v1.cols; ++x)
+        {
+            const float u1k = u1Row[x];
+            const float u2k = u2Row[x];
+
+            u1Row[x] = v1Row[x] + theta * divP1Row[x];
+            u2Row[x] = v2Row[x] + theta * divP2Row[x];
+
+            error += (u1Row[x] - u1k) * (u1Row[x] - u1k) + (u2Row[x] - u2k) * (u2Row[x] - u2k);
+        }
+    }
+
+    return error;
+}
+
+////////////////////////////////////////////////////////////
+// estimateDualVariables
+
+struct EstimateDualVariablesBody : ParallelLoopBody
+{
+    void operator() (const Range& range) const;
+
+    Mat_<float> u1x;
+    Mat_<float> u1y;
+    Mat_<float> u2x;
+    Mat_<float> u2y;
+    mutable Mat_<float> p11;
+    mutable Mat_<float> p12;
+    mutable Mat_<float> p21;
+    mutable Mat_<float> p22;
+    float taut;
+};
+
+void EstimateDualVariablesBody::operator() (const Range& range) const
+{
+    for (int y = range.start; y < range.end; ++y)
+    {
+        const float* u1xRow = u1x[y];
+        const float* u1yRow = u1y[y];
+        const float* u2xRow = u2x[y];
+        const float* u2yRow = u2y[y];
+
+        float* p11Row = p11[y];
+        float* p12Row = p12[y];
+        float* p21Row = p21[y];
+        float* p22Row = p22[y];
+
+        for (int x = 0; x < u1x.cols; ++x)
+        {
+            const float g1 = static_cast<float>(hypot(u1xRow[x], u1yRow[x]));
+            const float g2 = static_cast<float>(hypot(u2xRow[x], u2yRow[x]));
+
+            const float ng1  = 1.0f + taut * g1;
+            const float ng2  = 1.0f + taut * g2;
+
+            p11Row[x] = (p11Row[x] + taut * u1xRow[x]) / ng1;
+            p12Row[x] = (p12Row[x] + taut * u1yRow[x]) / ng1;
+            p21Row[x] = (p21Row[x] + taut * u2xRow[x]) / ng2;
+            p22Row[x] = (p22Row[x] + taut * u2yRow[x]) / ng2;
+        }
+    }
+}
+
+void estimateDualVariables(const Mat_<float>& u1x, const Mat_<float>& u1y, const Mat_<float>& u2x, const Mat_<float>& u2y,
+                           Mat_<float>& p11, Mat_<float>& p12, Mat_<float>& p21, Mat_<float>& p22, float taut)
+{
+    CV_DbgAssert( u1y.size() == u1x.size() );
+    CV_DbgAssert( u2x.size() == u1x.size() );
+    CV_DbgAssert( u2y.size() == u1x.size() );
+    CV_DbgAssert( p11.size() == u1x.size() );
+    CV_DbgAssert( p12.size() == u1x.size() );
+    CV_DbgAssert( p21.size() == u1x.size() );
+    CV_DbgAssert( p22.size() == u1x.size() );
+
+    EstimateDualVariablesBody body;
+
+    body.u1x = u1x;
+    body.u1y = u1y;
+    body.u2x = u2x;
+    body.u2y = u2y;
+    body.p11 = p11;
+    body.p12 = p12;
+    body.p21 = p21;
+    body.p22 = p22;
+    body.taut = taut;
+
+    parallel_for_(Range(0, u1x.rows), body);
+}
+
+void OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<float>& I1, Mat_<float>& u1, Mat_<float>& u2)
 {
     const float scaledEpsilon = static_cast<float>(epsilon * epsilon * I0.size().area());
 
@@ -818,21 +875,12 @@ void cv::OpticalFlowDual_TVL1::procOneScale(const Mat_<float>& I0, const Mat_<fl
     }
 }
 
-namespace
+void OpticalFlowDual_TVL1::collectGarbage()
 {
-    template <typename T> void releaseVector(vector<T>& v)
-    {
-        vector<T> empty;
-        empty.swap(v);
-    }
-}
-
-void cv::OpticalFlowDual_TVL1::collectGarbage()
-{
-    releaseVector(I0s);
-    releaseVector(I1s);
-    releaseVector(u1s);
-    releaseVector(u2s);
+    I0s.clear();
+    I1s.clear();
+    u1s.clear();
+    u2s.clear();
 
     I1x_buf.release();
     I1y_buf.release();
@@ -863,3 +911,27 @@ void cv::OpticalFlowDual_TVL1::collectGarbage()
     u2x_buf.release();
     u2y_buf.release();
 }
+
+CV_INIT_ALGORITHM(OpticalFlowDual_TVL1, "DenseOpticalFlow.DualTVL1",
+                  obj.info()->addParam(obj, "tau", obj.tau, false, 0, 0,
+                                       "Time step of the numerical scheme");
+                  obj.info()->addParam(obj, "lambda", obj.lambda, false, 0, 0,
+                                       "Weight parameter for the data term, attachment parameter");
+                  obj.info()->addParam(obj, "theta", obj.theta, false, 0, 0,
+                                       "Weight parameter for (u - v)^2, tightness parameter");
+                  obj.info()->addParam(obj, "nscales", obj.nscales, false, 0, 0,
+                                       "Number of scales used to create the pyramid of images");
+                  obj.info()->addParam(obj, "warps", obj.warps, false, 0, 0,
+                                       "Number of warpings per scale");
+                  obj.info()->addParam(obj, "epsilon", obj.epsilon, false, 0, 0,
+                                       "Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time");
+                  obj.info()->addParam(obj, "iterations", obj.iterations, false, 0, 0,
+                                       "Stopping criterion iterations number used in the numerical scheme");
+                  obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow));
+
+} // namespace
+
+Ptr<DenseOpticalFlow> cv::createOptFlow_DualTVL1()
+{
+    return new OpticalFlowDual_TVL1;
+}
diff --git a/modules/video/test/test_tvl1optflow.cpp b/modules/video/test/test_tvl1optflow.cpp
index b5688d35ee..804eae8b62 100644
--- a/modules/video/test/test_tvl1optflow.cpp
+++ b/modules/video/test/test_tvl1optflow.cpp
@@ -152,9 +152,9 @@ TEST(Video_calcOpticalFlowDual_TVL1, Regression)
     ASSERT_FALSE(frame2.empty());
 
     Mat_<Point2f> flow;
-    OpticalFlowDual_TVL1 tvl1;
+    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
 
-    tvl1(frame1, frame2, flow);
+    tvl1->calc(frame1, frame2, flow);
 
 #ifdef DUMP
     writeOpticalFlowToFile(flow, gold_flow_path);
diff --git a/platforms/linux/arm-gnueabi.toolchain.cmake b/platforms/linux/arm-gnueabi.toolchain.cmake
index caa81456db..c6b0469ad8 100644
--- a/platforms/linux/arm-gnueabi.toolchain.cmake
+++ b/platforms/linux/arm-gnueabi.toolchain.cmake
@@ -2,11 +2,7 @@ set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_VERSION 1)
 set(CMAKE_SYSTEM_PROCESSOR arm)
 
-if (CARMA)
-  set(GCC_COMPILER_VERSION "4.5" CACHE STRING "GCC Compiler version")
-else()
-  set(GCC_COMPILER_VERSION "4.6" CACHE STRING "GCC Compiler version")
-endif()
+set(GCC_COMPILER_VERSION "4.6" CACHE STRING "GCC Compiler version")
 
 set(FLOAT_ABI_SUFFIX "")
 
@@ -53,10 +49,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
 
-if (CARMA)
-    add_definitions(-DCARMA)
-endif()
-
 # macro to find programs on the host OS
 macro( find_host_program )
  set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
diff --git a/platforms/linux/scripts/cmake_carma.sh b/platforms/linux/scripts/cmake_carma.sh
index 1c8db98046..791bb67cab 100755
--- a/platforms/linux/scripts/cmake_carma.sh
+++ b/platforms/linux/scripts/cmake_carma.sh
@@ -3,6 +3,9 @@
 mkdir -p build_carma
 cd build_carma
 
-cmake -DSOFTFP=ON -DCARMA=ON -DWITH_TBB=ON -DBUILD_TBB=ON -DUSE_NEON=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/arm-linux-gnueabi/cuda/ \
--DCUDA_ARCH_BIN="2.1(2.0)" -DCUDA_ARCH_PTX="" -DCMAKE_SKIP_RPATH=ON -DWITH_CUDA=ON -DWITH_CUBLAS=ON \
+cmake \
+-DGCC_COMPILER_VERSION="4.5" -DSOFTFP=ON -DUSE_NEON=ON -DCMAKE_SKIP_RPATH=ON \
+-DCUDA_TOOLKIT_ROOT_DIR=/usr/arm-linux-gnueabi/cuda/ -DCUDA_ARCH_BIN="2.1(2.0)" -DCUDA_ARCH_PTX="" \
+-DWITH_TBB=ON -DWITH_CUBLAS=ON \
+-DBUILD_ZLIB=ON -DBUILD_TIFF=ON -DBUILD_JASPER=ON -DBUILD_JPEG=ON -DBUILD_PNG=ON -DBUILD_OPENEXR=ON -DBUILD_TBB=ON \
 -DCMAKE_TOOLCHAIN_FILE=../arm-gnueabi.toolchain.cmake $@ ../../..
diff --git a/samples/cpp/tvl1_optical_flow.cpp b/samples/cpp/tvl1_optical_flow.cpp
index d9a57a2163..4f9a02b5a9 100644
--- a/samples/cpp/tvl1_optical_flow.cpp
+++ b/samples/cpp/tvl1_optical_flow.cpp
@@ -173,10 +173,10 @@ int main(int argc, const char* argv[])
     }
 
     Mat_<Point2f> flow;
-    OpticalFlowDual_TVL1 tvl1;
+    Ptr<DenseOpticalFlow> tvl1 = createOptFlow_DualTVL1();
 
     const double start = (double)getTickCount();
-    tvl1(frame0, frame1, flow);
+    tvl1->calc(frame0, frame1, flow);
     const double timeSec = (getTickCount() - start) / getTickFrequency();
     cout << "calcOpticalFlowDual_TVL1 : " << timeSec << " sec" << endl;
 
diff --git a/samples/java/ant/build.xml b/samples/java/ant/build.xml
new file mode 100644
index 0000000000..924af1f131
--- /dev/null
+++ b/samples/java/ant/build.xml
@@ -0,0 +1,49 @@
+<project name="SimpleSample" basedir="." default="rebuild-run">
+
+    <property name="src.dir"     value="src"/>
+
+    <property name="lib.dir"     value="${ocvJarDir}"/>
+    <path id="classpath">
+        <fileset dir="${lib.dir}" includes="**/*.jar"/>
+    </path>
+
+    <property name="build.dir"   value="build"/>
+    <property name="classes.dir" value="${build.dir}/classes"/>
+    <property name="jar.dir"     value="${build.dir}/jar"/>
+
+    <property name="main-class"  value="${ant.project.name}"/>
+
+
+    <target name="clean">
+        <delete dir="${build.dir}"/>
+    </target>
+
+    <target name="compile">
+        <mkdir dir="${classes.dir}"/>
+        <javac srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/>
+    </target>
+
+    <target name="jar" depends="compile">
+        <mkdir dir="${jar.dir}"/>
+        <jar destfile="${jar.dir}/${ant.project.name}.jar" basedir="${classes.dir}">
+            <manifest>
+                <attribute name="Main-Class" value="${main-class}"/>
+            </manifest>
+        </jar>
+    </target>
+
+    <target name="run" depends="jar">
+        <java fork="true" classname="${main-class}">
+            <sysproperty key="java.library.path" path="${ocvLibDir}"/>
+            <classpath>
+                <path refid="classpath"/>
+                <path location="${jar.dir}/${ant.project.name}.jar"/>
+            </classpath>
+        </java>
+    </target>
+
+    <target name="rebuild" depends="clean,jar"/>
+
+    <target name="rebuild-run" depends="clean,run"/>
+
+</project>
\ No newline at end of file
diff --git a/samples/java/ant/src/SimpleSample.java b/samples/java/ant/src/SimpleSample.java
new file mode 100644
index 0000000000..990536f2b8
--- /dev/null
+++ b/samples/java/ant/src/SimpleSample.java
@@ -0,0 +1,19 @@
+import org.opencv.core.Mat;
+import org.opencv.core.CvType;
+import org.opencv.core.Scalar;
+
+class SimpleSample {
+
+  static{ System.loadLibrary("opencv_java244"); }
+
+  public static void main(String[] args) {
+    Mat m = new Mat(5, 10, CvType.CV_8UC1, new Scalar(0));
+    System.out.println("OpenCV Mat: " + m);
+    Mat mr1 = m.row(1);
+    mr1.setTo(new Scalar(1));
+    Mat mc5 = m.col(5);
+    mc5.setTo(new Scalar(5));
+    System.out.println("OpenCV Mat data:\n" + m.dump());
+  }
+
+}
diff --git a/samples/java/eclipse/HelloCV/.classpath b/samples/java/eclipse/HelloCV/.classpath
new file mode 100644
index 0000000000..645263d832
--- /dev/null
+++ b/samples/java/eclipse/HelloCV/.classpath
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.USER_LIBRARY/opencv-2.4.4"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/samples/java/eclipse/HelloCV/.project b/samples/java/eclipse/HelloCV/.project
new file mode 100644
index 0000000000..b1df1de5e5
--- /dev/null
+++ b/samples/java/eclipse/HelloCV/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>HelloCV</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/samples/java/eclipse/HelloCV/.settings/org.eclipse.jdt.core.prefs b/samples/java/eclipse/HelloCV/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000000..7341ab1683
--- /dev/null
+++ b/samples/java/eclipse/HelloCV/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,11 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.7
diff --git a/samples/java/eclipse/HelloCV/src/Main.java b/samples/java/eclipse/HelloCV/src/Main.java
new file mode 100644
index 0000000000..0e9bb5898f
--- /dev/null
+++ b/samples/java/eclipse/HelloCV/src/Main.java
@@ -0,0 +1,12 @@
+import org.opencv.core.CvType;
+import org.opencv.core.Mat;
+
+public class Main {
+
+    public static void main(String[] args) {
+        System.loadLibrary("opencv_java244");
+        Mat m  = Mat.eye(3, 3, CvType.CV_8UC1);
+        System.out.println("m = " + m.dump());
+    }
+
+}
diff --git a/samples/java/sbt/README b/samples/java/sbt/README
new file mode 100644
index 0000000000..263ff54749
--- /dev/null
+++ b/samples/java/sbt/README
@@ -0,0 +1,13 @@
+A demo of the Java wrapper for OpenCV with two examples:
+1) feature detection and matching and
+2) face detection.
+The examples are coded in Scala and Java.
+Anyone familiar with Java should be able to read the Scala examples.
+Please feel free to contribute code examples in Scala or Java, or any JVM language.
+
+To run the examples:
+1) Install OpenCV and copy the OpenCV jar to lib/.
+   This jar must match the native libraries installed in your system.
+   If this isn't the case, you may get a java.lang.UnsatisfiedLinkError at runtime.
+2) Go to the root directory and type "sbt/sbt run".
+   This should generate images in your current directory.
diff --git a/samples/java/sbt/lib/copy_opencv_jar_here b/samples/java/sbt/lib/copy_opencv_jar_here
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/samples/java/sbt/project/build.scala b/samples/java/sbt/project/build.scala
new file mode 100644
index 0000000000..5a7380b964
--- /dev/null
+++ b/samples/java/sbt/project/build.scala
@@ -0,0 +1,22 @@
+import sbt._
+import Keys._
+
+object OpenCVJavaDemoBuild extends Build {
+  def scalaSettings = Seq(
+    scalaVersion := "2.10.0",
+    scalacOptions ++= Seq(
+      "-optimize",
+      "-unchecked",
+      "-deprecation"
+    )
+  )
+
+  def buildSettings =
+    Project.defaultSettings ++
+    scalaSettings
+
+  lazy val root = {
+    val settings = buildSettings ++ Seq(name := "OpenCVJavaDemo")
+    Project(id = "OpenCVJavaDemo", base = file("."), settings = settings)
+  }
+}
diff --git a/samples/java/sbt/project/plugins.sbt b/samples/java/sbt/project/plugins.sbt
new file mode 100644
index 0000000000..c2371be434
--- /dev/null
+++ b/samples/java/sbt/project/plugins.sbt
@@ -0,0 +1 @@
+addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.1.0")
diff --git a/samples/java/sbt/sbt/sbt b/samples/java/sbt/sbt/sbt
new file mode 100644
index 0000000000..99ae7ec258
--- /dev/null
+++ b/samples/java/sbt/sbt/sbt
@@ -0,0 +1 @@
+java -Xms512M -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=384M -jar `dirname $0`/sbt-launch.jar "$@"
\ No newline at end of file
diff --git a/samples/java/sbt/sbt/sbt-launch.jar b/samples/java/sbt/sbt/sbt-launch.jar
new file mode 100644
index 0000000000..06ad8d8805
Binary files /dev/null and b/samples/java/sbt/sbt/sbt-launch.jar differ
diff --git a/samples/java/sbt/src/main/java/DetectFaceDemo.java b/samples/java/sbt/src/main/java/DetectFaceDemo.java
new file mode 100644
index 0000000000..fb08567237
--- /dev/null
+++ b/samples/java/sbt/src/main/java/DetectFaceDemo.java
@@ -0,0 +1,44 @@
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfRect;
+import org.opencv.core.Point;
+import org.opencv.core.Rect;
+import org.opencv.core.Scalar;
+import org.opencv.highgui.Highgui;
+import org.opencv.objdetect.CascadeClassifier;
+
+/*
+ * Detects faces in an image, draws boxes around them, and writes the results
+ * to "faceDetection.png".
+ */
+public class DetectFaceDemo {
+    public void run() {
+        System.out.println("\nRunning DetectFaceDemo");
+
+        // Create a face detector from the cascade file in the resources
+        // directory.
+        CascadeClassifier faceDetector = new CascadeClassifier(getClass()
+                .getResource("/lbpcascade_frontalface.xml").getPath());
+        Mat image = Highgui.imread(getClass().getResource(
+                "/AverageMaleFace.jpg").getPath());
+
+        // Detect faces in the image.
+        // MatOfRect is a special container class for Rect.
+        MatOfRect faceDetections = new MatOfRect();
+        faceDetector.detectMultiScale(image, faceDetections);
+
+        System.out.println(String.format("Detected %s faces",
+                faceDetections.toArray().length));
+
+        // Draw a bounding box around each face.
+        for (Rect rect : faceDetections.toArray()) {
+            Core.rectangle(image, new Point(rect.x, rect.y), new Point(rect.x
+                    + rect.width, rect.y + rect.height), new Scalar(0, 255, 0));
+        }
+
+        // Save the visualized detection.
+        String filename = "faceDetection.png";
+        System.out.println(String.format("Writing %s", filename));
+        Highgui.imwrite(filename, image);
+    }
+}
\ No newline at end of file
diff --git a/samples/java/sbt/src/main/resources/AverageMaleFace.jpg b/samples/java/sbt/src/main/resources/AverageMaleFace.jpg
new file mode 100644
index 0000000000..3b96e03b87
Binary files /dev/null and b/samples/java/sbt/src/main/resources/AverageMaleFace.jpg differ
diff --git a/samples/java/sbt/src/main/resources/img1.png b/samples/java/sbt/src/main/resources/img1.png
new file mode 100644
index 0000000000..93cd5945a7
Binary files /dev/null and b/samples/java/sbt/src/main/resources/img1.png differ
diff --git a/samples/java/sbt/src/main/resources/img2.png b/samples/java/sbt/src/main/resources/img2.png
new file mode 100644
index 0000000000..41959c66f2
Binary files /dev/null and b/samples/java/sbt/src/main/resources/img2.png differ
diff --git a/samples/java/sbt/src/main/scala/Main.scala b/samples/java/sbt/src/main/scala/Main.scala
new file mode 100644
index 0000000000..4a68d144a4
--- /dev/null
+++ b/samples/java/sbt/src/main/scala/Main.scala
@@ -0,0 +1,20 @@
+/*
+ * The main runner for the Java demos.
+ * Demos whose name begins with "Scala" are written in the Scala language,
+ * demonstrating the generic nature of the interface.
+ * The other demos are in Java.
+ * Currently, all demos are run, sequentially.
+ *
+ * You're invited to submit your own examples, in any JVM language of
+ * your choosing so long as you can get them to build.
+ */
+object Main extends App {
+  // We must load the native library before using any OpenCV functions.
+  // You must load this library _exactly once_ per Java invocation.
+  // If you load it more than once, you will get a java.lang.UnsatisfiedLinkError.
+  System.loadLibrary("opencv_java")
+
+  ScalaCorrespondenceMatchingDemo.run()
+  ScalaDetectFaceDemo.run()
+  new DetectFaceDemo().run()
+}
diff --git a/samples/java/sbt/src/main/scala/ScalaCorrespondenceMatchingDemo.scala b/samples/java/sbt/src/main/scala/ScalaCorrespondenceMatchingDemo.scala
new file mode 100644
index 0000000000..30ab0553e3
--- /dev/null
+++ b/samples/java/sbt/src/main/scala/ScalaCorrespondenceMatchingDemo.scala
@@ -0,0 +1,69 @@
+import org.opencv.highgui.Highgui
+import org.opencv.features2d.DescriptorExtractor
+import org.opencv.features2d.Features2d
+import org.opencv.core.MatOfKeyPoint
+import org.opencv.core.Mat
+import org.opencv.features2d.FeatureDetector
+import org.opencv.features2d.DescriptorMatcher
+import org.opencv.core.MatOfDMatch
+import reflect._
+
+/*
+ * Finds corresponding points between a pair of images using local descriptors.
+ * The correspondences are visualized in the image "scalaCorrespondences.png",
+ * which is written to disk.
+ */
+object ScalaCorrespondenceMatchingDemo {
+  def run() {
+    println(s"\nRunning ${classTag[this.type].toString.replace("$", "")}")
+
+    // Detects keypoints and extracts descriptors in a given image of type Mat.
+    def detectAndExtract(mat: Mat) = {
+      // A special container class for KeyPoint.
+      val keyPoints = new MatOfKeyPoint
+      // We're using the SURF detector.
+      val detector = FeatureDetector.create(FeatureDetector.SURF)
+      detector.detect(mat, keyPoints)
+
+      println(s"There were ${keyPoints.toArray.size} KeyPoints detected")
+
+      // Let's just use the best KeyPoints.
+      val sorted = keyPoints.toArray.sortBy(_.response).reverse.take(50)
+      // There isn't a constructor that takes Array[KeyPoint], so we unpack
+      // the array and use the constructor that can take any number of
+      // arguments.
+      val bestKeyPoints: MatOfKeyPoint = new MatOfKeyPoint(sorted: _*)
+
+      // We're using the SURF descriptor.
+      val extractor = DescriptorExtractor.create(DescriptorExtractor.SURF)
+      val descriptors = new Mat
+      extractor.compute(mat, bestKeyPoints, descriptors)
+
+      println(s"${descriptors.rows} descriptors were extracted, each with dimension ${descriptors.cols}")
+
+      (bestKeyPoints, descriptors)
+    }
+
+    // Load the images from the |resources| directory.
+    val leftImage = Highgui.imread(getClass.getResource("/img1.png").getPath)
+    val rightImage = Highgui.imread(getClass.getResource("/img2.png").getPath)
+
+    // Detect KeyPoints and extract descriptors.
+    val (leftKeyPoints, leftDescriptors) = detectAndExtract(leftImage)
+    val (rightKeyPoints, rightDescriptors) = detectAndExtract(rightImage)
+
+    // Match the descriptors.
+    val matcher = DescriptorMatcher.create(DescriptorMatcher.BRUTEFORCE)
+    // A special container class for DMatch.
+    val dmatches = new MatOfDMatch
+    // The backticks are because "match" is a keyword in Scala.
+    matcher.`match`(leftDescriptors, rightDescriptors, dmatches)
+
+    // Visualize the matches and save the visualization.
+    val correspondenceImage = new Mat
+    Features2d.drawMatches(leftImage, leftKeyPoints, rightImage, rightKeyPoints, dmatches, correspondenceImage)
+    val filename = "scalaCorrespondences.png"
+    println(s"Writing ${filename}")
+    assert(Highgui.imwrite(filename, correspondenceImage))
+  }
+}
\ No newline at end of file
diff --git a/samples/java/sbt/src/main/scala/ScalaDetectFaceDemo.scala b/samples/java/sbt/src/main/scala/ScalaDetectFaceDemo.scala
new file mode 100644
index 0000000000..a35eeb7784
--- /dev/null
+++ b/samples/java/sbt/src/main/scala/ScalaDetectFaceDemo.scala
@@ -0,0 +1,43 @@
+import org.opencv.core.Core
+import org.opencv.core.MatOfRect
+import org.opencv.core.Point
+import org.opencv.core.Scalar
+import org.opencv.highgui.Highgui
+import org.opencv.objdetect.CascadeClassifier
+import reflect._
+
+/*
+ * Detects faces in an image, draws boxes around them, and writes the results
+ * to "scalaFaceDetection.png".
+ */
+object ScalaDetectFaceDemo {
+  def run() {
+    println(s"\nRunning ${classTag[this.type].toString.replace("$", "")}")
+
+    // Create a face detector from the cascade file in the resources directory.
+    val faceDetector = new CascadeClassifier(getClass.getResource("/lbpcascade_frontalface.xml").getPath)
+    val image = Highgui.imread(getClass.getResource("/AverageMaleFace.jpg").getPath)
+
+    // Detect faces in the image.
+    // MatOfRect is a special container class for Rect.
+    val faceDetections = new MatOfRect
+    faceDetector.detectMultiScale(image, faceDetections)
+
+    println(s"Detected ${faceDetections.toArray.size} faces")
+
+    // Draw a bounding box around each face.
+    for (rect <- faceDetections.toArray) {
+      Core.rectangle(
+        image,
+        new Point(rect.x, rect.y),
+        new Point(rect.x + rect.width,
+          rect.y + rect.height),
+        new Scalar(0, 255, 0))
+    }
+
+    // Save the visualized detection.
+    val filename = "scalaFaceDetection.png"
+    println(s"Writing ${filename}")
+    assert(Highgui.imwrite(filename, image))
+  }
+}
\ No newline at end of file