Merge pull request #1561 from alalek:ocl_refactoring

2025-07-31 09:57:28 +08:00 · 2013-10-04 11:50:03 +04:00 · 2013-10-04 11:50:03 +04:00 · 8224f9843e
commit 8224f9843e
parent 8e75947a7d d74ca7b39d
69 changed files with 2994 additions and 2166 deletions
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -445,19 +445,20 @@ macro(ocv_glob_module_sources)
    source_group("Src\\Cuda"      FILES ${lib_cuda_srcs} ${lib_cuda_hdrs})
  endif()

+  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
+
  file(GLOB cl_kernels "src/opencl/*.cl")

  if(HAVE_OPENCL AND cl_kernels)
    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
    add_custom_command(
-      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
-      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
+      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
-    source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
-    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+    source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
+    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
  endif()

-  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
  source_group("Include" FILES ${lib_hdrs})
  source_group("Include\\detail" FILES ${lib_hdrs_detail})

--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@ -1,6 +1,12 @@
 file(GLOB cl_list "${CL_DIR}/*.cl" )
+list(SORT cl_list)

-file(WRITE ${OUTPUT} "// This file is auto-generated. Do not edit!
+string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}")
+get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME)
+
+set(STR_CPP "// This file is auto-generated. Do not edit!
+
+#include \"${OUTPUT_HPP_NAME}\"

 namespace cv
 {
@ -8,6 +14,15 @@ namespace ocl
 {
 ")

+set(STR_HPP "// This file is auto-generated. Do not edit!
+
+namespace cv
+{
+namespace ocl
+{
+
+")
+
 foreach(cl ${cl_list})
  get_filename_component(cl_filename "${cl}" NAME_WE)
  #message("${cl_filename}")
@ -29,7 +44,22 @@ foreach(cl ${cl_list})

  string(REGEX REPLACE "\"$" "" lines "${lines}") # unneeded " at the eof

-  file(APPEND ${OUTPUT} "const char* ${cl_filename}=\"${lines};\n")
+  string(MD5 hash "${lines}")
+
+  set(STR_CPP "${STR_CPP}const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
+  set(STR_HPP "${STR_HPP}extern const struct ProgramEntry ${cl_filename};\n")
 endforeach()

-file(APPEND ${OUTPUT} "}\n}\n")
+set(STR_CPP "${STR_CPP}}\n}\n")
+set(STR_HPP "${STR_HPP}}\n}\n")
+
+file(WRITE "${OUTPUT}" "${STR_CPP}")
+
+if(EXISTS "${OUTPUT_HPP}")
+  file(READ "${OUTPUT_HPP}" hpp_lines)
+endif()
+if("${hpp_lines}" STREQUAL "${STR_HPP}")
+  message(STATUS "${OUTPUT_HPP} contains same content")
+else()
+  file(WRITE "${OUTPUT_HPP}" "${STR_HPP}")
+endif()
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@ -43,27 +43,24 @@
 //
 //M*/
 #include "precomp.hpp"
-#include <cstdio>

 #ifdef HAVE_OPENCV_OCL
+#include <cstdio>
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

 namespace cv
 {
    namespace ocl
    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *surf;
-
-        const char noImage2dOption [] = "-D DISABLE_IMAGE2D";
+        static const char noImage2dOption[] = "-D DISABLE_IMAGE2D";

        static bool use_image2d = false;

-        static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
-            size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels, int depth)
+        static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
+            size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
        {
            char optBuf [100] = {0};
            char * optBufPtr = optBuf;
@ -74,7 +71,7 @@ namespace cv
            }
            cl_kernel kernel;
            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
-            size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
+            size_t wave_size = queryWaveFrontSize(kernel);
            CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
            sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
@ -486,26 +483,26 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i

    Context *clCxt = det.clCxt;
    string kernelName = "icvCalcLayerDetAndTrace";
-    vector< pair<size_t, const void *> > args;
+    std::vector< std::pair<size_t, const void *> > args;

    if(sumTex)
    {
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
    }
    else
    {
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
    }
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));

    size_t localThreads[3]  = {16, 16, 1};
    size_t globalThreads[3] =
@ -524,35 +521,35 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat

    Context *clCxt = det.clCxt;
    string kernelName = useMask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
-    vector< pair<size_t, const void *> > args;
+    std::vector< std::pair<size_t, const void *> > args;

-    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&counterOffset));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&nLayers));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
-    args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&counterOffset));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nLayers));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxCandidates));
+    args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));

    if(useMask)
    {
        if(maskSumTex)
        {
-            args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
        }
        else
        {
-            args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
        }
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
    }
    size_t localThreads[3]  = {16, 16, 1};
    size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
@ -568,19 +565,19 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
 {
    Context *clCxt = det.clCxt;
    string kernelName = "icvInterpolateKeypoint";
-    vector< pair<size_t, const void *> > args;
+    std::vector< std::pair<size_t, const void *> > args;

-    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&counters_.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&max_features));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters_.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&max_features));

    size_t localThreads[3]  = {3, 3, 3};
    size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
@ -593,21 +590,21 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
    Context *clCxt = counters.clCxt;
    string kernelName = "icvCalcOrientation";

-    vector< pair<size_t, const void *> > args;
+    std::vector< std::pair<size_t, const void *> > args;

    if(sumTex)
    {
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
    }
    else
    {
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
    }
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));

    size_t localThreads[3]  = {32, 4, 1};
    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
@ -620,11 +617,11 @@ void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
    Context *clCxt = counters.clCxt;
    string kernelName = "icvSetUpright";

-    vector< pair<size_t, const void *> > args;
+    std::vector< std::pair<size_t, const void *> > args;

-    args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&nFeatures));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures));

    size_t localThreads[3]  = {256, 1, 1};
    size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
@ -638,7 +635,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
    Context *clCxt = descriptors.clCxt;
    string kernelName;
-    vector< pair<size_t, const void *> > args;
+    std::vector< std::pair<size_t, const void *> > args;
    size_t localThreads[3]  = {1, 1, 1};
    size_t globalThreads[3] = {1, 1, 1};

@ -655,19 +652,19 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        if(imgTex)
        {
-            args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
        }
        else
        {
-            args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
        }
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));

        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);

@ -680,8 +677,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        globalThreads[1] = localThreads[1];

        args.clear();
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));

        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
@ -698,19 +695,19 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        if(imgTex)
        {
-            args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
        }
        else
        {
-            args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
        }
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));

        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);

@ -723,8 +720,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        globalThreads[1] = localThreads[1];

        args.clear();
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));

        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
--- a/modules/ocl/doc/structures_and_utility_functions.rst
+++ b/modules/ocl/doc/structures_and_utility_functions.rst
@ -3,56 +3,40 @@ Data Structures and Utility Functions

 .. highlight:: cpp

-ocl::Info
-------------
-.. ocv:class:: ocl::Info
+ocl::getOpenCLPlatforms
+-----------------------
+Returns the list of OpenCL platforms

-this class should be maintained by the user and be passed to getDevice
+.. ocv:function:: int ocl::getOpenCLPlatforms( PlatformsInfo& platforms )

-ocl::getDevice
------------------
+    :param platforms: Output variable
+
+ocl::getOpenCLDevices
+---------------------
 Returns the list of devices

-.. ocv:function:: int ocl::getDevice( std::vector<Info> & oclinfo, int devicetype=CVCL_DEVICE_TYPE_GPU )
+.. ocv:function:: int ocl::getOpenCLDevices( DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU, const PlatformInfo* platform = NULL )

-    :param oclinfo: Output vector of ``ocl::Info`` structures
+    :param devices: Output variable

-    :param devicetype: One of ``CVCL_DEVICE_TYPE_GPU``, ``CVCL_DEVICE_TYPE_CPU`` or ``CVCL_DEVICE_TYPE_DEFAULT``.
+    :param deviceType: Bitmask of ``CVCL_DEVICE_TYPE_GPU``, ``CVCL_DEVICE_TYPE_CPU`` or ``CVCL_DEVICE_TYPE_DEFAULT``.

-the function must be called before any other ``cv::ocl`` functions; it initializes ocl runtime.
+    :param platform: Specifies preferrable platform

 ocl::setDevice
+--------------
+Returns void
+
+.. ocv:function:: void ocl::setDevice( const DeviceInfo* info )
+
+    :param info: device info
+
+ocl::setBinaryPath
 ------------------
 Returns void

-.. ocv:function:: void ocl::setDevice( Info &oclinfo, int devnum = 0 )
-
-    :param oclinfo: Output vector of ``ocl::Info`` structures
-
-    :param devnum: the selected OpenCL device under this platform.
-
-ocl::setBinpath
------------------
-Returns void
-
-.. ocv:function:: void ocl::setBinpath(const char *path)
+.. ocv:function:: void ocl::setBinaryPath(const char *path)

    :param path: the path of OpenCL kernel binaries

 If you call this function and set a valid path, the OCL module will save the compiled kernel to the address in the first time and reload the binary since that. It can save compilation time at the runtime.
-
-ocl::getoclContext
----------------------
-Returns the pointer to the opencl context
-
-.. ocv:function:: void* ocl::getoclContext()
-
-Thefunction are used to get opencl context so that opencv can interactive with other opencl program.
-
-ocl::getoclCommandQueue
--------------------------
-Returns the pointer to the opencl command queue
-
-.. ocv:function:: void* ocl::getoclCommandQueue()
-
-Thefunction are used to get opencl command queue so that opencv can interactive with other opencl program.
--- a/modules/ocl/include/opencv2/ocl/cl_runtime/cl_runtime_opencl11_wrappers.hpp
+++ b/modules/ocl/include/opencv2/ocl/cl_runtime/cl_runtime_opencl11_wrappers.hpp
@ -0,0 +1,231 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
+#define __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
+
+// generated by parser_cl.py
+#undef clGetPlatformIDs
+#define clGetPlatformIDs clGetPlatformIDs_fn
+inline cl_int clGetPlatformIDs(cl_uint p0, cl_platform_id* p1, cl_uint* p2) { return clGetPlatformIDs_pfn(p0, p1, p2); }
+#undef clGetPlatformInfo
+#define clGetPlatformInfo clGetPlatformInfo_fn
+inline cl_int clGetPlatformInfo(cl_platform_id p0, cl_platform_info p1, size_t p2, void* p3, size_t* p4) { return clGetPlatformInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceIDs
+#define clGetDeviceIDs clGetDeviceIDs_fn
+inline cl_int clGetDeviceIDs(cl_platform_id p0, cl_device_type p1, cl_uint p2, cl_device_id* p3, cl_uint* p4) { return clGetDeviceIDs_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceInfo
+#define clGetDeviceInfo clGetDeviceInfo_fn
+inline cl_int clGetDeviceInfo(cl_device_id p0, cl_device_info p1, size_t p2, void* p3, size_t* p4) { return clGetDeviceInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateContext
+#define clCreateContext clCreateContext_fn
+inline cl_context clCreateContext(const cl_context_properties* p0, cl_uint p1, const cl_device_id* p2, void (CL_CALLBACK*p3) (const char*, const void*, size_t, void*), void* p4, cl_int* p5) { return clCreateContext_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateContextFromType
+#define clCreateContextFromType clCreateContextFromType_fn
+inline cl_context clCreateContextFromType(const cl_context_properties* p0, cl_device_type p1, void (CL_CALLBACK*p2) (const char*, const void*, size_t, void*), void* p3, cl_int* p4) { return clCreateContextFromType_pfn(p0, p1, p2, p3, p4); }
+#undef clRetainContext
+#define clRetainContext clRetainContext_fn
+inline cl_int clRetainContext(cl_context p0) { return clRetainContext_pfn(p0); }
+#undef clReleaseContext
+#define clReleaseContext clReleaseContext_fn
+inline cl_int clReleaseContext(cl_context p0) { return clReleaseContext_pfn(p0); }
+#undef clGetContextInfo
+#define clGetContextInfo clGetContextInfo_fn
+inline cl_int clGetContextInfo(cl_context p0, cl_context_info p1, size_t p2, void* p3, size_t* p4) { return clGetContextInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateCommandQueue
+#define clCreateCommandQueue clCreateCommandQueue_fn
+inline cl_command_queue clCreateCommandQueue(cl_context p0, cl_device_id p1, cl_command_queue_properties p2, cl_int* p3) { return clCreateCommandQueue_pfn(p0, p1, p2, p3); }
+#undef clRetainCommandQueue
+#define clRetainCommandQueue clRetainCommandQueue_fn
+inline cl_int clRetainCommandQueue(cl_command_queue p0) { return clRetainCommandQueue_pfn(p0); }
+#undef clReleaseCommandQueue
+#define clReleaseCommandQueue clReleaseCommandQueue_fn
+inline cl_int clReleaseCommandQueue(cl_command_queue p0) { return clReleaseCommandQueue_pfn(p0); }
+#undef clGetCommandQueueInfo
+#define clGetCommandQueueInfo clGetCommandQueueInfo_fn
+inline cl_int clGetCommandQueueInfo(cl_command_queue p0, cl_command_queue_info p1, size_t p2, void* p3, size_t* p4) { return clGetCommandQueueInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clSetCommandQueueProperty
+#define clSetCommandQueueProperty clSetCommandQueueProperty_fn
+inline cl_int clSetCommandQueueProperty(cl_command_queue p0, cl_command_queue_properties p1, cl_bool p2, cl_command_queue_properties* p3) { return clSetCommandQueueProperty_pfn(p0, p1, p2, p3); }
+#undef clCreateBuffer
+#define clCreateBuffer clCreateBuffer_fn
+inline cl_mem clCreateBuffer(cl_context p0, cl_mem_flags p1, size_t p2, void* p3, cl_int* p4) { return clCreateBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSubBuffer
+#define clCreateSubBuffer clCreateSubBuffer_fn
+inline cl_mem clCreateSubBuffer(cl_mem p0, cl_mem_flags p1, cl_buffer_create_type p2, const void* p3, cl_int* p4) { return clCreateSubBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateImage2D
+#define clCreateImage2D clCreateImage2D_fn
+inline cl_mem clCreateImage2D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, void* p6, cl_int* p7) { return clCreateImage2D_pfn(p0, p1, p2, p3, p4, p5, p6, p7); }
+#undef clCreateImage3D
+#define clCreateImage3D clCreateImage3D_fn
+inline cl_mem clCreateImage3D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, size_t p6, size_t p7, void* p8, cl_int* p9) { return clCreateImage3D_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clRetainMemObject
+#define clRetainMemObject clRetainMemObject_fn
+inline cl_int clRetainMemObject(cl_mem p0) { return clRetainMemObject_pfn(p0); }
+#undef clReleaseMemObject
+#define clReleaseMemObject clReleaseMemObject_fn
+inline cl_int clReleaseMemObject(cl_mem p0) { return clReleaseMemObject_pfn(p0); }
+#undef clGetSupportedImageFormats
+#define clGetSupportedImageFormats clGetSupportedImageFormats_fn
+inline cl_int clGetSupportedImageFormats(cl_context p0, cl_mem_flags p1, cl_mem_object_type p2, cl_uint p3, cl_image_format* p4, cl_uint* p5) { return clGetSupportedImageFormats_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetMemObjectInfo
+#define clGetMemObjectInfo clGetMemObjectInfo_fn
+inline cl_int clGetMemObjectInfo(cl_mem p0, cl_mem_info p1, size_t p2, void* p3, size_t* p4) { return clGetMemObjectInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetImageInfo
+#define clGetImageInfo clGetImageInfo_fn
+inline cl_int clGetImageInfo(cl_mem p0, cl_image_info p1, size_t p2, void* p3, size_t* p4) { return clGetImageInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clSetMemObjectDestructorCallback
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_fn
+inline cl_int clSetMemObjectDestructorCallback(cl_mem p0, void (CL_CALLBACK*p1) (cl_mem, void*), void* p2) { return clSetMemObjectDestructorCallback_pfn(p0, p1, p2); }
+#undef clCreateSampler
+#define clCreateSampler clCreateSampler_fn
+inline cl_sampler clCreateSampler(cl_context p0, cl_bool p1, cl_addressing_mode p2, cl_filter_mode p3, cl_int* p4) { return clCreateSampler_pfn(p0, p1, p2, p3, p4); }
+#undef clRetainSampler
+#define clRetainSampler clRetainSampler_fn
+inline cl_int clRetainSampler(cl_sampler p0) { return clRetainSampler_pfn(p0); }
+#undef clReleaseSampler
+#define clReleaseSampler clReleaseSampler_fn
+inline cl_int clReleaseSampler(cl_sampler p0) { return clReleaseSampler_pfn(p0); }
+#undef clGetSamplerInfo
+#define clGetSamplerInfo clGetSamplerInfo_fn
+inline cl_int clGetSamplerInfo(cl_sampler p0, cl_sampler_info p1, size_t p2, void* p3, size_t* p4) { return clGetSamplerInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateProgramWithSource
+#define clCreateProgramWithSource clCreateProgramWithSource_fn
+inline cl_program clCreateProgramWithSource(cl_context p0, cl_uint p1, const char** p2, const size_t* p3, cl_int* p4) { return clCreateProgramWithSource_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateProgramWithBinary
+#define clCreateProgramWithBinary clCreateProgramWithBinary_fn
+inline cl_program clCreateProgramWithBinary(cl_context p0, cl_uint p1, const cl_device_id* p2, const size_t* p3, const unsigned char** p4, cl_int* p5, cl_int* p6) { return clCreateProgramWithBinary_pfn(p0, p1, p2, p3, p4, p5, p6); }
+#undef clRetainProgram
+#define clRetainProgram clRetainProgram_fn
+inline cl_int clRetainProgram(cl_program p0) { return clRetainProgram_pfn(p0); }
+#undef clReleaseProgram
+#define clReleaseProgram clReleaseProgram_fn
+inline cl_int clReleaseProgram(cl_program p0) { return clReleaseProgram_pfn(p0); }
+#undef clBuildProgram
+#define clBuildProgram clBuildProgram_fn
+inline cl_int clBuildProgram(cl_program p0, cl_uint p1, const cl_device_id* p2, const char* p3, void (CL_CALLBACK*p4) (cl_program, void*), void* p5) { return clBuildProgram_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clUnloadCompiler
+#define clUnloadCompiler clUnloadCompiler_fn
+inline cl_int clUnloadCompiler() { return clUnloadCompiler_pfn(); }
+#undef clGetProgramInfo
+#define clGetProgramInfo clGetProgramInfo_fn
+inline cl_int clGetProgramInfo(cl_program p0, cl_program_info p1, size_t p2, void* p3, size_t* p4) { return clGetProgramInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetProgramBuildInfo
+#define clGetProgramBuildInfo clGetProgramBuildInfo_fn
+inline cl_int clGetProgramBuildInfo(cl_program p0, cl_device_id p1, cl_program_build_info p2, size_t p3, void* p4, size_t* p5) { return clGetProgramBuildInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateKernel
+#define clCreateKernel clCreateKernel_fn
+inline cl_kernel clCreateKernel(cl_program p0, const char* p1, cl_int* p2) { return clCreateKernel_pfn(p0, p1, p2); }
+#undef clCreateKernelsInProgram
+#define clCreateKernelsInProgram clCreateKernelsInProgram_fn
+inline cl_int clCreateKernelsInProgram(cl_program p0, cl_uint p1, cl_kernel* p2, cl_uint* p3) { return clCreateKernelsInProgram_pfn(p0, p1, p2, p3); }
+#undef clRetainKernel
+#define clRetainKernel clRetainKernel_fn
+inline cl_int clRetainKernel(cl_kernel p0) { return clRetainKernel_pfn(p0); }
+#undef clReleaseKernel
+#define clReleaseKernel clReleaseKernel_fn
+inline cl_int clReleaseKernel(cl_kernel p0) { return clReleaseKernel_pfn(p0); }
+#undef clSetKernelArg
+#define clSetKernelArg clSetKernelArg_fn
+inline cl_int clSetKernelArg(cl_kernel p0, cl_uint p1, size_t p2, const void* p3) { return clSetKernelArg_pfn(p0, p1, p2, p3); }
+#undef clGetKernelInfo
+#define clGetKernelInfo clGetKernelInfo_fn
+inline cl_int clGetKernelInfo(cl_kernel p0, cl_kernel_info p1, size_t p2, void* p3, size_t* p4) { return clGetKernelInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetKernelWorkGroupInfo
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_fn
+inline cl_int clGetKernelWorkGroupInfo(cl_kernel p0, cl_device_id p1, cl_kernel_work_group_info p2, size_t p3, void* p4, size_t* p5) { return clGetKernelWorkGroupInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clWaitForEvents
+#define clWaitForEvents clWaitForEvents_fn
+inline cl_int clWaitForEvents(cl_uint p0, const cl_event* p1) { return clWaitForEvents_pfn(p0, p1); }
+#undef clGetEventInfo
+#define clGetEventInfo clGetEventInfo_fn
+inline cl_int clGetEventInfo(cl_event p0, cl_event_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateUserEvent
+#define clCreateUserEvent clCreateUserEvent_fn
+inline cl_event clCreateUserEvent(cl_context p0, cl_int* p1) { return clCreateUserEvent_pfn(p0, p1); }
+#undef clRetainEvent
+#define clRetainEvent clRetainEvent_fn
+inline cl_int clRetainEvent(cl_event p0) { return clRetainEvent_pfn(p0); }
+#undef clReleaseEvent
+#define clReleaseEvent clReleaseEvent_fn
+inline cl_int clReleaseEvent(cl_event p0) { return clReleaseEvent_pfn(p0); }
+#undef clSetUserEventStatus
+#define clSetUserEventStatus clSetUserEventStatus_fn
+inline cl_int clSetUserEventStatus(cl_event p0, cl_int p1) { return clSetUserEventStatus_pfn(p0, p1); }
+#undef clSetEventCallback
+#define clSetEventCallback clSetEventCallback_fn
+inline cl_int clSetEventCallback(cl_event p0, cl_int p1, void (CL_CALLBACK*p2) (cl_event, cl_int, void*), void* p3) { return clSetEventCallback_pfn(p0, p1, p2, p3); }
+#undef clGetEventProfilingInfo
+#define clGetEventProfilingInfo clGetEventProfilingInfo_fn
+inline cl_int clGetEventProfilingInfo(cl_event p0, cl_profiling_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventProfilingInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clFlush
+#define clFlush clFlush_fn
+inline cl_int clFlush(cl_command_queue p0) { return clFlush_pfn(p0); }
+#undef clFinish
+#define clFinish clFinish_fn
+inline cl_int clFinish(cl_command_queue p0) { return clFinish_pfn(p0); }
+#undef clEnqueueReadBuffer
+#define clEnqueueReadBuffer clEnqueueReadBuffer_fn
+inline cl_int clEnqueueReadBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueReadBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueReadBufferRect
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_fn
+inline cl_int clEnqueueReadBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueReadBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueWriteBuffer
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_fn
+inline cl_int clEnqueueWriteBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, const void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueWriteBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueWriteBufferRect
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_fn
+inline cl_int clEnqueueWriteBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, const void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueWriteBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueCopyBuffer
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_fn
+inline cl_int clEnqueueCopyBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyBufferRect
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_fn
+inline cl_int clEnqueueCopyBufferRect(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, cl_uint p10, const cl_event* p11, cl_event* p12) { return clEnqueueCopyBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12); }
+#undef clEnqueueReadImage
+#define clEnqueueReadImage clEnqueueReadImage_fn
+inline cl_int clEnqueueReadImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueReadImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clEnqueueWriteImage
+#define clEnqueueWriteImage clEnqueueWriteImage_fn
+inline cl_int clEnqueueWriteImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, const void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueWriteImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clEnqueueCopyImage
+#define clEnqueueCopyImage clEnqueueCopyImage_fn
+inline cl_int clEnqueueCopyImage(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyImageToBuffer
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_fn
+inline cl_int clEnqueueCopyImageToBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImageToBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyBufferToImage
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_fn
+inline cl_int clEnqueueCopyBufferToImage(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBufferToImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueMapBuffer
+#define clEnqueueMapBuffer clEnqueueMapBuffer_fn
+inline void* clEnqueueMapBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8, cl_int* p9) { return clEnqueueMapBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueMapImage
+#define clEnqueueMapImage clEnqueueMapImage_fn
+inline void* clEnqueueMapImage(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, const size_t* p4, const size_t* p5, size_t* p6, size_t* p7, cl_uint p8, const cl_event* p9, cl_event* p10, cl_int* p11) { return clEnqueueMapImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11); }
+#undef clEnqueueUnmapMemObject
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_fn
+inline cl_int clEnqueueUnmapMemObject(cl_command_queue p0, cl_mem p1, void* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueUnmapMemObject_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueNDRangeKernel
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_fn
+inline cl_int clEnqueueNDRangeKernel(cl_command_queue p0, cl_kernel p1, cl_uint p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueNDRangeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueTask
+#define clEnqueueTask clEnqueueTask_fn
+inline cl_int clEnqueueTask(cl_command_queue p0, cl_kernel p1, cl_uint p2, const cl_event* p3, cl_event* p4) { return clEnqueueTask_pfn(p0, p1, p2, p3, p4); }
+#undef clEnqueueNativeKernel
+#define clEnqueueNativeKernel clEnqueueNativeKernel_fn
+inline cl_int clEnqueueNativeKernel(cl_command_queue p0, void (CL_CALLBACK*p1) (void*), void* p2, size_t p3, cl_uint p4, const cl_mem* p5, const void** p6, cl_uint p7, const cl_event* p8, cl_event* p9) { return clEnqueueNativeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueMarker
+#define clEnqueueMarker clEnqueueMarker_fn
+inline cl_int clEnqueueMarker(cl_command_queue p0, cl_event* p1) { return clEnqueueMarker_pfn(p0, p1); }
+#undef clEnqueueWaitForEvents
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_fn
+inline cl_int clEnqueueWaitForEvents(cl_command_queue p0, cl_uint p1, const cl_event* p2) { return clEnqueueWaitForEvents_pfn(p0, p1, p2); }
+#undef clEnqueueBarrier
+#define clEnqueueBarrier clEnqueueBarrier_fn
+inline cl_int clEnqueueBarrier(cl_command_queue p0) { return clEnqueueBarrier_pfn(p0); }
+#undef clGetExtensionFunctionAddress
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_fn
+inline void* clGetExtensionFunctionAddress(const char* p0) { return clGetExtensionFunctionAddress_pfn(p0); }
+
+#endif // __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
--- a/modules/ocl/include/opencv2/ocl/cl_runtime/cl_runtime_opencl12_wrappers.hpp
+++ b/modules/ocl/include/opencv2/ocl/cl_runtime/cl_runtime_opencl12_wrappers.hpp
@ -0,0 +1,273 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
+#define __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
+
+// generated by parser_cl.py
+#undef clGetPlatformIDs
+#define clGetPlatformIDs clGetPlatformIDs_fn
+inline cl_int clGetPlatformIDs(cl_uint p0, cl_platform_id* p1, cl_uint* p2) { return clGetPlatformIDs_pfn(p0, p1, p2); }
+#undef clGetPlatformInfo
+#define clGetPlatformInfo clGetPlatformInfo_fn
+inline cl_int clGetPlatformInfo(cl_platform_id p0, cl_platform_info p1, size_t p2, void* p3, size_t* p4) { return clGetPlatformInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceIDs
+#define clGetDeviceIDs clGetDeviceIDs_fn
+inline cl_int clGetDeviceIDs(cl_platform_id p0, cl_device_type p1, cl_uint p2, cl_device_id* p3, cl_uint* p4) { return clGetDeviceIDs_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceInfo
+#define clGetDeviceInfo clGetDeviceInfo_fn
+inline cl_int clGetDeviceInfo(cl_device_id p0, cl_device_info p1, size_t p2, void* p3, size_t* p4) { return clGetDeviceInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSubDevices
+#define clCreateSubDevices clCreateSubDevices_fn
+inline cl_int clCreateSubDevices(cl_device_id p0, const cl_device_partition_property* p1, cl_uint p2, cl_device_id* p3, cl_uint* p4) { return clCreateSubDevices_pfn(p0, p1, p2, p3, p4); }
+#undef clRetainDevice
+#define clRetainDevice clRetainDevice_fn
+inline cl_int clRetainDevice(cl_device_id p0) { return clRetainDevice_pfn(p0); }
+#undef clReleaseDevice
+#define clReleaseDevice clReleaseDevice_fn
+inline cl_int clReleaseDevice(cl_device_id p0) { return clReleaseDevice_pfn(p0); }
+#undef clCreateContext
+#define clCreateContext clCreateContext_fn
+inline cl_context clCreateContext(const cl_context_properties* p0, cl_uint p1, const cl_device_id* p2, void (CL_CALLBACK*p3) (const char*, const void*, size_t, void*), void* p4, cl_int* p5) { return clCreateContext_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateContextFromType
+#define clCreateContextFromType clCreateContextFromType_fn
+inline cl_context clCreateContextFromType(const cl_context_properties* p0, cl_device_type p1, void (CL_CALLBACK*p2) (const char*, const void*, size_t, void*), void* p3, cl_int* p4) { return clCreateContextFromType_pfn(p0, p1, p2, p3, p4); }
+#undef clRetainContext
+#define clRetainContext clRetainContext_fn
+inline cl_int clRetainContext(cl_context p0) { return clRetainContext_pfn(p0); }
+#undef clReleaseContext
+#define clReleaseContext clReleaseContext_fn
+inline cl_int clReleaseContext(cl_context p0) { return clReleaseContext_pfn(p0); }
+#undef clGetContextInfo
+#define clGetContextInfo clGetContextInfo_fn
+inline cl_int clGetContextInfo(cl_context p0, cl_context_info p1, size_t p2, void* p3, size_t* p4) { return clGetContextInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateCommandQueue
+#define clCreateCommandQueue clCreateCommandQueue_fn
+inline cl_command_queue clCreateCommandQueue(cl_context p0, cl_device_id p1, cl_command_queue_properties p2, cl_int* p3) { return clCreateCommandQueue_pfn(p0, p1, p2, p3); }
+#undef clRetainCommandQueue
+#define clRetainCommandQueue clRetainCommandQueue_fn
+inline cl_int clRetainCommandQueue(cl_command_queue p0) { return clRetainCommandQueue_pfn(p0); }
+#undef clReleaseCommandQueue
+#define clReleaseCommandQueue clReleaseCommandQueue_fn
+inline cl_int clReleaseCommandQueue(cl_command_queue p0) { return clReleaseCommandQueue_pfn(p0); }
+#undef clGetCommandQueueInfo
+#define clGetCommandQueueInfo clGetCommandQueueInfo_fn
+inline cl_int clGetCommandQueueInfo(cl_command_queue p0, cl_command_queue_info p1, size_t p2, void* p3, size_t* p4) { return clGetCommandQueueInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateBuffer
+#define clCreateBuffer clCreateBuffer_fn
+inline cl_mem clCreateBuffer(cl_context p0, cl_mem_flags p1, size_t p2, void* p3, cl_int* p4) { return clCreateBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSubBuffer
+#define clCreateSubBuffer clCreateSubBuffer_fn
+inline cl_mem clCreateSubBuffer(cl_mem p0, cl_mem_flags p1, cl_buffer_create_type p2, const void* p3, cl_int* p4) { return clCreateSubBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateImage
+#define clCreateImage clCreateImage_fn
+inline cl_mem clCreateImage(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, const cl_image_desc* p3, void* p4, cl_int* p5) { return clCreateImage_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clRetainMemObject
+#define clRetainMemObject clRetainMemObject_fn
+inline cl_int clRetainMemObject(cl_mem p0) { return clRetainMemObject_pfn(p0); }
+#undef clReleaseMemObject
+#define clReleaseMemObject clReleaseMemObject_fn
+inline cl_int clReleaseMemObject(cl_mem p0) { return clReleaseMemObject_pfn(p0); }
+#undef clGetSupportedImageFormats
+#define clGetSupportedImageFormats clGetSupportedImageFormats_fn
+inline cl_int clGetSupportedImageFormats(cl_context p0, cl_mem_flags p1, cl_mem_object_type p2, cl_uint p3, cl_image_format* p4, cl_uint* p5) { return clGetSupportedImageFormats_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetMemObjectInfo
+#define clGetMemObjectInfo clGetMemObjectInfo_fn
+inline cl_int clGetMemObjectInfo(cl_mem p0, cl_mem_info p1, size_t p2, void* p3, size_t* p4) { return clGetMemObjectInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetImageInfo
+#define clGetImageInfo clGetImageInfo_fn
+inline cl_int clGetImageInfo(cl_mem p0, cl_image_info p1, size_t p2, void* p3, size_t* p4) { return clGetImageInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clSetMemObjectDestructorCallback
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_fn
+inline cl_int clSetMemObjectDestructorCallback(cl_mem p0, void (CL_CALLBACK*p1) (cl_mem, void*), void* p2) { return clSetMemObjectDestructorCallback_pfn(p0, p1, p2); }
+#undef clCreateSampler
+#define clCreateSampler clCreateSampler_fn
+inline cl_sampler clCreateSampler(cl_context p0, cl_bool p1, cl_addressing_mode p2, cl_filter_mode p3, cl_int* p4) { return clCreateSampler_pfn(p0, p1, p2, p3, p4); }
+#undef clRetainSampler
+#define clRetainSampler clRetainSampler_fn
+inline cl_int clRetainSampler(cl_sampler p0) { return clRetainSampler_pfn(p0); }
+#undef clReleaseSampler
+#define clReleaseSampler clReleaseSampler_fn
+inline cl_int clReleaseSampler(cl_sampler p0) { return clReleaseSampler_pfn(p0); }
+#undef clGetSamplerInfo
+#define clGetSamplerInfo clGetSamplerInfo_fn
+inline cl_int clGetSamplerInfo(cl_sampler p0, cl_sampler_info p1, size_t p2, void* p3, size_t* p4) { return clGetSamplerInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateProgramWithSource
+#define clCreateProgramWithSource clCreateProgramWithSource_fn
+inline cl_program clCreateProgramWithSource(cl_context p0, cl_uint p1, const char** p2, const size_t* p3, cl_int* p4) { return clCreateProgramWithSource_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateProgramWithBinary
+#define clCreateProgramWithBinary clCreateProgramWithBinary_fn
+inline cl_program clCreateProgramWithBinary(cl_context p0, cl_uint p1, const cl_device_id* p2, const size_t* p3, const unsigned char** p4, cl_int* p5, cl_int* p6) { return clCreateProgramWithBinary_pfn(p0, p1, p2, p3, p4, p5, p6); }
+#undef clCreateProgramWithBuiltInKernels
+#define clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels_fn
+inline cl_program clCreateProgramWithBuiltInKernels(cl_context p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_int* p4) { return clCreateProgramWithBuiltInKernels_pfn(p0, p1, p2, p3, p4); }
+#undef clRetainProgram
+#define clRetainProgram clRetainProgram_fn
+inline cl_int clRetainProgram(cl_program p0) { return clRetainProgram_pfn(p0); }
+#undef clReleaseProgram
+#define clReleaseProgram clReleaseProgram_fn
+inline cl_int clReleaseProgram(cl_program p0) { return clReleaseProgram_pfn(p0); }
+#undef clBuildProgram
+#define clBuildProgram clBuildProgram_fn
+inline cl_int clBuildProgram(cl_program p0, cl_uint p1, const cl_device_id* p2, const char* p3, void (CL_CALLBACK*p4) (cl_program, void*), void* p5) { return clBuildProgram_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCompileProgram
+#define clCompileProgram clCompileProgram_fn
+inline cl_int clCompileProgram(cl_program p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_uint p4, const cl_program* p5, const char** p6, void (CL_CALLBACK*p7) (cl_program, void*), void* p8) { return clCompileProgram_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clLinkProgram
+#define clLinkProgram clLinkProgram_fn
+inline cl_program clLinkProgram(cl_context p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_uint p4, const cl_program* p5, void (CL_CALLBACK*p6) (cl_program, void*), void* p7, cl_int* p8) { return clLinkProgram_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clUnloadPlatformCompiler
+#define clUnloadPlatformCompiler clUnloadPlatformCompiler_fn
+inline cl_int clUnloadPlatformCompiler(cl_platform_id p0) { return clUnloadPlatformCompiler_pfn(p0); }
+#undef clGetProgramInfo
+#define clGetProgramInfo clGetProgramInfo_fn
+inline cl_int clGetProgramInfo(cl_program p0, cl_program_info p1, size_t p2, void* p3, size_t* p4) { return clGetProgramInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetProgramBuildInfo
+#define clGetProgramBuildInfo clGetProgramBuildInfo_fn
+inline cl_int clGetProgramBuildInfo(cl_program p0, cl_device_id p1, cl_program_build_info p2, size_t p3, void* p4, size_t* p5) { return clGetProgramBuildInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateKernel
+#define clCreateKernel clCreateKernel_fn
+inline cl_kernel clCreateKernel(cl_program p0, const char* p1, cl_int* p2) { return clCreateKernel_pfn(p0, p1, p2); }
+#undef clCreateKernelsInProgram
+#define clCreateKernelsInProgram clCreateKernelsInProgram_fn
+inline cl_int clCreateKernelsInProgram(cl_program p0, cl_uint p1, cl_kernel* p2, cl_uint* p3) { return clCreateKernelsInProgram_pfn(p0, p1, p2, p3); }
+#undef clRetainKernel
+#define clRetainKernel clRetainKernel_fn
+inline cl_int clRetainKernel(cl_kernel p0) { return clRetainKernel_pfn(p0); }
+#undef clReleaseKernel
+#define clReleaseKernel clReleaseKernel_fn
+inline cl_int clReleaseKernel(cl_kernel p0) { return clReleaseKernel_pfn(p0); }
+#undef clSetKernelArg
+#define clSetKernelArg clSetKernelArg_fn
+inline cl_int clSetKernelArg(cl_kernel p0, cl_uint p1, size_t p2, const void* p3) { return clSetKernelArg_pfn(p0, p1, p2, p3); }
+#undef clGetKernelInfo
+#define clGetKernelInfo clGetKernelInfo_fn
+inline cl_int clGetKernelInfo(cl_kernel p0, cl_kernel_info p1, size_t p2, void* p3, size_t* p4) { return clGetKernelInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetKernelArgInfo
+#define clGetKernelArgInfo clGetKernelArgInfo_fn
+inline cl_int clGetKernelArgInfo(cl_kernel p0, cl_uint p1, cl_kernel_arg_info p2, size_t p3, void* p4, size_t* p5) { return clGetKernelArgInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetKernelWorkGroupInfo
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_fn
+inline cl_int clGetKernelWorkGroupInfo(cl_kernel p0, cl_device_id p1, cl_kernel_work_group_info p2, size_t p3, void* p4, size_t* p5) { return clGetKernelWorkGroupInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clWaitForEvents
+#define clWaitForEvents clWaitForEvents_fn
+inline cl_int clWaitForEvents(cl_uint p0, const cl_event* p1) { return clWaitForEvents_pfn(p0, p1); }
+#undef clGetEventInfo
+#define clGetEventInfo clGetEventInfo_fn
+inline cl_int clGetEventInfo(cl_event p0, cl_event_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateUserEvent
+#define clCreateUserEvent clCreateUserEvent_fn
+inline cl_event clCreateUserEvent(cl_context p0, cl_int* p1) { return clCreateUserEvent_pfn(p0, p1); }
+#undef clRetainEvent
+#define clRetainEvent clRetainEvent_fn
+inline cl_int clRetainEvent(cl_event p0) { return clRetainEvent_pfn(p0); }
+#undef clReleaseEvent
+#define clReleaseEvent clReleaseEvent_fn
+inline cl_int clReleaseEvent(cl_event p0) { return clReleaseEvent_pfn(p0); }
+#undef clSetUserEventStatus
+#define clSetUserEventStatus clSetUserEventStatus_fn
+inline cl_int clSetUserEventStatus(cl_event p0, cl_int p1) { return clSetUserEventStatus_pfn(p0, p1); }
+#undef clSetEventCallback
+#define clSetEventCallback clSetEventCallback_fn
+inline cl_int clSetEventCallback(cl_event p0, cl_int p1, void (CL_CALLBACK*p2) (cl_event, cl_int, void*), void* p3) { return clSetEventCallback_pfn(p0, p1, p2, p3); }
+#undef clGetEventProfilingInfo
+#define clGetEventProfilingInfo clGetEventProfilingInfo_fn
+inline cl_int clGetEventProfilingInfo(cl_event p0, cl_profiling_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventProfilingInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clFlush
+#define clFlush clFlush_fn
+inline cl_int clFlush(cl_command_queue p0) { return clFlush_pfn(p0); }
+#undef clFinish
+#define clFinish clFinish_fn
+inline cl_int clFinish(cl_command_queue p0) { return clFinish_pfn(p0); }
+#undef clEnqueueReadBuffer
+#define clEnqueueReadBuffer clEnqueueReadBuffer_fn
+inline cl_int clEnqueueReadBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueReadBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueReadBufferRect
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_fn
+inline cl_int clEnqueueReadBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueReadBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueWriteBuffer
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_fn
+inline cl_int clEnqueueWriteBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, const void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueWriteBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueWriteBufferRect
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_fn
+inline cl_int clEnqueueWriteBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, const void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueWriteBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueFillBuffer
+#define clEnqueueFillBuffer clEnqueueFillBuffer_fn
+inline cl_int clEnqueueFillBuffer(cl_command_queue p0, cl_mem p1, const void* p2, size_t p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueFillBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyBuffer
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_fn
+inline cl_int clEnqueueCopyBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyBufferRect
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_fn
+inline cl_int clEnqueueCopyBufferRect(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, cl_uint p10, const cl_event* p11, cl_event* p12) { return clEnqueueCopyBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12); }
+#undef clEnqueueReadImage
+#define clEnqueueReadImage clEnqueueReadImage_fn
+inline cl_int clEnqueueReadImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueReadImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clEnqueueWriteImage
+#define clEnqueueWriteImage clEnqueueWriteImage_fn
+inline cl_int clEnqueueWriteImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, const void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueWriteImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clEnqueueFillImage
+#define clEnqueueFillImage clEnqueueFillImage_fn
+inline cl_int clEnqueueFillImage(cl_command_queue p0, cl_mem p1, const void* p2, const size_t* p3, const size_t* p4, cl_uint p5, const cl_event* p6, cl_event* p7) { return clEnqueueFillImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7); }
+#undef clEnqueueCopyImage
+#define clEnqueueCopyImage clEnqueueCopyImage_fn
+inline cl_int clEnqueueCopyImage(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyImageToBuffer
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_fn
+inline cl_int clEnqueueCopyImageToBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImageToBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyBufferToImage
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_fn
+inline cl_int clEnqueueCopyBufferToImage(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBufferToImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueMapBuffer
+#define clEnqueueMapBuffer clEnqueueMapBuffer_fn
+inline void* clEnqueueMapBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8, cl_int* p9) { return clEnqueueMapBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueMapImage
+#define clEnqueueMapImage clEnqueueMapImage_fn
+inline void* clEnqueueMapImage(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, const size_t* p4, const size_t* p5, size_t* p6, size_t* p7, cl_uint p8, const cl_event* p9, cl_event* p10, cl_int* p11) { return clEnqueueMapImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11); }
+#undef clEnqueueUnmapMemObject
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_fn
+inline cl_int clEnqueueUnmapMemObject(cl_command_queue p0, cl_mem p1, void* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueUnmapMemObject_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueMigrateMemObjects
+#define clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects_fn
+inline cl_int clEnqueueMigrateMemObjects(cl_command_queue p0, cl_uint p1, const cl_mem* p2, cl_mem_migration_flags p3, cl_uint p4, const cl_event* p5, cl_event* p6) { return clEnqueueMigrateMemObjects_pfn(p0, p1, p2, p3, p4, p5, p6); }
+#undef clEnqueueNDRangeKernel
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_fn
+inline cl_int clEnqueueNDRangeKernel(cl_command_queue p0, cl_kernel p1, cl_uint p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueNDRangeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueTask
+#define clEnqueueTask clEnqueueTask_fn
+inline cl_int clEnqueueTask(cl_command_queue p0, cl_kernel p1, cl_uint p2, const cl_event* p3, cl_event* p4) { return clEnqueueTask_pfn(p0, p1, p2, p3, p4); }
+#undef clEnqueueNativeKernel
+#define clEnqueueNativeKernel clEnqueueNativeKernel_fn
+inline cl_int clEnqueueNativeKernel(cl_command_queue p0, void (CL_CALLBACK*p1) (void*), void* p2, size_t p3, cl_uint p4, const cl_mem* p5, const void** p6, cl_uint p7, const cl_event* p8, cl_event* p9) { return clEnqueueNativeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueMarkerWithWaitList
+#define clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList_fn
+inline cl_int clEnqueueMarkerWithWaitList(cl_command_queue p0, cl_uint p1, const cl_event* p2, cl_event* p3) { return clEnqueueMarkerWithWaitList_pfn(p0, p1, p2, p3); }
+#undef clEnqueueBarrierWithWaitList
+#define clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList_fn
+inline cl_int clEnqueueBarrierWithWaitList(cl_command_queue p0, cl_uint p1, const cl_event* p2, cl_event* p3) { return clEnqueueBarrierWithWaitList_pfn(p0, p1, p2, p3); }
+#undef clGetExtensionFunctionAddressForPlatform
+#define clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform_fn
+inline void* clGetExtensionFunctionAddressForPlatform(cl_platform_id p0, const char* p1) { return clGetExtensionFunctionAddressForPlatform_pfn(p0, p1); }
+#undef clCreateImage2D
+#define clCreateImage2D clCreateImage2D_fn
+inline cl_mem clCreateImage2D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, void* p6, cl_int* p7) { return clCreateImage2D_pfn(p0, p1, p2, p3, p4, p5, p6, p7); }
+#undef clCreateImage3D
+#define clCreateImage3D clCreateImage3D_fn
+inline cl_mem clCreateImage3D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, size_t p6, size_t p7, void* p8, cl_int* p9) { return clCreateImage3D_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueMarker
+#define clEnqueueMarker clEnqueueMarker_fn
+inline cl_int clEnqueueMarker(cl_command_queue p0, cl_event* p1) { return clEnqueueMarker_pfn(p0, p1); }
+#undef clEnqueueWaitForEvents
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_fn
+inline cl_int clEnqueueWaitForEvents(cl_command_queue p0, cl_uint p1, const cl_event* p2) { return clEnqueueWaitForEvents_pfn(p0, p1, p2); }
+#undef clEnqueueBarrier
+#define clEnqueueBarrier clEnqueueBarrier_fn
+inline cl_int clEnqueueBarrier(cl_command_queue p0) { return clEnqueueBarrier_pfn(p0); }
+#undef clUnloadCompiler
+#define clUnloadCompiler clUnloadCompiler_fn
+inline cl_int clUnloadCompiler() { return clUnloadCompiler_pfn(); }
+#undef clGetExtensionFunctionAddress
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_fn
+inline void* clGetExtensionFunctionAddress(const char* p0) { return clGetExtensionFunctionAddress_pfn(p0); }
+
+#endif // __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -57,8 +57,7 @@ namespace cv
 {
    namespace ocl
    {
-        using std::auto_ptr;
-        enum
+        enum DeviceType
        {
            CVCL_DEVICE_TYPE_DEFAULT     = (1 << 0),
            CVCL_DEVICE_TYPE_CPU         = (1 << 1),
@ -93,94 +92,112 @@ namespace cv
        //return -1 if the target type is unsupported, otherwise return 0
        CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);

-        //this class contains ocl runtime information
-        class CV_EXPORTS Info
+        // these classes contain OpenCL runtime information
+
+        struct PlatformInfo;
+
+        struct DeviceInfo
        {
-        public:
-            struct Impl;
-            Impl *impl;
+            int _id; // reserved, don't use it

-            Info();
-            Info(const Info &m);
-            ~Info();
-            void release();
-            Info &operator = (const Info &m);
-            std::vector<string> DeviceName;
+            DeviceType deviceType;
+            std::string deviceProfile;
+            std::string deviceVersion;
+            std::string deviceName;
+            std::string deviceVendor;
+            int deviceVendorId;
+            std::string deviceDriverVersion;
+            std::string deviceExtensions;
+
+            size_t maxWorkGroupSize;
+            std::vector<size_t> maxWorkItemSizes;
+            int maxComputeUnits;
+            size_t localMemorySize;
+
+            int deviceVersionMajor;
+            int deviceVersionMinor;
+
+            bool haveDoubleSupport;
+            bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
+
+            std::string compilationExtraOptions;
+
+            const PlatformInfo* platform;
+
+            DeviceInfo();
        };
+
+        struct PlatformInfo
+        {
+            int _id; // reserved, don't use it
+
+            std::string platformProfile;
+            std::string platformVersion;
+            std::string platformName;
+            std::string platformVendor;
+            std::string platformExtensons;
+
+            int platformVersionMajor;
+            int platformVersionMinor;
+
+            std::vector<const DeviceInfo*> devices;
+
+            PlatformInfo();
+        };
+
        //////////////////////////////// Initialization & Info ////////////////////////
-        //this function may be obsoleted
-        //CV_EXPORTS cl_device_id getDevice();
-        //the function must be called before any other cv::ocl::functions, it initialize ocl runtime
-        //each Info relates to an OpenCL platform
-        //there is one or more devices in each platform, each one has a separate name
-        CV_EXPORTS int getDevice(std::vector<Info> &oclinfo, int devicetype = CVCL_DEVICE_TYPE_GPU);
+        typedef std::vector<const PlatformInfo*> PlatformsInfo;

-        //set device you want to use, optional function after getDevice be called
-        //the devnum is the index of the selected device in DeviceName vector of INfo
-        CV_EXPORTS void setDevice(Info &oclinfo, int devnum = 0);
+        CV_EXPORTS int getOpenCLPlatforms(PlatformsInfo& platforms);

-        //The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
-        //returns cl_context *
-        CV_EXPORTS void* getoclContext();
-        //returns cl_command_queue *
-        CV_EXPORTS void* getoclCommandQueue();
+        typedef std::vector<const DeviceInfo*> DevicesInfo;

-        //explicit call clFinish. The global command queue will be used.
-        CV_EXPORTS void finish();
+        CV_EXPORTS int getOpenCLDevices(DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU,
+                const PlatformInfo* platform = NULL);

-        //this function enable ocl module to use customized cl_context and cl_command_queue
-        //getDevice also need to be called before this function
-        CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
-
-        //returns true when global OpenCL context is initialized
-        CV_EXPORTS bool initialized();
+        // set device you want to use
+        CV_EXPORTS void setDevice(const DeviceInfo* info);

        //////////////////////////////// Error handling ////////////////////////
        CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);

-        //////////////////////////////// OpenCL context ////////////////////////
-        //This is a global singleton class used to represent a OpenCL context.
+        enum FEATURE_TYPE
+        {
+            FEATURE_CL_DOUBLE = 1,
+            FEATURE_CL_UNIFIED_MEM,
+            FEATURE_CL_VER_1_2
+        };
+
+        // Represents OpenCL context, interface
        class CV_EXPORTS Context
        {
        protected:
-            Context();
-            friend class auto_ptr<Context>;
-            friend bool initialized();
-        private:
-            static auto_ptr<Context> clCxt;
-            static int val;
+            Context() { }
+            ~Context() { }
        public:
-            ~Context();
-            void release();
-            Info::Impl* impl;
-
            static Context* getContext();
-            static void setContext(Info &oclinfo);

-            enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
-            bool supportsFeature(int ftype) const;
-            size_t computeUnits() const;
-            void* oclContext();
-            void* oclCommandQueue();
+            bool supportsFeature(FEATURE_TYPE featureType) const;
+            const DeviceInfo& getDeviceInfo() const;
+
+            const void* getOpenCLContextPtr() const;
+            const void* getOpenCLCommandQueuePtr() const;
+            const void* getOpenCLDeviceIDPtr() const;
        };

-        //! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
-        CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt ,
-                                                        const char **source, string kernelName,
-                                                        size_t globalThreads[3], size_t localThreads[3],
-                                                        std::vector< std::pair<size_t, const void *> > &args,
-                                                        int channels, int depth, const char *build_options,
-                                                        bool finish = true, bool measureKernelTime = false,
-                                                        bool cleanUp = true);
+        inline const void *getClContextPtr()
+        {
+            return Context::getContext()->getOpenCLContextPtr();
+        }

-        //! Calls a kernel, by file. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
-        CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt ,
-                                                        const char **fileName, const int numFiles, string kernelName,
-                                                        size_t globalThreads[3], size_t localThreads[3],
-                                                        std::vector< std::pair<size_t, const void *> > &args,
-                                                        int channels, int depth, const char *build_options,
-                                                        bool finish = true, bool measureKernelTime = false,
-                                                        bool cleanUp = true);
+        inline const void *getClCommandQueuePtr()
+        {
+            return Context::getContext()->getOpenCLCommandQueuePtr();
+        }
+
+        bool CV_EXPORTS supportsFeature(FEATURE_TYPE featureType);
+
+        void CV_EXPORTS finish();

        //! Enable or disable OpenCL program binary caching onto local disk
        // After a program (*.cl files in opencl/ folder) is built at runtime, we allow the
@ -198,12 +215,11 @@ namespace cv
            CACHE_DEBUG   = 0x1 << 0, // cache OpenCL binary when built in debug mode (only work with MSVC)
            CACHE_RELEASE = 0x1 << 1, // default behavior, only cache when built in release mode (only work with MSVC)
            CACHE_ALL     = CACHE_DEBUG | CACHE_RELEASE, // always cache opencl binary
-            CACHE_UPDATE  = 0x1 << 2  // if the binary cache file with the same name is already on the disk, it will be updated.
        };
        CV_EXPORTS void setBinaryDiskCache(int mode = CACHE_RELEASE, cv::String path = "./");

        //! set where binary cache to be saved to
-        CV_EXPORTS void setBinpath(const char *path);
+        CV_EXPORTS void setBinaryPath(const char *path);

        class CV_EXPORTS oclMatExpr;
        //////////////////////////////// oclMat ////////////////////////////////
@ -384,7 +400,7 @@ namespace cv
            uchar *dataend;

            //! OpenCL context associated with the oclMat object.
-            Context *clCxt;
+            Context *clCxt; // TODO clCtx
            //add offset for handle ROI, calculated in byte
            int offset;
            //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
@ -1879,11 +1895,6 @@ namespace cv
            oclMat temp5;
        };

-        static inline size_t divUp(size_t total, size_t grain)
-        {
-            return (total + grain - 1) / grain;
-        }
-
        /*!***************K Nearest Neighbour*************!*/
        class CV_EXPORTS KNearestNeighbour: public CvKNearest
        {
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@ -52,120 +52,162 @@

 namespace cv
 {
-    namespace ocl
+namespace ocl
+{
+
+struct ProgramEntry
+{
+    const char* name;
+    const char* programStr;
+    const char* programHash;
+};
+
+inline cl_device_id getClDeviceID(const Context *ctx)
+{
+    return *(cl_device_id*)(ctx->getOpenCLDeviceIDPtr());
+}
+
+inline cl_context getClContext(const Context *ctx)
+{
+    return *(cl_context*)(ctx->getOpenCLContextPtr());
+}
+
+inline cl_command_queue getClCommandQueue(const Context *ctx)
+{
+    return *(cl_command_queue*)(ctx->getOpenCLCommandQueuePtr());
+}
+
+enum openCLMemcpyKind
+{
+    clMemcpyHostToDevice = 0,
+    clMemcpyDeviceToHost,
+    clMemcpyDeviceToDevice
+};
+///////////////////////////OpenCL call wrappers////////////////////////////
+void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
+        size_t widthInBytes, size_t height);
+void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+        size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
+void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
+        const void *src, size_t spitch,
+        size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
+void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
+        const void *src, size_t spitch,
+        size_t width, size_t height, int src_offset);
+void CV_EXPORTS openCLFree(void *devPtr);
+cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
+void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
+cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+        const cv::ocl::ProgramEntry* source, std::string kernelName);
+cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+        const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options);
+void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
+void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
+        int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
+void CV_EXPORTS openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName,
+        size_t globalThreads[3], size_t localThreads[3],
+        std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
+void CV_EXPORTS openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3],
+        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
+void CV_EXPORTS openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3],
+        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+        int depth, const char *build_options);
+
+cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+        const size_t size);
+
+cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
+
+enum FLUSH_MODE
+{
+    CLFINISH = 0,
+    CLFLUSH,
+    DISABLE
+};
+
+void CV_EXPORTS openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3],
+        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
+void CV_EXPORTS openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3],
+        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+        int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
+
+// bind oclMat to OpenCL image textures
+// note:
+//   1. there is no memory management. User need to explicitly release the resource
+//   2. for faster clamping, there is no buffer padding for the constructed texture
+cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
+void CV_EXPORTS releaseTexture(cl_mem& texture);
+
+//Represents an image texture object
+class CV_EXPORTS TextureCL
+{
+public:
+    TextureCL(cl_mem tex, int r, int c, int t)
+        : tex_(tex), rows(r), cols(c), type(t) {}
+    ~TextureCL()
    {
-        enum openCLMemcpyKind
-        {
-            clMemcpyHostToDevice = 0,
-            clMemcpyDeviceToHost,
-            clMemcpyDeviceToDevice
-        };
-        ///////////////////////////OpenCL call wrappers////////////////////////////
-        void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
-                                          size_t widthInBytes, size_t height);
-        void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
-                                            size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
-        void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
-                                       const void *src, size_t spitch,
-                                       size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
-        void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
-                                           const void *src, size_t spitch,
-                                           size_t width, size_t height, int src_offset);
-        void CV_EXPORTS openCLFree(void *devPtr);
-        cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
-        void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
-        cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
-                                                       const char **source, std::string kernelName);
-        cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
-                                                       const char **source, std::string kernelName, const char *build_options);
-        void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
-        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
-                                 int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
-        void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
-                                  size_t globalThreads[3], size_t localThreads[3],
-                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
-        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
-        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                 int depth, const char *build_options);
+        openCLFree(tex_);
+    }
+    operator cl_mem()
+    {
+        return tex_;
+    }
+    cl_mem const tex_;
+    const int rows;
+    const int cols;
+    const int type;
+private:
+    //disable assignment
+    void operator=(const TextureCL&);
+};
+// bind oclMat to OpenCL image textures and retunrs an TextureCL object
+// note:
+//   for faster clamping, there is no buffer padding for the constructed texture
+Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);

-        cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
-                             const size_t size);
+// returns whether the current context supports image2d_t format or not
+bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());

-        cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
+bool CV_EXPORTS isCpuDevice();

-        int CV_EXPORTS savetofile(const Context *clcxt,  cl_program &program, const char *fileName);
+size_t CV_EXPORTS queryWaveFrontSize(cl_kernel kernel);

-        enum FLUSH_MODE
-        {
-            CLFINISH = 0,
-            CLFLUSH,
-            DISABLE
-        };

-        void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
-        void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                  int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
-        // bind oclMat to OpenCL image textures
-        // note:
-        //   1. there is no memory management. User need to explicitly release the resource
-        //   2. for faster clamping, there is no buffer padding for the constructed texture
-        cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
-        void CV_EXPORTS releaseTexture(cl_mem& texture);

-        //Represents an image texture object
-        class CV_EXPORTS TextureCL
-        {
-        public:
-            TextureCL(cl_mem tex, int r, int c, int t)
-                : tex_(tex), rows(r), cols(c), type(t) {}
-            ~TextureCL()
-            {
-                openCLFree(tex_);
-            }
-            operator cl_mem()
-            {
-                return tex_;
-            }
-            cl_mem const tex_;
-            const int rows;
-            const int cols;
-            const int type;
-        private:
-            //disable assignment
-            void operator=(const TextureCL&);
-        };
-        // bind oclMat to OpenCL image textures and retunrs an TextureCL object
-        // note:
-        //   for faster clamping, there is no buffer padding for the constructed texture
-        Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);
+inline size_t divUp(size_t total, size_t grain)
+{
+    return (total + grain - 1) / grain;
+}

-        // returns whether the current context supports image2d_t format or not
-        bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
+inline size_t roundUp(size_t sz, size_t n)
+{
+    // we don't assume that n is a power of 2 (see alignSize)
+    // equal to divUp(sz, n) * n
+    size_t t = sz + n - 1;
+    size_t rem = t % n;
+    size_t result = t - rem;
+    return result;
+}

-        // the enums are used to query device information
-        // currently only support wavefront size queries
-        enum DEVICE_INFO
-        {
-            WAVEFRONT_SIZE,             //in AMD speak
-            IS_CPU_DEVICE               //check if the device is CPU
-        };
-        template<DEVICE_INFO _it, typename _ty>
-        _ty queryDeviceInfo(cl_kernel kernel = NULL);
+//! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
+CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt,
+        const cv::ocl::ProgramEntry* source, string kernelName,
+        size_t globalThreads[3], size_t localThreads[3],
+        std::vector< std::pair<size_t, const void *> > &args,
+        int channels, int depth, const char *build_options,
+        bool finish = true, bool measureKernelTime = false,
+        bool cleanUp = true);

-        template<>
-        int CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel);
-        template<>
-        size_t CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel);
-        template<>
-        bool CV_EXPORTS queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel kernel);
-
-        unsigned long CV_EXPORTS queryLocalMemInfo();
-    }//namespace ocl
+//! Calls a kernel, by file. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
+CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt,
+        const cv::ocl::ProgramEntry* source, const int numFiles, string kernelName,
+        size_t globalThreads[3], size_t localThreads[3],
+        std::vector< std::pair<size_t, const void *> > &args,
+        int channels, int depth, const char *build_options,
+        bool finish = true, bool measureKernelTime = false,
+        bool cleanUp = true);

+}//namespace ocl
 }//namespace cv

 #endif //__OPENCV_OCL_PRIVATE_UTIL__
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@ -51,45 +51,59 @@ const char * impls[] =
 #endif
 };

+using namespace cv::ocl;
+
 int main(int argc, char ** argv)
 {
    const char * keys =
        "{ h | help     | false              | print help message }"
        "{ t | type     | gpu                | set device type:cpu or gpu}"
-        "{ p | platform | 0                  | set platform id }"
+        "{ p | platform | -1                 | set platform id }"
        "{ d | device   | 0                  | set device id }";

-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<bool>("help"))
+    if (getenv("OPENCV_OPENCL_DEVICE") == NULL) // TODO Remove this after buildbot updates
    {
-        cout << "Available options besides google test option:" << endl;
-        cmd.printParams();
-        return 0;
+        CommandLineParser cmd(argc, argv, keys);
+        if (cmd.get<bool>("help"))
+        {
+            cout << "Available options besides google test option:" << endl;
+            cmd.printParams();
+            return 0;
+        }
+
+        string type = cmd.get<string>("type");
+        int pid = cmd.get<int>("platform");
+        int device = cmd.get<int>("device");
+
+        int flag = type == "cpu" ? cv::ocl::CVCL_DEVICE_TYPE_CPU :
+                                   cv::ocl::CVCL_DEVICE_TYPE_GPU;
+
+        cv::ocl::PlatformsInfo platformsInfo;
+        cv::ocl::getOpenCLPlatforms(platformsInfo);
+        if (pid >= (int)platformsInfo.size())
+        {
+            std::cout << "platform is invalid\n";
+            return 1;
+        }
+
+        cv::ocl::DevicesInfo devicesInfo;
+        int devnums = cv::ocl::getOpenCLDevices(devicesInfo, flag, (pid < 0) ? NULL : platformsInfo[pid]);
+        if (device < 0 || device >= devnums)
+        {
+            std::cout << "device/platform invalid\n";
+            return 1;
+        }
+
+        cv::ocl::setDevice(devicesInfo[device]);
    }

-    string type = cmd.get<string>("type");
-    unsigned int pid = cmd.get<unsigned int>("platform");
-    int device = cmd.get<int>("device");
+    const DeviceInfo& deviceInfo = cv::ocl::Context::getContext()->getDeviceInfo();

-    int flag = type == "cpu" ? cv::ocl::CVCL_DEVICE_TYPE_CPU :
-                               cv::ocl::CVCL_DEVICE_TYPE_GPU;
-
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = cv::ocl::getDevice(oclinfo, flag);
-    if (devnums <= device || device < 0)
-    {
-        std::cout << "device invalid\n";
-        return -1;
-    }
-
-    if (pid >= oclinfo.size())
-    {
-        std::cout << "platform invalid\n";
-        return -1;
-    }
-
-    cv::ocl::setDevice(oclinfo[pid], device);
-    cv::ocl::setBinaryDiskCache(cv::ocl::CACHE_UPDATE);
+    cout << "Device type: " << (deviceInfo.deviceType == CVCL_DEVICE_TYPE_CPU ?
+                "CPU" :
+                (deviceInfo.deviceType == CVCL_DEVICE_TYPE_GPU ? "GPU" : "unknown")) << endl
+         << "Platform name: " << deviceInfo.platform->platformName << endl
+         << "Device name: " << deviceInfo.deviceName << endl;

    CV_PERF_TEST_MAIN_INTERNALS(ocl, impls)
 }
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@ -51,50 +51,10 @@
 //M*/

 #include "precomp.hpp"
-#include <iomanip>
-
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-namespace cv
-{
-    namespace ocl
-    {
-        //////////////////////////////// OpenCL kernel strings /////////////////////
-
-        extern const char *arithm_absdiff_nonsaturate;
-        extern const char *arithm_nonzero;
-        extern const char *arithm_sum;
-        extern const char *arithm_minMax;
-        extern const char *arithm_minMaxLoc;
-        extern const char *arithm_minMaxLoc_mask;
-        extern const char *arithm_LUT;
-        extern const char *arithm_add;
-        extern const char *arithm_add_mask;
-        extern const char *arithm_add_scalar;
-        extern const char *arithm_add_scalar_mask;
-        extern const char *arithm_bitwise_binary;
-        extern const char *arithm_bitwise_binary_mask;
-        extern const char *arithm_bitwise_binary_scalar;
-        extern const char *arithm_bitwise_binary_scalar_mask;
-        extern const char *arithm_bitwise_not;
-        extern const char *arithm_compare;
-        extern const char *arithm_transpose;
-        extern const char *arithm_flip;
-        extern const char *arithm_flip_rc;
-        extern const char *arithm_magnitude;
-        extern const char *arithm_cartToPolar;
-        extern const char *arithm_polarToCart;
-        extern const char *arithm_exp;
-        extern const char *arithm_log;
-        extern const char *arithm_addWeighted;
-        extern const char *arithm_phase;
-        extern const char *arithm_pow;
-        extern const char *arithm_setidentity;
-    }
-}

 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
@ -106,7 +66,7 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
                            oclMat &dst, int op_type, bool use_scalar = false)
 {
    Context *clCxt = src1.clCxt;
-    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
    if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F))
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
@ -264,7 +224,7 @@ void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////

 static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpOp,
-                        string kernelName, const char **kernelString)
+                        string kernelName, const cv::ocl::ProgramEntry* source)
 {
    CV_Assert(src1.type() == src2.type());
    dst.create(src1.size(), CV_8UC1);
@ -295,13 +255,13 @@ static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int
    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));

-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads,
                        args, -1, -1, buildOptions.c_str());
 }

 void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
    {
        cout << "Selected device do not support double" << endl;
        return;
@ -358,7 +318,7 @@ Scalar arithmetic_sum(const oclMat &src, int type, int ddepth)
 {
    CV_Assert(src.step % src.elemSize() == 0);

-    size_t groupnum = src.clCxt->computeUnits();
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
    CV_Assert(groupnum != 0);

    int dbsize = groupnum * src.oclchannels();
@ -385,7 +345,7 @@ typedef Scalar (*sumFunc)(const oclMat &src, int type, int ddepth);

 Scalar cv::ocl::sum(const oclMat &src)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
    }
@ -396,7 +356,7 @@ Scalar cv::ocl::sum(const oclMat &src)
        arithmetic_sum<double>
    };

-    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE);
    int ddepth = std::max(src.depth(), CV_32S);
    if (!hasDouble && ddepth == CV_64F)
        ddepth = CV_32F;
@ -407,7 +367,7 @@ Scalar cv::ocl::sum(const oclMat &src)

 Scalar cv::ocl::absSum(const oclMat &src)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
    }
@ -418,7 +378,7 @@ Scalar cv::ocl::absSum(const oclMat &src)
        arithmetic_sum<double>
    };

-    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE);
    int ddepth = std::max(src.depth(), CV_32S);
    if (!hasDouble && ddepth == CV_64F)
        ddepth = CV_32F;
@ -429,7 +389,7 @@ Scalar cv::ocl::absSum(const oclMat &src)

 Scalar cv::ocl::sqrSum(const oclMat &src)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
    }
@ -440,7 +400,7 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
        arithmetic_sum<double>
    };

-    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE);
    int ddepth = src.depth() <= CV_32S ? CV_32S : (hasDouble ? CV_64F : CV_32F);

    sumFunc func = functab[ddepth - CV_32S];
@ -524,7 +484,7 @@ template <typename T, typename WT>
 void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
                                             const oclMat &mask, oclMat &buf)
 {
-    size_t groupnum = src.clCxt->computeUnits();
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
    CV_Assert(groupnum != 0);

    int dbsize = groupnum * 2 * src.elemSize();
@ -566,7 +526,7 @@ void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, cons
    if (minVal == NULL && maxVal == NULL)
        return;

-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
    }
@ -654,7 +614,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
    CV_Assert(!src1.empty());
    CV_Assert(src2.empty() || (src1.type() == src2.type() && src1.size() == src2.size()));

-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
    }
@ -699,7 +659,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)

 static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -746,7 +706,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern

 static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -792,9 +752,9 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern

    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));

-    const char **kernelString = isVertical ? &arithm_flip_rc : &arithm_flip;
+    const cv::ocl::ProgramEntry* source = isVertical ? &arithm_flip_rc : &arithm_flip;

-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
 }

 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
@ -860,10 +820,10 @@ void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 //////////////////////////////// exp log /////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////

-static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString)
+static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
    Context  *clCxt = src.clCxt;
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -893,7 +853,7 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernel
    args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep1 ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));

-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads,
                        args, src.oclchannels(), -1, buildOptions.c_str());
 }

@ -913,7 +873,7 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)

 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -955,9 +915,9 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
    arithmetic_magnitude_phase_run(src1, src2, dst, "arithm_magnitude");
 }

-static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString)
+static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -985,7 +945,7 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat
    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));

-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
 }

 void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleInDegrees)
@ -1004,7 +964,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleI
 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
                                string kernelName, bool angleInDegrees)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -1057,7 +1017,7 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
                        string kernelName)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -1176,7 +1136,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                          Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
    CV_Assert(src.oclchannels() == 1);
-    size_t groupnum = src.clCxt->computeUnits();
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
    CV_Assert(groupnum != 0);
    int minloc = -1 , maxloc = -1;
    int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
@ -1238,7 +1198,7 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                        Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
        return;
@ -1251,7 +1211,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
    };

    minMaxLocFunc func;
-    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
+    func = functab[(int)src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)];
    func(src, minVal, maxVal, minLoc, maxLoc, mask);
 }

@ -1296,12 +1256,12 @@ int cv::ocl::countNonZero(const oclMat &src)
    CV_Assert(src.channels() == 1);

    Context *clCxt = src.clCxt;
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "selected device doesn't support double");
    }

-    size_t groupnum = src.clCxt->computeUnits();
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
    CV_Assert(groupnum != 0);
    int dbsize = groupnum;

@ -1327,7 +1287,7 @@ int cv::ocl::countNonZero(const oclMat &src)
 ////////////////////////////////bitwise_op////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////

-static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString)
+static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
    dst.create(src1.size(), src1.type());

@ -1361,7 +1321,7 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName
    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));

-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
 }

 enum { AND = 0, OR, XOR };
@ -1370,7 +1330,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
                               oclMat &dst, int operationType)
 {
    Context  *clCxt = src1.clCxt;
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
    {
        cout << "Selected device does not support double" << endl;
        return;
@ -1442,7 +1402,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca

 void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
    {
        cout << "Selected device does not support double" << endl;
        return;
@ -1571,7 +1531,7 @@ oclMatExpr::operator oclMat() const
 static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false)
 {
    Context  *clCxt = src.clCxt;
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
@ -1623,7 +1583,7 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
 void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
 {
    Context *clCxt = src1.clCxt;
-    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
    if (!hasDouble && src1.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
@ -1688,7 +1648,7 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
 /////////////////////////////////// Pow //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////

-static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
+static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
    CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows);
    CV_Assert(src1.type() == dst.type());
@ -1718,17 +1678,17 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));

    float pf = static_cast<float>(p);
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
        args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
    else
        args.push_back( make_pair( sizeof(cl_double), (void *)&p ));

-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
 }

 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 {
-    if (!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
+    if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.type() == CV_64F)
    {
        cout << "Selected device do not support double" << endl;
        return;
@ -1748,7 +1708,7 @@ void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 void cv::ocl::setIdentity(oclMat& src, const Scalar & scalar)
 {
    Context  *clCxt = Context::getContext();
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
        return;
--- a/modules/ocl/src/bgfg_mog.cpp
+++ b/modules/ocl/src/bgfg_mog.cpp
@ -44,14 +44,15 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 using namespace cv;
 using namespace cv::ocl;
+
 namespace cv
 {
    namespace ocl
    {
-        extern const char* bgfg_mog;
-
        typedef struct _contant_struct
        {
            cl_float c_Tb;
@ -392,7 +393,7 @@ void cv::ocl::device::mog::loadConstants(float Tb, float TB, float Tg, float var
    constants->c_tau = tau;
    constants->c_shadowVal = shadowVal;

-    cl_constants = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()),
+    cl_constants = load_constant(*((cl_context*)getClContextPtr()), *((cl_command_queue*)getClCommandQueuePtr()),
        (void *)constants, sizeof(_contant_struct));
 }

@ -635,4 +636,4 @@ void cv::ocl::MOG2::release()
    mean_.release();

    bgmodelUsedModes_.release();
-}
+}
--- a/modules/ocl/src/blend.cpp
+++ b/modules/ocl/src/blend.cpp
@ -44,20 +44,10 @@
 //M*/

 #include "precomp.hpp"
-#include <iomanip>
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-namespace cv
-{
-    namespace ocl
-    {
-        ////////////////////////////////////OpenCL kernel strings//////////////////////////
-        extern const char *blend_linear;
-    }
-}

 void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2,
                          oclMat &result)
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@ -45,23 +45,15 @@
 //M*/

 #include "precomp.hpp"
-
 #include <functional>
 #include <iterator>
 #include <vector>
+#include "opencl_kernels.hpp"
+
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;

-namespace cv
-{
-    namespace ocl
-    {
-        ////////////////////////////////////OpenCL kernel strings//////////////////////////
-        extern const char *brute_force_match;
-    }
-}
-
 static const int OPT_SIZE = 100;

 static const char * T_ARR [] = {
@ -245,7 +237,7 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, const oclM
 {
    const oclMat zeroMask;
    const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
    if (query.cols <= 64)
    {
        matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
@ -265,7 +257,7 @@ static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, co
 {
    const oclMat zeroMask;
    const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
    if (query.cols <= 64)
    {
        matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
@ -286,7 +278,7 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, float maxD
 {
    const oclMat zeroMask;
    const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
    if (query.cols <= 64)
    {
        matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
@ -469,7 +461,7 @@ static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, con
 static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                      const oclMat &trainIdx, const oclMat &distance, int distType)
 {
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
    if (query.cols <= 64)
    {
        knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
--- a/modules/ocl/src/build_warps.cpp
+++ b/modules/ocl/src/build_warps.cpp
@ -44,19 +44,10 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *build_warps;
-    }
-}

 //////////////////////////////////////////////////////////////////////////////
 // buildWarpPlaneMaps
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@ -44,19 +44,10 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *imgproc_canny;
-    }
-}

 cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(NULL)
 {
@ -98,7 +89,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
    {
        openCLFree(counter);
    }
-    counter = clCreateBuffer( *((cl_context*)getoclContext()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
+    counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
    openCLSafeCall(err);
 }

@ -354,7 +345,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
 void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
 {
    unsigned int count;
-    openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
+    openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
    Context *clCxt = map.clCxt;
    string kernelName = "edgesHysteresisGlobal";
    vector< pair<size_t, const void *> > args;
@ -363,7 +354,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
    int count_i[1] = {0};
    while(count > 0)
    {
-        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));

        args.clear();
        size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
@ -378,7 +369,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
        args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));

        openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
        std::swap(st1, st2);
    }
 }
--- a/modules/ocl/src/cl_context.cpp
+++ b/modules/ocl/src/cl_context.cpp
@ -0,0 +1,756 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Guoping Long, longguoping@gmail.com
+//    Niko Li, newlife20080214@gmail.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#include <fstream>
+#include "cl_programcache.hpp"
+
+// workaround for OpenCL C++ bindings
+#if defined(HAVE_OPENCL12)
+#include "opencv2/ocl/cl_runtime/cl_runtime_opencl12_wrappers.hpp"
+#elif defined(HAVE_OPENCL11)
+#include "opencv2/ocl/cl_runtime/cl_runtime_opencl11_wrappers.hpp"
+#else
+#error Invalid OpenCL configuration
+#endif
+
+#if defined _MSC_VER && _MSC_VER >= 1200
+#pragma warning( disable: 4100 4101 4127 4244 4267 4510 4512 4610)
+#endif
+#undef __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+
+namespace cv {
+namespace ocl {
+
+struct PlatformInfoImpl
+{
+    cl_platform_id platform_id;
+
+    std::vector<int> deviceIDs;
+
+    PlatformInfo info;
+
+    PlatformInfoImpl()
+        : platform_id(NULL)
+    {
+    }
+};
+
+struct DeviceInfoImpl
+{
+    cl_platform_id platform_id;
+    cl_device_id device_id;
+
+    DeviceInfo info;
+
+    DeviceInfoImpl()
+        : platform_id(NULL), device_id(NULL)
+    {
+    }
+};
+
+static std::vector<PlatformInfoImpl> global_platforms;
+static std::vector<DeviceInfoImpl> global_devices;
+
+static bool parseOpenCLVersion(const std::string& versionStr, int& major, int& minor)
+{
+    size_t p0 = versionStr.find(' ');
+    while (true)
+    {
+        if (p0 == std::string::npos)
+            break;
+        if (p0 + 1 >= versionStr.length())
+            break;
+        char c = versionStr[p0 + 1];
+        if (isdigit(c))
+            break;
+        p0 = versionStr.find(' ', p0 + 1);
+    }
+    size_t p1 = versionStr.find('.', p0);
+    size_t p2 = versionStr.find(' ', p1);
+    if (p0 == std::string::npos || p1 == std::string::npos || p2 == std::string::npos)
+    {
+        major = 0;
+        minor = 0;
+        return false;
+    }
+    std::string majorStr = versionStr.substr(p0 + 1, p1 - p0 - 1);
+    std::string minorStr = versionStr.substr(p1 + 1, p2 - p1 - 1);
+    major = atoi(majorStr.c_str());
+    minor = atoi(minorStr.c_str());
+    return true;
+}
+
+static void split(const std::string &s, char delim, std::vector<std::string> &elems) {
+    std::stringstream ss(s);
+    std::string item;
+    while (std::getline(ss, item, delim)) {
+        elems.push_back(item);
+    }
+}
+
+static std::vector<std::string> split(const std::string &s, char delim) {
+    std::vector<std::string> elems;
+    split(s, delim, elems);
+    return elems;
+}
+
+// Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
+// Sample: AMD:GPU:
+// Sample: AMD:GPU:Tahiti
+// Sample: :GPU|CPU: = '' = ':' = '::'
+static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
+        std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
+{
+    std::string deviceTypesStr;
+    size_t p0 = configurationStr.find(':');
+    if (p0 != std::string::npos)
+    {
+        size_t p1 = configurationStr.find(':', p0 + 1);
+        if (p1 != std::string::npos)
+        {
+            size_t p2 = configurationStr.find(':', p1 + 1);
+            if (p2 != std::string::npos)
+            {
+                std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
+                return false;
+            }
+            else
+            {
+                // assume platform + device types + device name/id
+                platform = configurationStr.substr(0, p0);
+                deviceTypesStr = configurationStr.substr(p0 + 1, p1 - (p0 + 1));
+                deviceNameOrID = configurationStr.substr(p1 + 1, configurationStr.length() - (p1 + 1));
+            }
+        }
+        else
+        {
+            // assume platform + device types
+            platform = configurationStr.substr(0, p0);
+            deviceTypesStr = configurationStr.substr(p0 + 1, configurationStr.length() - (p0 + 1));
+        }
+    }
+    else
+    {
+        // assume only platform
+        platform = configurationStr;
+    }
+    deviceTypes = split(deviceTypesStr, '|');
+    return true;
+}
+
+static bool __deviceSelected = false;
+static bool selectOpenCLDevice()
+{
+    __deviceSelected = true;
+
+    std::string platform;
+    std::vector<std::string> deviceTypes;
+    std::string deviceName;
+    const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
+    if (configuration)
+    {
+        if (!parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName))
+            return false;
+    }
+
+    bool isID = false;
+    int deviceID = -1;
+    if (deviceName.length() == 1)
+    // We limit ID range to 0..9, because we want to write:
+    // - '2500' to mean i5-2500
+    // - '8350' to mean AMD FX-8350
+    // - '650' to mean GeForce 650
+    // To extend ID range change condition to '> 0'
+    {
+        isID = true;
+        for (size_t i = 0; i < deviceName.length(); i++)
+        {
+            if (!isdigit(deviceName[i]))
+            {
+                isID = false;
+                break;
+            }
+        }
+        if (isID)
+        {
+            deviceID = atoi(deviceName.c_str());
+            CV_Assert(deviceID >= 0);
+        }
+    }
+
+    const PlatformInfo* platformInfo = NULL;
+    if (platform.length() > 0)
+    {
+        PlatformsInfo platforms;
+        getOpenCLPlatforms(platforms);
+        for (size_t i = 0; i < platforms.size(); i++)
+        {
+            if (platforms[i]->platformName.find(platform) != std::string::npos)
+            {
+                platformInfo = platforms[i];
+                break;
+            }
+        }
+        if (platformInfo == NULL)
+        {
+            std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
+            goto not_found;
+        }
+    }
+
+    if (deviceTypes.size() == 0)
+    {
+        if (!isID)
+        {
+            deviceTypes.push_back("GPU");
+            deviceTypes.push_back("CPU");
+        }
+        else
+        {
+            deviceTypes.push_back("ALL");
+        }
+    }
+    for (size_t t = 0; t < deviceTypes.size(); t++)
+    {
+        int deviceType = 0;
+        if (deviceTypes[t] == "GPU")
+        {
+            deviceType = CVCL_DEVICE_TYPE_GPU;
+        }
+        else if (deviceTypes[t] == "CPU")
+        {
+            deviceType = CVCL_DEVICE_TYPE_CPU;
+        }
+        else if (deviceTypes[t] == "ACCELERATOR")
+        {
+            deviceType = CVCL_DEVICE_TYPE_ACCELERATOR;
+        }
+        else if (deviceTypes[t] == "ALL")
+        {
+            deviceType = CVCL_DEVICE_TYPE_ALL;
+        }
+        else
+        {
+            std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
+            goto not_found;
+        }
+
+        DevicesInfo devices;
+        getOpenCLDevices(devices, deviceType, platformInfo);
+
+        for (size_t i = (isID ? deviceID : 0);
+             (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
+             i++)
+        {
+            if (isID || devices[i]->deviceName.find(deviceName) != std::string::npos)
+            {
+                // check for OpenCL 1.1
+                if (devices[i]->deviceVersionMajor < 1 ||
+                        (devices[i]->deviceVersionMajor == 1 && devices[i]->deviceVersionMinor < 1))
+                {
+                    std::cerr << "Skip unsupported version of OpenCL device: " << devices[i]->deviceName
+                            << "(" << devices[i]->platform->platformName << ")" << std::endl;
+                    continue; // unsupported version of device, skip it
+                }
+                try
+                {
+                    setDevice(devices[i]);
+                }
+                catch (...)
+                {
+                    std::cerr << "ERROR: Can't select OpenCL device: " << devices[i]->deviceName
+                            << "(" << devices[i]->platform->platformName << ")" << std::endl;
+                    goto not_found;
+                }
+                return true;
+            }
+        }
+    }
+not_found:
+    std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
+            << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
+            << "    Device types: ";
+    for (size_t t = 0; t < deviceTypes.size(); t++)
+    {
+        std::cerr << deviceTypes[t] << " ";
+    }
+    std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
+    return false;
+}
+
+static cv::Mutex __initializedMutex;
+static bool __initialized = false;
+static int initializeOpenCLDevices()
+{
+    assert(!__initialized);
+    __initialized = true;
+
+    assert(global_devices.size() == 0);
+
+    std::vector<cl::Platform> platforms;
+    try
+    {
+        openCLSafeCall(cl::Platform::get(&platforms));
+    }
+    catch (cv::Exception& e)
+    {
+        return 0; // OpenCL not found
+    }
+
+    global_platforms.resize(platforms.size());
+
+    for (size_t i = 0; i < platforms.size(); ++i)
+    {
+        PlatformInfoImpl& platformInfo = global_platforms[i];
+        platformInfo.info._id = i;
+
+        cl::Platform& platform = platforms[i];
+
+        platformInfo.platform_id = platform();
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_PROFILE, &platformInfo.info.platformProfile));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_VERSION, &platformInfo.info.platformVersion));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_NAME, &platformInfo.info.platformName));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_VENDOR, &platformInfo.info.platformVendor));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_EXTENSIONS, &platformInfo.info.platformExtensons));
+
+        parseOpenCLVersion(platformInfo.info.platformVersion,
+                platformInfo.info.platformVersionMajor, platformInfo.info.platformVersionMinor);
+
+        std::vector<cl::Device> devices;
+        cl_int status = platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+        if(status != CL_DEVICE_NOT_FOUND)
+            openCLVerifyCall(status);
+
+        if(devices.size() > 0)
+        {
+            int baseIndx = global_devices.size();
+            global_devices.resize(baseIndx + devices.size());
+            platformInfo.deviceIDs.resize(devices.size());
+            platformInfo.info.devices.resize(devices.size());
+
+            for(size_t j = 0; j < devices.size(); ++j)
+            {
+                cl::Device& device = devices[j];
+
+                DeviceInfoImpl& deviceInfo = global_devices[baseIndx + j];
+                deviceInfo.info._id = baseIndx + j;
+                deviceInfo.platform_id = platform();
+                deviceInfo.device_id = device();
+
+                deviceInfo.info.platform = &platformInfo.info;
+                platformInfo.deviceIDs[j] = deviceInfo.info._id;
+
+                cl_device_type type = cl_device_type(-1);
+                openCLSafeCall(device.getInfo(CL_DEVICE_TYPE, &type));
+                deviceInfo.info.deviceType = DeviceType(type);
+
+                openCLSafeCall(device.getInfo(CL_DEVICE_PROFILE, &deviceInfo.info.deviceProfile));
+                openCLSafeCall(device.getInfo(CL_DEVICE_VERSION, &deviceInfo.info.deviceVersion));
+                openCLSafeCall(device.getInfo(CL_DEVICE_NAME, &deviceInfo.info.deviceName));
+                openCLSafeCall(device.getInfo(CL_DEVICE_VENDOR, &deviceInfo.info.deviceVendor));
+                cl_uint vendorID = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_VENDOR_ID, &vendorID));
+                deviceInfo.info.deviceVendorId = vendorID;
+                openCLSafeCall(device.getInfo(CL_DRIVER_VERSION, &deviceInfo.info.deviceDriverVersion));
+                openCLSafeCall(device.getInfo(CL_DEVICE_EXTENSIONS, &deviceInfo.info.deviceExtensions));
+
+                parseOpenCLVersion(deviceInfo.info.deviceVersion,
+                        deviceInfo.info.deviceVersionMajor, deviceInfo.info.deviceVersionMinor);
+
+                size_t maxWorkGroupSize = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize));
+                deviceInfo.info.maxWorkGroupSize = maxWorkGroupSize;
+
+                cl_uint maxDimensions = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions));
+                std::vector<size_t> maxWorkItemSizes(maxDimensions);
+                openCLSafeCall(clGetDeviceInfo(device(), CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions,
+                        (void *)&maxWorkItemSizes[0], 0));
+                deviceInfo.info.maxWorkItemSizes = maxWorkItemSizes;
+
+                cl_uint maxComputeUnits = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &maxComputeUnits));
+                deviceInfo.info.maxComputeUnits = maxComputeUnits;
+
+                cl_ulong localMemorySize = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &localMemorySize));
+                deviceInfo.info.localMemorySize = (size_t)localMemorySize;
+
+
+                cl_bool unifiedMemory = false;
+                openCLSafeCall(device.getInfo(CL_DEVICE_HOST_UNIFIED_MEMORY, &unifiedMemory));
+                deviceInfo.info.isUnifiedMemory = unifiedMemory != 0;
+
+                //initialize extra options for compilation. Currently only fp64 is included.
+                //Assume 4KB is enough to store all possible extensions.
+                openCLSafeCall(device.getInfo(CL_DEVICE_EXTENSIONS, &deviceInfo.info.deviceExtensions));
+
+                size_t fp64_khr = deviceInfo.info.deviceExtensions.find("cl_khr_fp64");
+                if(fp64_khr != std::string::npos)
+                {
+                    deviceInfo.info.compilationExtraOptions += "-D DOUBLE_SUPPORT";
+                    deviceInfo.info.haveDoubleSupport = true;
+                }
+                else
+                {
+                    deviceInfo.info.haveDoubleSupport = false;
+                }
+            }
+        }
+    }
+
+    for (size_t i = 0; i < platforms.size(); ++i)
+    {
+        PlatformInfoImpl& platformInfo = global_platforms[i];
+        for(size_t j = 0; j < platformInfo.deviceIDs.size(); ++j)
+        {
+            DeviceInfoImpl& deviceInfo = global_devices[platformInfo.deviceIDs[j]];
+            platformInfo.info.devices[j] = &deviceInfo.info;
+        }
+    }
+
+    return global_devices.size();
+}
+
+
+DeviceInfo::DeviceInfo()
+    : _id(-1), deviceType(DeviceType(0)),
+      deviceVendorId(-1),
+      maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0),
+      deviceVersionMajor(0), deviceVersionMinor(0),
+      haveDoubleSupport(false), isUnifiedMemory(false),
+      platform(NULL)
+{
+    // nothing
+}
+
+PlatformInfo::PlatformInfo()
+    : _id(-1),
+      platformVersionMajor(0), platformVersionMinor(0)
+{
+    // nothing
+}
+
+//////////////////////////////// OpenCL context ////////////////////////
+//This is a global singleton class used to represent a OpenCL context.
+class ContextImpl : public Context
+{
+public:
+    const cl_device_id clDeviceID;
+    cl_context clContext;
+    cl_command_queue clCmdQueue;
+    const DeviceInfo& deviceInfo;
+
+protected:
+    ContextImpl(const DeviceInfo& deviceInfo, cl_device_id clDeviceID)
+        : clDeviceID(clDeviceID), clContext(NULL), clCmdQueue(NULL), deviceInfo(deviceInfo)
+    {
+        // nothing
+    }
+    ~ContextImpl();
+public:
+    static void setContext(const DeviceInfo* deviceInfo);
+
+    bool supportsFeature(FEATURE_TYPE featureType) const;
+
+    static void cleanupContext(void);
+};
+
+static cv::Mutex currentContextMutex;
+static ContextImpl* currentContext = NULL;
+
+Context* Context::getContext()
+{
+    if (currentContext == NULL)
+    {
+        if (!__initialized || !__deviceSelected)
+        {
+            cv::AutoLock lock(__initializedMutex);
+            if (!__initialized)
+            {
+                if (initializeOpenCLDevices() == 0)
+                {
+                    CV_Error(CV_GpuNotSupported, "OpenCL not available");
+                }
+            }
+            if (!__deviceSelected)
+            {
+                if (!selectOpenCLDevice())
+                {
+                    CV_Error(CV_GpuNotSupported, "Can't select OpenCL device");
+                }
+            }
+        }
+        CV_Assert(currentContext != NULL);
+    }
+    return currentContext;
+}
+
+bool Context::supportsFeature(FEATURE_TYPE featureType) const
+{
+    return ((ContextImpl*)this)->supportsFeature(featureType);
+}
+
+const DeviceInfo& Context::getDeviceInfo() const
+{
+    return ((ContextImpl*)this)->deviceInfo;
+}
+
+const void* Context::getOpenCLContextPtr() const
+{
+    return &(((ContextImpl*)this)->clContext);
+}
+
+const void* Context::getOpenCLCommandQueuePtr() const
+{
+    return &(((ContextImpl*)this)->clCmdQueue);
+}
+
+const void* Context::getOpenCLDeviceIDPtr() const
+{
+    return &(((ContextImpl*)this)->clDeviceID);
+}
+
+
+bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const
+{
+    switch (featureType)
+    {
+    case FEATURE_CL_DOUBLE:
+        return deviceInfo.haveDoubleSupport;
+    case FEATURE_CL_UNIFIED_MEM:
+        return deviceInfo.isUnifiedMemory;
+    case FEATURE_CL_VER_1_2:
+        return deviceInfo.deviceVersionMajor > 1 || (deviceInfo.deviceVersionMajor == 1 && deviceInfo.deviceVersionMinor >= 2);
+    }
+    CV_Error(CV_StsBadArg, "Invalid feature type");
+    return false;
+}
+
+#if defined(WIN32)
+static bool __termination = false;
+#endif
+
+ContextImpl::~ContextImpl()
+{
+#ifdef WIN32
+    // if process is on termination stage (ExitProcess was called and other threads were terminated)
+    // then disable command queue release because it may cause program hang
+    if (!__termination)
+#endif
+    {
+        if(clCmdQueue)
+        {
+            openCLSafeCall(clReleaseCommandQueue(clCmdQueue)); // some cleanup problems are here
+        }
+
+        if(clContext)
+        {
+            openCLSafeCall(clReleaseContext(clContext));
+        }
+    }
+    clCmdQueue = NULL;
+    clContext = NULL;
+}
+
+void fft_teardown();
+void clBlasTeardown();
+
+void ContextImpl::cleanupContext(void)
+{
+    fft_teardown();
+    clBlasTeardown();
+
+    cv::AutoLock lock(currentContextMutex);
+    if (currentContext)
+        delete currentContext;
+    currentContext = NULL;
+}
+
+void ContextImpl::setContext(const DeviceInfo* deviceInfo)
+{
+    CV_Assert(deviceInfo->_id >= 0 && deviceInfo->_id < (int)global_devices.size());
+
+    {
+        cv::AutoLock lock(currentContextMutex);
+        if (currentContext)
+        {
+            if (currentContext->deviceInfo._id == deviceInfo->_id)
+                return;
+        }
+    }
+
+    DeviceInfoImpl& infoImpl = global_devices[deviceInfo->_id];
+    CV_Assert(deviceInfo == &infoImpl.info);
+
+    cl_int status = 0;
+    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(infoImpl.platform_id), 0 };
+    cl_context clContext = clCreateContext(cps, 1, &infoImpl.device_id, NULL, NULL, &status);
+    openCLVerifyCall(status);
+    // TODO add CL_QUEUE_PROFILING_ENABLE
+    cl_command_queue clCmdQueue = clCreateCommandQueue(clContext, infoImpl.device_id, 0, &status);
+    openCLVerifyCall(status);
+
+    ContextImpl* ctx = new ContextImpl(infoImpl.info, infoImpl.device_id);
+    ctx->clCmdQueue = clCmdQueue;
+    ctx->clContext = clContext;
+
+    ContextImpl* old = NULL;
+    {
+        cv::AutoLock lock(currentContextMutex);
+        old = currentContext;
+        currentContext = ctx;
+    }
+    if (old != NULL)
+    {
+        delete old;
+    }
+}
+
+int getOpenCLPlatforms(PlatformsInfo& platforms)
+{
+    if (!__initialized)
+        initializeOpenCLDevices();
+
+    platforms.clear();
+
+    for (size_t id = 0; id < global_platforms.size(); ++id)
+    {
+        PlatformInfoImpl& impl = global_platforms[id];
+        platforms.push_back(&impl.info);
+    }
+
+    return platforms.size();
+}
+
+int getOpenCLDevices(std::vector<const DeviceInfo*> &devices, int deviceType, const PlatformInfo* platform)
+{
+    if (!__initialized)
+        initializeOpenCLDevices();
+
+    devices.clear();
+
+    switch(deviceType)
+    {
+    case CVCL_DEVICE_TYPE_DEFAULT:
+    case CVCL_DEVICE_TYPE_CPU:
+    case CVCL_DEVICE_TYPE_GPU:
+    case CVCL_DEVICE_TYPE_ACCELERATOR:
+    case CVCL_DEVICE_TYPE_ALL:
+        break;
+    default:
+        return 0;
+    }
+
+    if (platform == NULL)
+    {
+        for (size_t id = 0; id < global_devices.size(); ++id)
+        {
+            DeviceInfoImpl& deviceInfo = global_devices[id];
+            if (((int)deviceInfo.info.deviceType & deviceType) != 0)
+            {
+                devices.push_back(&deviceInfo.info);
+            }
+        }
+    }
+    else
+    {
+        for (size_t id = 0; id < platform->devices.size(); ++id)
+        {
+            const DeviceInfo* deviceInfo = platform->devices[id];
+            if (((int)deviceInfo->deviceType & deviceType) == deviceType)
+            {
+                devices.push_back(deviceInfo);
+            }
+        }
+    }
+
+    return (int)devices.size();
+}
+
+void setDevice(const DeviceInfo* info)
+{
+    if (!__deviceSelected)
+        __deviceSelected = true;
+
+    ContextImpl::setContext(info);
+}
+
+bool supportsFeature(FEATURE_TYPE featureType)
+{
+    return Context::getContext()->supportsFeature(featureType);
+}
+
+struct __Module
+{
+    __Module() { /* moved to Context::getContext(): initializeOpenCLDevices(); */ }
+    ~__Module() { ContextImpl::cleanupContext(); }
+};
+static __Module __module;
+
+
+} // namespace ocl
+} // namespace cv
+
+
+#if defined(WIN32) && defined(CVAPI_EXPORTS)
+
+extern "C"
+BOOL WINAPI DllMain(HINSTANCE /*hInst*/, DWORD fdwReason, LPVOID lpReserved)
+{
+    if (fdwReason == DLL_PROCESS_DETACH)
+    {
+        if (lpReserved != NULL) // called after ExitProcess() call
+            cv::ocl::__termination = true;
+    }
+    return TRUE;
+}
+
+#endif
--- a/modules/ocl/src/cl_operations.cpp
+++ b/modules/ocl/src/cl_operations.cpp
@ -0,0 +1,409 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Guoping Long, longguoping@gmail.com
+//    Niko Li, newlife20080214@gmail.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#include <fstream>
+#include "cl_programcache.hpp"
+
+//#define PRINT_KERNEL_RUN_TIME
+#define RUN_TIMES 100
+#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
+#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
+#endif
+//#define AMD_DOUBLE_DIFFER
+
+namespace cv {
+namespace ocl {
+
+DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
+DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
+int gDevMemTypeValueMap[5] = {0,
+                              CL_MEM_ALLOC_HOST_PTR,
+                              CL_MEM_USE_HOST_PTR,
+                              CL_MEM_COPY_HOST_PTR,
+                              CL_MEM_USE_PERSISTENT_MEM_AMD};
+int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
+
+void finish()
+{
+    clFinish(getClCommandQueue(Context::getContext()));
+}
+
+bool isCpuDevice()
+{
+    const DeviceInfo& info = Context::getContext()->getDeviceInfo();
+    return (info.deviceType == CVCL_DEVICE_TYPE_CPU);
+}
+
+size_t queryWaveFrontSize(cl_kernel kernel)
+{
+    const DeviceInfo& info = Context::getContext()->getDeviceInfo();
+    if (info.deviceType == CVCL_DEVICE_TYPE_CPU)
+        return 1;
+    size_t wavefront = 0;
+    CV_Assert(kernel != NULL);
+    openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(Context::getContext()),
+            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wavefront, NULL));
+    return wavefront;
+}
+
+
+void openCLReadBuffer(Context *ctx, cl_mem dst_buffer, void *host_buffer, size_t size)
+{
+    cl_int status;
+    status = clEnqueueReadBuffer(getClCommandQueue(ctx), dst_buffer, CL_TRUE, 0,
+                                 size, host_buffer, 0, NULL, NULL);
+    openCLVerifyCall(status);
+}
+
+cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size)
+{
+    cl_int status;
+    cl_mem buffer = clCreateBuffer(getClContext(ctx), (cl_mem_flags)flag, size, NULL, &status);
+    openCLVerifyCall(status);
+    return buffer;
+}
+
+void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch,
+                       size_t widthInBytes, size_t height)
+{
+    openCLMallocPitchEx(ctx, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
+}
+
+void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch,
+                       size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
+{
+    cl_int status;
+    *dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
+                              widthInBytes * height, 0, &status);
+    openCLVerifyCall(status);
+    *pitch = widthInBytes;
+}
+
+void openCLMemcpy2D(Context *ctx, void *dst, size_t dpitch,
+                    const void *src, size_t spitch,
+                    size_t width, size_t height, openCLMemcpyKind kind, int channels)
+{
+    size_t buffer_origin[3] = {0, 0, 0};
+    size_t host_origin[3] = {0, 0, 0};
+    size_t region[3] = {width, height, 1};
+    if(kind == clMemcpyHostToDevice)
+    {
+        if(dpitch == width || channels == 3 || height == 1)
+        {
+            openCLSafeCall(clEnqueueWriteBuffer(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
+                                                0, width * height, src, 0, NULL, NULL));
+        }
+        else
+        {
+            openCLSafeCall(clEnqueueWriteBufferRect(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
+                                                    buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
+        }
+    }
+    else if(kind == clMemcpyDeviceToHost)
+    {
+        if(spitch == width || channels == 3 || height == 1)
+        {
+            openCLSafeCall(clEnqueueReadBuffer(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
+                                               0, width * height, dst, 0, NULL, NULL));
+        }
+        else
+        {
+            openCLSafeCall(clEnqueueReadBufferRect(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
+                                                   buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
+        }
+    }
+}
+
+void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset,
+                        const void *src, size_t spitch,
+                        size_t width, size_t height, int src_offset)
+{
+    size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
+    size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
+    size_t region[3] = {width, height, 1};
+
+    openCLSafeCall(clEnqueueCopyBufferRect(getClCommandQueue(ctx), (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
+                                           region, spitch, 0, dpitch, 0, 0, 0, 0));
+}
+
+void openCLFree(void *devPtr)
+{
+    openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
+}
+
+cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName)
+{
+    return openCLGetKernelFromSource(ctx, source, kernelName, NULL);
+}
+
+cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
+                                    const char *build_options)
+{
+    cl_kernel kernel;
+    cl_int status = 0;
+    CV_Assert(ProgramCache::getProgramCache() != NULL);
+    cl_program program = ProgramCache::getProgramCache()->getProgram(ctx, source, build_options);
+    CV_Assert(program != NULL);
+    kernel = clCreateKernel(program, kernelName.c_str(), &status);
+    openCLVerifyCall(status);
+    openCLVerifyCall(clReleaseProgram(program));
+    return kernel;
+}
+
+void openCLVerifyKernel(const Context *ctx, cl_kernel kernel, size_t *localThreads)
+{
+    size_t kernelWorkGroupSize;
+    openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(ctx),
+                                            CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+    CV_Assert( localThreads[0] <= ctx->getDeviceInfo().maxWorkItemSizes[0] );
+    CV_Assert( localThreads[1] <= ctx->getDeviceInfo().maxWorkItemSizes[1] );
+    CV_Assert( localThreads[2] <= ctx->getDeviceInfo().maxWorkItemSizes[2] );
+    CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
+    CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= ctx->getDeviceInfo().maxWorkGroupSize );
+}
+
+#ifdef PRINT_KERNEL_RUN_TIME
+static double total_execute_time = 0;
+static double total_kernel_time = 0;
+#endif
+void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
+                          size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
+                          int depth, const char *build_options)
+{
+    //construct kernel name
+    //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
+    //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
+    stringstream idxStr;
+    if(channels != -1)
+        idxStr << "_C" << channels;
+    if(depth != -1)
+        idxStr << "_D" << depth;
+    kernelName += idxStr.str();
+
+    cl_kernel kernel;
+    kernel = openCLGetKernelFromSource(ctx, source, kernelName, build_options);
+
+    if ( localThreads != NULL)
+    {
+        globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
+        globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
+        globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
+
+        cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
+    }
+    for(size_t i = 0; i < args.size(); i ++)
+        openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
+
+#ifndef PRINT_KERNEL_RUN_TIME
+    openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                                          localThreads, 0, NULL, NULL));
+#else
+    cl_event event = NULL;
+    openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                                          localThreads, 0, NULL, &event));
+
+    cl_ulong start_time, end_time, queue_time;
+    double execute_time = 0;
+    double total_time   = 0;
+
+    openCLSafeCall(clWaitForEvents(1, &event));
+    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
+                                           sizeof(cl_ulong), &start_time, 0));
+
+    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                                           sizeof(cl_ulong), &end_time, 0));
+
+    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+                                           sizeof(cl_ulong), &queue_time, 0));
+
+    execute_time = (double)(end_time - start_time) / (1000 * 1000);
+    total_time = (double)(end_time - queue_time) / (1000 * 1000);
+
+    total_execute_time += execute_time;
+    total_kernel_time += total_time;
+    clReleaseEvent(event);
+#endif
+
+    clFlush(getClCommandQueue(ctx));
+    openCLSafeCall(clReleaseKernel(kernel));
+}
+
+void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
+                         size_t globalThreads[3], size_t localThreads[3],
+                         vector< pair<size_t, const void *> > &args, int channels, int depth)
+{
+    openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args,
+                        channels, depth, NULL);
+}
+void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
+                         size_t globalThreads[3], size_t localThreads[3],
+                         vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
+
+{
+#ifndef PRINT_KERNEL_RUN_TIME
+    openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
+                         build_options);
+#else
+    string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
+    cout << endl;
+    cout << "Function Name: " << kernelName;
+    if(depth >= 0)
+        cout << " |data type: " << data_type[depth];
+    cout << " |channels: " << channels;
+    cout << " |Time Unit: " << "ms" << endl;
+
+    total_execute_time = 0;
+    total_kernel_time = 0;
+    cout << "-------------------------------------" << endl;
+
+    cout << setiosflags(ios::left) << setw(15) << "excute time";
+    cout << setiosflags(ios::left) << setw(15) << "lauch time";
+    cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
+    int i = 0;
+    for(i = 0; i < RUN_TIMES; i++)
+        openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
+                             build_options);
+
+    cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
+    cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
+#endif
+}
+
+double openCLExecuteKernelInterop(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
+                         size_t globalThreads[3], size_t localThreads[3],
+                         vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
+                         bool finish, bool measureKernelTime, bool cleanUp)
+
+{
+    //construct kernel name
+    //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
+    //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
+    stringstream idxStr;
+    if(channels != -1)
+        idxStr << "_C" << channels;
+    if(depth != -1)
+        idxStr << "_D" << depth;
+    kernelName += idxStr.str();
+
+    cl_kernel kernel;
+    kernel = openCLGetKernelFromSource(ctx, source, kernelName, build_options);
+
+    double kernelTime = 0.0;
+
+    if( globalThreads != NULL)
+    {
+        if ( localThreads != NULL)
+        {
+            globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
+            globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
+            globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
+
+            //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
+            cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
+        }
+        for(size_t i = 0; i < args.size(); i ++)
+            openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
+
+        if(measureKernelTime == false)
+        {
+            openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                            localThreads, 0, NULL, NULL));
+        }
+        else
+        {
+            cl_event event = NULL;
+            openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                            localThreads, 0, NULL, &event));
+
+            cl_ulong end_time, queue_time;
+
+            openCLSafeCall(clWaitForEvents(1, &event));
+
+            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                            sizeof(cl_ulong), &end_time, 0));
+
+            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+                            sizeof(cl_ulong), &queue_time, 0));
+
+            kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
+
+            clReleaseEvent(event);
+        }
+    }
+
+    if(finish)
+    {
+        clFinish(getClCommandQueue(ctx));
+    }
+
+    if(cleanUp)
+    {
+        openCLSafeCall(clReleaseKernel(kernel));
+    }
+
+    return kernelTime;
+}
+
+cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+                     const size_t size)
+{
+    int status;
+    cl_mem con_struct;
+
+    con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
+    openCLSafeCall(status);
+
+    openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
+                                        value, 0, 0, 0));
+
+    return con_struct;
+}
+
+}//namespace ocl
+}//namespace cv
--- a/modules/ocl/src/cl_programcache.cpp
+++ b/modules/ocl/src/cl_programcache.cpp
@ -0,0 +1,530 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Guoping Long, longguoping@gmail.com
+//    Niko Li, newlife20080214@gmail.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#include <fstream>
+#include "cl_programcache.hpp"
+
+// workaround for OpenCL C++ bindings
+#if defined(HAVE_OPENCL12)
+#include "opencv2/ocl/cl_runtime/cl_runtime_opencl12_wrappers.hpp"
+#elif defined(HAVE_OPENCL11)
+#include "opencv2/ocl/cl_runtime/cl_runtime_opencl11_wrappers.hpp"
+#else
+#error Invalid OpenCL configuration
+#endif
+
+#if defined _MSC_VER && _MSC_VER >= 1200
+#  pragma warning( disable: 4100 4244 4267 4510 4512 4610)
+#endif
+#undef __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+
+namespace cv { namespace ocl {
+
+#define MAX_PROG_CACHE_SIZE 1024
+/*
+ * The binary caching system to eliminate redundant program source compilation.
+ * Strictly, this is not a cache because we do not implement evictions right now.
+ * We shall add such features to trade-off memory consumption and performance when necessary.
+ */
+
+cv::Mutex ProgramCache::mutexFiles;
+cv::Mutex ProgramCache::mutexCache;
+
+std::auto_ptr<ProgramCache> _programCache;
+ProgramCache* ProgramCache::getProgramCache()
+{
+    if (NULL == _programCache.get())
+        _programCache.reset(new ProgramCache());
+    return _programCache.get();
+}
+
+ProgramCache::ProgramCache()
+{
+    codeCache.clear();
+    cacheSize = 0;
+}
+
+ProgramCache::~ProgramCache()
+{
+    releaseProgram();
+}
+
+cl_program ProgramCache::progLookup(const string& srcsign)
+{
+    map<string, cl_program>::iterator iter;
+    iter = codeCache.find(srcsign);
+    if(iter != codeCache.end())
+        return iter->second;
+    else
+        return NULL;
+}
+
+void ProgramCache::addProgram(const string& srcsign, cl_program program)
+{
+    if (!progLookup(srcsign))
+    {
+        clRetainProgram(program);
+        codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
+    }
+}
+
+void ProgramCache::releaseProgram()
+{
+    map<string, cl_program>::iterator iter;
+    for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
+    {
+        openCLSafeCall(clReleaseProgram(iter->second));
+    }
+    codeCache.clear();
+    cacheSize = 0;
+}
+
+static int enable_disk_cache = true ||
+#ifdef _DEBUG
+        false;
+#else
+        true;
+#endif
+static String binpath = "";
+
+void setBinaryDiskCache(int mode, String path)
+{
+    enable_disk_cache = 0;
+    binpath = "";
+
+    if(mode == CACHE_NONE)
+    {
+        return;
+    }
+    enable_disk_cache =
+#ifdef _DEBUG
+        (mode & CACHE_DEBUG)   == CACHE_DEBUG;
+#else
+        (mode & CACHE_RELEASE) == CACHE_RELEASE;
+#endif
+    if(enable_disk_cache && !path.empty())
+    {
+        binpath = path;
+    }
+}
+
+void setBinaryPath(const char *path)
+{
+    binpath = path;
+}
+
+static const int MAX_ENTRIES = 64;
+
+struct ProgramFileCache
+{
+    struct CV_DECL_ALIGNED(1) ProgramFileHeader
+    {
+        int hashLength;
+        //char hash[];
+    };
+
+    struct CV_DECL_ALIGNED(1) ProgramFileTable
+    {
+        int numberOfEntries;
+        //int firstEntryOffset[];
+    };
+
+    struct CV_DECL_ALIGNED(1) ProgramFileConfigurationEntry
+    {
+        int nextEntry;
+        int dataSize;
+        int optionsLength;
+        //char options[];
+        // char data[];
+    };
+
+    string fileName_;
+    const char* hash_;
+    std::fstream f;
+
+    ProgramFileCache(const string& fileName, const char* hash)
+        : fileName_(fileName), hash_(hash)
+    {
+        if (hash_ != NULL)
+        {
+            f.open(fileName_.c_str(), ios::in|ios::out|ios::binary);
+            if(f.is_open())
+            {
+                int hashLength = 0;
+                f.read((char*)&hashLength, sizeof(int));
+                std::vector<char> fhash(hashLength + 1);
+                f.read(&fhash[0], hashLength);
+                if (f.eof() || strncmp(hash_, &fhash[0], hashLength) != 0)
+                {
+                    f.close();
+                    remove(fileName_.c_str());
+                    return;
+                }
+            }
+        }
+    }
+
+    int getHash(const string& options)
+    {
+        int hash = 0;
+        for (size_t i = 0; i < options.length(); i++)
+        {
+            hash = (hash << 2) ^ (hash >> 17) ^ options[i];
+        }
+        return (hash + (hash >> 16)) & (MAX_ENTRIES - 1);
+    }
+
+    bool readConfigurationFromFile(const string& options, std::vector<char>& buf)
+    {
+        if (hash_ == NULL)
+            return false;
+
+        if (!f.is_open())
+            return false;
+
+        f.seekg(0, std::fstream::end);
+        size_t fileSize = (size_t)f.tellg();
+        if (fileSize == 0)
+        {
+            std::cerr << "Invalid file (empty): " << fileName_ << std::endl;
+            f.close();
+            remove(fileName_.c_str());
+            return false;
+        }
+        f.seekg(0, std::fstream::beg);
+
+        int hashLength = 0;
+        f.read((char*)&hashLength, sizeof(int));
+        CV_Assert(hashLength > 0);
+        f.seekg(sizeof(hashLength) + hashLength, std::fstream::beg);
+
+        int numberOfEntries = 0;
+        f.read((char*)&numberOfEntries, sizeof(int));
+        CV_Assert(numberOfEntries > 0);
+        if (numberOfEntries != MAX_ENTRIES)
+        {
+            std::cerr << "Invalid file: " << fileName_ << std::endl;
+            f.close();
+            remove(fileName_.c_str());
+            return false;
+        }
+
+        std::vector<int> firstEntryOffset(numberOfEntries);
+        f.read((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
+
+        int entryNum = getHash(options);
+
+        int entryOffset = firstEntryOffset[entryNum];
+        ProgramFileConfigurationEntry entry;
+        while (entryOffset > 0)
+        {
+            f.seekg(entryOffset, std::fstream::beg);
+            assert(sizeof(entry) == sizeof(int)*3);
+            f.read((char*)&entry, sizeof(entry));
+            std::vector<char> foptions(entry.optionsLength);
+            if ((int)options.length() == entry.optionsLength)
+            {
+                if (entry.optionsLength > 0)
+                    f.read(&foptions[0], entry.optionsLength);
+                if (memcmp(&foptions[0], options.c_str(), entry.optionsLength) == 0)
+                {
+                    buf.resize(entry.dataSize);
+                    f.read(&buf[0], entry.dataSize);
+                    f.seekg(0, std::fstream::beg);
+                    return true;
+                }
+            }
+            if (entry.nextEntry <= 0)
+                break;
+            entryOffset = entry.nextEntry;
+        }
+        return false;
+    }
+
+    bool writeConfigurationToFile(const string& options, std::vector<char>& buf)
+    {
+        if (hash_ == NULL)
+            return true; // don't save dynamic kernels
+
+        if (!f.is_open())
+        {
+            f.open(fileName_.c_str(), ios::in|ios::out|ios::binary);
+            if (!f.is_open())
+            {
+                f.open(fileName_.c_str(), ios::out|ios::binary);
+                if (!f.is_open())
+                    return false;
+            }
+        }
+
+        f.seekg(0, std::fstream::end);
+        size_t fileSize = (size_t)f.tellg();
+        if (fileSize == 0)
+        {
+            f.seekp(0, std::fstream::beg);
+            int hashLength = strlen(hash_);
+            f.write((char*)&hashLength, sizeof(int));
+            f.write(hash_, hashLength);
+
+            int numberOfEntries = MAX_ENTRIES;
+            f.write((char*)&numberOfEntries, sizeof(int));
+            std::vector<int> firstEntryOffset(MAX_ENTRIES, 0);
+            f.write((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
+            f.close();
+            f.open(fileName_.c_str(), ios::in|ios::out|ios::binary);
+            CV_Assert(f.is_open());
+            f.seekg(0, std::fstream::end);
+            fileSize = (size_t)f.tellg();
+        }
+        f.seekg(0, std::fstream::beg);
+
+        int hashLength = 0;
+        f.read((char*)&hashLength, sizeof(int));
+        CV_Assert(hashLength > 0);
+        f.seekg(sizeof(hashLength) + hashLength, std::fstream::beg);
+
+        int numberOfEntries = 0;
+        f.read((char*)&numberOfEntries, sizeof(int));
+        CV_Assert(numberOfEntries > 0);
+        if (numberOfEntries != MAX_ENTRIES)
+        {
+            std::cerr << "Invalid file: " << fileName_ << std::endl;
+            f.close();
+            remove(fileName_.c_str());
+            return false;
+        }
+
+        size_t tableEntriesOffset = (size_t)f.tellg();
+        std::vector<int> firstEntryOffset(numberOfEntries);
+        f.read((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
+
+        int entryNum = getHash(options);
+
+        int entryOffset = firstEntryOffset[entryNum];
+        ProgramFileConfigurationEntry entry;
+        while (entryOffset > 0)
+        {
+            f.seekg(entryOffset, std::fstream::beg);
+            assert(sizeof(entry) == sizeof(int)*3);
+            f.read((char*)&entry, sizeof(entry));
+            std::vector<char> foptions(entry.optionsLength);
+            if ((int)options.length() == entry.optionsLength)
+            {
+                if (entry.optionsLength > 0)
+                    f.read(&foptions[0], entry.optionsLength);
+                CV_Assert(memcmp(&foptions, options.c_str(), entry.optionsLength) != 0);
+            }
+            if (entry.nextEntry <= 0)
+                break;
+            entryOffset = entry.nextEntry;
+        }
+        if (entryOffset > 0)
+        {
+            f.seekp(entryOffset, std::fstream::beg);
+            entry.nextEntry = fileSize;
+            f.write((char*)&entry, sizeof(entry));
+        }
+        else
+        {
+            firstEntryOffset[entryNum] = fileSize;
+            f.seekp(tableEntriesOffset, std::fstream::beg);
+            f.write((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
+        }
+        f.seekp(fileSize, std::fstream::beg);
+        entry.nextEntry = 0;
+        entry.dataSize = buf.size();
+        entry.optionsLength = options.length();
+        f.write((char*)&entry, sizeof(entry));
+        f.write(options.c_str(), entry.optionsLength);
+        f.write(&buf[0], entry.dataSize);
+        return true;
+    }
+
+    cl_program getOrBuildProgram(const Context* ctx, const cv::ocl::ProgramEntry* source, const string& options)
+    {
+        cl_int status = 0;
+        cl_program program = NULL;
+        std::vector<char> binary;
+        if (!enable_disk_cache || !readConfigurationFromFile(options, binary))
+        {
+            program = clCreateProgramWithSource(getClContext(ctx), 1, (const char**)&source->programStr, NULL, &status);
+            openCLVerifyCall(status);
+            cl_device_id device = getClDeviceID(ctx);
+            status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL);
+            if(status == CL_SUCCESS)
+            {
+                if (enable_disk_cache)
+                {
+                    size_t binarySize;
+                    openCLSafeCall(clGetProgramInfo(program,
+                                            CL_PROGRAM_BINARY_SIZES,
+                                            sizeof(size_t),
+                                            &binarySize, NULL));
+
+                    std::vector<char> binary(binarySize);
+
+                    char* ptr = &binary[0];
+                    openCLSafeCall(clGetProgramInfo(program,
+                                            CL_PROGRAM_BINARIES,
+                                            sizeof(char*),
+                                            &ptr,
+                                            NULL));
+
+                    if (!writeConfigurationToFile(options, binary))
+                    {
+                        std::cerr << "Can't write data to file: " << fileName_ << std::endl;
+                    }
+                }
+            }
+        }
+        else
+        {
+            cl_device_id device = getClDeviceID(ctx);
+            size_t size = binary.size();
+            const char* ptr = &binary[0];
+            program = clCreateProgramWithBinary(getClContext(ctx),
+                    1, &device,
+                    (const size_t *)&size, (const unsigned char **)&ptr,
+                    NULL, &status);
+            openCLVerifyCall(status);
+            status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL);
+        }
+
+        if(status != CL_SUCCESS)
+        {
+            if(status == CL_BUILD_PROGRAM_FAILURE)
+            {
+                cl_int logStatus;
+                char *buildLog = NULL;
+                size_t buildLogSize = 0;
+                logStatus = clGetProgramBuildInfo(program,
+                        getClDeviceID(ctx), CL_PROGRAM_BUILD_LOG, buildLogSize,
+                        buildLog, &buildLogSize);
+                if(logStatus != CL_SUCCESS)
+                    std::cout << "Failed to build the program and get the build info." << endl;
+                buildLog = new char[buildLogSize];
+                CV_DbgAssert(!!buildLog);
+                memset(buildLog, 0, buildLogSize);
+                openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
+                                                     CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
+                std::cout << "\nBUILD LOG: " << options << "\n";
+                std::cout << buildLog << endl;
+                delete [] buildLog;
+            }
+            openCLVerifyCall(status);
+        }
+        return program;
+    }
+};
+
+cl_program ProgramCache::getProgram(const Context *ctx, const cv::ocl::ProgramEntry* source,
+                                    const char *build_options)
+{
+    stringstream src_sign;
+
+    src_sign << (int64)(source->programStr);
+    src_sign << getClContext(ctx);
+    if (NULL != build_options)
+    {
+        src_sign << "_" << build_options;
+    }
+
+    {
+        cv::AutoLock lockCache(mutexCache);
+        cl_program program = ProgramCache::getProgramCache()->progLookup(src_sign.str());
+        if (!!program)
+        {
+            clRetainProgram(program);
+            return program;
+        }
+    }
+
+    cv::AutoLock lockCache(mutexFiles);
+
+    // second check
+    {
+        cv::AutoLock lockCache(mutexCache);
+        cl_program program = ProgramCache::getProgramCache()->progLookup(src_sign.str());
+        if (!!program)
+        {
+            clRetainProgram(program);
+            return program;
+        }
+    }
+
+    string all_build_options;
+    if (!ctx->getDeviceInfo().compilationExtraOptions.empty())
+        all_build_options += ctx->getDeviceInfo().compilationExtraOptions;
+    if (build_options != NULL)
+    {
+        all_build_options += " ";
+        all_build_options += build_options;
+    }
+    const DeviceInfo& devInfo = ctx->getDeviceInfo();
+    string filename = binpath + (source->name ? source->name : "NULL") + "_" + devInfo.platform->platformName + "_" + devInfo.deviceName + ".clb";
+
+    ProgramFileCache programFileCache(filename, source->programHash);
+    cl_program program = programFileCache.getOrBuildProgram(ctx, source, all_build_options);
+
+    //Cache the binary for future use if build_options is null
+    if( (this->cacheSize += 1) < MAX_PROG_CACHE_SIZE)
+    {
+        cv::AutoLock lockCache(mutexCache);
+        this->addProgram(src_sign.str(), program);
+    }
+    else
+    {
+        cout << "Warning: code cache has been full.\n";
+    }
+    return program;
+}
+
+} // namespace ocl
+} // namespace cv
--- a/modules/ocl/src/cl_programcache.hpp
+++ b/modules/ocl/src/cl_programcache.hpp
@ -44,47 +44,43 @@

 #include "precomp.hpp"

-using namespace cv;
-using namespace cv::ocl;
-using namespace std;
-using std::cout;
-using std::endl;
+namespace cv {
+namespace ocl {

-namespace cv
+class ProgramCache
 {
-    namespace ocl
-    {
-        class ProgramCache
-        {
-        protected:
-            ProgramCache();
-            friend class auto_ptr<ProgramCache>;
-            static auto_ptr<ProgramCache> programCache;
+protected:
+    ProgramCache();
+    ~ProgramCache();
+    friend class std::auto_ptr<ProgramCache>;
+public:
+    static ProgramCache *getProgramCache();

-        public:
-            ~ProgramCache();
-            static ProgramCache *getProgramCache()
-            {
-                if( NULL == programCache.get())
-                    programCache.reset(new ProgramCache());
-                return programCache.get();
-            }
+    cl_program getProgram(const Context *ctx, const cv::ocl::ProgramEntry* source,
+                          const char *build_options);

-            //lookup the binary given the file name
-            cl_program progLookup(string srcsign);
+    void releaseProgram();
+protected:
+    //lookup the binary given the file name
+    // (with acquired mutexCache)
+    cl_program progLookup(const string& srcsign);

-            //add program to the cache
-            void addProgram(string srcsign, cl_program program);
-            void releaseProgram();
+    //add program to the cache
+    // (with acquired mutexCache)
+    void addProgram(const string& srcsign, cl_program program);

-            map <string, cl_program> codeCache;
-            unsigned int cacheSize;
-            //The presumed watermark for the cache volume (256MB). Is it enough?
-            //We may need more delicate algorithms when necessary later.
-            //Right now, let's just leave it along.
-            static const unsigned MAX_PROG_CACHE_SIZE = 1024;
-        };
+    map <string, cl_program> codeCache;
+    unsigned int cacheSize;

-    }//namespace ocl
+    //The presumed watermark for the cache volume (256MB). Is it enough?
+    //We may need more delicate algorithms when necessary later.
+    //Right now, let's just leave it along.
+    static const unsigned MAX_PROG_CACHE_SIZE = 1024;

+    // acquire both mutexes in this order: 1) mutexFiles 2) mutexCache
+    static cv::Mutex mutexFiles;
+    static cv::Mutex mutexCache;
+};
+
+}//namespace ocl
 }//namespace cv
--- a/modules/ocl/src/cl_runtime/generator/common.py
+++ b/modules/ocl/src/cl_runtime/generator/common.py
@ -182,6 +182,29 @@ def generateTemplates(sz, lprefix, switch_name, calling_convention=''):
        print '};'
        print ''

+@outputToString
+def generateInlineWrappers(fns):
+    print '// generated by %s' % os.path.basename(sys.argv[0])
+    for fn in fns:
+        print '#undef %s' % (fn['name'])
+        print '#define %s %s_fn' % (fn['name'], fn['name'])
+        params = []
+        call_params = []
+        for i in range(0, len(fn['params'])):
+            t = fn['params'][i]
+            if t.find('*)') >= 0:
+                p = re.sub(r'\*\)', (' *p%d)' % i), t, 1)
+                params.append(p)
+            else:
+                params.append('%s p%d' % (t, i))
+            call_params.append('p%d' % (i))
+
+        if len(fn['ret']) == 1 and fn['ret'][0] == 'void':
+            print 'inline void %s(%s) { %s_pfn(%s); }' \
+                    % (fn['name'], ', '.join(params), fn['name'], ', '.join(call_params))
+        else:
+            print 'inline %s %s(%s) { return %s_pfn(%s); }' \
+                    % (' '.join(fn['ret']), fn['name'], ', '.join(params), fn['name'], ', '.join(call_params))

 def ProcessTemplate(inputFile, ctx, noteLine='//\n// AUTOGENERATED, DO NOT EDIT\n//'):
    f = open(inputFile, "r")
--- a/modules/ocl/src/cl_runtime/generator/parser_cl.py
+++ b/modules/ocl/src/cl_runtime/generator/parser_cl.py
@ -10,6 +10,7 @@ try:
    if len(sys.argv) > 1:
        outfile = open('../../../include/opencv2/ocl/cl_runtime/' + sys.argv[1] + '.hpp', "w")
        outfile_impl = open('../' + sys.argv[1] + '_impl.hpp', "w")
+        outfile_wrappers = open('../../../include/opencv2/ocl/cl_runtime/' + sys.argv[1] + '_wrappers.hpp', "w")
        if len(sys.argv) > 2:
            f = open(sys.argv[2], "r")
        else:
@ -102,6 +103,11 @@ ctx['CL_FN_DECLARATIONS'] = generateFnDeclaration(fns)
 sys.stdout = outfile
 ProcessTemplate('template/cl_runtime_opencl.hpp.in', ctx)

+ctx['CL_FN_INLINE_WRAPPERS'] = generateInlineWrappers(fns)
+
+sys.stdout = outfile_wrappers
+ProcessTemplate('template/cl_runtime_opencl_wrappers.hpp.in', ctx)
+
 ctx['CL_FN_ENUMS'] = generateEnums(fns)
 ctx['CL_FN_NAMES'] = generateNames(fns)
 ctx['CL_FN_DEFINITIONS'] = generateFnDefinition(fns)
--- a/modules/ocl/src/cl_runtime/generator/template/cl_runtime_opencl_wrappers.hpp.in
+++ b/modules/ocl/src/cl_runtime/generator/template/cl_runtime_opencl_wrappers.hpp.in
@ -0,0 +1,6 @@
+#ifndef __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
+#define __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
+
+@CL_FN_INLINE_WRAPPERS@
+
+#endif // __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
--- a/modules/ocl/src/color.cpp
+++ b/modules/ocl/src/color.cpp
@ -45,6 +45,7 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
@ -57,14 +58,6 @@ using namespace cv::ocl;
 #define FLT_EPSILON     1.192092896e-07F
 #endif

-namespace cv
-{
-namespace ocl
-{
-extern const char *cvt_color;
-}
-}
-
 namespace
 {
 void RGB2Gray_caller(const oclMat &src, oclMat &dst, int bidx)
--- a/modules/ocl/src/columnsum.cpp
+++ b/modules/ocl/src/columnsum.cpp
@ -43,20 +43,11 @@
 //
 //M*/

-#include <iomanip>
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-namespace cv
-{
-    namespace ocl
-    {
-        extern const char *imgproc_columnsum;
-    }
-}

 void cv::ocl::columnSum(const oclMat &src, oclMat &dst)
 {
--- a/modules/ocl/src/error.cpp
+++ b/modules/ocl/src/error.cpp
@ -152,19 +152,19 @@ namespace cv
            case CL_INVALID_GLOBAL_WORK_SIZE:
                return "CL_INVALID_GLOBAL_WORK_SIZE";
                //case CL_INVALID_PROPERTY:
-                //	return "CL_INVALID_PROPERTY";
+                //    return "CL_INVALID_PROPERTY";
                //case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR:
-                //	return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+                //    return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
                //case CL_PLATFORM_NOT_FOUND_KHR:
-                //	return "CL_PLATFORM_NOT_FOUND_KHR";
-                //	//case CL_INVALID_PROPERTY_EXT:
-                //	//    return "CL_INVALID_PROPERTY_EXT";
+                //    return "CL_PLATFORM_NOT_FOUND_KHR";
+                //    //case CL_INVALID_PROPERTY_EXT:
+                //    //    return "CL_INVALID_PROPERTY_EXT";
                //case CL_DEVICE_PARTITION_FAILED_EXT:
-                //	return "CL_DEVICE_PARTITION_FAILED_EXT";
+                //    return "CL_DEVICE_PARTITION_FAILED_EXT";
                //case CL_INVALID_PARTITION_COUNT_EXT:
-                //	return "CL_INVALID_PARTITION_COUNT_EXT";
+                //    return "CL_INVALID_PARTITION_COUNT_EXT";
                //default:
-                //	return "unknown error code";
+                //    return "unknown error code";
            }
            static char buf[256];
            sprintf(buf, "%d", err);
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@ -42,12 +42,10 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include <iomanip>
 #include "precomp.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

 #if !defined HAVE_CLAMDFFT
 void cv::ocl::dft(const oclMat&, oclMat&, Size, int)
@ -156,25 +154,25 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
 {
    fft_setup();

-    bool is_1d_input	= (_dft_size.height == 1);
-    int is_row_dft		= flags & DFT_ROWS;
+    bool is_1d_input    = (_dft_size.height == 1);
+    int is_row_dft        = flags & DFT_ROWS;
    int is_scaled_dft   = flags & DFT_SCALE;
-    int is_inverse		= flags & DFT_INVERSE;
+    int is_inverse        = flags & DFT_INVERSE;

-    //clAmdFftResultLocation	place;
-    clAmdFftLayout			inLayout;
-    clAmdFftLayout			outLayout;
-    clAmdFftDim				dim = is_1d_input || is_row_dft ? CLFFT_1D : CLFFT_2D;
+    //clAmdFftResultLocation    place;
+    clAmdFftLayout            inLayout;
+    clAmdFftLayout            outLayout;
+    clAmdFftDim                dim = is_1d_input || is_row_dft ? CLFFT_1D : CLFFT_2D;

-    size_t batchSize		 = is_row_dft ? dft_size.height : 1;
+    size_t batchSize         = is_row_dft ? dft_size.height : 1;
    size_t clLengthsIn[ 3 ]  = {1, 1, 1};
    size_t clStridesIn[ 3 ]  = {1, 1, 1};
    //size_t clLengthsOut[ 3 ] = {1, 1, 1};
    size_t clStridesOut[ 3 ] = {1, 1, 1};
-    clLengthsIn[0]			 = dft_size.width;
-    clLengthsIn[1]			 = is_row_dft ? 1 : dft_size.height;
-    clStridesIn[0]			 = 1;
-    clStridesOut[0]			 = 1;
+    clLengthsIn[0]             = dft_size.width;
+    clLengthsIn[1]             = is_row_dft ? 1 : dft_size.height;
+    clStridesIn[0]             = 1;
+    clStridesOut[0]             = 1;

    switch(_type)
    {
@ -206,7 +204,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
    clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
    clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];

-    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getoclContext(), dim, clLengthsIn ) );
+    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getClContextPtr(), dim, clLengthsIn ) );

    openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
    openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
@ -220,7 +218,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
    openCLSafeCall( clAmdFftSetPlanScale  ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );

    //ready to bake
-    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getoclCommandQueue(), NULL, NULL ) );
+    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getClCommandQueuePtr(), NULL, NULL ) );
 }
 cv::ocl::FftPlan::~FftPlan()
 {
@ -296,12 +294,12 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
    // similar assertions with cuda module
    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);

-    //bool is_1d_input	= (src.rows == 1);
-    //int is_row_dft		= flags & DFT_ROWS;
-    //int is_scaled_dft		= flags & DFT_SCALE;
-    int is_inverse			= flags & DFT_INVERSE;
-    bool is_complex_input	= src.channels() == 2;
-    bool is_complex_output	= !(flags & DFT_REAL_OUTPUT);
+    //bool is_1d_input    = (src.rows == 1);
+    //int is_row_dft        = flags & DFT_ROWS;
+    //int is_scaled_dft        = flags & DFT_SCALE;
+    int is_inverse = flags & DFT_INVERSE;
+    bool is_complex_input = src.channels() == 2;
+    bool is_complex_output = !(flags & DFT_REAL_OUTPUT);


    // We don't support real-to-real transform
@ -338,10 +336,10 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
    if (buffersize)
    {
        cl_int medstatus;
-        clMedBuffer = clCreateBuffer ( (cl_context)src.clCxt->oclContext(), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+        clMedBuffer = clCreateBuffer ( *(cl_context*)(src.clCxt->getOpenCLContextPtr()), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
        openCLSafeCall( medstatus );
    }
-    cl_command_queue clq = (cl_command_queue)src.clCxt->oclCommandQueue();
+    cl_command_queue clq = *(cl_command_queue*)(src.clCxt->getOpenCLCommandQueuePtr());
    openCLSafeCall( clAmdFftEnqueueTransform( plHandle,
        is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
        1,
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@ -48,26 +48,11 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

-using namespace std;
 using namespace cv;
 using namespace cv::ocl;

-//helper routines
-namespace cv
-{
-namespace ocl
-{
-///////////////////////////OpenCL kernel strings///////////////////////////
-extern const char *filtering_boxFilter;
-extern const char *filter_sep_row;
-extern const char *filter_sep_col;
-extern const char *filtering_laplacian;
-extern const char *filtering_morph;
-extern const char *filtering_adaptive_bilateral;
-}
-}
-
 namespace
 {
 inline void normalizeAnchor(int &anchor, int ksize)
@ -1430,7 +1415,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,

 void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
    {
        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
        return;
--- a/modules/ocl/src/gemm.cpp
+++ b/modules/ocl/src/gemm.cpp
@ -43,7 +43,6 @@
 //
 //M*/

-#include <iomanip>
 #include "precomp.hpp"

 namespace cv { namespace ocl {
@ -134,7 +133,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
    int offb    = src2.offset;
    int offc    = dst.offset;

-    cl_command_queue clq = (cl_command_queue)src1.clCxt->oclCommandQueue();
+    cl_command_queue clq = *(cl_command_queue*)src1.clCxt->getOpenCLCommandQueuePtr();
    switch(src1.type())
    {
    case CV_32FC1:
--- a/modules/ocl/src/gftt.cpp
+++ b/modules/ocl/src/gftt.cpp
@ -42,23 +42,14 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include <iomanip>
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;

 static bool use_cpu_sorter = true;

-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *imgproc_gftt;
-    }
-}
-
 namespace
 {
 enum SortMethod
@ -338,7 +329,7 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &poin
    CV_DbgAssert(points.type() == CV_32FC2);
    points_v.resize(points.cols);
    openCLSafeCall(clEnqueueReadBuffer(
-        *reinterpret_cast<cl_command_queue*>(getoclCommandQueue()),
+        *(cl_command_queue*)getClCommandQueuePtr(),
        reinterpret_cast<cl_mem>(points.data),
        CL_TRUE,
        0,
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@ -49,24 +49,10 @@
 //M*/

 #include "precomp.hpp"
-#include <stdio.h>
-#include <string>
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-
-namespace cv
-{
-namespace ocl
-{
-///////////////////////////OpenCL kernel strings///////////////////////////
-extern const char *haarobjectdetect;
-extern const char *haarobjectdetectbackup;
-extern const char *haarobjectdetect_scaled2;
-}
-}

 /* these settings affect the quality of detection: change with care */
 #define CV_ADJUST_FEATURES 1
@ -745,7 +731,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
    if( gimg.cols < minSize.width || gimg.rows < minSize.height )
        CV_Error(CV_StsError, "Image too small");

-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
    if( (flags & CV_HAAR_SCALE_IMAGE) )
    {
        CvSize winSize0 = cascade->orig_window_size;
@ -788,7 +774,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS

        size_t blocksize = 8;
        size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->computeUnits()) *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->getDeviceInfo().maxComputeUnits) *localThreads[0],
                                    localThreads[1], 1
                                  };
        int outputsz = 256 * globalThreads[0] / localThreads[0];
@ -949,7 +935,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        int grp_per_CU = 12;
        size_t blocksize = 8;
        size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->getDeviceInfo().maxComputeUnits *localThreads[0],
                                    localThreads[1], 1 };
        int outputsz = 256 * globalThreads[0] / localThreads[0];
        int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
@ -1120,7 +1106,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
    int blocksize = 8;
    int grp_per_CU = 12;
    size_t localThreads[3] = { blocksize, blocksize, 1 };
-    size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->computeUnits() *localThreads[0],
+    size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->getDeviceInfo().maxComputeUnits *localThreads[0],
        localThreads[1],
        1 };
    int outputsz = 256 * globalThreads[0] / localThreads[0];
@ -1148,7 +1134,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
    }

    int *candidate;
-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
    if( (flags & CV_HAAR_SCALE_IMAGE) )
    {
        int indexy = 0;
@ -1340,7 +1326,7 @@ void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
    GpuHidHaarStageClassifier    *stage;
    GpuHidHaarClassifier         *classifier;
    GpuHidHaarTreeNode           *node;
-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
    if( (flags & CV_HAAR_SCALE_IMAGE) )
    {
        gcascade   = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
@ -1505,7 +1491,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
    CvSize sz;
    CvSize winSize0 = oldCascade->orig_window_size;
    detect_piramid_info *scaleinfo;
-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
    if (flags & CV_HAAR_SCALE_IMAGE)
    {
        for(factor = 1.f;; factor *= scaleFactor)
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@ -44,9 +44,10 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
@ -57,15 +58,6 @@ using namespace std;
 static oclMat gauss_w_lut;
 static bool hog_device_cpu;

-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *objdetect_hog;
-    }
-}
-
 namespace cv
 {
    namespace ocl
@ -157,7 +149,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo

    effect_size = Size(0, 0);

-    if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
+    if (isCpuDevice())
        hog_device_cpu = true;
    else
        hog_device_cpu = false;
@ -1670,9 +1662,9 @@ void cv::ocl::device::hog::compute_hists(int nbins,
    else
    {
        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        size_t wave_size = queryWaveFrontSize(kernel);
        char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
            localThreads, args, -1, -1, opt);
    }
@ -1734,9 +1726,9 @@ void cv::ocl::device::hog::normalize_hists(int nbins,
    else
    {
        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        size_t wave_size = queryWaveFrontSize(kernel);
        char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
                             localThreads, args, -1, -1, opt);
    }
@ -1803,9 +1795,9 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width,
    else
    {
        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        size_t wave_size = queryWaveFrontSize(kernel);
        char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
                             localThreads, args, -1, -1, opt);
    }
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@ -54,34 +54,15 @@
 //M*/

 #include "precomp.hpp"
-#include <iomanip>
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

 namespace cv
 {
    namespace ocl
    {
-
-        ////////////////////////////////////OpenCL kernel strings//////////////////////////
-        extern const char *meanShift;
-        extern const char *imgproc_copymakeboder;
-        extern const char *imgproc_median;
-        extern const char *imgproc_threshold;
-        extern const char *imgproc_resize;
-        extern const char *imgproc_remap;
-        extern const char *imgproc_warpAffine;
-        extern const char *imgproc_warpPerspective;
-        extern const char *imgproc_integral_sum;
-        extern const char *imgproc_integral;
-        extern const char *imgproc_histogram;
-        extern const char *imgproc_bilateral;
-        extern const char *imgproc_calcHarris;
-        extern const char *imgproc_calcMinEigenVal;
-        extern const char *imgproc_convolve;
-        extern const char *imgproc_clahe;
        ////////////////////////////////////OpenCL call wrappers////////////////////////////

        template <typename T> struct index_and_sizeof;
@ -289,7 +270,7 @@ namespace cv
                args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
                args.push_back( make_pair(sizeof(cl_int), (void *)&cols));

-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                {
                    args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
                }
@ -317,7 +298,7 @@ namespace cv
                args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
                args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
                args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                {
                    args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
                }
@ -380,7 +361,7 @@ namespace cv
                args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
                args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
                args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                {
                    args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
                    args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
@ -802,12 +783,12 @@ namespace cv
                string kernelName = "warpAffine" + s[interpolation];


-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                {
                    cl_int st;
-                    coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
                    openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
                }
                else
                {
@ -817,8 +798,8 @@ namespace cv
                        {
                            float_coeffs[m][n] = coeffs[m][n];
                        }
-                        coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
-                        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
+                        coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
+                        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));

                }
                //TODO: improve this kernel
@ -872,12 +853,12 @@ namespace cv
                string s[3] = {"NN", "Linear", "Cubic"};
                string kernelName = "warpPerspective" + s[interpolation];

-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                {
                    cl_int st;
-                    coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
                    openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
                }
                else
                {
@ -886,9 +867,9 @@ namespace cv
                        for(int n = 0; n < 3; n++)
                            float_coeffs[m][n] = coeffs[m][n];

-                    coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
                    openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
                }
                //TODO: improve this kernel
                size_t blkSizeX = 16, blkSizeY = 16;
@ -994,7 +975,7 @@ namespace cv
        void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
        {
            CV_Assert(src.type() == CV_8UC1);
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
            {
                CV_Error(CV_GpuNotSupported, "select device don't support double");
            }
@ -1127,7 +1108,7 @@ namespace cv
            CV_Assert(Dx.offset == 0 && Dy.offset == 0);
        }

-        static void corner_ocl(const char *src_str, string kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy,
+        static void corner_ocl(const cv::ocl::ProgramEntry* source, string kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy,
                        oclMat &dst, int border_type)
        {
            char borderType[30];
@ -1179,7 +1160,7 @@ namespace cv
            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
            args.push_back( make_pair( sizeof(cl_float) , (void *)&k));
-            openCLExecuteKernel(dst.clCxt, &src_str, kernelName, gt, lt, args, -1, -1, build_options);
+            openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, build_options);
        }

        void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
@ -1192,7 +1173,7 @@ namespace cv
        void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
                          double k, int borderType)
        {
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
            {
                CV_Error(CV_GpuNotSupported, "select device don't support double");
            }
@ -1200,7 +1181,7 @@ namespace cv
            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
            extractCovData(src, dx, dy, blockSize, ksize, borderType);
            dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
+            corner_ocl(&imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
        }

        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
@ -1211,7 +1192,7 @@ namespace cv

        void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
        {
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
            {
                CV_Error(CV_GpuNotSupported, "select device don't support double");
            }
@ -1219,7 +1200,7 @@ namespace cv
            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
            extractCovData(src, dx, dy, blockSize, ksize, borderType);
            dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
+            corner_ocl(&imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
        }
        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
        static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
@ -1512,17 +1493,17 @@ namespace cv
                String kernelName = "calcLut";
                size_t localThreads[3]  = { 32, 8, 1 };
                size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
-                bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+                bool is_cpu = isCpuDevice();
                if (is_cpu)
                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
                else
                {
                    cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
-                    int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+                    size_t wave_size = queryWaveFrontSize(kernel);
                    openCLSafeCall(clReleaseKernel(kernel));

                    static char opt[20] = {0};
-                    sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+                    sprintf(opt, " -D WAVE_SIZE=%d", (int)wave_size);
                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
                }
            }
@ -1768,7 +1749,7 @@ namespace cv
 }
 //////////////////////////////////convolve////////////////////////////////////////////////////

-static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const char **kernelString)
+static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
    CV_Assert(src.depth() == CV_32FC1);
    CV_Assert(temp1.depth() == CV_32F);
@ -1803,7 +1784,7 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, st
    args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.rows ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.cols ));

-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
 }
 void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
 {
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
--- a/modules/ocl/src/interpolate_frames.cpp
+++ b/modules/ocl/src/interpolate_frames.cpp
@ -44,8 +44,8 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

-using namespace std;
 using namespace cv;
 using namespace cv::ocl;

@ -53,9 +53,6 @@ namespace cv
 {
    namespace ocl
    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *interpolate_frames;
-
        namespace interpolate
        {
            //The following are ported from NPP_staging.cu
--- a/modules/ocl/src/kalman.cpp
+++ b/modules/ocl/src/kalman.cpp
@ -44,7 +44,6 @@
 //M*/
 #include "precomp.hpp"

-using namespace std;
 using namespace cv;
 using namespace cv::ocl;

@ -132,4 +131,4 @@ CV_EXPORTS const oclMat& KalmanFilter::correct(const oclMat& measurement)
    gemm(gain, temp5, 1, statePre, 1, statePost);
    gemm(gain, temp2, -1, errorCovPre, 1, errorCovPost);
    return statePost;
-}
+}
--- a/modules/ocl/src/kmeans.cpp
+++ b/modules/ocl/src/kmeans.cpp
@ -43,20 +43,11 @@
 //
 //M*/

-#include <iomanip>
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
-using namespace ocl;
-
-namespace cv
-{
-namespace ocl
-{
-////////////////////////////////////OpenCL kernel strings//////////////////////////
-extern const char *kmeans_kernel;
-}
-}
+using namespace cv::ocl;

 static void generateRandomCenter(const vector<Vec2f>& box, float* center, RNG& rng)
 {
--- a/modules/ocl/src/knearest.cpp
+++ b/modules/ocl/src/knearest.cpp
@ -44,17 +44,11 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 using namespace cv;
 using namespace cv::ocl;

-namespace cv
-{
-    namespace ocl
-    {
-        extern const char* knearest;//knearest
-    }
-}
-
 KNearestNeighbour::KNearestNeighbour()
 {
    clear();
@ -112,7 +106,7 @@ void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lable
    k1 = MIN( k1, k );

    String kernel_name = "knn_find_nearest";
-    cl_ulong local_memory_size = queryLocalMemInfo();
+    cl_ulong local_memory_size = (cl_ulong)Context::getContext()->getDeviceInfo().localMemorySize;
    int nThreads = local_memory_size / (2 * k * 4);
    if(nThreads >= 256)
        nThreads = 256;
@ -122,7 +116,7 @@ void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lable
    size_t global_thread[] = {1, samples.rows, 1};

    char build_option[50];
-    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
    {
        sprintf(build_option, " ");
    }else
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@ -44,22 +44,11 @@
 //M*/


-#include <iomanip>
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-//helper routines
-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *match_template;
-    }
-}

 namespace cv
 {
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@ -46,30 +46,19 @@
 //M*/

 #include "precomp.hpp"
-
-#define ALIGN 32
-#define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

-////////////////////////////////////////////////////////////////////////
-//////////////////////////////// oclMat ////////////////////////////////
-////////////////////////////////////////////////////////////////////////
+#define ALIGN 32
+#define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN

 // helper routines
 namespace cv
 {
    namespace ocl
    {
-        /////////////////////////// OpenCL kernel strings ///////////////////////////
-
-        extern const char *operator_copyToM;
-        extern const char *operator_convertTo;
-        extern const char *operator_setTo;
-        extern const char *operator_setToM;
-        extern const char *convertC3C4;
        extern DevMemType gDeviceMemType;
        extern DevMemRW gDeviceMemRW;
    }
@ -134,7 +123,6 @@ void cv::ocl::oclMat::upload(const Mat &m)
    Size wholeSize;
    Point ofs;
    m.locateROI(wholeSize, ofs);
-
    create(wholeSize, m.type());

    if (m.channels() == 3)
@ -142,13 +130,12 @@ void cv::ocl::oclMat::upload(const Mat &m)
        int pitch = wholeSize.width * 3 * m.elemSize1();
        int tail_padding = m.elemSize1() * 3072;
        int err;
-        cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
+        cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
                                     (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
        openCLVerifyCall(err);

        openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
        convert_C3C4(temp, *this);
-
        openCLSafeCall(clReleaseMemObject(temp));
    }
    else
@ -197,13 +184,12 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
        int pitch = wholecols * 3 * m.elemSize1();
        int tail_padding = m.elemSize1() * 3072;
        int err;
-        cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
+        cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
                                     (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
        openCLVerifyCall(err);

        convert_C4C3(*this, temp);
        openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
-
        openCLSafeCall(clReleaseMemObject(temp));
    }
    else
@ -319,7 +305,7 @@ static void convert_run(const oclMat &src, oclMat &dst, double alpha, double bet

 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
 {
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) &&
            (depth() == CV_64F || dst.depth() == CV_64F))
    {
        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
@ -380,7 +366,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri
 #ifdef CL_VERSION_1_2
    // this enables backwards portability to
    // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-    if (Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
+    if (Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2) &&
        dst.offset == 0 && dst.cols == dst.wholecols)
    {
        const int sizeofMap[][7] =
@ -392,7 +378,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri
            };
        int sizeofGeneric = sizeofMap[dst.oclchannels() - 1][dst.depth()];

-        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
+        clEnqueueFillBuffer(getClCommandQueue(dst.clCxt),
                            (cl_mem)dst.data, (void*)mat.data, sizeofGeneric,
                            0, dst.step * dst.rows, 0, NULL, NULL);
    }
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@ -72,7 +72,7 @@ namespace cv
    namespace ocl
    {
        // provide additional methods for the user to interact with the command queue after a task is fired
-        static void openCLExecuteKernel_2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
+        static void openCLExecuteKernel_2(Context *clCxt, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
                                   size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
                                   int depth, char *build_options, FLUSH_MODE finish_mode)
        {
@ -101,15 +101,15 @@ namespace cv
            for(size_t i = 0; i < args.size(); i ++)
                openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));

-            openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, globalThreads,
+            openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), kernel, 3, NULL, globalThreads,
                                                  localThreads, 0, NULL, NULL));

            switch(finish_mode)
            {
            case CLFINISH:
-                clFinish((cl_command_queue)clCxt->oclCommandQueue());
+                clFinish(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr());
            case CLFLUSH:
-                clFlush((cl_command_queue)clCxt->oclCommandQueue());
+                clFlush(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr());
                break;
            case DISABLE:
            default:
@ -118,14 +118,14 @@ namespace cv
            openCLSafeCall(clReleaseKernel(kernel));
        }

-        void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName,
+        void openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, string kernelName,
                                  size_t globalThreads[3], size_t localThreads[3],
                                  vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode)
        {
            openCLExecuteKernel2(clCxt, source, kernelName, globalThreads, localThreads, args,
                                 channels, depth, NULL, finish_mode);
        }
-        void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName,
+        void openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, string kernelName,
                                  size_t globalThreads[3], size_t localThreads[3],
                                  vector< pair<size_t, const void *> > &args, int channels, int depth, char *build_options, FLUSH_MODE finish_mode)

@ -178,7 +178,7 @@ namespace cv
 #ifdef CL_VERSION_1_2
            //this enables backwards portability to
            //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-            if(Context::getContext()->supportsFeature(Context::CL_VER_1_2))
+            if(Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2))
            {
                cl_image_desc desc;
                desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
@ -191,13 +191,13 @@ namespace cv
                desc.buffer           = NULL;
                desc.num_mip_levels   = 0;
                desc.num_samples      = 0;
-                texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+                texture = clCreateImage(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
            }
            else
 #endif
            {
                texture = clCreateImage2D(
-                    (cl_context)mat.clCxt->oclContext(),
+                    *(cl_context*)mat.clCxt->getOpenCLContextPtr(),
                    CL_MEM_READ_WRITE,
                    &format,
                    mat.cols,
@ -212,22 +212,22 @@ namespace cv
            cl_mem devData;
            if (mat.cols * mat.elemSize() != mat.step)
            {
-                devData = clCreateBuffer((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_ONLY, mat.cols * mat.rows
+                devData = clCreateBuffer(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_ONLY, mat.cols * mat.rows
                    * mat.elemSize(), NULL, NULL);
                const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
-                clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
+                clEnqueueCopyBufferRect(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), (cl_mem)mat.data, devData, origin, origin,
                    regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
-                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
+                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
            }
            else
            {
                devData = (cl_mem)mat.data;
            }

-            clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
+            clEnqueueCopyBufferToImage(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), devData, texture, 0, origin, region, 0, NULL, 0);
            if ((mat.cols * mat.elemSize() != mat.step))
            {
-                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
+                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
                clReleaseMemObject(devData);
            }

@ -249,7 +249,7 @@ namespace cv

        bool support_image2d(Context *clCxt)
        {
-            static const char * _kernel_string = "__kernel void test_func(image2d_t img) {}";
+            const cv::ocl::ProgramEntry _kernel = {NULL, "__kernel void test_func(image2d_t img) {}", NULL};
            static bool _isTested = false;
            static bool _support = false;
            if(_isTested)
@ -258,8 +258,8 @@ namespace cv
            }
            try
            {
-                cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
-                finish();
+                cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel, "test_func");
+                cv::ocl::finish();
                _support = true;
            }
            catch (const cv::Exception& e)
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@ -44,13 +44,12 @@
 //
 //M*/
 #include "precomp.hpp"
-#include <iostream>
+#include "opencl_kernels.hpp"
+
 namespace cv
 {
 namespace ocl
 {
-extern const char *moments;
-
 // The function calculates center of gravity and the central second order moments
 static void icvCompleteMomentState( CvMoments* moments )
 {
@ -106,7 +105,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )

        bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;

-        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float)
+        if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
        {
            CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
        }
@ -146,7 +145,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )

        cv::Mat dst(dst_a);
        a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
-        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+        if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
        {
            for (int i = 0; i < contour->total; ++i)
            {
@ -230,7 +229,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
            CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
    }

-    if( !moments )
+    if( !mom )
        CV_Error( CV_StsNullPtr, "" );

    memset( mom, 0, sizeof(*mom));
--- a/modules/ocl/src/mssegmentation.cpp
+++ b/modules/ocl/src/mssegmentation.cpp
@ -43,8 +43,10 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

-using namespace std;
+using namespace cv;
+using namespace cv::ocl;

 // Auxiliray stuff
 namespace
--- a/modules/ocl/src/optical_flow_farneback.cpp
+++ b/modules/ocl/src/optical_flow_farneback.cpp
@ -45,23 +45,14 @@


 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 #include "opencv2/video/tracking.hpp"

-using namespace std;
 using namespace cv;
 using namespace cv::ocl;

 #define MIN_SIZE 32

-namespace cv
-{
-namespace ocl
-{
-///////////////////////////OpenCL kernel strings///////////////////////////
-extern const char *optical_flow_farneback;
-}
-}
-
 namespace cv {
 namespace ocl {
 namespace optflow_farneback
--- a/modules/ocl/src/pyrdown.cpp
+++ b/modules/ocl/src/pyrdown.cpp
@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Dachuan Zhao, dachuan@multicorewareinc.com
-//		Yao Wang, yao@multicorewareinc.com
+//        Dachuan Zhao, dachuan@multicorewareinc.com
+//        Yao Wang, yao@multicorewareinc.com
 //
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -45,23 +45,10 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-using std::cout;
-using std::endl;
-
-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *pyr_down;
-
-    }
-}

 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@ -45,21 +45,12 @@
 //
 //M*/

-
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

-using namespace std;
 using namespace cv;
 using namespace cv::ocl;

-namespace cv
-{
-namespace ocl
-{
-extern const char *pyrlk;
-extern const char *pyrlk_no_image;
-}
-}
 struct dim3
 {
    unsigned int x, y, z;
@ -125,7 +116,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    args.push_back( make_pair( sizeof(cl_int), (void *)&iters ));
    args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr ));

-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
    if (is_cpu)
    {
        openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU");
@ -139,7 +130,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
            stringstream idxStr;
            idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth();
            cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str());
-            int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+            int wave_size = (int)queryWaveFrontSize(kernel);
            openCLSafeCall(clReleaseKernel(kernel));

            static char opt[32] = {0};
--- a/modules/ocl/src/pyrup.cpp
+++ b/modules/ocl/src/pyrup.cpp
@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Zhang Chunpeng chunpeng@multicorewareinc.com
-//		Yao Wang, yao@multicorewareinc.com
+//        Zhang Chunpeng chunpeng@multicorewareinc.com
+//        Yao Wang, yao@multicorewareinc.com
 //
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -45,21 +45,19 @@
 //
 //M*/

-/* Haar features calculation */
-//#define EMU
-
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
+
+/* Haar features calculation */
+//#define EMU

 namespace cv
 {
    namespace ocl
    {
-        extern const char *pyr_up;
-
        void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst)
        {
            int depth = src.depth(), channels = src.channels(), oclChannels = src.oclchannels();
--- a/modules/ocl/src/sort_by_key.cpp
+++ b/modules/ocl/src/sort_by_key.cpp
@ -43,18 +43,16 @@
 //
 //M*/

-#include <iomanip>
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
+using namespace cv;
+using namespace cv::ocl;

 namespace cv
 {
 namespace ocl
 {
-
-extern const char * kernel_sort_by_key;
-extern const char * kernel_stablesort_by_key;
-extern const char * kernel_radix_sort_by_key;
-
 void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);

 //TODO(pengx17): change this value depending on device other than a constant
--- a/modules/ocl/src/split_merge.cpp
+++ b/modules/ocl/src/split_merge.cpp
@ -44,29 +44,11 @@
 //M*/

 #include "precomp.hpp"
-#include <vector>
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

-
-using std::cout;
-using std::endl;
-
-////////////////////////////////////////////////////////////////////////
-///////////////// oclMat merge and split ///////////////////////////////
-////////////////////////////////////////////////////////////////////////
-
-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *merge_mat;
-        extern const char *split_mat;
-    }
-}
 namespace cv
 {
    namespace ocl
@ -75,7 +57,7 @@ namespace cv
        {
            static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
            {
-                if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F)
+                if(!mat_dst.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_dst.type() == CV_64F)
                {
                    CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
                    return;
@ -170,7 +152,7 @@ namespace cv
            static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
            {

-                if(!mat_src.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_src.type() == CV_64F)
+                if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F)
                {
                    CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
                    return;
--- a/modules/ocl/src/stereo_csbp.cpp
+++ b/modules/ocl/src/stereo_csbp.cpp
@ -45,51 +45,11 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

-#if !defined (HAVE_OPENCL)
-
-namespace cv
-{
-    namespace ocl
-    {
-
-        void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int &, int &, int &, int &)
-        {
-            throw_nogpu();
-        }
-        cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int)
-        {
-            throw_nogpu();
-        }
-        cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float,
-            float, float, int, int)
-        {
-            throw_nogpu();
-        }
-
-        void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &, const oclMat &, oclMat &)
-        {
-            throw_nogpu();
-        }
-    }
-}
-
-#else /* !defined (HAVE_OPENCL) */
-
-namespace cv
-{
-    namespace ocl
-    {
-
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *stereocsbp;
-    }
-
-}
 namespace cv
 {
    namespace ocl
@ -150,10 +110,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th));
                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step));
                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }

@ -200,9 +160,9 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int),  (void *)&rthis.min_disp_th));
                openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int),  (void *)&cdisp_step1));
                openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int),  (void *)&msg_step));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }

@ -235,10 +195,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
                openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
                openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }
            static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
@ -270,10 +230,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
                openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
                openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }

@ -340,10 +300,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term));
                openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int),  (void *)&left.step));
                openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int),  (void *)&rthis.min_disp_th));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }
            static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost,
@ -391,10 +351,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term));
                openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int),  (void *)&left.step));
                openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int),  (void *)&rthis.min_disp_th));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }
            static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis,
@ -458,10 +418,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2));
                openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1));
                openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }
            ////////////////////////////////////////////////////////////////////////////////////////////////
@ -500,10 +460,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int),  (void *)&disp_step));
                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int),  (void *)&msg_step));
                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }
            static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
@ -552,10 +512,10 @@ namespace cv
                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane));
                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step));
                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                    globalThreads, localThreads, 0, NULL, NULL));

-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                openCLSafeCall(clReleaseKernel(kernel));
            }
        }
@ -755,5 +715,3 @@ void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat
    operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out,
        left, right, disp);
 }
-
-#endif /* !defined (HAVE_OPENCL) */
--- a/modules/ocl/src/stereobm.cpp
+++ b/modules/ocl/src/stereobm.cpp
@ -46,23 +46,11 @@
 //M*/

 #include "precomp.hpp"
-#include <vector>
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

-
-namespace cv
-{
-namespace ocl
-{
-
-///////////////////////////OpenCL kernel strings///////////////////////////
-extern const char *stereobm;
-
-}
-}
 namespace cv
 {
 namespace ocl
--- a/modules/ocl/src/stereobp.cpp
+++ b/modules/ocl/src/stereobp.cpp
@ -45,27 +45,11 @@
 //M*/

 #include "precomp.hpp"
-#include <vector>
-#include <cstdio>
+#include "opencl_kernels.hpp"

 using namespace cv;
 using namespace cv::ocl;
-using namespace std;

-////////////////////////////////////////////////////////////////////////
-///////////////// stereoBP /////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-
-namespace cv
-{
-    namespace ocl
-    {
-
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *stereobp;
-    }
-
-}
 namespace cv
 {
    namespace ocl
@ -95,7 +79,10 @@ namespace cv
                con_struct -> cmax_disc_term    = max_disc_term;
                con_struct -> cdisc_single_jump = disc_single_jump;

-                cl_con_struct = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()), (void *)con_struct,
+                Context* clCtx = Context::getContext();
+                cl_context clContext = *(cl_context*)(clCtx->getOpenCLContextPtr());
+                cl_command_queue clCmdQueue = *(cl_command_queue*)(clCtx->getOpenCLCommandQueuePtr());
+                cl_con_struct = load_constant(clContext, clCmdQueue, (void *)con_struct,
                                              sizeof(con_struct_t));

                delete con_struct;
--- a/modules/ocl/src/svm.cpp
+++ b/modules/ocl/src/svm.cpp
@ -43,9 +43,13 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 using namespace cv;
 using namespace ocl;

+namespace cv { namespace ocl {
+
 #if 1
 typedef float Qfloat;
 #define QFLOAT_TYPE CV_32F
@ -54,14 +58,6 @@ typedef double Qfloat;
 #define QFLOAT_TYPE CV_64F
 #endif

-namespace cv
-{
-namespace ocl
-{
-///////////////////////////OpenCL kernel strings///////////////////////////
-extern const char *svm;
-}
-}
 class CvSVMKernel_ocl: public CvSVMKernel
 {
 public:
@ -612,7 +608,7 @@ static void matmul_rbf(oclMat& src, oclMat& src_e, oclMat& dst, int src_rows, in
    args.push_back(make_pair(sizeof(cl_int), (void* )&src2_cols));
    args.push_back(make_pair(sizeof(cl_int), (void* )&width));
    float gamma = 0.0f;
-    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
    {
        gamma = (float)gamma1;
        args.push_back(make_pair(sizeof(cl_float), (void* )&gamma));
@ -748,7 +744,7 @@ float CvSVM_OCL::predict(const CvMat* samples, CV_OUT CvMat* results) const
    if(params.kernel_type == CvSVM::RBF)
    {
        sv_.upload(sv_temp);
-        if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+        if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
        {
            dst = oclMat(sample_count, sv_total, CV_32FC1);
        }
@ -886,7 +882,7 @@ bool CvSVMSolver_ocl::solve_generic( CvSVMSolutionInfo& si )
    if(params->kernel_type == CvSVM::RBF)
    {
        src_e = src;
-        if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+        if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
        {
            dst = oclMat(sample_count, sample_count, CV_32FC1);
        }
@ -1053,7 +1049,7 @@ void CvSVMKernel_ocl::calc( int vcount, const int row_idx, Qfloat* results, Mat&
    //int j;
    (this->*calc_func_ocl)( vcount, row_idx, results, src);

-#if defined HAVE_CLAMDBLAS
+// FIXIT #if defined HAVE_CLAMDBLAS
    const Qfloat max_val = (Qfloat)(FLT_MAX * 1e-3);
    int j;
    for( j = 0; j < vcount; j++ )
@ -1063,7 +1059,7 @@ void CvSVMKernel_ocl::calc( int vcount, const int row_idx, Qfloat* results, Mat&
            results[j] = max_val;
        }
    }
-#endif
+// FIXIT #endif
 }
 bool CvSVMKernel_ocl::create( const CvSVMParams* _params, Calc_ocl _calc_func, Calc _calc_func1 )
 {
@ -1115,7 +1111,7 @@ void CvSVMKernel_ocl::calc_non_rbf_base( int vcount, const int row_idx, Qfloat*
 }
 void CvSVMKernel_ocl::calc_rbf( int vcount, const int row_idx, Qfloat* results, Mat& src)
 {
-    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
    {
        for(int m = 0; m < vcount; m++)
        {
@ -1140,14 +1136,14 @@ void CvSVMKernel_ocl::calc_poly( int vcount, const int row_idx, Qfloat* results,

    calc_non_rbf_base( vcount, row_idx, results, src);

-#if defined HAVE_CLAMDBLAS
+//FIXIT #if defined HAVE_CLAMDBLAS

    CvMat R = cvMat( 1, vcount, QFLOAT_TYPE, results );
    if( vcount > 0 )
    {
        cvPow( &R, &R, params->degree );
    }
-#endif
+//FIXIT #endif
 }


@ -1155,11 +1151,11 @@ void CvSVMKernel_ocl::calc_sigmoid( int vcount, const int row_idx, Qfloat* resul
 {
    calc_non_rbf_base( vcount, row_idx, results, src);
    // TODO: speedup this
-#if defined HAVE_CLAMDBLAS
+//FIXIT #if defined HAVE_CLAMDBLAS
    for(int j = 0; j < vcount; j++ )
    {
        Qfloat t = results[j];
-        double e = exp(-fabs(t));
+        double e = ::exp(-fabs(t));
        if( t > 0 )
        {
            results[j] = (Qfloat)((1. - e) / (1. + e));
@ -1169,7 +1165,7 @@ void CvSVMKernel_ocl::calc_sigmoid( int vcount, const int row_idx, Qfloat* resul
            results[j] = (Qfloat)((e - 1.) / (e + 1.));
        }
    }
-#endif
+//FIXIT #endif
 }
 CvSVM_OCL::CvSVM_OCL()
 {
@ -1199,3 +1195,5 @@ void CvSVM_OCL::create_solver( )
 {
    solver = new CvSVMSolver_ocl(&params);
 }
+
+} }
--- a/modules/ocl/src/tvl1flow.cpp
+++ b/modules/ocl/src/tvl1flow.cpp
@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Jin Ma, jin@multicorewareinc.com
+//        Jin Ma, jin@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@ -42,21 +42,12 @@
 //
 //M*/

-
 #include "precomp.hpp"
-using namespace std;
+#include "opencl_kernels.hpp"
+
 using namespace cv;
 using namespace cv::ocl;

-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char* tvl1flow;
-    }
-}
-
 cv::ocl::OpticalFlowDual_TVL1_OCL::OpticalFlowDual_TVL1_OCL()
 {
    tau            = 0.25;
--- a/modules/ocl/test/main.cpp
+++ b/modules/ocl/test/main.cpp
@ -80,44 +80,55 @@ int main(int argc, char **argv)
    const char *keys =
        "{ h | help     | false              | print help message }"
        "{ t | type     | gpu                | set device type:cpu or gpu}"
-        "{ p | platform | 0                  | set platform id }"
+        "{ p | platform | -1                 | set platform id }"
        "{ d | device   | 0                  | set device id }";

-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<bool>("help"))
+    if (getenv("OPENCV_OPENCL_DEVICE") == NULL) // TODO Remove this after buildbot updates
    {
-        cout << "Avaible options besides goole test option:" << endl;
-        cmd.printParams();
-        return 0;
-    }
-    string type = cmd.get<string>("type");
-    unsigned int pid = cmd.get<unsigned int>("platform");
-    int device = cmd.get<int>("device");
+        CommandLineParser cmd(argc, argv, keys);
+        if (cmd.get<bool>("help"))
+        {
+            cout << "Available options besides google test option:" << endl;
+            cmd.printParams();
+            return 0;
+        }
+        string type = cmd.get<string>("type");
+        int pid = cmd.get<int>("platform");
+        int device = cmd.get<int>("device");

-    print_info();
-    int flag = CVCL_DEVICE_TYPE_GPU;
-    if(type == "cpu")
-    {
-        flag = CVCL_DEVICE_TYPE_CPU;
-    }
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = getDevice(oclinfo, flag);
-    if(devnums <= device || device < 0)
-    {
-        std::cout << "device invalid\n";
-        return -1;
-    }
-    if(pid >= oclinfo.size())
-    {
-        std::cout << "platform invalid\n";
-        return -1;
+        print_info();
+        int flag = CVCL_DEVICE_TYPE_GPU;
+        if(type == "cpu")
+        {
+            flag = CVCL_DEVICE_TYPE_CPU;
+        }
+
+        cv::ocl::PlatformsInfo platformsInfo;
+        cv::ocl::getOpenCLPlatforms(platformsInfo);
+        if (pid >= (int)platformsInfo.size())
+        {
+            std::cout << "platform is invalid\n";
+            return 1;
+        }
+
+        cv::ocl::DevicesInfo devicesInfo;
+        int devnums = cv::ocl::getOpenCLDevices(devicesInfo, flag, (pid < 0) ? NULL : platformsInfo[pid]);
+        if (device < 0 || device >= devnums)
+        {
+            std::cout << "device/platform invalid\n";
+            return 1;
+        }
+
+        cv::ocl::setDevice(devicesInfo[device]);
    }

-    setDevice(oclinfo[pid], device);
+    const DeviceInfo& deviceInfo = cv::ocl::Context::getContext()->getDeviceInfo();

-    setBinaryDiskCache(CACHE_UPDATE);
-
-    cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
+    cout << "Device type: " << (deviceInfo.deviceType == CVCL_DEVICE_TYPE_CPU ?
+                "CPU" :
+                (deviceInfo.deviceType == CVCL_DEVICE_TYPE_GPU ? "GPU" : "unknown")) << endl
+         << "Platform name: " << deviceInfo.platform->platformName << endl
+         << "Device name: " << deviceInfo.deviceName << endl;
    return RUN_ALL_TESTS();
 }

--- a/modules/ocl/test/test_matrix_operation.cpp
+++ b/modules/ocl/test/test_matrix_operation.cpp
@ -132,7 +132,7 @@ typedef ConvertToTestBase ConvertTo;
 TEST_P(ConvertTo, Accuracy)
 {
    if((src_depth == CV_64F || dst_depth == CV_64F) &&
-            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::Context::CL_DOUBLE))
+            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::FEATURE_CL_DOUBLE))
    {
        return; // returns silently
    }
@ -228,7 +228,7 @@ typedef CopyToTestBase CopyTo;
 TEST_P(CopyTo, Without_mask)
 {
    if((src.depth() == CV_64F) &&
-            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::Context::CL_DOUBLE))
+            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::FEATURE_CL_DOUBLE))
    {
        return; // returns silently
    }
@ -246,7 +246,7 @@ TEST_P(CopyTo, Without_mask)
 TEST_P(CopyTo, With_mask)
 {
    if(src.depth() == CV_64F &&
-        !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::Context::CL_DOUBLE))
+        !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::FEATURE_CL_DOUBLE))
    {
        return; // returns silently
    }
@ -342,7 +342,7 @@ typedef SetToTestBase SetTo;
 TEST_P(SetTo, Without_mask)
 {
    if(depth == CV_64F &&
-            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::Context::CL_DOUBLE))
+            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::FEATURE_CL_DOUBLE))
    {
        return; // returns silently
    }
@ -360,7 +360,7 @@ TEST_P(SetTo, Without_mask)
 TEST_P(SetTo, With_mask)
 {
    if(depth == CV_64F &&
-            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::Context::CL_DOUBLE))
+            !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::FEATURE_CL_DOUBLE))
    {
        return; // returns silently
    }
@ -430,7 +430,7 @@ PARAM_TEST_CASE(convertC3C4, MatType, bool)
 TEST_P(convertC3C4, Accuracy)
 {
    if(depth == CV_64F &&
-        !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::Context::CL_DOUBLE))
+        !cv::ocl::Context::getContext()->supportsFeature(cv::ocl::FEATURE_CL_DOUBLE))
    {
        return; // returns silently
    }
--- a/modules/superres/perf/perf_superres_ocl.cpp
+++ b/modules/superres/perf/perf_superres_ocl.cpp
@ -107,9 +107,6 @@ PERF_TEST_P(Size_MatType, SuperResolution_BTVL1_OCL,
    Combine(Values(szSmall64, szSmall128),
    Values(MatType(CV_8UC1), MatType(CV_8UC3))))
 {
-    std::vector<cv::ocl::Info>info;
-    cv::ocl::getDevice(info);
-
    declare.time(5 * 60);

    const Size size = std::tr1::get<0>(GetParam());
--- a/modules/superres/src/btv_l1_ocl.cpp
+++ b/modules/superres/src/btv_l1_ocl.cpp
@ -56,6 +56,7 @@ cv::Ptr<cv::superres::SuperResolution> cv::superres::createSuperResolution_BTVL1
 }

 #else
+#include "opencl_kernels.hpp"

 using namespace std;
 using namespace cv;
@ -67,8 +68,6 @@ namespace cv
 {
    namespace ocl
    {
-        extern const char* superres_btvl1;
-
        float* btvWeights_ = NULL;
        size_t btvWeights_size = 0;
    }
@ -232,7 +231,7 @@ void btv_l1_device_ocl::calcBtvRegularization(const oclMat& src, oclMat& dst, in
    cl_mem c_btvRegWeights;
    size_t count = btvWeights_size * sizeof(float);
    c_btvRegWeights = openCLCreateBuffer(clCxt, CL_MEM_READ_ONLY, count);
-    int cl_safe_check = clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), c_btvRegWeights, 1, 0, count, btvWeights_, 0, NULL, NULL);
+    int cl_safe_check = clEnqueueWriteBuffer(getClCommandQueue(clCxt), c_btvRegWeights, 1, 0, count, btvWeights_, 0, NULL, NULL);
    CV_Assert(cl_safe_check == CL_SUCCESS);

    args.push_back(make_pair(sizeof(cl_mem), (void*)&src_.data));
--- a/modules/superres/test/test_superres.cpp
+++ b/modules/superres/test/test_superres.cpp
@ -278,8 +278,6 @@ TEST_F(SuperResolution, BTVL1_GPU)
 #if defined(HAVE_OPENCV_OCL) && defined(HAVE_OPENCL)
 TEST_F(SuperResolution, BTVL1_OCL)
 {
-    std::vector<cv::ocl::Info> infos;
-    cv::ocl::getDevice(infos);
    RunTest(cv::superres::createSuperResolution_BTVL1_OCL());
 }
 #endif
--- a/samples/gpu/super_resolution.cpp
+++ b/samples/gpu/super_resolution.cpp
@ -132,17 +132,9 @@ int main(int argc, const char* argv[])
    }
 #endif
 #if defined(HAVE_OPENCV_OCL)
-    std::vector<cv::ocl::Info>info;
    if(useCuda)
    {
        CV_Assert(!useOcl);
-        info.clear();
-    }
-
-    if(useOcl)
-    {
-        CV_Assert(!useCuda);
-        cv::ocl::getDevice(info);
    }
 #endif
    Ptr<SuperResolution> superRes;
--- a/samples/ocl/adaptive_bilateral_filter.cpp
+++ b/samples/ocl/adaptive_bilateral_filter.cpp
@ -25,9 +25,6 @@ int main( int argc, const char** argv )
        return -1;
    }

-    std::vector<ocl::Info> infos;
-    ocl::getDevice(infos);
-
    ocl::oclMat dsrc(src), dABFilter, dBFilter;

    Size ksize(ks, ks);
@ -48,4 +45,4 @@ int main( int argc, const char** argv )
    waitKey();
    return 0;

-}
+}
--- a/samples/ocl/bgfg_segm.cpp
+++ b/samples/ocl/bgfg_segm.cpp
@ -24,7 +24,7 @@ int main(int argc, const char** argv)
    if (cmd.get<bool>("help"))
    {
        cout << "Usage : bgfg_segm [options]" << endl;
-        cout << "Avaible options:" << endl;
+        cout << "Available options:" << endl;
        cmd.printParams();
        return 0;
    }
@ -54,9 +54,6 @@ int main(int argc, const char** argv)
        return -1;
    }

-    std::vector<cv::ocl::Info>info;
-    cv::ocl::getDevice(info);
-
    Mat frame;
    cap >> frame;

--- a/samples/ocl/clahe.cpp
+++ b/samples/ocl/clahe.cpp
@ -45,9 +45,6 @@ int main(int argc, char** argv)
    createTrackbar("Tile Size", "CLAHE", &tilesize, 32, (TrackbarCallback)TSize_Callback);
    createTrackbar("Clip Limit", "CLAHE", &cliplimit, 20, (TrackbarCallback)Clip_Callback);

-    vector<ocl::Info> info;
-    CV_Assert(ocl::getDevice(info));
-
    Mat frame, outframe;
    ocl::oclMat d_outframe;

--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@ -72,7 +72,7 @@ int main( int argc, const char** argv )
    CommandLineParser cmd(argc, argv, keys);
    if (cmd.get<bool>("help"))
    {
-        cout << "Avaible options:" << endl;
+        cout << "Available options:" << endl;
        cmd.printParams();
        return 0;
    }
@ -120,16 +120,6 @@ int main( int argc, const char** argv )


    cvNamedWindow( "result", 1 );
-    vector<ocl::Info> oclinfo;
-    int devnums = ocl::getDevice(oclinfo);
-    if( devnums < 1 )
-    {
-        std::cout << "no device found\n";
-        return -1;
-    }
-    //if you want to use undefault device, set it here
-    //setDevice(oclinfo[0]);
-    ocl::setBinpath("./");
    if( capture )
    {
        cout << "In capture ..." << endl;
--- a/samples/ocl/hog.cpp
+++ b/samples/ocl/hog.cpp
@ -135,8 +135,6 @@ App::App(CommandLineParser& cmd)

 void App::run()
 {
-    vector<ocl::Info> oclinfo;
-    ocl::getDevice(oclinfo);
    running = true;
    VideoWriter video_writer;

--- a/samples/ocl/pyrlk_optical_flow.cpp
+++ b/samples/ocl/pyrlk_optical_flow.cpp
@ -86,13 +86,6 @@ static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<

 int main(int argc, const char* argv[])
 {
-    static std::vector<Info> ocl_info;
-    ocl::getDevice(ocl_info);
-    //if you want to use undefault device, set it here
-    setDevice(ocl_info[0]);
-
-    //set this to save kernel compile time from second time you run
-    ocl::setBinpath("./");
    const char* keys =
        "{ h   | help     | false           | print help message }"
        "{ l   | left     |                 | specify left image }"
@ -109,7 +102,7 @@ int main(int argc, const char* argv[])
    if (cmd.get<bool>("help"))
    {
        cout << "Usage: pyrlk_optical_flow [options]" << endl;
-        cout << "Avaible options:" << endl;
+        cout << "Available options:" << endl;
        cmd.printParams();
        return 0;
    }
--- a/samples/ocl/squares.cpp
+++ b/samples/ocl/squares.cpp
@ -284,13 +284,11 @@ int main(int argc, char** argv)
    string outfile = cmd.get<string>("o");
    if(inputName.empty())
    {
-        cout << "Avaible options:" << endl;
+        cout << "Available options:" << endl;
        cmd.printParams();
        return 0;
    }

-    vector<ocl::Info> info;
-    CV_Assert(ocl::getDevice(info));
    int iterations = 10;
    namedWindow( wndname, 1 );
    vector<vector<Point> > squares_cpu, squares_ocl;
--- a/samples/ocl/stereo_match.cpp
+++ b/samples/ocl/stereo_match.cpp
@ -77,28 +77,18 @@ int main(int argc, char** argv)
        "{ r | right    |                           | specify right image }"
        "{ m | method   | BM                        | specify match method(BM/BP/CSBP) }"
        "{ n | ndisp    | 64                        |  specify number of disparity levels }"
-        "{ s | cpu_ocl  | false                     | use cpu or gpu as ocl device to process the image }"
        "{ o | output   | stereo_match_output.jpg   | specify output path when input is images}";
    CommandLineParser cmd(argc, argv, keys);
    if (cmd.get<bool>("help"))
    {
-        cout << "Avaible options:" << endl;
+        cout << "Available options:" << endl;
        cmd.printParams();
        return 0;
    }
    try
    {
        App app(cmd);
-        int flag = CVCL_DEVICE_TYPE_GPU;
-        if(cmd.get<bool>("s") == true)
-            flag = CVCL_DEVICE_TYPE_CPU;
-
-        vector<Info> info;
-        if(getDevice(info, flag) == 0)
-        {
-            throw runtime_error("Error: Did not find a valid OpenCL device!");
-        }
-        cout << "Device name:" << info[0].DeviceName[0] << endl;
+        cout << "Device name:" << cv::ocl::Context::getContext()->getDeviceInfo().deviceName << endl;

        app.run();
    }
--- a/samples/ocl/surf_matcher.cpp
+++ b/samples/ocl/surf_matcher.cpp
@ -145,19 +145,11 @@ int main(int argc, char* argv[])
    CommandLineParser cmd(argc, argv, keys);
    if (cmd.get<bool>("help"))
    {
-        std::cout << "Avaible options:" << std::endl;
+        std::cout << "Available options:" << std::endl;
        cmd.printParams();
        return 0;
    }

-    vector<cv::ocl::Info> info;
-    if(cv::ocl::getDevice(info) == 0)
-    {
-        std::cout << "Error: Did not find a valid OpenCL device!" << std::endl;
-        return -1;
-    }
-    ocl::setDevice(info[0]);
-
    Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
    oclMat img1, img2;
    bool useCPU = cmd.get<bool>("c");
@ -190,7 +182,7 @@ int main(int argc, char* argv[])
    {
        std::cout
                << "Device name:"
-                << info[0].DeviceName[0]
+                << cv::ocl::Context::getContext()->getDeviceInfo().deviceName
                << std::endl;
    }
    double surf_time = 0.;
--- a/samples/ocl/tvl1_optical_flow.cpp
+++ b/samples/ocl/tvl1_optical_flow.cpp
@ -80,13 +80,6 @@ static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)

 int main(int argc, const char* argv[])
 {
-    static std::vector<Info> ocl_info;
-    ocl::getDevice(ocl_info);
-    //if you want to use undefault device, set it here
-    setDevice(ocl_info[0]);
-
-    //set this to save kernel compile time from second time you run
-    ocl::setBinpath("./");
    const char* keys =
        "{ h   | help       | false           | print help message }"
        "{ l   | left       |                 | specify left image }"
@ -101,7 +94,7 @@ int main(int argc, const char* argv[])
    if (cmd.get<bool>("help"))
    {
        cout << "Usage: pyrlk_optical_flow [options]" << endl;
-        cout << "Avaible options:" << endl;
+        cout << "Available options:" << endl;
        cmd.printParams();
        return 0;
    }