2013-10-22 18:04:49 +08:00
|
|
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
//
|
|
|
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
|
|
|
//
|
|
|
|
// By downloading, copying, installing or using the software you agree to this license.
|
|
|
|
// If you do not agree to this license, do not download, install,
|
|
|
|
// copy or use the software.
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// License Agreement
|
|
|
|
// For Open Source Computer Vision Library
|
|
|
|
//
|
|
|
|
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
|
|
|
// Third party copyrights are property of their respective owners.
|
|
|
|
//
|
|
|
|
// Redistribution and use in source and binary forms, with or without modification,
|
|
|
|
// are permitted provided that the following conditions are met:
|
|
|
|
//
|
|
|
|
// * Redistribution's of source code must retain the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer.
|
|
|
|
//
|
|
|
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
|
|
// and/or other materials provided with the distribution.
|
|
|
|
//
|
|
|
|
// * The name of the copyright holders may not be used to endorse or promote products
|
|
|
|
// derived from this software without specific prior written permission.
|
|
|
|
//
|
|
|
|
// This software is provided by the copyright holders and contributors "as is" and
|
|
|
|
// any express or implied warranties, including, but not limited to, the implied
|
|
|
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
|
|
// In no event shall the OpenCV Foundation or contributors be liable for any direct,
|
|
|
|
// indirect, incidental, special, exemplary, or consequential damages
|
|
|
|
// (including, but not limited to, procurement of substitute goods or services;
|
|
|
|
// loss of use, data, or profits; or business interruption) however caused
|
|
|
|
// and on any theory of liability, whether in contract, strict liability,
|
|
|
|
// or tort (including negligence or otherwise) arising in any way out of
|
|
|
|
// the use of this software, even if advised of the possibility of such damage.
|
|
|
|
//
|
|
|
|
//M*/
|
|
|
|
|
|
|
|
#include "precomp.hpp"
|
2014-01-16 22:30:39 +08:00
|
|
|
#include <list>
|
2013-10-22 18:04:49 +08:00
|
|
|
#include <map>
|
2017-07-06 22:57:05 +08:00
|
|
|
#include <deque>
|
2017-09-07 04:15:47 +08:00
|
|
|
#include <set>
|
2013-12-25 18:39:21 +08:00
|
|
|
#include <string>
|
|
|
|
#include <sstream>
|
|
|
|
#include <iostream> // std::cerr
|
2016-09-16 15:03:42 +08:00
|
|
|
#if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
|
2016-08-25 22:26:46 +08:00
|
|
|
#include <inttypes.h>
|
2016-09-16 15:03:42 +08:00
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
|
Merge pull request #9114 from pengli:dnn_rebase
add libdnn acceleration to dnn module (#9114)
* import libdnn code
Signed-off-by: Li Peng <peng.li@intel.com>
* add convolution layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add pooling layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add softmax layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add lrn layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add innerproduct layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add HAVE_OPENCL macro
Signed-off-by: Li Peng <peng.li@intel.com>
* fix for convolution ocl
Signed-off-by: Li Peng <peng.li@intel.com>
* enable getUMat() for multi-dimension Mat
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat for ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* use CV_OCL_RUN macro
Signed-off-by: Li Peng <peng.li@intel.com>
* set OPENCL target when it is available
and disable fuseLayer for OCL target for the time being
Signed-off-by: Li Peng <peng.li@intel.com>
* fix innerproduct accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* remove trailing space
Signed-off-by: Li Peng <peng.li@intel.com>
* Fixed tensorflow demo bug.
Root cause is that tensorflow has different algorithm with libdnn
to calculate convolution output dimension.
libdnn don't calculate output dimension anymore and just use one
passed in by config.
* split gemm ocl file
split it into gemm_buffer.cl and gemm_image.cl
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix compile failure
Signed-off-by: Li Peng <peng.li@intel.com>
* check env flag for auto tuning
Signed-off-by: Li Peng <peng.li@intel.com>
* switch to new ocl kernels for softmax layer
Signed-off-by: Li Peng <peng.li@intel.com>
* update softmax layer
on some platform subgroup extension may not work well,
fallback to non subgroup ocl acceleration.
Signed-off-by: Li Peng <peng.li@intel.com>
* fallback to cpu path for fc layer with multi output
Signed-off-by: Li Peng <peng.li@intel.com>
* update output message
Signed-off-by: Li Peng <peng.li@intel.com>
* update fully connected layer
fallback to gemm API if libdnn return false
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ReLU OCL implementation
* disable layer fusion for now
Signed-off-by: Li Peng <peng.li@intel.com>
* Add OCL implementation for concat layer
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
* libdnn: update license and copyrights
Also refine libdnn coding style
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* DNN: Don't link OpenCL library explicitly
* DNN: Make default preferableTarget to DNN_TARGET_CPU
User should set it to DNN_TARGET_OPENCL explicitly if want to
use OpenCL acceleration.
Also don't fusion when using DNN_TARGET_OPENCL
* DNN: refine coding style
* Add getOpenCLErrorString
* DNN: Use int32_t/uint32_t instread of alias
* Use namespace ocl4dnn to include libdnn things
* remove extra copyTo in softmax ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* update ReLU layer ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* Add prefer target property for layer class
It is used to indicate the target for layer forwarding,
either the default CPU target or OCL target.
Signed-off-by: Li Peng <peng.li@intel.com>
* Add cl_event based timer for cv::ocl
* Rename libdnn to ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* use UMat for ocl4dnn internal buffer
Remove allocateMemory which use clCreateBuffer directly
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* enable buffer gemm in ocl4dnn innerproduct
Signed-off-by: Li Peng <peng.li@intel.com>
* replace int_tp globally for ocl4dnn kernels.
Signed-off-by: wzw <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* create UMat for layer params
Signed-off-by: Li Peng <peng.li@intel.com>
* update sign ocl kernel
Signed-off-by: Li Peng <peng.li@intel.com>
* update image based gemm of inner product layer
Signed-off-by: Li Peng <peng.li@intel.com>
* remove buffer gemm of inner product layer
call cv::gemm API instead
Signed-off-by: Li Peng <peng.li@intel.com>
* change ocl4dnn forward parameter to UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine auto-tuning mechanism.
- Use OPENCV_OCL4DNN_KERNEL_CONFIG_PATH to set cache directory
for fine-tuned kernel configuration.
e.g. export OPENCV_OCL4DNN_KERNEL_CONFIG_PATH=/home/tmp,
the cache directory will be /home/tmp/spatialkernels/ on Linux.
- Define environment OPENCV_OCL4DNN_ENABLE_AUTO_TUNING to enable
auto-tuning.
- OPENCV_OPENCL_ENABLE_PROFILING is only used to enable profiling
for OpenCL command queue. This fix basic kernel get wrong running
time, i.e. 0ms.
- If creating cache directory failed, disable auto-tuning.
* Detect and create cache dir on windows
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine gemm like convolution kernel.
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix redundant swizzleWeights calling when use cached kernel config.
* Fix "out of resource" bug when auto-tuning too many kernels.
* replace cl_mem with UMat in ocl4dnnConvSpatial class
* OCL4DNN: reduce the tuning kernel candidate.
This patch could reduce 75% of the tuning candidates with less
than 2% performance impact for the final result.
Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
* replace cl_mem with umat in ocl4dnn convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* remove weight_image_ of ocl4dnn inner product
Actually it is unused in the computation
Signed-off-by: Li Peng <peng.li@intel.com>
* Various fixes for ocl4dnn
1. OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel())
2. Ptr<OCL4DNNInnerProduct<float> > innerProductOp
3. Code comments cleanup
4. ignore check on OCL cpu device
Signed-off-by: Li Peng <peng.li@intel.com>
* add build option for log softmax
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ocl kernels in ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ocl4dnnSet with opencv setTo
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ALIGN with cv::alignSize
Signed-off-by: Li Peng <peng.li@intel.com>
* check kernel build options
Signed-off-by: Li Peng <peng.li@intel.com>
* Handle program compilation fail properly.
* Use std::numeric_limits<float>::infinity() for large float number
* check ocl4dnn kernel compilation result
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ctx_id
Signed-off-by: Li Peng <peng.li@intel.com>
* change clEnqueueNDRangeKernel to kernel.run()
Signed-off-by: Li Peng <peng.li@intel.com>
* change cl_mem to UMat in image based gemm
Signed-off-by: Li Peng <peng.li@intel.com>
* check intel subgroup support for lrn and pooling layer
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix convolution bug if group is greater than 1
Signed-off-by: Li Peng <peng.li@intel.com>
* Set default layer preferableTarget to be DNN_TARGET_CPU
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ocl perf test for convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* Add more ocl accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_image with ocl::Image2D
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix build failure in elementwise layer
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat() to get blob data
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_mem handle with ocl::KernelArg
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(build): don't use C++11, OPENCL_LIBRARIES fix
* dnn(ocl4dnn): remove unused OpenCL kernels
* dnn(ocl4dnn): extract OpenCL code into .cl files
* dnn(ocl4dnn): refine auto-tuning
Defaultly disable auto-tuning, set OPENCV_OCL4DNN_ENABLE_AUTO_TUNING
environment variable to enable it.
Use a set of pre-tuned configs as default config if auto-tuning is disabled.
These configs are tuned for Intel GPU with 48/72 EUs, and for googlenet,
AlexNet, ResNet-50
If default config is not suitable, use the first available kernel config
from the candidates. Candidate priority from high to low is gemm like kernel,
IDLF kernel, basick kernel.
* dnn(ocl4dnn): pooling doesn't use OpenCL subgroups
* dnn(ocl4dnn): fix perf test
OpenCV has default 3sec time limit for each performance test.
Warmup OpenCL backend outside of perf measurement loop.
* use ocl::KernelArg as much as possible
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): fix bias bug for gemm like kernel
* dnn(ocl4dnn): wrap cl_mem into UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): Refine signature of kernel config
- Use more readable string as signture of kernel config
- Don't count device name and vendor in signature string
- Default kernel configurations are tuned for Intel GPU with
24/48/72 EUs, and for googlenet, AlexNet, ResNet-50 net model.
* dnn(ocl4dnn): swap width/height in configuration
* dnn(ocl4dnn): enable configs for Intel OpenCL runtime only
* core: make configuration helper functions accessible from non-core modules
* dnn(ocl4dnn): update kernel auto-tuning behavior
Avoid unwanted creation of directories
* dnn(ocl4dnn): simplify kernel to workaround OpenCL compiler crash
* dnn(ocl4dnn): remove redundant code
* dnn(ocl4dnn): Add more clear message for simd size dismatch.
* dnn(ocl4dnn): add const to const argument
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): force compiler use a specific SIMD size for IDLF kernel
* dnn(ocl4dnn): drop unused tuneLocalSize()
* dnn(ocl4dnn): specify OpenCL queue for Timer and convolve() method
* dnn(ocl4dnn): sanitize file names used for cache
* dnn(perf): enable Network tests with OpenCL
* dnn(ocl4dnn/conv): drop computeGlobalSize()
* dnn(ocl4dnn/conv): drop unused fields
* dnn(ocl4dnn/conv): simplify ctor
* dnn(ocl4dnn/conv): refactor kernelConfig localSize=NULL
* dnn(ocl4dnn/conv): drop unsupported double / untested half types
* dnn(ocl4dnn/conv): drop unused variable
* dnn(ocl4dnn/conv): alignSize/divUp
* dnn(ocl4dnn/conv): use enum values
* dnn(ocl4dnn): drop unused innerproduct variable
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): add an generic function to check cl option support
* dnn(ocl4dnn): run softmax subgroup version kernel first
Signed-off-by: Li Peng <peng.li@intel.com>
2017-10-02 20:38:00 +08:00
|
|
|
#include <opencv2/core/utils/configuration.private.hpp>
|
|
|
|
|
2016-12-19 05:38:33 +08:00
|
|
|
#include "opencv2/core/ocl_genbase.hpp"
|
Merge pull request #9114 from pengli:dnn_rebase
add libdnn acceleration to dnn module (#9114)
* import libdnn code
Signed-off-by: Li Peng <peng.li@intel.com>
* add convolution layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add pooling layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add softmax layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add lrn layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add innerproduct layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add HAVE_OPENCL macro
Signed-off-by: Li Peng <peng.li@intel.com>
* fix for convolution ocl
Signed-off-by: Li Peng <peng.li@intel.com>
* enable getUMat() for multi-dimension Mat
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat for ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* use CV_OCL_RUN macro
Signed-off-by: Li Peng <peng.li@intel.com>
* set OPENCL target when it is available
and disable fuseLayer for OCL target for the time being
Signed-off-by: Li Peng <peng.li@intel.com>
* fix innerproduct accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* remove trailing space
Signed-off-by: Li Peng <peng.li@intel.com>
* Fixed tensorflow demo bug.
Root cause is that tensorflow has different algorithm with libdnn
to calculate convolution output dimension.
libdnn don't calculate output dimension anymore and just use one
passed in by config.
* split gemm ocl file
split it into gemm_buffer.cl and gemm_image.cl
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix compile failure
Signed-off-by: Li Peng <peng.li@intel.com>
* check env flag for auto tuning
Signed-off-by: Li Peng <peng.li@intel.com>
* switch to new ocl kernels for softmax layer
Signed-off-by: Li Peng <peng.li@intel.com>
* update softmax layer
on some platform subgroup extension may not work well,
fallback to non subgroup ocl acceleration.
Signed-off-by: Li Peng <peng.li@intel.com>
* fallback to cpu path for fc layer with multi output
Signed-off-by: Li Peng <peng.li@intel.com>
* update output message
Signed-off-by: Li Peng <peng.li@intel.com>
* update fully connected layer
fallback to gemm API if libdnn return false
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ReLU OCL implementation
* disable layer fusion for now
Signed-off-by: Li Peng <peng.li@intel.com>
* Add OCL implementation for concat layer
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
* libdnn: update license and copyrights
Also refine libdnn coding style
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* DNN: Don't link OpenCL library explicitly
* DNN: Make default preferableTarget to DNN_TARGET_CPU
User should set it to DNN_TARGET_OPENCL explicitly if want to
use OpenCL acceleration.
Also don't fusion when using DNN_TARGET_OPENCL
* DNN: refine coding style
* Add getOpenCLErrorString
* DNN: Use int32_t/uint32_t instread of alias
* Use namespace ocl4dnn to include libdnn things
* remove extra copyTo in softmax ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* update ReLU layer ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* Add prefer target property for layer class
It is used to indicate the target for layer forwarding,
either the default CPU target or OCL target.
Signed-off-by: Li Peng <peng.li@intel.com>
* Add cl_event based timer for cv::ocl
* Rename libdnn to ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* use UMat for ocl4dnn internal buffer
Remove allocateMemory which use clCreateBuffer directly
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* enable buffer gemm in ocl4dnn innerproduct
Signed-off-by: Li Peng <peng.li@intel.com>
* replace int_tp globally for ocl4dnn kernels.
Signed-off-by: wzw <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* create UMat for layer params
Signed-off-by: Li Peng <peng.li@intel.com>
* update sign ocl kernel
Signed-off-by: Li Peng <peng.li@intel.com>
* update image based gemm of inner product layer
Signed-off-by: Li Peng <peng.li@intel.com>
* remove buffer gemm of inner product layer
call cv::gemm API instead
Signed-off-by: Li Peng <peng.li@intel.com>
* change ocl4dnn forward parameter to UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine auto-tuning mechanism.
- Use OPENCV_OCL4DNN_KERNEL_CONFIG_PATH to set cache directory
for fine-tuned kernel configuration.
e.g. export OPENCV_OCL4DNN_KERNEL_CONFIG_PATH=/home/tmp,
the cache directory will be /home/tmp/spatialkernels/ on Linux.
- Define environment OPENCV_OCL4DNN_ENABLE_AUTO_TUNING to enable
auto-tuning.
- OPENCV_OPENCL_ENABLE_PROFILING is only used to enable profiling
for OpenCL command queue. This fix basic kernel get wrong running
time, i.e. 0ms.
- If creating cache directory failed, disable auto-tuning.
* Detect and create cache dir on windows
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine gemm like convolution kernel.
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix redundant swizzleWeights calling when use cached kernel config.
* Fix "out of resource" bug when auto-tuning too many kernels.
* replace cl_mem with UMat in ocl4dnnConvSpatial class
* OCL4DNN: reduce the tuning kernel candidate.
This patch could reduce 75% of the tuning candidates with less
than 2% performance impact for the final result.
Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
* replace cl_mem with umat in ocl4dnn convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* remove weight_image_ of ocl4dnn inner product
Actually it is unused in the computation
Signed-off-by: Li Peng <peng.li@intel.com>
* Various fixes for ocl4dnn
1. OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel())
2. Ptr<OCL4DNNInnerProduct<float> > innerProductOp
3. Code comments cleanup
4. ignore check on OCL cpu device
Signed-off-by: Li Peng <peng.li@intel.com>
* add build option for log softmax
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ocl kernels in ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ocl4dnnSet with opencv setTo
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ALIGN with cv::alignSize
Signed-off-by: Li Peng <peng.li@intel.com>
* check kernel build options
Signed-off-by: Li Peng <peng.li@intel.com>
* Handle program compilation fail properly.
* Use std::numeric_limits<float>::infinity() for large float number
* check ocl4dnn kernel compilation result
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ctx_id
Signed-off-by: Li Peng <peng.li@intel.com>
* change clEnqueueNDRangeKernel to kernel.run()
Signed-off-by: Li Peng <peng.li@intel.com>
* change cl_mem to UMat in image based gemm
Signed-off-by: Li Peng <peng.li@intel.com>
* check intel subgroup support for lrn and pooling layer
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix convolution bug if group is greater than 1
Signed-off-by: Li Peng <peng.li@intel.com>
* Set default layer preferableTarget to be DNN_TARGET_CPU
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ocl perf test for convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* Add more ocl accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_image with ocl::Image2D
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix build failure in elementwise layer
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat() to get blob data
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_mem handle with ocl::KernelArg
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(build): don't use C++11, OPENCL_LIBRARIES fix
* dnn(ocl4dnn): remove unused OpenCL kernels
* dnn(ocl4dnn): extract OpenCL code into .cl files
* dnn(ocl4dnn): refine auto-tuning
Defaultly disable auto-tuning, set OPENCV_OCL4DNN_ENABLE_AUTO_TUNING
environment variable to enable it.
Use a set of pre-tuned configs as default config if auto-tuning is disabled.
These configs are tuned for Intel GPU with 48/72 EUs, and for googlenet,
AlexNet, ResNet-50
If default config is not suitable, use the first available kernel config
from the candidates. Candidate priority from high to low is gemm like kernel,
IDLF kernel, basick kernel.
* dnn(ocl4dnn): pooling doesn't use OpenCL subgroups
* dnn(ocl4dnn): fix perf test
OpenCV has default 3sec time limit for each performance test.
Warmup OpenCL backend outside of perf measurement loop.
* use ocl::KernelArg as much as possible
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): fix bias bug for gemm like kernel
* dnn(ocl4dnn): wrap cl_mem into UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): Refine signature of kernel config
- Use more readable string as signture of kernel config
- Don't count device name and vendor in signature string
- Default kernel configurations are tuned for Intel GPU with
24/48/72 EUs, and for googlenet, AlexNet, ResNet-50 net model.
* dnn(ocl4dnn): swap width/height in configuration
* dnn(ocl4dnn): enable configs for Intel OpenCL runtime only
* core: make configuration helper functions accessible from non-core modules
* dnn(ocl4dnn): update kernel auto-tuning behavior
Avoid unwanted creation of directories
* dnn(ocl4dnn): simplify kernel to workaround OpenCL compiler crash
* dnn(ocl4dnn): remove redundant code
* dnn(ocl4dnn): Add more clear message for simd size dismatch.
* dnn(ocl4dnn): add const to const argument
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): force compiler use a specific SIMD size for IDLF kernel
* dnn(ocl4dnn): drop unused tuneLocalSize()
* dnn(ocl4dnn): specify OpenCL queue for Timer and convolve() method
* dnn(ocl4dnn): sanitize file names used for cache
* dnn(perf): enable Network tests with OpenCL
* dnn(ocl4dnn/conv): drop computeGlobalSize()
* dnn(ocl4dnn/conv): drop unused fields
* dnn(ocl4dnn/conv): simplify ctor
* dnn(ocl4dnn/conv): refactor kernelConfig localSize=NULL
* dnn(ocl4dnn/conv): drop unsupported double / untested half types
* dnn(ocl4dnn/conv): drop unused variable
* dnn(ocl4dnn/conv): alignSize/divUp
* dnn(ocl4dnn/conv): use enum values
* dnn(ocl4dnn): drop unused innerproduct variable
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): add an generic function to check cl option support
* dnn(ocl4dnn): run softmax subgroup version kernel first
Signed-off-by: Li Peng <peng.li@intel.com>
2017-10-02 20:38:00 +08:00
|
|
|
#include "opencl_kernels_core.hpp"
|
2016-12-19 05:38:33 +08:00
|
|
|
|
2014-02-27 16:51:40 +08:00
|
|
|
#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
|
2014-10-09 17:50:57 +08:00
|
|
|
#define CV_OPENCL_SHOW_RUN_ERRORS 0
|
2015-01-02 08:33:40 +08:00
|
|
|
#define CV_OPENCL_SHOW_SVM_ERROR_LOG 1
|
|
|
|
#define CV_OPENCL_SHOW_SVM_LOG 0
|
2014-02-27 16:51:40 +08:00
|
|
|
|
2014-01-16 22:30:39 +08:00
|
|
|
#include "opencv2/core/bufferpool.hpp"
|
|
|
|
#ifndef LOG_BUFFER_POOL
|
|
|
|
# if 0
|
|
|
|
# define LOG_BUFFER_POOL printf
|
|
|
|
# else
|
|
|
|
# define LOG_BUFFER_POOL(...)
|
|
|
|
# endif
|
|
|
|
#endif
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
#if CV_OPENCL_SHOW_SVM_LOG
|
|
|
|
// TODO add timestamp logging
|
|
|
|
#define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
|
|
|
|
#else
|
|
|
|
#define CV_OPENCL_SVM_TRACE_P(...)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if CV_OPENCL_SHOW_SVM_ERROR_LOG
|
|
|
|
// TODO add timestamp logging
|
|
|
|
#define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
|
|
|
|
#else
|
|
|
|
#define CV_OPENCL_SVM_TRACE_ERROR_P(...)
|
|
|
|
#endif
|
|
|
|
|
2013-12-15 03:16:53 +08:00
|
|
|
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
|
2013-12-17 21:36:26 +08:00
|
|
|
#include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
|
2013-12-15 03:16:53 +08:00
|
|
|
|
2013-11-21 17:05:32 +08:00
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
|
|
|
|
#else
|
|
|
|
// TODO FIXIT: This file can't be build without OPENCL
|
2016-12-11 00:41:34 +08:00
|
|
|
#include "ocl_deprecated.hpp"
|
OpenCV-OpenCL interop (PR #4072):
Commits:
added new function, cv::ocl::attachContext(String& platformName, void* platformID, void* context, void* deviceID) which allow to attach externally created OpenCL context to OpenCV.
add definitions of clRetainDevice, clRetainContext funcs
removed definitions for clRetainContext, clRetainDevice
fixed build issue under Linux
fixed uninitialized vars, replace dbgassert in error handling
remove function which is not ready yet
add new function, cv::ocl::convertFromBuffer(int rows, int cols, int type, void* cl_mem_obj, UMat& dst, UMatUsageFlags usageFlags = cv::USAGE_DEFAULT) which attaches user allocated OpenCL clBuffer to UMat
uncommented clGetMemObjectInfo definition (otherwise prevent opencv build)
fixed build issue on linux and android
add step parameter to cv::ocl::convertFromBuffer func
suppress compile-time warning
added sample opencl-opencv interoperability (showcase for cv::ocl::convertFromBuffer func)
CMakeLists.txt modified to not create sample build script if OpenCL SDK not found in system
fixed build issue (apple opencl include dir and spaces in CMake file)
added call to clRetainContext for attachContext func and call to clRetainMemObject for convertFromBuffer func
uncommented clRetainMemObject definition
added comments and cleanup
add local path to cmake modules search dirs (instead of replacing)
remove REQUIRED for find_package call (sample build together with opencv). need to try standalone sample build
opencl-interop sample moved to standalone build
set minimum version requirement for sample's cmake to 3.1
put cmake_minimum_required under condition, so do not check if samples not builded
remove code dups for setSize, updateContinuityFlag, and finalizeHdr
commented out cmake_minimum_required(VERSION 3.1)
add safety check for cmake version
add convertFromImage func and update opencl-interop sample
uncommented clGetImageInfo defs
uncommented clEnqueueCopyImageToBuffer defs
fixed clEnqueueCopyImageToBuffer defs
add doxygen comments
remove doxygen @fn tag
try to restart buildbot
add doxygen comments to directx interop funcs
remove internal header, use fwd declarations in affected compile units instead
2015-05-28 04:22:33 +08:00
|
|
|
#endif // HAVE_OPENCL
|
2013-11-21 17:05:32 +08:00
|
|
|
|
2014-08-10 04:10:05 +08:00
|
|
|
#ifdef _DEBUG
|
|
|
|
#define CV_OclDbgAssert CV_DbgAssert
|
|
|
|
#else
|
2014-06-19 19:18:52 +08:00
|
|
|
static bool isRaiseError()
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static bool value = false;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
2017-05-25 23:59:01 +08:00
|
|
|
value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
|
2014-06-19 19:18:52 +08:00
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
#define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
|
2014-02-01 19:07:03 +08:00
|
|
|
#endif
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
#include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
|
|
|
|
#include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
|
|
|
|
#include "opencv2/core/opencl/opencl_svm.hpp"
|
|
|
|
#endif
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
namespace cv { namespace ocl {
|
|
|
|
|
2013-10-25 20:46:03 +08:00
|
|
|
struct UMat2D
|
|
|
|
{
|
2013-11-19 00:48:00 +08:00
|
|
|
UMat2D(const UMat& m)
|
2013-10-25 20:46:03 +08:00
|
|
|
{
|
2013-11-22 22:56:03 +08:00
|
|
|
offset = (int)m.offset;
|
|
|
|
step = (int)m.step;
|
2013-10-25 20:46:03 +08:00
|
|
|
rows = m.rows;
|
|
|
|
cols = m.cols;
|
|
|
|
}
|
2013-11-22 22:56:03 +08:00
|
|
|
int offset;
|
|
|
|
int step;
|
2013-10-25 20:46:03 +08:00
|
|
|
int rows;
|
|
|
|
int cols;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct UMat3D
|
|
|
|
{
|
2013-11-19 00:48:00 +08:00
|
|
|
UMat3D(const UMat& m)
|
2013-10-25 20:46:03 +08:00
|
|
|
{
|
2013-11-22 22:56:03 +08:00
|
|
|
offset = (int)m.offset;
|
|
|
|
step = (int)m.step.p[1];
|
|
|
|
slicestep = (int)m.step.p[0];
|
|
|
|
slices = (int)m.size.p[0];
|
2013-10-25 20:46:03 +08:00
|
|
|
rows = m.size.p[1];
|
|
|
|
cols = m.size.p[2];
|
|
|
|
}
|
2013-11-22 22:56:03 +08:00
|
|
|
int offset;
|
|
|
|
int slicestep;
|
|
|
|
int step;
|
2013-10-25 20:46:03 +08:00
|
|
|
int slices;
|
|
|
|
int rows;
|
|
|
|
int cols;
|
|
|
|
};
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
// Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
|
|
|
|
static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
|
|
|
|
{
|
|
|
|
static uint64 table[256];
|
|
|
|
static bool initialized = false;
|
|
|
|
|
|
|
|
if( !initialized )
|
|
|
|
{
|
|
|
|
for( int i = 0; i < 256; i++ )
|
|
|
|
{
|
|
|
|
uint64 c = i;
|
|
|
|
for( int j = 0; j < 8; j++ )
|
|
|
|
c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
|
|
|
|
table[i] = c;
|
|
|
|
}
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64 crc = ~crc0;
|
|
|
|
for( size_t idx = 0; idx < size; idx++ )
|
|
|
|
crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
|
|
|
|
|
|
|
|
return ~crc;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool haveOpenCL()
|
|
|
|
{
|
2014-02-03 19:12:27 +08:00
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
static bool g_isOpenCLInitialized = false;
|
|
|
|
static bool g_isOpenCLAvailable = false;
|
|
|
|
|
2013-12-15 03:16:53 +08:00
|
|
|
if (!g_isOpenCLInitialized)
|
2013-11-21 17:05:32 +08:00
|
|
|
{
|
2013-12-15 03:16:53 +08:00
|
|
|
try
|
2013-11-21 17:05:32 +08:00
|
|
|
{
|
2013-12-15 03:16:53 +08:00
|
|
|
cl_uint n = 0;
|
|
|
|
g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
|
2013-11-21 17:05:32 +08:00
|
|
|
}
|
2013-12-15 03:16:53 +08:00
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
g_isOpenCLAvailable = false;
|
2013-11-21 17:05:32 +08:00
|
|
|
}
|
2013-12-15 03:16:53 +08:00
|
|
|
g_isOpenCLInitialized = true;
|
2013-11-21 17:05:32 +08:00
|
|
|
}
|
|
|
|
return g_isOpenCLAvailable;
|
2014-02-03 19:12:27 +08:00
|
|
|
#else
|
|
|
|
return false;
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool useOpenCL()
|
|
|
|
{
|
2015-02-04 18:03:27 +08:00
|
|
|
CoreTLSData* data = getCoreTlsData().get();
|
2013-10-22 18:04:49 +08:00
|
|
|
if( data->useOpenCL < 0 )
|
2014-07-03 16:18:19 +08:00
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
2014-11-05 23:41:39 +08:00
|
|
|
data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available();
|
2014-07-03 16:18:19 +08:00
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
data->useOpenCL = 0;
|
|
|
|
}
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
return data->useOpenCL > 0;
|
|
|
|
}
|
|
|
|
|
2013-10-25 20:46:03 +08:00
|
|
|
void setUseOpenCL(bool flag)
|
|
|
|
{
|
|
|
|
if( haveOpenCL() )
|
|
|
|
{
|
2015-02-04 18:03:27 +08:00
|
|
|
CoreTLSData* data = getCoreTlsData().get();
|
2014-03-18 17:19:41 +08:00
|
|
|
data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
|
2013-10-25 20:46:03 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-12-15 03:16:53 +08:00
|
|
|
#ifdef HAVE_CLAMDBLAS
|
|
|
|
|
|
|
|
class AmdBlasHelper
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
static AmdBlasHelper & getInstance()
|
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
|
2013-12-15 03:16:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool isAvailable() const
|
|
|
|
{
|
|
|
|
return g_isAmdBlasAvailable;
|
|
|
|
}
|
|
|
|
|
|
|
|
~AmdBlasHelper()
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
clAmdBlasTeardown();
|
|
|
|
}
|
|
|
|
catch (...) { }
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
AmdBlasHelper()
|
|
|
|
{
|
|
|
|
if (!g_isAmdBlasInitialized)
|
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
AutoLock lock(getInitializationMutex());
|
2013-12-15 03:16:53 +08:00
|
|
|
|
2015-06-23 19:31:01 +08:00
|
|
|
if (!g_isAmdBlasInitialized)
|
2013-12-15 03:16:53 +08:00
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
if (haveOpenCL())
|
2013-12-15 03:16:53 +08:00
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
try
|
|
|
|
{
|
|
|
|
g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
g_isAmdBlasAvailable = false;
|
|
|
|
}
|
2013-12-15 03:16:53 +08:00
|
|
|
}
|
2015-06-23 19:31:01 +08:00
|
|
|
else
|
2013-12-15 03:16:53 +08:00
|
|
|
g_isAmdBlasAvailable = false;
|
|
|
|
|
2015-06-23 19:31:01 +08:00
|
|
|
g_isAmdBlasInitialized = true;
|
|
|
|
}
|
2013-12-15 03:16:53 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
static bool g_isAmdBlasInitialized;
|
|
|
|
static bool g_isAmdBlasAvailable;
|
|
|
|
};
|
|
|
|
|
|
|
|
bool AmdBlasHelper::g_isAmdBlasAvailable = false;
|
|
|
|
bool AmdBlasHelper::g_isAmdBlasInitialized = false;
|
|
|
|
|
|
|
|
bool haveAmdBlas()
|
|
|
|
{
|
|
|
|
return AmdBlasHelper::getInstance().isAvailable();
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
bool haveAmdBlas()
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2013-12-17 21:36:26 +08:00
|
|
|
#ifdef HAVE_CLAMDFFT
|
|
|
|
|
|
|
|
class AmdFftHelper
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
static AmdFftHelper & getInstance()
|
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
|
2013-12-17 21:36:26 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool isAvailable() const
|
|
|
|
{
|
|
|
|
return g_isAmdFftAvailable;
|
|
|
|
}
|
|
|
|
|
|
|
|
~AmdFftHelper()
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
// clAmdFftTeardown();
|
|
|
|
}
|
|
|
|
catch (...) { }
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
AmdFftHelper()
|
|
|
|
{
|
|
|
|
if (!g_isAmdFftInitialized)
|
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
AutoLock lock(getInitializationMutex());
|
2013-12-17 21:36:26 +08:00
|
|
|
|
2015-06-23 19:31:01 +08:00
|
|
|
if (!g_isAmdFftInitialized)
|
2013-12-17 21:36:26 +08:00
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
if (haveOpenCL())
|
2013-12-17 21:36:26 +08:00
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
try
|
|
|
|
{
|
|
|
|
cl_uint major, minor, patch;
|
|
|
|
CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
|
2014-08-29 18:23:18 +08:00
|
|
|
|
2015-06-23 19:31:01 +08:00
|
|
|
// it throws exception in case AmdFft binaries are not found
|
|
|
|
CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
|
|
|
|
g_isAmdFftAvailable = true;
|
|
|
|
}
|
|
|
|
catch (const Exception &)
|
|
|
|
{
|
|
|
|
g_isAmdFftAvailable = false;
|
|
|
|
}
|
2013-12-17 21:36:26 +08:00
|
|
|
}
|
2015-06-23 19:31:01 +08:00
|
|
|
else
|
2013-12-17 21:36:26 +08:00
|
|
|
g_isAmdFftAvailable = false;
|
|
|
|
|
2015-06-23 19:31:01 +08:00
|
|
|
g_isAmdFftInitialized = true;
|
|
|
|
}
|
2013-12-17 21:36:26 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
static clAmdFftSetupData setupData;
|
|
|
|
static bool g_isAmdFftInitialized;
|
|
|
|
static bool g_isAmdFftAvailable;
|
|
|
|
};
|
|
|
|
|
|
|
|
clAmdFftSetupData AmdFftHelper::setupData;
|
|
|
|
bool AmdFftHelper::g_isAmdFftAvailable = false;
|
|
|
|
bool AmdFftHelper::g_isAmdFftInitialized = false;
|
|
|
|
|
|
|
|
bool haveAmdFft()
|
|
|
|
{
|
|
|
|
return AmdFftHelper::getInstance().isAvailable();
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
bool haveAmdFft()
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
bool haveSVM()
|
|
|
|
{
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
return true;
|
|
|
|
#else
|
|
|
|
return false;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
void finish()
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
Queue::getDefault().finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
#define IMPLEMENT_REFCOUNTABLE() \
|
|
|
|
void addref() { CV_XADD(&refcount, 1); } \
|
2014-03-25 04:38:57 +08:00
|
|
|
void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
|
2013-10-22 18:04:49 +08:00
|
|
|
int refcount
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
/////////////////////////////////////////// Platform /////////////////////////////////////////////
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
struct Platform::Impl
|
|
|
|
{
|
|
|
|
Impl()
|
|
|
|
{
|
|
|
|
refcount = 1;
|
|
|
|
handle = 0;
|
|
|
|
initialized = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
~Impl() {}
|
|
|
|
|
|
|
|
void init()
|
|
|
|
{
|
|
|
|
if( !initialized )
|
|
|
|
{
|
|
|
|
//cl_uint num_entries
|
|
|
|
cl_uint n = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
|
2013-10-22 18:04:49 +08:00
|
|
|
handle = 0;
|
|
|
|
if( handle != 0 )
|
|
|
|
{
|
|
|
|
char buf[1000];
|
|
|
|
size_t len = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
|
2013-10-22 18:04:49 +08:00
|
|
|
buf[len] = '\0';
|
|
|
|
vendor = String(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
|
|
|
|
|
|
|
cl_platform_id handle;
|
|
|
|
String vendor;
|
|
|
|
bool initialized;
|
|
|
|
};
|
|
|
|
|
|
|
|
Platform::Platform()
|
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Platform::~Platform()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
|
|
|
Platform::Platform(const Platform& pl)
|
|
|
|
{
|
|
|
|
p = (Impl*)pl.p;
|
|
|
|
if(p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
|
|
|
Platform& Platform::operator = (const Platform& pl)
|
|
|
|
{
|
|
|
|
Impl* newp = (Impl*)pl.p;
|
|
|
|
if(newp)
|
|
|
|
newp->addref();
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = newp;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
void* Platform::ptr() const
|
|
|
|
{
|
|
|
|
return p ? p->handle : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Platform& Platform::getDefault()
|
|
|
|
{
|
|
|
|
static Platform p;
|
|
|
|
if( !p.p )
|
|
|
|
{
|
|
|
|
p.p = new Impl;
|
|
|
|
p.p->init();
|
|
|
|
}
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
/////////////////////////////////////// Device ////////////////////////////////////////////
|
|
|
|
|
|
|
|
// deviceVersion has format
|
|
|
|
// OpenCL<space><major_version.minor_version><space><vendor-specific information>
|
|
|
|
// by specification
|
|
|
|
// http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
|
|
|
|
// http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
|
|
|
|
static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
|
|
|
|
{
|
|
|
|
major = minor = 0;
|
|
|
|
if (10 >= deviceVersion.length())
|
|
|
|
return;
|
|
|
|
const char *pstr = deviceVersion.c_str();
|
|
|
|
if (0 != strncmp(pstr, "OpenCL ", 7))
|
|
|
|
return;
|
|
|
|
size_t ppos = deviceVersion.find('.', 7);
|
|
|
|
if (String::npos == ppos)
|
|
|
|
return;
|
|
|
|
String temp = deviceVersion.substr(7, ppos - 7);
|
|
|
|
major = atoi(temp.c_str());
|
|
|
|
temp = deviceVersion.substr(ppos + 1);
|
|
|
|
minor = atoi(temp.c_str());
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
struct Device::Impl
|
|
|
|
{
|
|
|
|
Impl(void* d)
|
|
|
|
{
|
|
|
|
handle = (cl_device_id)d;
|
2013-12-16 20:46:36 +08:00
|
|
|
refcount = 1;
|
2014-01-31 18:00:05 +08:00
|
|
|
|
|
|
|
name_ = getStrProp(CL_DEVICE_NAME);
|
|
|
|
version_ = getStrProp(CL_DEVICE_VERSION);
|
2017-09-07 04:15:47 +08:00
|
|
|
extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
|
2014-01-31 18:00:05 +08:00
|
|
|
doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
|
|
|
|
hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
|
|
|
|
maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
|
|
|
|
maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
|
|
|
|
type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
|
|
|
|
driverVersion_ = getStrProp(CL_DRIVER_VERSION);
|
2014-02-01 19:07:03 +08:00
|
|
|
|
|
|
|
String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
|
|
|
|
parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
|
2014-03-05 15:25:37 +08:00
|
|
|
|
2017-09-07 04:15:47 +08:00
|
|
|
size_t pos = 0;
|
|
|
|
while (pos < extensions_.size())
|
|
|
|
{
|
|
|
|
size_t pos2 = extensions_.find(' ', pos);
|
|
|
|
if (pos2 == String::npos)
|
|
|
|
pos2 = extensions_.size();
|
|
|
|
if (pos2 > pos)
|
|
|
|
{
|
|
|
|
std::string extensionName = extensions_.substr(pos, pos2 - pos);
|
|
|
|
extensions_set_.insert(extensionName);
|
|
|
|
}
|
|
|
|
pos = pos2 + 1;
|
|
|
|
}
|
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
|
|
|
|
|
2014-03-05 19:04:44 +08:00
|
|
|
vendorName_ = getStrProp(CL_DEVICE_VENDOR);
|
|
|
|
if (vendorName_ == "Advanced Micro Devices, Inc." ||
|
|
|
|
vendorName_ == "AMD")
|
|
|
|
vendorID_ = VENDOR_AMD;
|
2014-08-16 04:29:10 +08:00
|
|
|
else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
|
2014-03-05 19:04:44 +08:00
|
|
|
vendorID_ = VENDOR_INTEL;
|
|
|
|
else if (vendorName_ == "NVIDIA Corporation")
|
|
|
|
vendorID_ = VENDOR_NVIDIA;
|
2014-03-05 15:25:37 +08:00
|
|
|
else
|
2014-03-05 19:04:44 +08:00
|
|
|
vendorID_ = UNKNOWN_VENDOR;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
template<typename _TpCL, typename _TpOut>
|
|
|
|
_TpOut getProp(cl_device_info prop) const
|
|
|
|
{
|
|
|
|
_TpCL temp=_TpCL();
|
|
|
|
size_t sz = 0;
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
|
2013-10-22 18:04:49 +08:00
|
|
|
sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
|
|
|
|
}
|
|
|
|
|
2013-10-23 03:34:16 +08:00
|
|
|
bool getBoolProp(cl_device_info prop) const
|
|
|
|
{
|
|
|
|
cl_bool temp = CL_FALSE;
|
|
|
|
size_t sz = 0;
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
|
2013-10-23 03:34:16 +08:00
|
|
|
sz == sizeof(temp) ? temp != 0 : false;
|
|
|
|
}
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
String getStrProp(cl_device_info prop) const
|
|
|
|
{
|
|
|
|
char buf[1024];
|
|
|
|
size_t sz=0;
|
2014-02-01 19:07:03 +08:00
|
|
|
return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
|
2013-10-22 18:04:49 +08:00
|
|
|
sz < sizeof(buf) ? String(buf) : String();
|
|
|
|
}
|
|
|
|
|
2017-09-07 04:15:47 +08:00
|
|
|
bool isExtensionSupported(const std::string& extensionName) const
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
{
|
2017-09-07 04:15:47 +08:00
|
|
|
return extensions_set_.count(extensionName) > 0;
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
2017-09-07 04:15:47 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
cl_device_id handle;
|
2014-01-31 18:00:05 +08:00
|
|
|
|
|
|
|
String name_;
|
|
|
|
String version_;
|
2017-09-07 04:15:47 +08:00
|
|
|
std::string extensions_;
|
2014-01-31 18:00:05 +08:00
|
|
|
int doubleFPConfig_;
|
|
|
|
bool hostUnifiedMemory_;
|
|
|
|
int maxComputeUnits_;
|
|
|
|
size_t maxWorkGroupSize_;
|
|
|
|
int type_;
|
2014-02-01 19:07:03 +08:00
|
|
|
int deviceVersionMajor_;
|
|
|
|
int deviceVersionMinor_;
|
2014-01-31 18:00:05 +08:00
|
|
|
String driverVersion_;
|
2014-03-05 19:04:44 +08:00
|
|
|
String vendorName_;
|
|
|
|
int vendorID_;
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
bool intelSubgroupsSupport_;
|
2017-09-07 04:15:47 +08:00
|
|
|
|
|
|
|
std::set<std::string> extensions_set_;
|
2013-10-22 18:04:49 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
Device::Device()
|
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Device::Device(void* d)
|
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
set(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
Device::Device(const Device& d)
|
|
|
|
{
|
|
|
|
p = d.p;
|
|
|
|
if(p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
|
|
|
Device& Device::operator = (const Device& d)
|
|
|
|
{
|
|
|
|
Impl* newp = (Impl*)d.p;
|
|
|
|
if(newp)
|
|
|
|
newp->addref();
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = newp;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
Device::~Device()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Device::set(void* d)
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = new Impl(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
void* Device::ptr() const
|
|
|
|
{
|
|
|
|
return p ? p->handle : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
String Device::name() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->name_ : String(); }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
String Device::extensions() const
|
2017-09-07 04:15:47 +08:00
|
|
|
{ return p ? String(p->extensions_) : String(); }
|
|
|
|
|
|
|
|
bool Device::isExtensionSupported(const String& extensionName) const
|
|
|
|
{ return p ? p->isExtensionSupported(extensionName) : false; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2014-01-22 14:08:42 +08:00
|
|
|
String Device::version() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->version_ : String(); }
|
2014-01-22 14:08:42 +08:00
|
|
|
|
2014-03-05 19:04:44 +08:00
|
|
|
String Device::vendorName() const
|
|
|
|
{ return p ? p->vendorName_ : String(); }
|
|
|
|
|
|
|
|
int Device::vendorID() const
|
|
|
|
{ return p ? p->vendorID_ : 0; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
String Device::OpenCL_C_Version() const
|
|
|
|
{ return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
|
|
|
|
|
|
|
|
String Device::OpenCLVersion() const
|
2017-02-16 17:50:57 +08:00
|
|
|
{ return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
int Device::deviceVersionMajor() const
|
|
|
|
{ return p ? p->deviceVersionMajor_ : 0; }
|
|
|
|
|
|
|
|
int Device::deviceVersionMinor() const
|
|
|
|
{ return p ? p->deviceVersionMinor_ : 0; }
|
2014-01-16 15:57:57 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
String Device::driverVersion() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->driverVersion_ : String(); }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
int Device::type() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->type_ : 0; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
int Device::addressBits() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
|
|
|
|
|
|
|
|
bool Device::available() const
|
2013-10-23 03:34:16 +08:00
|
|
|
{ return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
bool Device::compilerAvailable() const
|
2013-10-23 03:34:16 +08:00
|
|
|
{ return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
bool Device::linkerAvailable() const
|
2013-11-21 17:05:32 +08:00
|
|
|
#ifdef CL_VERSION_1_2
|
2013-10-23 03:34:16 +08:00
|
|
|
{ return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
|
2013-11-21 17:05:32 +08:00
|
|
|
#else
|
|
|
|
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
int Device::doubleFPConfig() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->doubleFPConfig_ : 0; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
int Device::singleFPConfig() const
|
|
|
|
{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
|
|
|
|
|
|
|
|
int Device::halfFPConfig() const
|
2013-11-21 17:05:32 +08:00
|
|
|
#ifdef CL_VERSION_1_2
|
2013-10-22 18:04:49 +08:00
|
|
|
{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
|
2013-11-21 17:05:32 +08:00
|
|
|
#else
|
|
|
|
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
bool Device::endianLittle() const
|
2013-10-23 03:34:16 +08:00
|
|
|
{ return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
bool Device::errorCorrectionSupport() const
|
2013-10-23 03:34:16 +08:00
|
|
|
{ return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
int Device::executionCapabilities() const
|
|
|
|
{ return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
|
|
|
|
|
|
|
|
size_t Device::globalMemCacheSize() const
|
|
|
|
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
|
|
|
|
|
|
|
|
int Device::globalMemCacheType() const
|
|
|
|
{ return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
|
|
|
|
|
|
|
|
int Device::globalMemCacheLineSize() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
|
|
|
|
|
|
|
|
size_t Device::globalMemSize() const
|
|
|
|
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
|
|
|
|
|
|
|
|
size_t Device::localMemSize() const
|
|
|
|
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
|
|
|
|
|
|
|
|
int Device::localMemType() const
|
|
|
|
{ return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
|
|
|
|
|
|
|
|
bool Device::hostUnifiedMemory() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->hostUnifiedMemory_ : false; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
bool Device::imageSupport() const
|
2013-10-23 03:34:16 +08:00
|
|
|
{ return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2014-04-15 07:09:17 +08:00
|
|
|
bool Device::imageFromBufferSupport() const
|
|
|
|
{
|
2017-09-07 04:15:47 +08:00
|
|
|
return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
|
2014-04-15 07:09:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
uint Device::imagePitchAlignment() const
|
|
|
|
{
|
|
|
|
#ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
|
|
|
|
return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
|
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
uint Device::imageBaseAddressAlignment() const
|
|
|
|
{
|
|
|
|
#ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
|
|
|
|
return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
|
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
size_t Device::image2DMaxWidth() const
|
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
|
|
|
|
|
|
|
|
size_t Device::image2DMaxHeight() const
|
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
|
|
|
|
|
|
|
|
size_t Device::image3DMaxWidth() const
|
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
|
|
|
|
|
|
|
|
size_t Device::image3DMaxHeight() const
|
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
|
|
|
|
|
|
|
|
size_t Device::image3DMaxDepth() const
|
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
|
|
|
|
|
|
|
|
size_t Device::imageMaxBufferSize() const
|
2013-11-21 17:05:32 +08:00
|
|
|
#ifdef CL_VERSION_1_2
|
2013-10-22 18:04:49 +08:00
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
|
2013-11-21 17:05:32 +08:00
|
|
|
#else
|
|
|
|
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
size_t Device::imageMaxArraySize() const
|
2013-11-21 17:05:32 +08:00
|
|
|
#ifdef CL_VERSION_1_2
|
2013-10-22 18:04:49 +08:00
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
|
2013-11-21 17:05:32 +08:00
|
|
|
#else
|
|
|
|
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
|
Merge pull request #8104 from insoow:master
Gemm kernels for Intel GPU (#8104)
* Fix an issue with Kernel object reset release when consecutive Kernel::run calls
Kernel::run launch OCL gpu kernels and set a event callback function
to decreate the ref count of UMat or remove UMat when the lauched workloads
are completed. However, for some OCL kernels requires multiple call of
Kernel::run function with some kernel parameter changes (e.g., input
and output buffer offset) to get the final computation result.
In the case, the current implementation requires unnecessary
synchronization and cleanupMat.
This fix requires the user to specify whether there will be more work or not.
If there is no remaining computation, the Kernel::run will reset the
kernel object
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better
performance.
Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Fix API compatibility error
This patch fixes a OCV API compatibility error. The error was reported
due to the interface changes of Kernel::run. To resolve the issue,
An overloaded function of Kernel::run is added. It take a flag indicating
whether there are more work to be done with the kernel object without
releasing resources related to it.
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Renaming intel_gpu_gemm.cpp to intel_gpu_gemm.inl.hpp
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* Revert "Fix API compatibility error"
This reverts commit 2ef427db91b6c4aec170f691c5d2e6c47d6520d7.
Conflicts:
modules/core/src/intel_gpu_gemm.inl.hpp
* Revert "Fix an issue with Kernel object reset release when consecutive Kernel::run calls"
This reverts commit cc7f9f54695dc293598addce9b9d7e345225bede.
* Fix the case of uninitialization D
When C is null and beta is non-zero, D is used without initialization.
This resloves the issue
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* fix potential output error due to 0 * nan
Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
* whitespace fix, eliminate non-ASCII symbols
* fix build warning
2017-04-19 17:57:54 +08:00
|
|
|
bool Device::intelSubgroupsSupport() const
|
|
|
|
{ return p ? p->intelSubgroupsSupport_ : false; }
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
int Device::maxClockFrequency() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
|
|
|
|
|
|
|
|
int Device::maxComputeUnits() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->maxComputeUnits_ : 0; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
int Device::maxConstantArgs() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
|
|
|
|
|
|
|
|
size_t Device::maxConstantBufferSize() const
|
|
|
|
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
|
|
|
|
|
|
|
|
size_t Device::maxMemAllocSize() const
|
|
|
|
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
|
|
|
|
|
|
|
|
size_t Device::maxParameterSize() const
|
|
|
|
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
|
|
|
|
|
|
|
|
int Device::maxReadImageArgs() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
|
|
|
|
|
|
|
|
int Device::maxWriteImageArgs() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
|
|
|
|
|
|
|
|
int Device::maxSamplers() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
|
|
|
|
|
|
|
|
size_t Device::maxWorkGroupSize() const
|
2014-01-31 18:00:05 +08:00
|
|
|
{ return p ? p->maxWorkGroupSize_ : 0; }
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
int Device::maxWorkItemDims() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
|
|
|
|
|
|
|
|
void Device::maxWorkItemSizes(size_t* sizes) const
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
{
|
|
|
|
const int MAX_DIMS = 32;
|
|
|
|
size_t retsz = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
|
|
|
|
MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int Device::memBaseAddrAlign() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
|
|
|
|
|
|
|
|
int Device::nativeVectorWidthChar() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
|
|
|
|
|
|
|
|
int Device::nativeVectorWidthShort() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
|
|
|
|
|
|
|
|
int Device::nativeVectorWidthInt() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
|
|
|
|
|
|
|
|
int Device::nativeVectorWidthLong() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
|
|
|
|
|
|
|
|
int Device::nativeVectorWidthFloat() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
|
|
|
|
|
|
|
|
int Device::nativeVectorWidthDouble() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
|
|
|
|
|
|
|
|
int Device::nativeVectorWidthHalf() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
|
|
|
|
|
|
|
|
int Device::preferredVectorWidthChar() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
|
|
|
|
|
|
|
|
int Device::preferredVectorWidthShort() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
|
|
|
|
|
|
|
|
int Device::preferredVectorWidthInt() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
|
|
|
|
|
|
|
|
int Device::preferredVectorWidthLong() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
|
|
|
|
|
|
|
|
int Device::preferredVectorWidthFloat() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
|
|
|
|
|
|
|
|
int Device::preferredVectorWidthDouble() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
|
|
|
|
|
|
|
|
int Device::preferredVectorWidthHalf() const
|
|
|
|
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
|
|
|
|
|
|
|
|
size_t Device::printfBufferSize() const
|
2013-11-21 17:05:32 +08:00
|
|
|
#ifdef CL_VERSION_1_2
|
2013-10-22 18:04:49 +08:00
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
|
2013-11-21 17:05:32 +08:00
|
|
|
#else
|
|
|
|
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
|
|
|
|
#endif
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
size_t Device::profilingTimerResolution() const
|
|
|
|
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
|
|
|
|
|
|
|
|
const Device& Device::getDefault()
|
|
|
|
{
|
2014-02-01 00:23:01 +08:00
|
|
|
const Context& ctx = Context::getDefault();
|
2015-02-04 18:03:27 +08:00
|
|
|
int idx = getCoreTlsData().get()->device;
|
2014-11-05 23:41:39 +08:00
|
|
|
const Device& device = ctx.device(idx);
|
|
|
|
return device;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
////////////////////////////////////// Context ///////////////////////////////////////////////////
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2013-12-25 18:39:21 +08:00
|
|
|
template <typename Functor, typename ObjectType>
|
|
|
|
inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
|
|
|
|
{
|
|
|
|
::size_t required;
|
|
|
|
cl_int err = f(obj, name, 0, NULL, &required);
|
|
|
|
if (err != CL_SUCCESS)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
param.clear();
|
|
|
|
if (required > 0)
|
|
|
|
{
|
2013-12-25 22:41:24 +08:00
|
|
|
AutoBuffer<char> buf(required + 1);
|
|
|
|
char* ptr = (char*)buf; // cleanup is not needed
|
|
|
|
err = f(obj, name, required, ptr, NULL);
|
2013-12-25 18:39:21 +08:00
|
|
|
if (err != CL_SUCCESS)
|
|
|
|
return err;
|
2013-12-25 22:41:24 +08:00
|
|
|
param = ptr;
|
2013-12-25 18:39:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return CL_SUCCESS;
|
2014-01-18 05:30:29 +08:00
|
|
|
}
|
2013-12-25 18:39:21 +08:00
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
static void split(const std::string &s, char delim, std::vector<std::string> &elems)
|
|
|
|
{
|
2013-12-25 22:41:24 +08:00
|
|
|
elems.clear();
|
|
|
|
if (s.size() == 0)
|
|
|
|
return;
|
|
|
|
std::istringstream ss(s);
|
2013-12-25 18:39:21 +08:00
|
|
|
std::string item;
|
2013-12-25 22:41:24 +08:00
|
|
|
while (!ss.eof())
|
|
|
|
{
|
|
|
|
std::getline(ss, item, delim);
|
2013-12-25 18:39:21 +08:00
|
|
|
elems.push_back(item);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
|
|
|
|
// Sample: AMD:GPU:
|
|
|
|
// Sample: AMD:GPU:Tahiti
|
|
|
|
// Sample: :GPU|CPU: = '' = ':' = '::'
|
|
|
|
static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
|
|
|
|
std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
|
|
|
|
{
|
2013-12-25 22:41:24 +08:00
|
|
|
std::vector<std::string> parts;
|
|
|
|
split(configurationStr, ':', parts);
|
|
|
|
if (parts.size() > 3)
|
2013-12-25 18:39:21 +08:00
|
|
|
{
|
2013-12-25 22:41:24 +08:00
|
|
|
std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (parts.size() > 2)
|
|
|
|
deviceNameOrID = parts[2];
|
|
|
|
if (parts.size() > 1)
|
|
|
|
{
|
|
|
|
split(parts[1], '|', deviceTypes);
|
2013-12-25 18:39:21 +08:00
|
|
|
}
|
2013-12-25 22:41:24 +08:00
|
|
|
if (parts.size() > 0)
|
2013-12-25 18:39:21 +08:00
|
|
|
{
|
2013-12-25 22:41:24 +08:00
|
|
|
platform = parts[0];
|
2013-12-25 18:39:21 +08:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-02-21 00:47:45 +08:00
|
|
|
#ifdef WINRT
|
2014-05-06 04:59:39 +08:00
|
|
|
static cl_device_id selectOpenCLDevice()
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#else
|
2017-09-29 23:16:55 +08:00
|
|
|
// std::tolower is int->int
|
|
|
|
static char char_tolower(char ch)
|
|
|
|
{
|
|
|
|
return (char)std::tolower((int)ch);
|
|
|
|
}
|
2013-12-25 18:39:21 +08:00
|
|
|
static cl_device_id selectOpenCLDevice()
|
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
std::string platform, deviceName;
|
2013-12-25 18:39:21 +08:00
|
|
|
std::vector<std::string> deviceTypes;
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2013-12-25 18:39:21 +08:00
|
|
|
const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
|
2014-11-05 23:41:39 +08:00
|
|
|
if (configuration &&
|
|
|
|
(strcmp(configuration, "disabled") == 0 ||
|
|
|
|
!parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
|
|
|
|
))
|
2014-02-01 19:07:03 +08:00
|
|
|
return NULL;
|
2013-12-25 18:39:21 +08:00
|
|
|
|
|
|
|
bool isID = false;
|
|
|
|
int deviceID = -1;
|
|
|
|
if (deviceName.length() == 1)
|
|
|
|
// We limit ID range to 0..9, because we want to write:
|
|
|
|
// - '2500' to mean i5-2500
|
|
|
|
// - '8350' to mean AMD FX-8350
|
|
|
|
// - '650' to mean GeForce 650
|
|
|
|
// To extend ID range change condition to '> 0'
|
|
|
|
{
|
|
|
|
isID = true;
|
|
|
|
for (size_t i = 0; i < deviceName.length(); i++)
|
|
|
|
{
|
|
|
|
if (!isdigit(deviceName[i]))
|
|
|
|
{
|
|
|
|
isID = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (isID)
|
|
|
|
{
|
|
|
|
deviceID = atoi(deviceName.c_str());
|
2014-02-01 19:07:03 +08:00
|
|
|
if (deviceID < 0)
|
|
|
|
return NULL;
|
2013-12-25 18:39:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<cl_platform_id> platforms;
|
2013-12-25 22:41:24 +08:00
|
|
|
{
|
|
|
|
cl_uint numPlatforms = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
|
|
|
|
|
2013-12-25 22:41:24 +08:00
|
|
|
if (numPlatforms == 0)
|
|
|
|
return NULL;
|
|
|
|
platforms.resize((size_t)numPlatforms);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
|
2013-12-25 22:41:24 +08:00
|
|
|
platforms.resize(numPlatforms);
|
|
|
|
}
|
2013-12-25 18:39:21 +08:00
|
|
|
|
|
|
|
int selectedPlatform = -1;
|
|
|
|
if (platform.length() > 0)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < platforms.size(); i++)
|
|
|
|
{
|
|
|
|
std::string name;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
|
2013-12-25 18:39:21 +08:00
|
|
|
if (name.find(platform) != std::string::npos)
|
|
|
|
{
|
|
|
|
selectedPlatform = (int)i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (selectedPlatform == -1)
|
|
|
|
{
|
|
|
|
std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
|
|
|
|
goto not_found;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (deviceTypes.size() == 0)
|
|
|
|
{
|
|
|
|
if (!isID)
|
|
|
|
{
|
|
|
|
deviceTypes.push_back("GPU");
|
2014-07-01 18:24:14 +08:00
|
|
|
if (configuration)
|
|
|
|
deviceTypes.push_back("CPU");
|
2013-12-25 18:39:21 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
deviceTypes.push_back("ALL");
|
|
|
|
}
|
|
|
|
for (size_t t = 0; t < deviceTypes.size(); t++)
|
|
|
|
{
|
|
|
|
int deviceType = 0;
|
2014-03-24 19:20:00 +08:00
|
|
|
std::string tempStrDeviceType = deviceTypes[t];
|
2017-09-29 23:16:55 +08:00
|
|
|
std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower);
|
2014-03-24 19:20:00 +08:00
|
|
|
|
|
|
|
if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
|
2013-12-25 18:39:21 +08:00
|
|
|
deviceType = Device::TYPE_GPU;
|
2014-03-24 19:20:00 +08:00
|
|
|
else if (tempStrDeviceType == "cpu")
|
2013-12-25 18:39:21 +08:00
|
|
|
deviceType = Device::TYPE_CPU;
|
2014-03-24 19:20:00 +08:00
|
|
|
else if (tempStrDeviceType == "accelerator")
|
2013-12-25 18:39:21 +08:00
|
|
|
deviceType = Device::TYPE_ACCELERATOR;
|
2014-03-24 19:20:00 +08:00
|
|
|
else if (tempStrDeviceType == "all")
|
2013-12-25 18:39:21 +08:00
|
|
|
deviceType = Device::TYPE_ALL;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
|
|
|
|
goto not_found;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
|
|
|
|
for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
|
|
|
|
(selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
|
|
|
|
i++)
|
|
|
|
{
|
|
|
|
cl_uint count = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
|
|
|
|
CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
|
2013-12-25 18:39:21 +08:00
|
|
|
if (count == 0)
|
|
|
|
continue;
|
|
|
|
size_t base = devices.size();
|
|
|
|
devices.resize(base + count);
|
|
|
|
status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
|
2013-12-25 18:39:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = (isID ? deviceID : 0);
|
|
|
|
(isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
|
|
|
|
i++)
|
|
|
|
{
|
|
|
|
std::string name;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
|
2014-03-21 20:56:12 +08:00
|
|
|
cl_bool useGPU = true;
|
2014-03-24 19:20:00 +08:00
|
|
|
if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
|
2014-03-21 20:56:12 +08:00
|
|
|
{
|
|
|
|
cl_bool isIGPU = CL_FALSE;
|
|
|
|
clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
|
2014-03-24 19:20:00 +08:00
|
|
|
useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
|
2014-03-21 20:56:12 +08:00
|
|
|
}
|
|
|
|
if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
|
2013-12-25 18:39:21 +08:00
|
|
|
{
|
|
|
|
// TODO check for OpenCL 1.1
|
|
|
|
return devices[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2013-12-25 18:39:21 +08:00
|
|
|
not_found:
|
2014-11-05 23:41:39 +08:00
|
|
|
if (!configuration)
|
|
|
|
return NULL; // suppress messages on stderr
|
|
|
|
|
2017-05-17 22:36:48 +08:00
|
|
|
std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl
|
2013-12-25 18:39:21 +08:00
|
|
|
<< " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
|
|
|
|
<< " Device types: ";
|
|
|
|
for (size_t t = 0; t < deviceTypes.size(); t++)
|
|
|
|
std::cerr << deviceTypes[t] << " ";
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2013-12-25 18:39:21 +08:00
|
|
|
std::cerr << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
|
|
|
|
return NULL;
|
|
|
|
}
|
2014-05-06 04:59:39 +08:00
|
|
|
#endif
|
2013-12-25 18:39:21 +08:00
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
namespace svm {
|
|
|
|
|
|
|
|
enum AllocatorFlags { // don't use first 16 bits
|
|
|
|
OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
|
|
|
|
OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
|
|
|
|
OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
|
|
|
|
OPENCL_SVM_BUFFER_MASK = 3 << 16,
|
|
|
|
OPENCL_SVM_BUFFER_MAP = 4 << 16
|
|
|
|
};
|
|
|
|
|
|
|
|
static bool checkForceSVMUmatUsage()
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static bool force = false;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
2017-05-25 23:59:01 +08:00
|
|
|
force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
|
2015-01-02 08:33:40 +08:00
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return force;
|
|
|
|
}
|
|
|
|
static bool checkDisableSVMUMatUsage()
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static bool force = false;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
2017-05-25 23:59:01 +08:00
|
|
|
force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
|
2015-01-02 08:33:40 +08:00
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return force;
|
|
|
|
}
|
|
|
|
static bool checkDisableSVM()
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static bool force = false;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
2017-05-25 23:59:01 +08:00
|
|
|
force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
|
2015-01-02 08:33:40 +08:00
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return force;
|
|
|
|
}
|
|
|
|
// see SVMCapabilities
|
|
|
|
static unsigned int getSVMCapabilitiesMask()
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static unsigned int mask = 0;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
|
|
|
const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
|
|
|
|
if (envValue == NULL)
|
|
|
|
{
|
|
|
|
return ~0U; // all bits 1
|
|
|
|
}
|
|
|
|
mask = atoi(envValue);
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#endif
|
|
|
|
|
2016-12-19 05:38:33 +08:00
|
|
|
static size_t getProgramCountLimit()
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static size_t count = 0;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
2017-05-25 23:59:01 +08:00
|
|
|
count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
|
2016-12-19 05:38:33 +08:00
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
struct Context::Impl
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
static Context::Impl* get(Context& context) { return context.p; }
|
|
|
|
|
|
|
|
void __init()
|
2013-12-02 05:50:24 +08:00
|
|
|
{
|
|
|
|
refcount = 1;
|
|
|
|
handle = 0;
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
svmInitialized = false;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
Impl()
|
|
|
|
{
|
|
|
|
__init();
|
2013-12-02 05:50:24 +08:00
|
|
|
}
|
|
|
|
|
2013-12-25 18:39:21 +08:00
|
|
|
void setDefault()
|
|
|
|
{
|
|
|
|
CV_Assert(handle == NULL);
|
|
|
|
|
|
|
|
cl_device_id d = selectOpenCLDevice();
|
|
|
|
|
|
|
|
if (d == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
cl_platform_id pl = NULL;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
|
2013-12-25 18:39:21 +08:00
|
|
|
|
|
|
|
cl_context_properties prop[] =
|
|
|
|
{
|
|
|
|
CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
|
|
|
|
0
|
|
|
|
};
|
|
|
|
|
|
|
|
// !!! in the current implementation force the number of devices to 1 !!!
|
2014-02-01 19:07:03 +08:00
|
|
|
cl_uint nd = 1;
|
|
|
|
cl_int status;
|
2013-12-25 18:39:21 +08:00
|
|
|
|
|
|
|
handle = clCreateContext(prop, nd, &d, 0, 0, &status);
|
2014-02-01 19:07:03 +08:00
|
|
|
|
|
|
|
bool ok = handle != 0 && status == CL_SUCCESS;
|
2013-12-25 18:39:21 +08:00
|
|
|
if( ok )
|
|
|
|
{
|
|
|
|
devices.resize(nd);
|
|
|
|
devices[0].set(d);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
handle = NULL;
|
|
|
|
}
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
Impl(int dtype0)
|
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
__init();
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
cl_int retval = 0;
|
|
|
|
cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
|
|
|
|
cl_context_properties prop[] =
|
|
|
|
{
|
|
|
|
CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
|
|
|
|
0
|
|
|
|
};
|
|
|
|
|
|
|
|
cl_uint i, nd0 = 0, nd = 0;
|
|
|
|
int dtype = dtype0 & 15;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
AutoBuffer<void*> dlistbuf(nd0*2+1);
|
|
|
|
cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
|
|
|
|
cl_device_id* dlist_new = dlist + nd0;
|
2015-01-02 08:33:40 +08:00
|
|
|
CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
|
2013-10-22 18:04:49 +08:00
|
|
|
String name0;
|
|
|
|
|
|
|
|
for(i = 0; i < nd0; i++)
|
|
|
|
{
|
|
|
|
Device d(dlist[i]);
|
|
|
|
if( !d.available() || !d.compilerAvailable() )
|
|
|
|
continue;
|
|
|
|
if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
|
|
|
|
continue;
|
|
|
|
if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
|
|
|
|
continue;
|
|
|
|
String name = d.name();
|
|
|
|
if( nd != 0 && name != name0 )
|
|
|
|
continue;
|
|
|
|
name0 = name;
|
|
|
|
dlist_new[nd++] = dlist[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
if(nd == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// !!! in the current implementation force the number of devices to 1 !!!
|
|
|
|
nd = 1;
|
|
|
|
|
|
|
|
handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
|
2014-02-01 19:07:03 +08:00
|
|
|
bool ok = handle != 0 && retval == CL_SUCCESS;
|
2013-10-22 18:04:49 +08:00
|
|
|
if( ok )
|
|
|
|
{
|
|
|
|
devices.resize(nd);
|
|
|
|
for( i = 0; i < nd; i++ )
|
|
|
|
devices[i].set(dlist_new[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
~Impl()
|
|
|
|
{
|
|
|
|
if(handle)
|
2014-02-01 19:07:03 +08:00
|
|
|
{
|
2013-10-22 18:04:49 +08:00
|
|
|
clReleaseContext(handle);
|
2014-02-01 19:07:03 +08:00
|
|
|
handle = NULL;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
devices.clear();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Program getProg(const ProgramSource& src,
|
2013-10-22 18:04:49 +08:00
|
|
|
const String& buildflags, String& errmsg)
|
|
|
|
{
|
2016-12-19 05:38:33 +08:00
|
|
|
size_t limit = getProgramCountLimit();
|
2017-07-31 22:24:08 +08:00
|
|
|
String key = cv::format("codehash=%08llx ", src.hash()) + Program::getPrefix(buildflags);
|
2016-12-19 05:38:33 +08:00
|
|
|
{
|
|
|
|
cv::AutoLock lock(program_cache_mutex);
|
|
|
|
phash_t::iterator it = phash.find(key);
|
|
|
|
if (it != phash.end())
|
|
|
|
{
|
|
|
|
// TODO LRU cache
|
|
|
|
CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
|
|
|
|
if (i != cacheList.end() && i != cacheList.begin())
|
|
|
|
{
|
|
|
|
cacheList.erase(i);
|
|
|
|
cacheList.push_front(key);
|
|
|
|
}
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
{ // cleanup program cache
|
|
|
|
size_t sz = phash.size();
|
|
|
|
if (limit > 0 && sz >= limit)
|
|
|
|
{
|
2016-12-22 09:52:43 +08:00
|
|
|
static bool warningFlag = false;
|
|
|
|
if (!warningFlag)
|
|
|
|
{
|
|
|
|
printf("\nWARNING: OpenCV-OpenCL:\n"
|
|
|
|
" In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
|
|
|
|
" You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
|
|
|
|
warningFlag = true;
|
|
|
|
}
|
2016-12-19 05:38:33 +08:00
|
|
|
while (!cacheList.empty())
|
|
|
|
{
|
|
|
|
size_t c = phash.erase(cacheList.back());
|
|
|
|
cacheList.pop_back();
|
|
|
|
if (c != 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
Program prog(src, buildflags, errmsg);
|
2013-11-19 00:48:00 +08:00
|
|
|
if(prog.ptr())
|
2016-12-19 05:38:33 +08:00
|
|
|
{
|
|
|
|
cv::AutoLock lock(program_cache_mutex);
|
|
|
|
phash.insert(std::pair<std::string, Program>(key, prog));
|
|
|
|
cacheList.push_front(key);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
return prog;
|
|
|
|
}
|
|
|
|
|
2017-08-25 08:42:11 +08:00
|
|
|
void unloadProg(Program& prog)
|
|
|
|
{
|
|
|
|
cv::AutoLock lock(program_cache_mutex);
|
|
|
|
for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
|
|
|
|
{
|
|
|
|
phash_t::iterator it = phash.find(*i);
|
|
|
|
if (it != phash.end())
|
|
|
|
{
|
|
|
|
if (it->second.ptr() == prog.ptr())
|
|
|
|
{
|
|
|
|
phash.erase(*i);
|
|
|
|
cacheList.erase(i);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-12-19 05:38:33 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
|
|
|
|
|
|
|
cl_context handle;
|
|
|
|
std::vector<Device> devices;
|
|
|
|
|
2016-12-19 05:38:33 +08:00
|
|
|
cv::Mutex program_cache_mutex;
|
|
|
|
typedef std::map<std::string, Program> phash_t;
|
2013-10-22 18:04:49 +08:00
|
|
|
phash_t phash;
|
2016-12-19 05:38:33 +08:00
|
|
|
typedef std::list<cv::String> CacheList;
|
|
|
|
CacheList cacheList;
|
2015-01-02 08:33:40 +08:00
|
|
|
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
bool svmInitialized;
|
|
|
|
bool svmAvailable;
|
|
|
|
bool svmEnabled;
|
|
|
|
svm::SVMCapabilities svmCapabilities;
|
|
|
|
svm::SVMFunctions svmFunctions;
|
|
|
|
|
|
|
|
void svmInit()
|
|
|
|
{
|
|
|
|
CV_Assert(handle != NULL);
|
|
|
|
const Device& device = devices[0];
|
|
|
|
cl_device_svm_capabilities deviceCaps = 0;
|
|
|
|
CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
|
|
|
|
cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
|
|
|
|
if (status != CL_SUCCESS)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
|
|
|
|
goto noSVM;
|
|
|
|
}
|
|
|
|
CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
|
|
|
|
CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
|
|
|
|
svmCapabilities.value_ =
|
|
|
|
((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
|
|
|
|
((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
|
|
|
|
((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
|
|
|
|
((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
|
|
|
|
svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
|
|
|
|
if (svmCapabilities.value_ == 0)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
|
|
|
|
goto noSVM;
|
|
|
|
}
|
|
|
|
try
|
|
|
|
{
|
|
|
|
// Try OpenCL 2.0
|
|
|
|
CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
|
|
|
|
void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
|
|
|
|
if (!ptr)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
|
|
|
|
CV_ErrorNoReturn(Error::StsBadArg, "clSVMAlloc returned NULL");
|
|
|
|
}
|
|
|
|
try
|
|
|
|
{
|
|
|
|
bool error = false;
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
|
|
|
|
CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMMap FAILED");
|
|
|
|
}
|
|
|
|
clFinish(q);
|
|
|
|
try
|
|
|
|
{
|
|
|
|
((int*)ptr)[0] = 100;
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
|
|
|
|
error = true;
|
|
|
|
}
|
|
|
|
if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
|
|
|
|
CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
|
|
|
|
}
|
|
|
|
clFinish(q);
|
|
|
|
if (error)
|
|
|
|
{
|
|
|
|
CV_ErrorNoReturn(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
|
|
|
|
clSVMFree(handle, ptr);
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
clSVMFree(handle, ptr);
|
|
|
|
svmFunctions.fn_clSVMAlloc = clSVMAlloc;
|
|
|
|
svmFunctions.fn_clSVMFree = clSVMFree;
|
|
|
|
svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
|
|
|
|
//svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
|
|
|
|
//svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
|
|
|
|
svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
|
|
|
|
svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
|
|
|
|
svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
|
|
|
|
svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
|
|
|
|
try
|
|
|
|
{
|
|
|
|
// Try HSA extension
|
|
|
|
String extensions = device.extensions();
|
|
|
|
if (extensions.find("cl_amd_svm") == String::npos)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
|
|
|
|
goto noSVM;
|
|
|
|
}
|
|
|
|
cl_platform_id p = NULL;
|
|
|
|
status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
|
|
|
|
svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
|
|
|
|
svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
|
|
|
|
//svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
|
|
|
|
//svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
|
|
|
|
svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
|
|
|
|
svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
|
|
|
|
svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
|
|
|
|
svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
|
|
|
|
CV_Assert(svmFunctions.isValid());
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
|
|
|
|
goto noSVM;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
svmAvailable = true;
|
|
|
|
svmEnabled = !svm::checkDisableSVM();
|
|
|
|
svmInitialized = true;
|
|
|
|
CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
|
|
|
|
return;
|
|
|
|
noSVM:
|
|
|
|
CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
|
|
|
|
svmAvailable = false;
|
|
|
|
svmEnabled = false;
|
|
|
|
svmCapabilities.value_ = 0;
|
|
|
|
svmInitialized = true;
|
|
|
|
svmFunctions.fn_clSVMAlloc = NULL;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Context::Context()
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Context::Context(int dtype)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
create(dtype);
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
bool Context::create()
|
2013-12-25 18:39:21 +08:00
|
|
|
{
|
|
|
|
if( !haveOpenCL() )
|
|
|
|
return false;
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = new Impl();
|
|
|
|
if(!p->handle)
|
|
|
|
{
|
|
|
|
delete p;
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
return p != 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
bool Context::create(int dtype0)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
if( !haveOpenCL() )
|
|
|
|
return false;
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = new Impl(dtype0);
|
|
|
|
if(!p->handle)
|
|
|
|
{
|
|
|
|
delete p;
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
return p != 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Context::~Context()
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2013-12-27 18:02:03 +08:00
|
|
|
if (p)
|
|
|
|
{
|
|
|
|
p->release();
|
|
|
|
p = NULL;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Context::Context(const Context& c)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = (Impl*)c.p;
|
|
|
|
if(p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Context& Context::operator = (const Context& c)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
Impl* newp = (Impl*)c.p;
|
|
|
|
if(newp)
|
|
|
|
newp->addref();
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = newp;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
void* Context::ptr() const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2013-12-27 18:02:03 +08:00
|
|
|
return p == NULL ? NULL : p->handle;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
size_t Context::ndevices() const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
return p ? p->devices.size() : 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
const Device& Context::device(size_t idx) const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
static Device dummy;
|
|
|
|
return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Context& Context::getDefault(bool initialize)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2014-03-13 19:45:23 +08:00
|
|
|
static Context* ctx = new Context();
|
|
|
|
if(!ctx->p && haveOpenCL())
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2014-03-13 19:45:23 +08:00
|
|
|
if (!ctx->p)
|
|
|
|
ctx->p = new Impl();
|
2013-12-02 05:50:24 +08:00
|
|
|
if (initialize)
|
|
|
|
{
|
2014-02-01 00:23:01 +08:00
|
|
|
// do not create new Context right away.
|
2013-12-02 05:50:24 +08:00
|
|
|
// First, try to retrieve existing context of the same type.
|
2014-02-01 00:23:01 +08:00
|
|
|
// In its turn, Platform::getContext() may call Context::create()
|
2013-12-02 05:50:24 +08:00
|
|
|
// if there is no such context.
|
2014-03-13 19:45:23 +08:00
|
|
|
if (ctx->p->handle == NULL)
|
|
|
|
ctx->p->setDefault();
|
2013-12-02 05:50:24 +08:00
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-03-13 19:45:23 +08:00
|
|
|
return *ctx;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Program Context::getProg(const ProgramSource& prog,
|
2013-10-22 18:04:49 +08:00
|
|
|
const String& buildopts, String& errmsg)
|
|
|
|
{
|
|
|
|
return p ? p->getProg(prog, buildopts, errmsg) : Program();
|
|
|
|
}
|
|
|
|
|
2017-08-25 08:42:11 +08:00
|
|
|
void Context::unloadProg(Program& prog)
|
|
|
|
{
|
|
|
|
if (p)
|
|
|
|
p->unloadProg(prog);
|
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
bool Context::useSVM() const
|
|
|
|
{
|
|
|
|
Context::Impl* i = p;
|
|
|
|
CV_Assert(i);
|
|
|
|
if (!i->svmInitialized)
|
|
|
|
i->svmInit();
|
|
|
|
return i->svmEnabled;
|
|
|
|
}
|
|
|
|
void Context::setUseSVM(bool enabled)
|
|
|
|
{
|
|
|
|
Context::Impl* i = p;
|
|
|
|
CV_Assert(i);
|
|
|
|
if (!i->svmInitialized)
|
|
|
|
i->svmInit();
|
|
|
|
if (enabled && !i->svmAvailable)
|
|
|
|
{
|
|
|
|
CV_ErrorNoReturn(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
|
|
|
|
}
|
|
|
|
i->svmEnabled = enabled;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
bool Context::useSVM() const { return false; }
|
|
|
|
void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
namespace svm {
|
|
|
|
|
|
|
|
const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
|
|
|
|
{
|
|
|
|
Context::Impl* i = context.p;
|
|
|
|
CV_Assert(i);
|
|
|
|
if (!i->svmInitialized)
|
|
|
|
i->svmInit();
|
|
|
|
return i->svmCapabilities;
|
|
|
|
}
|
|
|
|
|
|
|
|
CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
|
|
|
|
{
|
|
|
|
Context::Impl* i = context.p;
|
|
|
|
CV_Assert(i);
|
|
|
|
CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
|
|
|
|
CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
|
|
|
|
return &i->svmFunctions;
|
|
|
|
}
|
|
|
|
|
|
|
|
CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
|
|
|
|
{
|
|
|
|
if (checkForceSVMUmatUsage())
|
|
|
|
return true;
|
|
|
|
if (checkDisableSVMUMatUsage())
|
|
|
|
return false;
|
|
|
|
if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
|
|
|
|
return true;
|
|
|
|
return false; // don't use SVM by default
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace cv::ocl::svm
|
|
|
|
#endif // HAVE_OPENCL_SVM
|
|
|
|
|
|
|
|
|
OpenCV-OpenCL interop (PR #4072):
Commits:
added new function, cv::ocl::attachContext(String& platformName, void* platformID, void* context, void* deviceID) which allow to attach externally created OpenCL context to OpenCV.
add definitions of clRetainDevice, clRetainContext funcs
removed definitions for clRetainContext, clRetainDevice
fixed build issue under Linux
fixed uninitialized vars, replace dbgassert in error handling
remove function which is not ready yet
add new function, cv::ocl::convertFromBuffer(int rows, int cols, int type, void* cl_mem_obj, UMat& dst, UMatUsageFlags usageFlags = cv::USAGE_DEFAULT) which attaches user allocated OpenCL clBuffer to UMat
uncommented clGetMemObjectInfo definition (otherwise prevent opencv build)
fixed build issue on linux and android
add step parameter to cv::ocl::convertFromBuffer func
suppress compile-time warning
added sample opencl-opencv interoperability (showcase for cv::ocl::convertFromBuffer func)
CMakeLists.txt modified to not create sample build script if OpenCL SDK not found in system
fixed build issue (apple opencl include dir and spaces in CMake file)
added call to clRetainContext for attachContext func and call to clRetainMemObject for convertFromBuffer func
uncommented clRetainMemObject definition
added comments and cleanup
add local path to cmake modules search dirs (instead of replacing)
remove REQUIRED for find_package call (sample build together with opencv). need to try standalone sample build
opencl-interop sample moved to standalone build
set minimum version requirement for sample's cmake to 3.1
put cmake_minimum_required under condition, so do not check if samples not builded
remove code dups for setSize, updateContinuityFlag, and finalizeHdr
commented out cmake_minimum_required(VERSION 3.1)
add safety check for cmake version
add convertFromImage func and update opencl-interop sample
uncommented clGetImageInfo defs
uncommented clEnqueueCopyImageToBuffer defs
fixed clEnqueueCopyImageToBuffer defs
add doxygen comments
remove doxygen @fn tag
try to restart buildbot
add doxygen comments to directx interop funcs
remove internal header, use fwd declarations in affected compile units instead
2015-05-28 04:22:33 +08:00
|
|
|
static void get_platform_name(cl_platform_id id, String& name)
|
|
|
|
{
|
|
|
|
// get platform name string length
|
|
|
|
size_t sz = 0;
|
|
|
|
if (CL_SUCCESS != clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz))
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "clGetPlatformInfo failed!");
|
|
|
|
|
|
|
|
// get platform name string
|
|
|
|
AutoBuffer<char> buf(sz + 1);
|
|
|
|
if (CL_SUCCESS != clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf, 0))
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "clGetPlatformInfo failed!");
|
|
|
|
|
|
|
|
// just in case, ensure trailing zero for ASCIIZ string
|
|
|
|
buf[sz] = 0;
|
|
|
|
|
|
|
|
name = (const char*)buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
// Attaches OpenCL context to OpenCV
|
|
|
|
*/
|
|
|
|
void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
|
|
|
|
{
|
|
|
|
cl_uint cnt = 0;
|
|
|
|
|
|
|
|
if(CL_SUCCESS != clGetPlatformIDs(0, 0, &cnt))
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "clGetPlatformIDs failed!");
|
|
|
|
|
|
|
|
if (cnt == 0)
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "no OpenCL platform available!");
|
|
|
|
|
|
|
|
std::vector<cl_platform_id> platforms(cnt);
|
|
|
|
|
|
|
|
if(CL_SUCCESS != clGetPlatformIDs(cnt, &platforms[0], 0))
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "clGetPlatformIDs failed!");
|
|
|
|
|
|
|
|
bool platformAvailable = false;
|
|
|
|
|
|
|
|
// check if external platformName contained in list of available platforms in OpenCV
|
|
|
|
for (unsigned int i = 0; i < cnt; i++)
|
|
|
|
{
|
|
|
|
String availablePlatformName;
|
|
|
|
get_platform_name(platforms[i], availablePlatformName);
|
|
|
|
// external platform is found in the list of available platforms
|
|
|
|
if (platformName == availablePlatformName)
|
|
|
|
{
|
|
|
|
platformAvailable = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!platformAvailable)
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "No matched platforms available!");
|
|
|
|
|
|
|
|
// check if platformID corresponds to platformName
|
|
|
|
String actualPlatformName;
|
|
|
|
get_platform_name((cl_platform_id)platformID, actualPlatformName);
|
|
|
|
if (platformName != actualPlatformName)
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "No matched platforms available!");
|
|
|
|
|
|
|
|
// do not initialize OpenCL context
|
|
|
|
Context ctx = Context::getDefault(false);
|
|
|
|
|
|
|
|
// attach supplied context to OpenCV
|
|
|
|
initializeContextFromHandle(ctx, platformID, context, deviceID);
|
|
|
|
|
|
|
|
if(CL_SUCCESS != clRetainContext((cl_context)context))
|
|
|
|
CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "clRetainContext failed!");
|
|
|
|
|
|
|
|
// clear command queue, if any
|
|
|
|
getCoreTlsData().get()->oclQueue.finish();
|
|
|
|
Queue q;
|
|
|
|
getCoreTlsData().get()->oclQueue = q;
|
|
|
|
|
|
|
|
return;
|
|
|
|
} // attachContext()
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
|
2013-12-02 05:50:24 +08:00
|
|
|
{
|
|
|
|
cl_context context = (cl_context)_context;
|
|
|
|
cl_device_id device = (cl_device_id)_device;
|
|
|
|
|
|
|
|
// cleanup old context
|
2014-02-01 00:23:01 +08:00
|
|
|
Context::Impl * impl = ctx.p;
|
2013-12-02 05:50:24 +08:00
|
|
|
if (impl->handle)
|
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
|
2013-12-02 05:50:24 +08:00
|
|
|
}
|
|
|
|
impl->devices.clear();
|
|
|
|
|
|
|
|
impl->handle = context;
|
|
|
|
impl->devices.resize(1);
|
|
|
|
impl->devices[0].set(device);
|
|
|
|
|
|
|
|
Platform& p = Platform::getDefault();
|
2014-02-01 19:07:03 +08:00
|
|
|
Platform::Impl* pImpl = p.p;
|
2013-12-02 05:50:24 +08:00
|
|
|
pImpl->handle = (cl_platform_id)platform;
|
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
/////////////////////////////////////////// Queue /////////////////////////////////////////////
|
2013-12-02 05:50:24 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
struct Queue::Impl
|
|
|
|
{
|
2017-09-08 23:57:26 +08:00
|
|
|
inline void __init()
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
refcount = 1;
|
2017-09-08 23:57:26 +08:00
|
|
|
handle = 0;
|
|
|
|
isProfilingQueue_ = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
Impl(cl_command_queue q)
|
|
|
|
{
|
|
|
|
__init();
|
|
|
|
handle = q;
|
|
|
|
|
|
|
|
cl_command_queue_properties props = 0;
|
|
|
|
cl_int result = clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL);
|
|
|
|
CV_Assert(result && "clGetCommandQueueInfo(CL_QUEUE_PROPERTIES)");
|
|
|
|
isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
|
|
|
|
}
|
|
|
|
|
|
|
|
Impl(cl_command_queue q, bool isProfilingQueue)
|
|
|
|
{
|
|
|
|
__init();
|
|
|
|
handle = q;
|
|
|
|
isProfilingQueue_ = isProfilingQueue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Impl(const Context& c, const Device& d, bool withProfiling = false)
|
|
|
|
{
|
|
|
|
__init();
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
const Context* pc = &c;
|
2013-10-22 18:04:49 +08:00
|
|
|
cl_context ch = (cl_context)pc->ptr();
|
|
|
|
if( !ch )
|
|
|
|
{
|
2014-02-01 00:23:01 +08:00
|
|
|
pc = &Context::getDefault();
|
2013-10-22 18:04:49 +08:00
|
|
|
ch = (cl_context)pc->ptr();
|
|
|
|
}
|
|
|
|
cl_device_id dh = (cl_device_id)d.ptr();
|
|
|
|
if( !dh )
|
|
|
|
dh = (cl_device_id)pc->device(0).ptr();
|
|
|
|
cl_int retval = 0;
|
2017-09-08 23:57:26 +08:00
|
|
|
cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
|
|
|
|
handle = clCreateCommandQueue(ch, dh, props, &retval);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(retval == CL_SUCCESS);
|
2017-09-08 23:57:26 +08:00
|
|
|
isProfilingQueue_ = withProfiling;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
~Impl()
|
|
|
|
{
|
2013-12-11 04:31:34 +08:00
|
|
|
#ifdef _WIN32
|
|
|
|
if (!cv::__termination)
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2013-12-11 04:31:34 +08:00
|
|
|
if(handle)
|
|
|
|
{
|
|
|
|
clFinish(handle);
|
|
|
|
clReleaseCommandQueue(handle);
|
2014-02-01 19:07:03 +08:00
|
|
|
handle = NULL;
|
2013-12-11 04:31:34 +08:00
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-08 23:57:26 +08:00
|
|
|
const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
|
|
|
|
{
|
|
|
|
if (isProfilingQueue_)
|
|
|
|
return self;
|
|
|
|
|
|
|
|
if (profiling_queue_.ptr())
|
|
|
|
return profiling_queue_;
|
|
|
|
|
|
|
|
cl_context ctx = 0;
|
|
|
|
CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
|
|
|
|
|
|
|
|
cl_device_id device = 0;
|
|
|
|
CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
|
|
|
|
|
|
|
|
cl_int result = CL_SUCCESS;
|
|
|
|
cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
|
|
|
|
cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
|
|
|
|
CV_Assert(result == CL_SUCCESS && "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
|
|
|
|
|
|
|
|
Queue queue;
|
|
|
|
queue.p = new Impl(q, true);
|
|
|
|
profiling_queue_ = queue;
|
|
|
|
|
|
|
|
return profiling_queue_;
|
|
|
|
}
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
|
|
|
|
|
|
|
cl_command_queue handle;
|
2017-09-08 23:57:26 +08:00
|
|
|
bool isProfilingQueue_;
|
|
|
|
cv::ocl::Queue profiling_queue_;
|
2013-10-22 18:04:49 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
Queue::Queue()
|
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Queue::Queue(const Context& c, const Device& d)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
create(c, d);
|
|
|
|
}
|
|
|
|
|
|
|
|
Queue::Queue(const Queue& q)
|
|
|
|
{
|
|
|
|
p = q.p;
|
|
|
|
if(p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
|
|
|
Queue& Queue::operator = (const Queue& q)
|
|
|
|
{
|
|
|
|
Impl* newp = (Impl*)q.p;
|
|
|
|
if(newp)
|
|
|
|
newp->addref();
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = newp;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
Queue::~Queue()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
bool Queue::create(const Context& c, const Device& d)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = new Impl(c, d);
|
|
|
|
return p->handle != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Queue::finish()
|
|
|
|
{
|
|
|
|
if(p && p->handle)
|
2014-02-01 19:07:03 +08:00
|
|
|
{
|
|
|
|
CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2017-09-08 23:57:26 +08:00
|
|
|
const Queue& Queue::getProfilingQueue() const
|
|
|
|
{
|
|
|
|
CV_Assert(p);
|
|
|
|
return p->getProfilingQueue(*this);
|
|
|
|
}
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
void* Queue::ptr() const
|
|
|
|
{
|
|
|
|
return p ? p->handle : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Queue& Queue::getDefault()
|
|
|
|
{
|
2015-02-04 18:03:27 +08:00
|
|
|
Queue& q = getCoreTlsData().get()->oclQueue;
|
2013-12-01 07:12:19 +08:00
|
|
|
if( !q.p && haveOpenCL() )
|
2014-02-01 00:23:01 +08:00
|
|
|
q.create(Context::getDefault());
|
2013-10-22 18:04:49 +08:00
|
|
|
return q;
|
|
|
|
}
|
|
|
|
|
|
|
|
static cl_command_queue getQueue(const Queue& q)
|
|
|
|
{
|
|
|
|
cl_command_queue qq = (cl_command_queue)q.ptr();
|
|
|
|
if(!qq)
|
|
|
|
qq = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
return qq;
|
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
/////////////////////////////////////////// KernelArg /////////////////////////////////////////////
|
|
|
|
|
2013-11-19 00:48:00 +08:00
|
|
|
KernelArg::KernelArg()
|
2014-03-08 05:29:27 +08:00
|
|
|
: flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
|
2013-11-19 00:48:00 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2014-03-08 05:29:27 +08:00
|
|
|
KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
|
|
|
|
: flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2017-02-14 19:58:52 +08:00
|
|
|
CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
KernelArg KernelArg::Constant(const Mat& m)
|
|
|
|
{
|
|
|
|
CV_Assert(m.isContinuous());
|
2014-08-13 19:08:27 +08:00
|
|
|
return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
/////////////////////////////////////////// Kernel /////////////////////////////////////////////
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
struct Kernel::Impl
|
|
|
|
{
|
2014-02-23 18:59:06 +08:00
|
|
|
Impl(const char* kname, const Program& prog) :
|
2017-07-06 18:25:32 +08:00
|
|
|
refcount(1), isInProgress(false), nu(0)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
cl_program ph = (cl_program)prog.ptr();
|
|
|
|
cl_int retval = 0;
|
2016-08-25 22:26:46 +08:00
|
|
|
#ifdef ENABLE_INSTRUMENTATION
|
|
|
|
name = kname;
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
handle = ph != 0 ?
|
|
|
|
clCreateKernel(ph, kname, &retval) : 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(retval == CL_SUCCESS);
|
2013-10-25 20:46:03 +08:00
|
|
|
for( int i = 0; i < MAX_ARRS; i++ )
|
|
|
|
u[i] = 0;
|
2013-12-02 03:14:45 +08:00
|
|
|
haveTempDstUMats = false;
|
2013-10-25 20:46:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void cleanupUMats()
|
|
|
|
{
|
|
|
|
for( int i = 0; i < MAX_ARRS; i++ )
|
|
|
|
if( u[i] )
|
|
|
|
{
|
|
|
|
if( CV_XADD(&u[i]->urefcount, -1) == 1 )
|
2017-07-06 22:57:05 +08:00
|
|
|
{
|
|
|
|
u[i]->flags |= UMatData::ASYNC_CLEANUP;
|
2013-10-25 20:46:03 +08:00
|
|
|
u[i]->currAllocator->deallocate(u[i]);
|
2017-07-06 22:57:05 +08:00
|
|
|
}
|
2013-10-25 20:46:03 +08:00
|
|
|
u[i] = 0;
|
|
|
|
}
|
|
|
|
nu = 0;
|
2013-12-02 03:14:45 +08:00
|
|
|
haveTempDstUMats = false;
|
2013-10-25 20:46:03 +08:00
|
|
|
}
|
|
|
|
|
2013-12-02 03:14:45 +08:00
|
|
|
void addUMat(const UMat& m, bool dst)
|
2013-10-25 20:46:03 +08:00
|
|
|
{
|
|
|
|
CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
|
|
|
|
u[nu] = m.u;
|
|
|
|
CV_XADD(&m.u->urefcount, 1);
|
|
|
|
nu++;
|
2013-12-02 03:14:45 +08:00
|
|
|
if(dst && m.u->tempUMat())
|
|
|
|
haveTempDstUMats = true;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2013-10-25 20:46:03 +08:00
|
|
|
|
2014-04-15 07:09:17 +08:00
|
|
|
void addImage(const Image2D& image)
|
|
|
|
{
|
|
|
|
images.push_back(image);
|
|
|
|
}
|
|
|
|
|
2017-07-06 18:25:32 +08:00
|
|
|
void finit(cl_event e)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2017-07-06 18:25:32 +08:00
|
|
|
CV_UNUSED(e);
|
|
|
|
#if 0
|
|
|
|
printf("event::callback(%p)\n", e); fflush(stdout);
|
|
|
|
#endif
|
2013-10-25 20:46:03 +08:00
|
|
|
cleanupUMats();
|
2014-04-15 07:09:17 +08:00
|
|
|
images.clear();
|
2017-07-06 18:25:32 +08:00
|
|
|
isInProgress = false;
|
2013-10-22 18:04:49 +08:00
|
|
|
release();
|
|
|
|
}
|
|
|
|
|
2017-09-09 00:42:59 +08:00
|
|
|
bool run(int dims, size_t _globalsize[], size_t _localsize[],
|
|
|
|
bool sync, int64* timeNS, const Queue& q);
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
~Impl()
|
|
|
|
{
|
|
|
|
if(handle)
|
|
|
|
clReleaseKernel(handle);
|
|
|
|
}
|
|
|
|
|
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
|
|
|
|
2016-08-25 22:26:46 +08:00
|
|
|
#ifdef ENABLE_INSTRUMENTATION
|
|
|
|
cv::String name;
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
cl_kernel handle;
|
2013-10-25 20:46:03 +08:00
|
|
|
enum { MAX_ARRS = 16 };
|
|
|
|
UMatData* u[MAX_ARRS];
|
2017-07-06 18:25:32 +08:00
|
|
|
bool isInProgress;
|
2013-10-25 20:46:03 +08:00
|
|
|
int nu;
|
2014-04-15 07:09:17 +08:00
|
|
|
std::list<Image2D> images;
|
2013-12-02 03:14:45 +08:00
|
|
|
bool haveTempDstUMats;
|
2013-10-22 18:04:49 +08:00
|
|
|
};
|
|
|
|
|
OpenCV-OpenCL interop (PR #4072):
Commits:
added new function, cv::ocl::attachContext(String& platformName, void* platformID, void* context, void* deviceID) which allow to attach externally created OpenCL context to OpenCV.
add definitions of clRetainDevice, clRetainContext funcs
removed definitions for clRetainContext, clRetainDevice
fixed build issue under Linux
fixed uninitialized vars, replace dbgassert in error handling
remove function which is not ready yet
add new function, cv::ocl::convertFromBuffer(int rows, int cols, int type, void* cl_mem_obj, UMat& dst, UMatUsageFlags usageFlags = cv::USAGE_DEFAULT) which attaches user allocated OpenCL clBuffer to UMat
uncommented clGetMemObjectInfo definition (otherwise prevent opencv build)
fixed build issue on linux and android
add step parameter to cv::ocl::convertFromBuffer func
suppress compile-time warning
added sample opencl-opencv interoperability (showcase for cv::ocl::convertFromBuffer func)
CMakeLists.txt modified to not create sample build script if OpenCL SDK not found in system
fixed build issue (apple opencl include dir and spaces in CMake file)
added call to clRetainContext for attachContext func and call to clRetainMemObject for convertFromBuffer func
uncommented clRetainMemObject definition
added comments and cleanup
add local path to cmake modules search dirs (instead of replacing)
remove REQUIRED for find_package call (sample build together with opencv). need to try standalone sample build
opencl-interop sample moved to standalone build
set minimum version requirement for sample's cmake to 3.1
put cmake_minimum_required under condition, so do not check if samples not builded
remove code dups for setSize, updateContinuityFlag, and finalizeHdr
commented out cmake_minimum_required(VERSION 3.1)
add safety check for cmake version
add convertFromImage func and update opencl-interop sample
uncommented clGetImageInfo defs
uncommented clEnqueueCopyImageToBuffer defs
fixed clEnqueueCopyImageToBuffer defs
add doxygen comments
remove doxygen @fn tag
try to restart buildbot
add doxygen comments to directx interop funcs
remove internal header, use fwd declarations in affected compile units instead
2015-05-28 04:22:33 +08:00
|
|
|
}} // namespace cv::ocl
|
|
|
|
|
|
|
|
extern "C" {
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2017-07-06 18:25:32 +08:00
|
|
|
static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2017-07-06 18:25:32 +08:00
|
|
|
((cv::ocl::Kernel::Impl*)p)->finit(e);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace cv { namespace ocl {
|
|
|
|
|
|
|
|
Kernel::Kernel()
|
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Kernel::Kernel(const char* kname, const Program& prog)
|
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
create(kname, prog);
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Kernel::Kernel(const char* kname, const ProgramSource& src,
|
2013-11-19 00:48:00 +08:00
|
|
|
const String& buildopts, String* errmsg)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
create(kname, src, buildopts, errmsg);
|
|
|
|
}
|
|
|
|
|
|
|
|
Kernel::Kernel(const Kernel& k)
|
|
|
|
{
|
|
|
|
p = k.p;
|
|
|
|
if(p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
|
|
|
Kernel& Kernel::operator = (const Kernel& k)
|
|
|
|
{
|
|
|
|
Impl* newp = (Impl*)k.p;
|
|
|
|
if(newp)
|
|
|
|
newp->addref();
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = newp;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
Kernel::~Kernel()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Kernel::create(const char* kname, const Program& prog)
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = new Impl(kname, prog);
|
|
|
|
if(p->handle == 0)
|
|
|
|
{
|
|
|
|
p->release();
|
|
|
|
p = 0;
|
|
|
|
}
|
2014-10-03 19:17:28 +08:00
|
|
|
#ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
|
|
|
|
CV_Assert(p);
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
return p != 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
bool Kernel::create(const char* kname, const ProgramSource& src,
|
2013-11-19 00:48:00 +08:00
|
|
|
const String& buildopts, String* errmsg)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
{
|
|
|
|
p->release();
|
|
|
|
p = 0;
|
|
|
|
}
|
2013-11-19 00:48:00 +08:00
|
|
|
String tempmsg;
|
|
|
|
if( !errmsg ) errmsg = &tempmsg;
|
2014-02-01 00:23:01 +08:00
|
|
|
const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
|
2013-10-22 18:04:49 +08:00
|
|
|
return create(kname, prog);
|
|
|
|
}
|
|
|
|
|
|
|
|
void* Kernel::ptr() const
|
|
|
|
{
|
|
|
|
return p ? p->handle : 0;
|
|
|
|
}
|
|
|
|
|
2013-11-19 00:48:00 +08:00
|
|
|
bool Kernel::empty() const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2013-11-19 00:48:00 +08:00
|
|
|
return ptr() == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Kernel::set(int i, const void* value, size_t sz)
|
|
|
|
{
|
2014-01-30 05:47:25 +08:00
|
|
|
if (!p || !p->handle)
|
|
|
|
return -1;
|
2014-03-07 15:15:54 +08:00
|
|
|
if (i < 0)
|
|
|
|
return i;
|
2013-10-25 20:46:03 +08:00
|
|
|
if( i == 0 )
|
|
|
|
p->cleanupUMats();
|
2014-02-05 23:10:02 +08:00
|
|
|
|
|
|
|
cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
|
|
|
|
CV_OclDbgAssert(retval == CL_SUCCESS);
|
|
|
|
if (retval != CL_SUCCESS)
|
2013-11-19 00:48:00 +08:00
|
|
|
return -1;
|
|
|
|
return i+1;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-01-16 15:57:57 +08:00
|
|
|
int Kernel::set(int i, const Image2D& image2D)
|
|
|
|
{
|
2014-04-15 07:09:17 +08:00
|
|
|
p->addImage(image2D);
|
2014-01-16 15:57:57 +08:00
|
|
|
cl_mem h = (cl_mem)image2D.ptr();
|
|
|
|
return set(i, &h, sizeof(h));
|
|
|
|
}
|
|
|
|
|
2013-11-19 00:48:00 +08:00
|
|
|
int Kernel::set(int i, const UMat& m)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2017-08-09 16:54:55 +08:00
|
|
|
return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2013-11-19 00:48:00 +08:00
|
|
|
int Kernel::set(int i, const KernelArg& arg)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2013-11-19 00:48:00 +08:00
|
|
|
if( !p || !p->handle )
|
|
|
|
return -1;
|
2014-03-07 15:15:54 +08:00
|
|
|
if (i < 0)
|
|
|
|
return i;
|
2013-11-27 22:30:07 +08:00
|
|
|
if( i == 0 )
|
|
|
|
p->cleanupUMats();
|
2013-10-22 18:04:49 +08:00
|
|
|
if( arg.m )
|
|
|
|
{
|
2013-10-25 20:46:03 +08:00
|
|
|
int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
|
|
|
|
((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
|
2013-11-28 03:30:29 +08:00
|
|
|
bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
|
2013-11-19 00:48:00 +08:00
|
|
|
cl_mem h = (cl_mem)arg.m->handle(accessFlags);
|
|
|
|
|
2014-01-30 05:47:25 +08:00
|
|
|
if (!h)
|
|
|
|
{
|
|
|
|
p->release();
|
|
|
|
p = 0;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
|
|
|
{
|
|
|
|
const Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
|
|
|
|
#if 1 // TODO
|
|
|
|
cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
|
|
|
|
#else
|
|
|
|
cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
|
|
|
|
#endif
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
|
2013-11-28 03:30:29 +08:00
|
|
|
if (ptronly)
|
2015-01-02 08:33:40 +08:00
|
|
|
{
|
|
|
|
i++;
|
|
|
|
}
|
2013-11-28 03:30:29 +08:00
|
|
|
else if( arg.m->dims <= 2 )
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2013-11-19 00:48:00 +08:00
|
|
|
UMat2D u2d(*arg.m);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
|
2013-11-19 00:48:00 +08:00
|
|
|
i += 3;
|
|
|
|
|
|
|
|
if( !(arg.flags & KernelArg::NO_SIZE) )
|
|
|
|
{
|
2014-03-08 05:29:27 +08:00
|
|
|
int cols = u2d.cols*arg.wscale/arg.iwscale;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
|
2013-11-19 00:48:00 +08:00
|
|
|
i += 2;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2013-11-19 00:48:00 +08:00
|
|
|
UMat3D u3d(*arg.m);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
|
2013-11-19 00:48:00 +08:00
|
|
|
i += 4;
|
|
|
|
if( !(arg.flags & KernelArg::NO_SIZE) )
|
|
|
|
{
|
2014-03-08 05:29:27 +08:00
|
|
|
int cols = u3d.cols*arg.wscale/arg.iwscale;
|
2017-03-13 01:29:49 +08:00
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices) == CL_SUCCESS);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
|
2013-11-19 00:48:00 +08:00
|
|
|
i += 3;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2013-12-02 03:14:45 +08:00
|
|
|
p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
|
2013-11-19 00:48:00 +08:00
|
|
|
return i;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
|
2013-11-19 00:48:00 +08:00
|
|
|
return i+1;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2013-12-01 07:12:19 +08:00
|
|
|
bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
2013-10-25 20:46:03 +08:00
|
|
|
bool sync, const Queue& q)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2017-09-09 00:42:59 +08:00
|
|
|
if (!p)
|
2013-11-19 00:48:00 +08:00
|
|
|
return false;
|
2013-11-27 22:30:07 +08:00
|
|
|
|
2017-09-19 00:04:46 +08:00
|
|
|
size_t globalsize[CV_MAX_DIM] = {1,1,1};
|
2013-12-01 07:12:19 +08:00
|
|
|
size_t total = 1;
|
2017-09-09 00:42:59 +08:00
|
|
|
CV_Assert(_globalsize != NULL);
|
2013-12-01 07:12:19 +08:00
|
|
|
for (int i = 0; i < dims; i++)
|
|
|
|
{
|
2013-12-02 22:27:08 +08:00
|
|
|
size_t val = _localsize ? _localsize[i] :
|
2014-03-08 05:29:27 +08:00
|
|
|
dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
|
2013-12-02 00:58:30 +08:00
|
|
|
CV_Assert( val > 0 );
|
2013-12-01 07:12:19 +08:00
|
|
|
total *= _globalsize[i];
|
2017-09-19 00:04:46 +08:00
|
|
|
if (_globalsize[i] == 1)
|
|
|
|
val = 1;
|
|
|
|
globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
|
2013-12-01 07:12:19 +08:00
|
|
|
}
|
2017-09-19 00:04:46 +08:00
|
|
|
CV_Assert(total > 0);
|
2017-09-09 00:42:59 +08:00
|
|
|
|
|
|
|
return p->run(dims, globalsize, _localsize, sync, NULL, q);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
|
|
|
|
bool sync, int64* timeNS, const Queue& q)
|
|
|
|
{
|
|
|
|
CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());
|
|
|
|
|
|
|
|
if (!handle || isInProgress)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
cl_command_queue qq = getQueue(q);
|
|
|
|
if (haveTempDstUMats)
|
|
|
|
sync = true;
|
|
|
|
if (timeNS)
|
2013-12-02 03:14:45 +08:00
|
|
|
sync = true;
|
2017-07-06 18:25:32 +08:00
|
|
|
cl_event asyncEvent = 0;
|
2017-09-09 00:42:59 +08:00
|
|
|
cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
|
|
|
|
NULL, globalsize, localsize, 0, 0,
|
|
|
|
(sync && !timeNS) ? 0 : &asyncEvent);
|
2014-10-09 17:50:57 +08:00
|
|
|
#if CV_OPENCL_SHOW_RUN_ERRORS
|
|
|
|
if (retval != CL_SUCCESS)
|
|
|
|
{
|
|
|
|
printf("OpenCL program returns error: %d\n", retval);
|
|
|
|
fflush(stdout);
|
|
|
|
}
|
|
|
|
#endif
|
2017-09-09 00:42:59 +08:00
|
|
|
if (sync || retval != CL_SUCCESS)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
|
2017-09-09 00:42:59 +08:00
|
|
|
if (timeNS)
|
|
|
|
{
|
|
|
|
if (retval == CL_SUCCESS)
|
|
|
|
{
|
|
|
|
clWaitForEvents(1, &asyncEvent);
|
|
|
|
cl_ulong startTime, stopTime;
|
|
|
|
CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
|
|
|
|
CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
|
|
|
|
*timeNS = (int64)(stopTime - startTime);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*timeNS = -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cleanupUMats();
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-09-09 00:42:59 +08:00
|
|
|
addref();
|
|
|
|
isInProgress = true;
|
|
|
|
CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this) == CL_SUCCESS);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2017-07-06 18:25:32 +08:00
|
|
|
if (asyncEvent)
|
|
|
|
clReleaseEvent(asyncEvent);
|
2014-02-01 19:07:03 +08:00
|
|
|
return retval == CL_SUCCESS;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2013-11-19 00:48:00 +08:00
|
|
|
bool Kernel::runTask(bool sync, const Queue& q)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2017-07-06 18:25:32 +08:00
|
|
|
if(!p || !p->handle || p->isInProgress)
|
2013-11-19 00:48:00 +08:00
|
|
|
return false;
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
cl_command_queue qq = getQueue(q);
|
2017-07-06 18:25:32 +08:00
|
|
|
cl_event asyncEvent = 0;
|
|
|
|
cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
|
2014-02-01 19:07:03 +08:00
|
|
|
if( sync || retval != CL_SUCCESS )
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
|
2013-10-25 20:46:03 +08:00
|
|
|
p->cleanupUMats();
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
p->addref();
|
2017-07-06 18:25:32 +08:00
|
|
|
p->isInProgress = true;
|
|
|
|
CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2017-07-06 18:25:32 +08:00
|
|
|
if (asyncEvent)
|
|
|
|
clReleaseEvent(asyncEvent);
|
2014-02-01 19:07:03 +08:00
|
|
|
return retval == CL_SUCCESS;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2017-09-09 00:42:59 +08:00
|
|
|
int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
|
|
|
|
{
|
|
|
|
CV_Assert(p && p->handle && !p->isInProgress);
|
|
|
|
Queue q = q_.ptr() ? q_ : Queue::getDefault();
|
|
|
|
CV_Assert(q.ptr());
|
|
|
|
q.finish(); // call clFinish() on base queue
|
|
|
|
Queue profilingQueue = q.getProfilingQueue();
|
|
|
|
int64 timeNs = -1;
|
|
|
|
bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
|
|
|
|
return res ? timeNs : -1;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
size_t Kernel::workGroupSize() const
|
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
if(!p || !p->handle)
|
2013-10-22 18:04:49 +08:00
|
|
|
return 0;
|
|
|
|
size_t val = 0, retsz = 0;
|
|
|
|
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
|
|
|
|
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
|
2014-02-01 19:07:03 +08:00
|
|
|
sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2013-12-30 17:06:32 +08:00
|
|
|
size_t Kernel::preferedWorkGroupSizeMultiple() const
|
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
if(!p || !p->handle)
|
2013-12-30 17:06:32 +08:00
|
|
|
return 0;
|
|
|
|
size_t val = 0, retsz = 0;
|
|
|
|
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
|
|
|
|
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
|
2014-02-01 19:07:03 +08:00
|
|
|
sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
|
2013-12-30 17:06:32 +08:00
|
|
|
}
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
bool Kernel::compileWorkGroupSize(size_t wsz[]) const
|
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
if(!p || !p->handle || !wsz)
|
2013-10-22 18:04:49 +08:00
|
|
|
return 0;
|
|
|
|
size_t retsz = 0;
|
|
|
|
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
|
|
|
|
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
|
2014-11-26 17:24:53 +08:00
|
|
|
sizeof(wsz[0])*3, wsz, &retsz) == CL_SUCCESS;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
size_t Kernel::localMemSize() const
|
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
if(!p || !p->handle)
|
2013-10-22 18:04:49 +08:00
|
|
|
return 0;
|
|
|
|
size_t retsz = 0;
|
|
|
|
cl_ulong val = 0;
|
|
|
|
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
|
|
|
|
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
|
2014-02-01 19:07:03 +08:00
|
|
|
sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
/////////////////////////////////////////// Program /////////////////////////////////////////////
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
struct Program::Impl
|
|
|
|
{
|
2014-02-01 00:23:01 +08:00
|
|
|
Impl(const ProgramSource& _src,
|
2013-10-22 18:04:49 +08:00
|
|
|
const String& _buildflags, String& errmsg)
|
|
|
|
{
|
2016-11-07 17:15:51 +08:00
|
|
|
CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Compile: %" PRIx64 " options: %s", _src.hash(), _buildflags.c_str()).c_str());
|
2013-10-22 18:04:49 +08:00
|
|
|
refcount = 1;
|
2014-02-01 00:23:01 +08:00
|
|
|
const Context& ctx = Context::getDefault();
|
2013-10-22 18:04:49 +08:00
|
|
|
src = _src;
|
|
|
|
buildflags = _buildflags;
|
|
|
|
const String& srcstr = src.source();
|
|
|
|
const char* srcptr = srcstr.c_str();
|
|
|
|
size_t srclen = srcstr.size();
|
|
|
|
cl_int retval = 0;
|
|
|
|
|
|
|
|
handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
|
2014-02-01 19:07:03 +08:00
|
|
|
if( handle && retval == CL_SUCCESS )
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2013-11-21 22:27:07 +08:00
|
|
|
int i, n = (int)ctx.ndevices();
|
2013-10-22 18:04:49 +08:00
|
|
|
AutoBuffer<void*> deviceListBuf(n+1);
|
|
|
|
void** deviceList = deviceListBuf;
|
|
|
|
for( i = 0; i < n; i++ )
|
|
|
|
deviceList[i] = ctx.device(i).ptr();
|
2013-11-27 22:30:07 +08:00
|
|
|
|
2014-03-05 19:04:44 +08:00
|
|
|
Device device = Device::getDefault();
|
|
|
|
if (device.isAMD())
|
|
|
|
buildflags += " -D AMD_DEVICE";
|
|
|
|
else if (device.isIntel())
|
|
|
|
buildflags += " -D INTEL_DEVICE";
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
retval = clBuildProgram(handle, n,
|
|
|
|
(const cl_device_id*)deviceList,
|
|
|
|
buildflags.c_str(), 0, 0);
|
2014-02-27 16:51:40 +08:00
|
|
|
#if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
|
2014-02-01 19:07:03 +08:00
|
|
|
if( retval != CL_SUCCESS )
|
2014-02-27 16:51:40 +08:00
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
size_t retsz = 0;
|
2014-02-27 16:51:40 +08:00
|
|
|
cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
|
2013-12-01 07:12:19 +08:00
|
|
|
CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
|
2014-02-27 16:51:40 +08:00
|
|
|
if (buildInfo_retval == CL_SUCCESS && retsz > 1)
|
2013-12-01 07:12:19 +08:00
|
|
|
{
|
|
|
|
AutoBuffer<char> bufbuf(retsz + 16);
|
|
|
|
char* buf = bufbuf;
|
2014-02-27 16:51:40 +08:00
|
|
|
buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
|
2013-12-01 07:12:19 +08:00
|
|
|
CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
|
2014-02-27 16:51:40 +08:00
|
|
|
if (buildInfo_retval == CL_SUCCESS)
|
2013-12-01 07:12:19 +08:00
|
|
|
{
|
2014-02-27 16:51:40 +08:00
|
|
|
// TODO It is useful to see kernel name & program file name also
|
2013-12-01 07:12:19 +08:00
|
|
|
errmsg = String(buf);
|
2014-02-27 16:51:40 +08:00
|
|
|
printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
|
2014-02-01 19:07:03 +08:00
|
|
|
fflush(stdout);
|
2013-12-01 07:12:19 +08:00
|
|
|
}
|
|
|
|
}
|
2014-02-27 16:51:40 +08:00
|
|
|
if (retval != CL_SUCCESS && handle)
|
2013-12-23 19:49:45 +08:00
|
|
|
{
|
|
|
|
clReleaseProgram(handle);
|
|
|
|
handle = NULL;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Impl(const String& _buf, const String& _buildflags)
|
|
|
|
{
|
|
|
|
refcount = 1;
|
|
|
|
handle = 0;
|
|
|
|
buildflags = _buildflags;
|
|
|
|
if(_buf.empty())
|
|
|
|
return;
|
|
|
|
String prefix0 = Program::getPrefix(buildflags);
|
2014-02-01 00:23:01 +08:00
|
|
|
const Context& ctx = Context::getDefault();
|
2013-10-22 18:04:49 +08:00
|
|
|
const Device& dev = Device::getDefault();
|
|
|
|
const char* pos0 = _buf.c_str();
|
2013-10-22 21:41:28 +08:00
|
|
|
const char* pos1 = strchr(pos0, '\n');
|
2013-10-22 18:04:49 +08:00
|
|
|
if(!pos1)
|
|
|
|
return;
|
2013-10-22 21:41:28 +08:00
|
|
|
const char* pos2 = strchr(pos1+1, '\n');
|
2013-10-22 18:04:49 +08:00
|
|
|
if(!pos2)
|
|
|
|
return;
|
2013-10-22 21:41:28 +08:00
|
|
|
const char* pos3 = strchr(pos2+1, '\n');
|
2013-10-22 18:04:49 +08:00
|
|
|
if(!pos3)
|
|
|
|
return;
|
|
|
|
size_t prefixlen = (pos3 - pos0)+1;
|
|
|
|
String prefix(pos0, prefixlen);
|
|
|
|
if( prefix != prefix0 )
|
|
|
|
return;
|
|
|
|
const uchar* bin = (uchar*)(pos3+1);
|
|
|
|
void* devid = dev.ptr();
|
|
|
|
size_t codelen = _buf.length() - prefixlen;
|
|
|
|
cl_int binstatus = 0, retval = 0;
|
|
|
|
handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
|
|
|
|
&codelen, &bin, &binstatus, &retval);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(retval == CL_SUCCESS);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
String store()
|
|
|
|
{
|
|
|
|
if(!handle)
|
|
|
|
return String();
|
|
|
|
size_t progsz = 0, retsz = 0;
|
|
|
|
String prefix = Program::getPrefix(buildflags);
|
|
|
|
size_t prefixlen = prefix.length();
|
2014-02-01 19:07:03 +08:00
|
|
|
if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
|
2013-10-22 18:04:49 +08:00
|
|
|
return String();
|
|
|
|
AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
|
|
|
|
uchar* buf = bufbuf;
|
|
|
|
memcpy(buf, prefix.c_str(), prefixlen);
|
|
|
|
buf += prefixlen;
|
2014-02-01 19:07:03 +08:00
|
|
|
if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
|
2013-10-22 18:04:49 +08:00
|
|
|
return String();
|
|
|
|
buf[progsz] = (uchar)'\0';
|
|
|
|
return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
|
|
|
|
}
|
|
|
|
|
|
|
|
~Impl()
|
|
|
|
{
|
|
|
|
if( handle )
|
2014-02-01 19:07:03 +08:00
|
|
|
{
|
2014-03-13 19:45:23 +08:00
|
|
|
#ifdef _WIN32
|
|
|
|
if (!cv::__termination)
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
clReleaseProgram(handle);
|
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
handle = NULL;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource src;
|
2013-10-22 18:04:49 +08:00
|
|
|
String buildflags;
|
|
|
|
cl_program handle;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
Program::Program() { p = 0; }
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Program::Program(const ProgramSource& src,
|
2013-10-22 18:04:49 +08:00
|
|
|
const String& buildflags, String& errmsg)
|
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
create(src, buildflags, errmsg);
|
|
|
|
}
|
|
|
|
|
|
|
|
Program::Program(const Program& prog)
|
|
|
|
{
|
|
|
|
p = prog.p;
|
|
|
|
if(p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
|
|
|
Program& Program::operator = (const Program& prog)
|
|
|
|
{
|
|
|
|
Impl* newp = (Impl*)prog.p;
|
|
|
|
if(newp)
|
|
|
|
newp->addref();
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = newp;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
Program::~Program()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
bool Program::create(const ProgramSource& src,
|
2013-10-22 18:04:49 +08:00
|
|
|
const String& buildflags, String& errmsg)
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = new Impl(src, buildflags, errmsg);
|
|
|
|
if(!p->handle)
|
|
|
|
{
|
|
|
|
p->release();
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
return p != 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
const ProgramSource& Program::source() const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2014-02-01 00:23:01 +08:00
|
|
|
static ProgramSource dummy;
|
2013-10-22 18:04:49 +08:00
|
|
|
return p ? p->src : dummy;
|
|
|
|
}
|
|
|
|
|
|
|
|
void* Program::ptr() const
|
|
|
|
{
|
|
|
|
return p ? p->handle : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Program::read(const String& bin, const String& buildflags)
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = new Impl(bin, buildflags);
|
|
|
|
return p->handle != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Program::write(String& bin) const
|
|
|
|
{
|
|
|
|
if(!p)
|
|
|
|
return false;
|
|
|
|
bin = p->store();
|
|
|
|
return !bin.empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
String Program::getPrefix() const
|
|
|
|
{
|
|
|
|
if(!p)
|
|
|
|
return String();
|
|
|
|
return getPrefix(p->buildflags);
|
|
|
|
}
|
|
|
|
|
|
|
|
String Program::getPrefix(const String& buildflags)
|
|
|
|
{
|
2014-02-01 00:23:01 +08:00
|
|
|
const Context& ctx = Context::getDefault();
|
2013-10-22 18:04:49 +08:00
|
|
|
const Device& dev = ctx.device(0);
|
|
|
|
return format("name=%s\ndriver=%s\nbuildflags=%s\n",
|
|
|
|
dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
struct ProgramSource::Impl
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2016-12-19 05:38:33 +08:00
|
|
|
Impl(const String& src)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2016-12-19 05:38:33 +08:00
|
|
|
init(cv::String(), cv::String(), src, cv::String());
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2016-12-19 05:38:33 +08:00
|
|
|
Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2016-12-19 05:38:33 +08:00
|
|
|
init(module, name, codeStr, codeHash);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2016-12-19 05:38:33 +08:00
|
|
|
void init(const String& module, const String& name, const String& codeStr, const String& codeHash)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
refcount = 1;
|
2016-12-19 05:38:33 +08:00
|
|
|
module_ = module;
|
|
|
|
name_ = name;
|
|
|
|
codeStr_ = codeStr;
|
|
|
|
codeHash_ = codeHash;
|
|
|
|
|
|
|
|
isHashUpdated = false;
|
|
|
|
if (codeHash_.empty())
|
|
|
|
{
|
|
|
|
updateHash();
|
|
|
|
codeHash_ = cv::format("%08llx", hash_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void updateHash()
|
|
|
|
{
|
|
|
|
hash_ = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
|
|
|
|
isHashUpdated = true;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
2016-12-19 05:38:33 +08:00
|
|
|
|
|
|
|
String module_;
|
|
|
|
String name_;
|
|
|
|
String codeStr_;
|
|
|
|
String codeHash_;
|
|
|
|
// TODO std::vector<ProgramSource> includes_;
|
|
|
|
|
|
|
|
bool isHashUpdated;
|
|
|
|
ProgramSource::hash_t hash_;
|
2013-10-22 18:04:49 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource::ProgramSource()
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
|
2016-12-19 05:38:33 +08:00
|
|
|
ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
|
|
|
|
{
|
|
|
|
p = new Impl(module, name, codeStr, codeHash);
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource::ProgramSource(const char* prog)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = new Impl(prog);
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource::ProgramSource(const String& prog)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = new Impl(prog);
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource::~ProgramSource()
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource::ProgramSource(const ProgramSource& prog)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
p = prog.p;
|
|
|
|
if(p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
Impl* newp = (Impl*)prog.p;
|
|
|
|
if(newp)
|
|
|
|
newp->addref();
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
p = newp;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
const String& ProgramSource::source() const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2016-12-19 05:38:33 +08:00
|
|
|
CV_Assert(p);
|
|
|
|
return p->codeStr_;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
ProgramSource::hash_t ProgramSource::hash() const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2016-12-19 05:38:33 +08:00
|
|
|
CV_Assert(p);
|
|
|
|
if (!p->isHashUpdated)
|
|
|
|
p->updateHash();
|
|
|
|
return p->hash_;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2016-12-19 05:38:33 +08:00
|
|
|
|
|
|
|
internal::ProgramEntry::operator ProgramSource&() const
|
|
|
|
{
|
|
|
|
if (this->pProgramSource == NULL)
|
|
|
|
{
|
|
|
|
cv::AutoLock lock(cv::getInitializationMutex());
|
|
|
|
if (this->pProgramSource == NULL)
|
|
|
|
{
|
|
|
|
ProgramSource* ps = new ProgramSource(this->module, this->name, this->programCode, this->programHash);
|
|
|
|
const_cast<ProgramEntry*>(this)->pProgramSource = ps;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return *this->pProgramSource;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2016-12-19 05:38:33 +08:00
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
//////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
template<typename T>
|
2014-01-16 22:30:39 +08:00
|
|
|
class OpenCLBufferPool
|
|
|
|
{
|
|
|
|
protected:
|
|
|
|
~OpenCLBufferPool() { }
|
|
|
|
public:
|
2015-01-02 08:33:40 +08:00
|
|
|
virtual T allocate(size_t size) = 0;
|
|
|
|
virtual void release(T buffer) = 0;
|
2014-01-16 22:30:39 +08:00
|
|
|
};
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
template <typename Derived, typename BufferEntry, typename T>
|
|
|
|
class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
|
2014-01-16 22:30:39 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
private:
|
|
|
|
inline Derived& derived() { return *static_cast<Derived*>(this); }
|
2014-01-16 22:30:39 +08:00
|
|
|
protected:
|
|
|
|
Mutex mutex_;
|
|
|
|
|
|
|
|
size_t currentReservedSize;
|
|
|
|
size_t maxReservedSize;
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
|
|
|
|
std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
|
|
|
|
|
|
|
|
// synchronized
|
|
|
|
bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
|
|
|
|
{
|
|
|
|
typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
|
|
|
|
for (; i != allocatedEntries_.end(); ++i)
|
|
|
|
{
|
|
|
|
BufferEntry& e = *i;
|
|
|
|
if (e.clBuffer_ == buffer)
|
|
|
|
{
|
|
|
|
entry = e;
|
|
|
|
allocatedEntries_.erase(i);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2014-01-16 22:30:39 +08:00
|
|
|
|
|
|
|
// synchronized
|
|
|
|
bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
|
|
|
|
{
|
|
|
|
if (reservedEntries_.empty())
|
|
|
|
return false;
|
2015-01-02 08:33:40 +08:00
|
|
|
typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
|
|
|
|
typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
|
|
|
|
BufferEntry result;
|
2014-01-16 22:30:39 +08:00
|
|
|
size_t minDiff = (size_t)(-1);
|
|
|
|
for (; i != reservedEntries_.end(); ++i)
|
|
|
|
{
|
|
|
|
BufferEntry& e = *i;
|
|
|
|
if (e.capacity_ >= size)
|
|
|
|
{
|
|
|
|
size_t diff = e.capacity_ - size;
|
2015-12-16 18:49:00 +08:00
|
|
|
if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
|
2014-01-16 22:30:39 +08:00
|
|
|
{
|
|
|
|
minDiff = diff;
|
|
|
|
result_pos = i;
|
|
|
|
result = e;
|
|
|
|
if (diff == 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (result_pos != reservedEntries_.end())
|
|
|
|
{
|
|
|
|
//CV_DbgAssert(result == *result_pos);
|
|
|
|
reservedEntries_.erase(result_pos);
|
|
|
|
entry = result;
|
|
|
|
currentReservedSize -= entry.capacity_;
|
2015-01-02 08:33:40 +08:00
|
|
|
allocatedEntries_.push_back(entry);
|
2014-01-16 22:30:39 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// synchronized
|
|
|
|
void _checkSizeOfReservedEntries()
|
|
|
|
{
|
|
|
|
while (currentReservedSize > maxReservedSize)
|
|
|
|
{
|
|
|
|
CV_DbgAssert(!reservedEntries_.empty());
|
|
|
|
const BufferEntry& entry = reservedEntries_.back();
|
|
|
|
CV_DbgAssert(currentReservedSize >= entry.capacity_);
|
|
|
|
currentReservedSize -= entry.capacity_;
|
2015-01-02 08:33:40 +08:00
|
|
|
derived()._releaseBufferEntry(entry);
|
2014-01-16 22:30:39 +08:00
|
|
|
reservedEntries_.pop_back();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inline size_t _allocationGranularity(size_t size)
|
|
|
|
{
|
|
|
|
// heuristic values
|
2015-12-16 18:49:00 +08:00
|
|
|
if (size < 1024*1024)
|
|
|
|
return 4096; // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
|
2014-01-16 22:30:39 +08:00
|
|
|
else if (size < 16*1024*1024)
|
|
|
|
return 64*1024;
|
|
|
|
else
|
|
|
|
return 1024*1024;
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
2015-01-02 08:33:40 +08:00
|
|
|
OpenCLBufferPoolBaseImpl()
|
|
|
|
: currentReservedSize(0),
|
|
|
|
maxReservedSize(0)
|
2014-01-16 22:30:39 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
// nothing
|
2014-01-16 22:30:39 +08:00
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
virtual ~OpenCLBufferPoolBaseImpl()
|
2014-01-16 22:30:39 +08:00
|
|
|
{
|
|
|
|
freeAllReservedBuffers();
|
|
|
|
CV_Assert(reservedEntries_.empty());
|
|
|
|
}
|
|
|
|
public:
|
2015-01-02 08:33:40 +08:00
|
|
|
virtual T allocate(size_t size)
|
2014-01-16 22:30:39 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
AutoLock locker(mutex_);
|
|
|
|
BufferEntry entry;
|
|
|
|
if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
|
2014-01-16 22:30:39 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
CV_DbgAssert(size <= entry.capacity_);
|
|
|
|
LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
derived()._allocateBufferEntry(entry, size);
|
2014-01-16 22:30:39 +08:00
|
|
|
}
|
|
|
|
return entry.clBuffer_;
|
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
virtual void release(T buffer)
|
2014-01-16 22:30:39 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
AutoLock locker(mutex_);
|
|
|
|
BufferEntry entry;
|
|
|
|
CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
|
2014-01-16 22:30:39 +08:00
|
|
|
if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
|
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
derived()._releaseBufferEntry(entry);
|
2014-01-16 22:30:39 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
reservedEntries_.push_front(entry);
|
|
|
|
currentReservedSize += entry.capacity_;
|
|
|
|
_checkSizeOfReservedEntries();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual size_t getReservedSize() const { return currentReservedSize; }
|
|
|
|
virtual size_t getMaxReservedSize() const { return maxReservedSize; }
|
|
|
|
virtual void setMaxReservedSize(size_t size)
|
|
|
|
{
|
|
|
|
AutoLock locker(mutex_);
|
|
|
|
size_t oldMaxReservedSize = maxReservedSize;
|
|
|
|
maxReservedSize = size;
|
|
|
|
if (maxReservedSize < oldMaxReservedSize)
|
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
|
2014-01-16 22:30:39 +08:00
|
|
|
for (; i != reservedEntries_.end();)
|
|
|
|
{
|
|
|
|
const BufferEntry& entry = *i;
|
|
|
|
if (entry.capacity_ > maxReservedSize / 8)
|
|
|
|
{
|
|
|
|
CV_DbgAssert(currentReservedSize >= entry.capacity_);
|
|
|
|
currentReservedSize -= entry.capacity_;
|
2015-01-02 08:33:40 +08:00
|
|
|
derived()._releaseBufferEntry(entry);
|
2014-01-16 22:30:39 +08:00
|
|
|
i = reservedEntries_.erase(i);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
_checkSizeOfReservedEntries();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
virtual void freeAllReservedBuffers()
|
|
|
|
{
|
|
|
|
AutoLock locker(mutex_);
|
2015-01-02 08:33:40 +08:00
|
|
|
typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
|
2014-01-16 22:30:39 +08:00
|
|
|
for (; i != reservedEntries_.end(); ++i)
|
|
|
|
{
|
|
|
|
const BufferEntry& entry = *i;
|
2015-01-02 08:33:40 +08:00
|
|
|
derived()._releaseBufferEntry(entry);
|
2014-01-16 22:30:39 +08:00
|
|
|
}
|
|
|
|
reservedEntries_.clear();
|
2015-03-20 18:21:10 +08:00
|
|
|
currentReservedSize = 0;
|
2014-01-16 22:30:39 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
struct CLBufferEntry
|
|
|
|
{
|
|
|
|
cl_mem clBuffer_;
|
|
|
|
size_t capacity_;
|
|
|
|
CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
|
|
|
|
};
|
|
|
|
|
|
|
|
class OpenCLBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef struct CLBufferEntry BufferEntry;
|
|
|
|
protected:
|
|
|
|
int createFlags_;
|
|
|
|
public:
|
|
|
|
OpenCLBufferPoolImpl(int createFlags = 0)
|
|
|
|
: createFlags_(createFlags)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void _allocateBufferEntry(BufferEntry& entry, size_t size)
|
|
|
|
{
|
|
|
|
CV_DbgAssert(entry.clBuffer_ == NULL);
|
|
|
|
entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
cl_int retval = CL_SUCCESS;
|
|
|
|
entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
|
|
|
|
CV_Assert(retval == CL_SUCCESS);
|
|
|
|
CV_Assert(entry.clBuffer_ != NULL);
|
|
|
|
if(retval == CL_SUCCESS)
|
|
|
|
{
|
|
|
|
CV_IMPL_ADD(CV_IMPL_OCL);
|
|
|
|
}
|
|
|
|
LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
|
|
|
|
(long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
|
|
|
|
allocatedEntries_.push_back(entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
void _releaseBufferEntry(const BufferEntry& entry)
|
|
|
|
{
|
|
|
|
CV_Assert(entry.capacity_ != 0);
|
|
|
|
CV_Assert(entry.clBuffer_ != NULL);
|
|
|
|
LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
|
|
|
|
entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
|
|
|
|
clReleaseMemObject(entry.clBuffer_);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
struct CLSVMBufferEntry
|
|
|
|
{
|
|
|
|
void* clBuffer_;
|
|
|
|
size_t capacity_;
|
|
|
|
CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
|
|
|
|
};
|
|
|
|
class OpenCLSVMBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef struct CLSVMBufferEntry BufferEntry;
|
|
|
|
public:
|
|
|
|
OpenCLSVMBufferPoolImpl()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void _allocateBufferEntry(BufferEntry& entry, size_t size)
|
|
|
|
{
|
|
|
|
CV_DbgAssert(entry.clBuffer_ == NULL);
|
|
|
|
entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
|
|
|
|
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
|
|
|
|
bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
|
|
|
|
cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
|
|
|
|
(isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
|
|
|
|
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
|
|
|
|
void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
|
|
|
|
CV_Assert(buf);
|
|
|
|
|
|
|
|
entry.clBuffer_ = buf;
|
|
|
|
{
|
|
|
|
CV_IMPL_ADD(CV_IMPL_OCL);
|
|
|
|
}
|
|
|
|
LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
|
|
|
|
(long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
|
|
|
|
allocatedEntries_.push_back(entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
void _releaseBufferEntry(const BufferEntry& entry)
|
|
|
|
{
|
|
|
|
CV_Assert(entry.capacity_ != 0);
|
|
|
|
CV_Assert(entry.clBuffer_ != NULL);
|
|
|
|
LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
|
|
|
|
entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", entry.clBuffer_);
|
|
|
|
svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
2014-02-06 21:27:29 +08:00
|
|
|
#if defined _MSC_VER
|
|
|
|
#pragma warning(disable:4127) // conditional expression is constant
|
|
|
|
#endif
|
|
|
|
template <bool readAccess, bool writeAccess>
|
|
|
|
class AlignedDataPtr
|
|
|
|
{
|
|
|
|
protected:
|
|
|
|
const size_t size_;
|
|
|
|
uchar* const originPtr_;
|
|
|
|
const size_t alignment_;
|
|
|
|
uchar* ptr_;
|
|
|
|
uchar* allocatedPtr_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
|
|
|
|
: size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
|
|
|
|
{
|
|
|
|
CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
|
|
|
|
if (((size_t)ptr_ & (alignment - 1)) != 0)
|
|
|
|
{
|
|
|
|
allocatedPtr_ = new uchar[size_ + alignment - 1];
|
|
|
|
ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
|
|
|
|
if (readAccess)
|
|
|
|
{
|
|
|
|
memcpy(ptr_, originPtr_, size_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uchar* getAlignedPtr() const
|
|
|
|
{
|
|
|
|
CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
|
|
|
|
return ptr_;
|
|
|
|
}
|
|
|
|
|
|
|
|
~AlignedDataPtr()
|
|
|
|
{
|
|
|
|
if (allocatedPtr_)
|
|
|
|
{
|
|
|
|
if (writeAccess)
|
|
|
|
{
|
|
|
|
memcpy(originPtr_, ptr_, size_);
|
|
|
|
}
|
|
|
|
delete[] allocatedPtr_;
|
|
|
|
allocatedPtr_ = NULL;
|
|
|
|
}
|
|
|
|
ptr_ = NULL;
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
AlignedDataPtr(const AlignedDataPtr&); // disabled
|
|
|
|
AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
|
|
|
|
};
|
2015-08-11 06:33:46 +08:00
|
|
|
|
|
|
|
template <bool readAccess, bool writeAccess>
|
|
|
|
class AlignedDataPtr2D
|
|
|
|
{
|
|
|
|
protected:
|
|
|
|
const size_t size_;
|
|
|
|
uchar* const originPtr_;
|
|
|
|
const size_t alignment_;
|
|
|
|
uchar* ptr_;
|
|
|
|
uchar* allocatedPtr_;
|
|
|
|
size_t rows_;
|
|
|
|
size_t cols_;
|
|
|
|
size_t step_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment)
|
|
|
|
: size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
|
|
|
|
{
|
|
|
|
CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
|
|
|
|
if (((size_t)ptr_ & (alignment - 1)) != 0)
|
|
|
|
{
|
|
|
|
allocatedPtr_ = new uchar[size_ + alignment - 1];
|
|
|
|
ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
|
|
|
|
if (readAccess)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < rows_; i++)
|
|
|
|
memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uchar* getAlignedPtr() const
|
|
|
|
{
|
|
|
|
CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
|
|
|
|
return ptr_;
|
|
|
|
}
|
|
|
|
|
|
|
|
~AlignedDataPtr2D()
|
|
|
|
{
|
|
|
|
if (allocatedPtr_)
|
|
|
|
{
|
|
|
|
if (writeAccess)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < rows_; i++)
|
|
|
|
memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
|
|
|
|
}
|
|
|
|
delete[] allocatedPtr_;
|
|
|
|
allocatedPtr_ = NULL;
|
|
|
|
}
|
|
|
|
ptr_ = NULL;
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
|
|
|
|
AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
|
|
|
|
};
|
2014-02-06 21:27:29 +08:00
|
|
|
#if defined _MSC_VER
|
|
|
|
#pragma warning(default:4127) // conditional expression is constant
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
|
|
|
|
#define CV_OPENCL_DATA_PTR_ALIGNMENT 16
|
|
|
|
#endif
|
2014-01-16 22:30:39 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
class OpenCLAllocator : public MatAllocator
|
|
|
|
{
|
2014-01-16 22:30:39 +08:00
|
|
|
mutable OpenCLBufferPoolImpl bufferPool;
|
2015-01-02 08:33:40 +08:00
|
|
|
mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
|
|
|
|
#endif
|
|
|
|
|
2014-02-10 20:34:45 +08:00
|
|
|
enum AllocatorFlags
|
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
|
|
|
|
ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
,ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2
|
|
|
|
#endif
|
2014-02-10 20:34:45 +08:00
|
|
|
};
|
2013-10-22 18:04:49 +08:00
|
|
|
public:
|
2015-01-02 08:33:40 +08:00
|
|
|
OpenCLAllocator()
|
|
|
|
: bufferPool(0),
|
|
|
|
bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
|
|
|
|
{
|
|
|
|
size_t defaultPoolSize, poolSize;
|
|
|
|
defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
|
2017-05-25 23:59:01 +08:00
|
|
|
poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
|
2015-01-02 08:33:40 +08:00
|
|
|
bufferPool.setMaxReservedSize(poolSize);
|
2017-05-25 23:59:01 +08:00
|
|
|
poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
|
2015-01-02 08:33:40 +08:00
|
|
|
bufferPoolHostPtr.setMaxReservedSize(poolSize);
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
2017-05-25 23:59:01 +08:00
|
|
|
poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
|
2015-01-02 08:33:40 +08:00
|
|
|
bufferPoolSVM.setMaxReservedSize(poolSize);
|
|
|
|
#endif
|
|
|
|
|
2015-12-01 04:45:48 +08:00
|
|
|
matStdAllocator = Mat::getDefaultAllocator();
|
2015-01-02 08:33:40 +08:00
|
|
|
}
|
2017-07-06 22:57:05 +08:00
|
|
|
~OpenCLAllocator()
|
|
|
|
{
|
|
|
|
flushCleanupQueue();
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2014-02-10 20:34:45 +08:00
|
|
|
UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
|
|
|
|
int flags, UMatUsageFlags usageFlags) const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2014-02-10 20:34:45 +08:00
|
|
|
UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
|
2013-10-22 18:04:49 +08:00
|
|
|
return u;
|
|
|
|
}
|
|
|
|
|
2014-02-10 20:34:45 +08:00
|
|
|
void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
const Device& dev = ctx.device(0);
|
2014-02-10 20:34:45 +08:00
|
|
|
createFlags = 0;
|
|
|
|
if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
|
|
|
|
createFlags |= CL_MEM_ALLOC_HOST_PTR;
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
if( dev.hostUnifiedMemory() )
|
|
|
|
flags0 = 0;
|
|
|
|
else
|
|
|
|
flags0 = UMatData::COPY_ON_MAP;
|
|
|
|
}
|
|
|
|
|
2013-12-01 07:12:19 +08:00
|
|
|
UMatData* allocate(int dims, const int* sizes, int type,
|
2014-02-10 20:34:45 +08:00
|
|
|
void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
if(!useOpenCL())
|
2014-02-10 20:34:45 +08:00
|
|
|
return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
|
2013-12-01 07:12:19 +08:00
|
|
|
CV_Assert(data == 0);
|
2013-10-22 18:04:49 +08:00
|
|
|
size_t total = CV_ELEM_SIZE(type);
|
|
|
|
for( int i = dims-1; i >= 0; i-- )
|
|
|
|
{
|
|
|
|
if( step )
|
|
|
|
step[i] = total;
|
|
|
|
total *= sizes[i];
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
Context& ctx = Context::getDefault();
|
2017-07-06 22:57:05 +08:00
|
|
|
flushCleanupQueue();
|
2015-01-02 08:33:40 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
int createFlags = 0, flags0 = 0;
|
2014-02-10 20:34:45 +08:00
|
|
|
getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2014-02-10 20:34:45 +08:00
|
|
|
void* handle = NULL;
|
|
|
|
int allocatorFlags = 0;
|
2015-01-02 08:33:40 +08:00
|
|
|
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
|
|
|
|
if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
|
|
|
|
{
|
|
|
|
allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
|
|
|
|
handle = bufferPoolSVM.allocate(total);
|
|
|
|
|
|
|
|
// this property is constant, so single buffer pool can be used here
|
|
|
|
bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
|
|
|
|
allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
2014-02-10 20:34:45 +08:00
|
|
|
if (createFlags == 0)
|
|
|
|
{
|
|
|
|
allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
|
2015-01-02 08:33:40 +08:00
|
|
|
handle = bufferPool.allocate(total);
|
|
|
|
}
|
|
|
|
else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
|
|
|
|
{
|
|
|
|
allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
|
|
|
|
handle = bufferPoolHostPtr.allocate(total);
|
2014-02-10 20:34:45 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
CV_Assert(handle != NULL); // Unsupported, throw
|
2014-02-10 20:34:45 +08:00
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
|
|
|
|
if (!handle)
|
|
|
|
return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
UMatData* u = new UMatData(this);
|
|
|
|
u->data = 0;
|
|
|
|
u->size = total;
|
|
|
|
u->handle = handle;
|
|
|
|
u->flags = flags0;
|
2014-02-10 20:34:45 +08:00
|
|
|
u->allocatorFlags_ = allocatorFlags;
|
|
|
|
CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
|
2015-07-25 00:10:31 +08:00
|
|
|
u->markHostCopyObsolete(true);
|
2013-10-22 18:04:49 +08:00
|
|
|
return u;
|
|
|
|
}
|
|
|
|
|
2014-02-10 20:34:45 +08:00
|
|
|
bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
if(!u)
|
|
|
|
return false;
|
|
|
|
|
2017-07-06 22:57:05 +08:00
|
|
|
flushCleanupQueue();
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
UMatDataAutoLock lock(u);
|
|
|
|
|
|
|
|
if(u->handle == 0)
|
|
|
|
{
|
|
|
|
CV_Assert(u->origdata != 0);
|
2014-02-01 00:23:01 +08:00
|
|
|
Context& ctx = Context::getDefault();
|
2013-10-22 18:04:49 +08:00
|
|
|
int createFlags = 0, flags0 = 0;
|
2014-02-10 20:34:45 +08:00
|
|
|
getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
cl_context ctx_handle = (cl_context)ctx.ptr();
|
2015-01-02 08:33:40 +08:00
|
|
|
int allocatorFlags = 0;
|
|
|
|
int tempUMatFlags = 0;
|
|
|
|
void* handle = NULL;
|
|
|
|
cl_int retval = CL_SUCCESS;
|
|
|
|
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
|
|
|
|
bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
|
|
|
|
if (useSVM && svmCaps.isSupportFineGrainSystem())
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
|
|
|
|
tempUMatFlags = UMatData::TEMP_UMAT;
|
|
|
|
handle = u->origdata;
|
|
|
|
CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
|
|
|
|
}
|
|
|
|
else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
|
|
|
|
{
|
|
|
|
if (!(accessFlags & ACCESS_FAST)) // memcpy used
|
|
|
|
{
|
|
|
|
bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
|
2014-07-09 16:04:22 +08:00
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
cl_svm_mem_flags memFlags = createFlags |
|
|
|
|
(isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
|
|
|
|
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
|
|
|
|
handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
|
|
|
|
CV_Assert(handle);
|
|
|
|
|
|
|
|
cl_command_queue q = NULL;
|
|
|
|
if (!isFineGrainBuffer)
|
|
|
|
{
|
|
|
|
q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
|
|
|
|
handle, u->size,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
|
|
|
|
}
|
|
|
|
memcpy(handle, u->origdata, u->size);
|
|
|
|
if (!isFineGrainBuffer)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
|
|
|
|
tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
|
|
|
|
allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
|
|
|
|
: svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
tempUMatFlags = UMatData::TEMP_UMAT;
|
2015-09-08 09:06:04 +08:00
|
|
|
if (u->origdata == cv::alignPtr(u->origdata, 4)) // There are OpenCL runtime issues for less aligned data
|
|
|
|
{
|
|
|
|
handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
|
|
|
|
u->size, u->origdata, &retval);
|
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
|
|
|
|
{
|
|
|
|
handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
|
|
|
|
u->size, u->origdata, &retval);
|
|
|
|
tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
if(!handle || retval != CL_SUCCESS)
|
2013-10-22 18:04:49 +08:00
|
|
|
return false;
|
2015-01-02 08:33:40 +08:00
|
|
|
u->handle = handle;
|
2013-10-22 18:04:49 +08:00
|
|
|
u->prevAllocator = u->currAllocator;
|
|
|
|
u->currAllocator = this;
|
|
|
|
u->flags |= tempUMatFlags;
|
2015-01-02 08:33:40 +08:00
|
|
|
u->allocatorFlags_ = allocatorFlags;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
if(accessFlags & ACCESS_WRITE)
|
|
|
|
u->markHostCopyObsolete(true);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-12-02 22:27:08 +08:00
|
|
|
/*void sync(UMatData* u) const
|
2013-12-01 07:12:19 +08:00
|
|
|
{
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
2013-12-02 02:14:15 +08:00
|
|
|
UMatDataAutoLock lock(u);
|
|
|
|
|
2013-12-02 04:12:20 +08:00
|
|
|
if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
|
2013-12-01 07:12:19 +08:00
|
|
|
{
|
2013-12-02 04:12:20 +08:00
|
|
|
if( u->tempCopiedUMat() )
|
|
|
|
{
|
|
|
|
clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
|
|
|
|
u->size, u->origdata, 0, 0, 0);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
cl_int retval = 0;
|
|
|
|
void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
|
|
|
|
(CL_MAP_READ | CL_MAP_WRITE),
|
|
|
|
0, u->size, 0, 0, 0, &retval);
|
|
|
|
clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
|
|
|
|
clFinish(q);
|
|
|
|
}
|
2013-12-01 07:12:19 +08:00
|
|
|
u->markHostCopyObsolete(false);
|
|
|
|
}
|
|
|
|
else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
|
|
|
|
{
|
|
|
|
clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
|
|
|
|
u->size, u->data, 0, 0, 0);
|
|
|
|
}
|
2013-12-02 22:27:08 +08:00
|
|
|
}*/
|
2013-12-01 07:12:19 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
void deallocate(UMatData* u) const
|
|
|
|
{
|
|
|
|
if(!u)
|
|
|
|
return;
|
|
|
|
|
2015-09-08 09:06:04 +08:00
|
|
|
CV_Assert(u->urefcount == 0);
|
|
|
|
CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
|
2013-12-16 20:46:36 +08:00
|
|
|
|
2015-09-08 09:06:04 +08:00
|
|
|
CV_Assert(u->handle != 0);
|
2015-09-03 22:18:59 +08:00
|
|
|
CV_Assert(u->mapcount == 0);
|
2017-07-06 22:57:05 +08:00
|
|
|
|
|
|
|
if (u->flags & UMatData::ASYNC_CLEANUP)
|
|
|
|
addToCleanupQueue(u);
|
|
|
|
else
|
|
|
|
deallocate_(u);
|
|
|
|
}
|
|
|
|
|
|
|
|
void deallocate_(UMatData* u) const
|
|
|
|
{
|
2013-10-22 18:04:49 +08:00
|
|
|
if(u->tempUMat())
|
|
|
|
{
|
2015-07-25 00:10:31 +08:00
|
|
|
CV_Assert(u->origdata);
|
2014-01-30 04:19:18 +08:00
|
|
|
// UMatDataAutoLock lock(u);
|
2015-01-02 08:33:40 +08:00
|
|
|
|
2015-09-08 09:06:04 +08:00
|
|
|
if (u->hostCopyObsolete())
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
2013-12-02 04:12:20 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
if( u->tempCopiedUMat() )
|
|
|
|
{
|
|
|
|
CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
|
|
|
|
(u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
|
|
|
|
bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
|
|
|
|
cl_command_queue q = NULL;
|
|
|
|
if (!isFineGrainBuffer)
|
|
|
|
{
|
|
|
|
CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
|
|
|
|
q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
|
|
|
|
u->handle, u->size,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
clFinish(q);
|
|
|
|
memcpy(u->origdata, u->handle, u->size);
|
|
|
|
if (!isFineGrainBuffer)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
|
|
|
|
// nothing
|
|
|
|
}
|
2013-12-02 04:12:20 +08:00
|
|
|
}
|
|
|
|
else
|
2015-01-02 08:33:40 +08:00
|
|
|
#endif
|
|
|
|
{
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
if( u->tempCopiedUMat() )
|
|
|
|
{
|
|
|
|
AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
|
|
|
|
CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
|
|
|
|
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2015-08-25 00:21:37 +08:00
|
|
|
cl_int retval = 0;
|
2015-09-03 22:18:59 +08:00
|
|
|
if (u->tempUMat())
|
|
|
|
{
|
|
|
|
CV_Assert(u->mapcount == 0);
|
|
|
|
void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
|
|
|
|
(CL_MAP_READ | CL_MAP_WRITE),
|
|
|
|
0, u->size, 0, 0, 0, &retval);
|
2015-09-08 09:06:04 +08:00
|
|
|
CV_Assert(u->origdata == data);
|
2015-09-03 22:18:59 +08:00
|
|
|
CV_OclDbgAssert(retval == CL_SUCCESS);
|
2015-09-08 09:06:04 +08:00
|
|
|
if (u->originalUMatData)
|
|
|
|
{
|
|
|
|
CV_Assert(u->originalUMatData->data == data);
|
|
|
|
}
|
2015-09-03 22:18:59 +08:00
|
|
|
CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
|
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
u->markHostCopyObsolete(false);
|
|
|
|
}
|
2015-09-08 09:06:04 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
// nothing
|
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
|
|
|
{
|
|
|
|
if( u->tempCopiedUMat() )
|
2013-12-02 04:12:20 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
|
|
|
|
svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
|
2013-12-02 04:12:20 +08:00
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
clReleaseMemObject((cl_mem)u->handle);
|
|
|
|
}
|
2013-11-25 21:16:22 +08:00
|
|
|
u->handle = 0;
|
2015-07-25 00:10:31 +08:00
|
|
|
u->markDeviceCopyObsolete(true);
|
2013-10-22 18:04:49 +08:00
|
|
|
u->currAllocator = u->prevAllocator;
|
2015-07-25 00:10:31 +08:00
|
|
|
u->prevAllocator = NULL;
|
|
|
|
if(u->data && u->copyOnMap() && u->data != u->origdata)
|
2013-10-22 21:41:28 +08:00
|
|
|
fastFree(u->data);
|
|
|
|
u->data = u->origdata;
|
2015-09-08 09:06:04 +08:00
|
|
|
u->currAllocator->deallocate(u);
|
|
|
|
u = NULL;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2015-07-25 00:10:31 +08:00
|
|
|
CV_Assert(u->origdata == NULL);
|
|
|
|
if(u->data && u->copyOnMap() && u->data != u->origdata)
|
2013-12-01 07:12:19 +08:00
|
|
|
{
|
2013-10-22 18:04:49 +08:00
|
|
|
fastFree(u->data);
|
2013-12-01 07:12:19 +08:00
|
|
|
u->data = 0;
|
2015-07-25 00:10:31 +08:00
|
|
|
u->markHostCopyObsolete(true);
|
2013-12-01 07:12:19 +08:00
|
|
|
}
|
2014-02-10 20:34:45 +08:00
|
|
|
if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
|
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
bufferPool.release((cl_mem)u->handle);
|
|
|
|
}
|
|
|
|
else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
|
|
|
|
{
|
|
|
|
bufferPoolHostPtr.release((cl_mem)u->handle);
|
|
|
|
}
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
|
|
|
|
{
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
|
|
|
|
{
|
|
|
|
//nothing
|
|
|
|
}
|
|
|
|
else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
|
|
|
|
(u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
|
|
|
|
{
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bufferPoolSVM.release((void*)u->handle);
|
2014-02-10 20:34:45 +08:00
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
#endif
|
2014-02-10 20:34:45 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
clReleaseMemObject((cl_mem)u->handle);
|
|
|
|
}
|
2013-11-25 21:16:22 +08:00
|
|
|
u->handle = 0;
|
2015-07-25 00:10:31 +08:00
|
|
|
u->markDeviceCopyObsolete(true);
|
2013-10-22 18:04:49 +08:00
|
|
|
delete u;
|
2015-07-25 00:10:31 +08:00
|
|
|
u = NULL;
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2015-09-08 09:06:04 +08:00
|
|
|
CV_Assert(u == NULL);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
2015-09-08 09:06:04 +08:00
|
|
|
// synchronized call (external UMatDataAutoLock, see UMat::getMat)
|
2013-10-22 18:04:49 +08:00
|
|
|
void map(UMatData* u, int accessFlags) const
|
|
|
|
{
|
2015-09-08 09:06:04 +08:00
|
|
|
CV_Assert(u && u->handle);
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
if(accessFlags & ACCESS_WRITE)
|
|
|
|
u->markDeviceCopyObsolete(true);
|
|
|
|
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
|
|
|
|
{
|
|
|
|
if( !u->copyOnMap() )
|
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
// TODO
|
|
|
|
// because there can be other map requests for the same UMat with different access flags,
|
|
|
|
// we use the universal (read-write) access mode.
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
|
|
|
{
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
|
|
|
|
{
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
|
|
|
|
u->handle, u->size,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
clFinish(q);
|
|
|
|
u->data = (uchar*)u->handle;
|
|
|
|
u->markHostCopyObsolete(false);
|
|
|
|
u->markDeviceMemMapped(true);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-09-03 22:18:59 +08:00
|
|
|
cl_int retval = CL_SUCCESS;
|
|
|
|
if (!u->deviceMemMapped())
|
|
|
|
{
|
|
|
|
CV_Assert(u->refcount == 1);
|
|
|
|
CV_Assert(u->mapcount++ == 0);
|
|
|
|
u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
|
|
|
|
(CL_MAP_READ | CL_MAP_WRITE),
|
|
|
|
0, u->size, 0, 0, 0, &retval);
|
|
|
|
}
|
|
|
|
if (u->data && retval == CL_SUCCESS)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
u->markHostCopyObsolete(false);
|
2014-07-09 16:04:22 +08:00
|
|
|
u->markDeviceMemMapped(true);
|
2013-10-22 18:04:49 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
// TODO Is it really a good idea and was it tested well?
|
2013-10-22 18:04:49 +08:00
|
|
|
// if map failed, switch to copy-on-map mode for the particular buffer
|
|
|
|
u->flags |= UMatData::COPY_ON_MAP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!u->data)
|
|
|
|
{
|
|
|
|
u->data = (uchar*)fastMalloc(u->size);
|
|
|
|
u->markHostCopyObsolete(true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
|
|
|
|
{
|
2014-02-06 21:27:29 +08:00
|
|
|
AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
|
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
|
2014-02-06 21:27:29 +08:00
|
|
|
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
|
2013-10-22 18:04:49 +08:00
|
|
|
u->markHostCopyObsolete(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void unmap(UMatData* u) const
|
|
|
|
{
|
|
|
|
if(!u)
|
|
|
|
return;
|
|
|
|
|
2014-07-09 16:04:22 +08:00
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
CV_Assert(u->handle != 0);
|
|
|
|
|
|
|
|
UMatDataAutoLock autolock(u);
|
|
|
|
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
2013-12-01 07:12:19 +08:00
|
|
|
cl_int retval = 0;
|
2014-07-09 16:04:22 +08:00
|
|
|
if( !u->copyOnMap() && u->deviceMemMapped() )
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2014-07-09 16:04:22 +08:00
|
|
|
CV_Assert(u->data != NULL);
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
|
|
|
{
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
|
|
|
|
{
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
clFinish(q);
|
|
|
|
u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
|
|
|
|
}
|
|
|
|
}
|
2015-08-25 22:25:03 +08:00
|
|
|
if (u->refcount == 0)
|
|
|
|
u->data = 0;
|
2015-01-02 08:33:40 +08:00
|
|
|
u->markDeviceCopyObsolete(false);
|
2015-07-25 00:10:31 +08:00
|
|
|
u->markHostCopyObsolete(true);
|
2015-01-02 08:33:40 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
2015-08-25 22:25:03 +08:00
|
|
|
if (u->refcount == 0)
|
2015-09-03 22:18:59 +08:00
|
|
|
{
|
|
|
|
CV_Assert(u->mapcount-- == 1);
|
|
|
|
CV_Assert((retval = clEnqueueUnmapMemObject(q,
|
|
|
|
(cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS);
|
|
|
|
if (Device::getDefault().isAMD())
|
|
|
|
{
|
|
|
|
// required for multithreaded applications (see stitching test)
|
|
|
|
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
u->markDeviceMemMapped(false);
|
2015-08-25 22:25:03 +08:00
|
|
|
u->data = 0;
|
2015-09-03 22:18:59 +08:00
|
|
|
u->markDeviceCopyObsolete(false);
|
|
|
|
u->markHostCopyObsolete(true);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else if( u->copyOnMap() && u->deviceCopyObsolete() )
|
|
|
|
{
|
2014-02-06 21:27:29 +08:00
|
|
|
AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
|
|
|
|
#endif
|
2013-12-01 07:12:19 +08:00
|
|
|
CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
|
2014-02-06 21:27:29 +08:00
|
|
|
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
|
2015-09-03 22:18:59 +08:00
|
|
|
u->markDeviceCopyObsolete(false);
|
|
|
|
u->markHostCopyObsolete(true);
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool checkContinuous(int dims, const size_t sz[],
|
|
|
|
const size_t srcofs[], const size_t srcstep[],
|
|
|
|
const size_t dstofs[], const size_t dststep[],
|
|
|
|
size_t& total, size_t new_sz[],
|
|
|
|
size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
|
|
|
|
size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
|
|
|
|
{
|
|
|
|
bool iscontinuous = true;
|
|
|
|
srcrawofs = srcofs ? srcofs[dims-1] : 0;
|
|
|
|
dstrawofs = dstofs ? dstofs[dims-1] : 0;
|
|
|
|
total = sz[dims-1];
|
|
|
|
for( int i = dims-2; i >= 0; i-- )
|
|
|
|
{
|
2013-10-25 15:19:40 +08:00
|
|
|
if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
|
2013-10-22 18:04:49 +08:00
|
|
|
iscontinuous = false;
|
|
|
|
total *= sz[i];
|
|
|
|
if( srcofs )
|
|
|
|
srcrawofs += srcofs[i]*srcstep[i];
|
|
|
|
if( dstofs )
|
|
|
|
dstrawofs += dstofs[i]*dststep[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
if( !iscontinuous )
|
|
|
|
{
|
|
|
|
// OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
|
|
|
|
if( dims == 2 )
|
|
|
|
{
|
|
|
|
new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
|
|
|
|
// we assume that new_... arrays are initialized by caller
|
|
|
|
// with 0's, so there is no else branch
|
|
|
|
if( srcofs )
|
|
|
|
{
|
|
|
|
new_srcofs[0] = srcofs[1];
|
|
|
|
new_srcofs[1] = srcofs[0];
|
|
|
|
new_srcofs[2] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( dstofs )
|
|
|
|
{
|
|
|
|
new_dstofs[0] = dstofs[1];
|
|
|
|
new_dstofs[1] = dstofs[0];
|
|
|
|
new_dstofs[2] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
|
|
|
|
new_dststep[0] = dststep[0]; new_dststep[1] = 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// we could check for dims == 3 here,
|
|
|
|
// but from user perspective this one is more informative
|
|
|
|
CV_Assert(dims <= 3);
|
|
|
|
new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
|
|
|
|
if( srcofs )
|
|
|
|
{
|
|
|
|
new_srcofs[0] = srcofs[2];
|
|
|
|
new_srcofs[1] = srcofs[1];
|
|
|
|
new_srcofs[2] = srcofs[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
if( dstofs )
|
|
|
|
{
|
|
|
|
new_dstofs[0] = dstofs[2];
|
|
|
|
new_dstofs[1] = dstofs[1];
|
|
|
|
new_dstofs[2] = dstofs[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
|
|
|
|
new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return iscontinuous;
|
|
|
|
}
|
|
|
|
|
|
|
|
void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
|
|
|
|
const size_t srcofs[], const size_t srcstep[],
|
|
|
|
const size_t dststep[]) const
|
|
|
|
{
|
|
|
|
if(!u)
|
|
|
|
return;
|
|
|
|
UMatDataAutoLock autolock(u);
|
|
|
|
|
|
|
|
if( u->data && !u->hostCopyObsolete() )
|
|
|
|
{
|
2015-12-01 04:45:48 +08:00
|
|
|
Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
|
2013-10-22 18:04:49 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
CV_Assert( u->handle != 0 );
|
|
|
|
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
|
|
|
|
size_t total = 0, new_sz[] = {0, 0, 0};
|
|
|
|
size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
|
|
|
|
size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
|
|
|
|
|
|
|
|
bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
|
|
|
|
total, new_sz,
|
|
|
|
srcrawofs, new_srcofs, new_srcstep,
|
|
|
|
dstrawofs, new_dstofs, new_dststep);
|
2014-02-06 21:27:29 +08:00
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
CV_DbgAssert(u->data == NULL || u->data == u->handle);
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
|
|
|
|
u->handle, u->size,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
clFinish(q);
|
|
|
|
if( iscontinuous )
|
|
|
|
{
|
|
|
|
memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// This code is from MatAllocator::download()
|
|
|
|
int isz[CV_MAX_DIM];
|
|
|
|
uchar* srcptr = (uchar*)u->handle;
|
|
|
|
for( int i = 0; i < dims; i++ )
|
|
|
|
{
|
|
|
|
CV_Assert( sz[i] <= (size_t)INT_MAX );
|
|
|
|
if( sz[i] == 0 )
|
|
|
|
return;
|
|
|
|
if( srcofs )
|
|
|
|
srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
|
|
|
|
isz[i] = (int)sz[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
Mat src(dims, isz, CV_8U, srcptr, srcstep);
|
|
|
|
Mat dst(dims, isz, CV_8U, dstptr, dststep);
|
|
|
|
|
|
|
|
const Mat* arrays[] = { &src, &dst };
|
|
|
|
uchar* ptrs[2];
|
|
|
|
NAryMatIterator it(arrays, ptrs, 2);
|
|
|
|
size_t j, planesz = it.size;
|
|
|
|
|
|
|
|
for( j = 0; j < it.nplanes; j++, ++it )
|
|
|
|
memcpy(ptrs[1], ptrs[0], planesz);
|
|
|
|
}
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
clFinish(q);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else
|
2015-01-02 08:33:40 +08:00
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
if( iscontinuous )
|
|
|
|
{
|
2015-08-11 06:33:46 +08:00
|
|
|
AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
|
|
|
|
CV_Assert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
|
|
|
|
srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
|
2015-01-02 08:33:40 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2015-08-11 06:33:46 +08:00
|
|
|
AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
|
|
|
|
uchar* ptr = alignedPtr.getAlignedPtr();
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
|
2015-08-11 06:33:46 +08:00
|
|
|
new_srcofs, new_dstofs, new_sz,
|
|
|
|
new_srcstep[0], 0,
|
|
|
|
new_dststep[0], 0,
|
|
|
|
ptr, 0, 0, 0) >= 0 );
|
2015-01-02 08:33:40 +08:00
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
|
|
|
|
const size_t dstofs[], const size_t dststep[],
|
|
|
|
const size_t srcstep[]) const
|
|
|
|
{
|
|
|
|
if(!u)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// there should be no user-visible CPU copies of the UMat which we are going to copy to
|
2013-12-02 00:58:30 +08:00
|
|
|
CV_Assert(u->refcount == 0 || u->tempUMat());
|
2013-10-22 18:04:49 +08:00
|
|
|
|
|
|
|
size_t total = 0, new_sz[] = {0, 0, 0};
|
|
|
|
size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
|
|
|
|
size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
|
|
|
|
|
|
|
|
bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
|
|
|
|
total, new_sz,
|
|
|
|
srcrawofs, new_srcofs, new_srcstep,
|
|
|
|
dstrawofs, new_dstofs, new_dststep);
|
|
|
|
|
|
|
|
UMatDataAutoLock autolock(u);
|
|
|
|
|
|
|
|
// if there is cached CPU copy of the GPU matrix,
|
|
|
|
// we could use it as a destination.
|
|
|
|
// we can do it in 2 cases:
|
|
|
|
// 1. we overwrite the whole content
|
|
|
|
// 2. we overwrite part of the matrix, but the GPU copy is out-of-date
|
2014-01-18 18:27:30 +08:00
|
|
|
if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-12-01 04:45:48 +08:00
|
|
|
Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
|
2013-10-22 18:04:49 +08:00
|
|
|
u->markHostCopyObsolete(false);
|
|
|
|
u->markDeviceCopyObsolete(true);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
CV_Assert( u->handle != 0 );
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
CV_DbgAssert(u->data == NULL || u->data == u->handle);
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
|
|
|
|
u->handle, u->size,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
clFinish(q);
|
|
|
|
if( iscontinuous )
|
|
|
|
{
|
|
|
|
memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// This code is from MatAllocator::upload()
|
|
|
|
int isz[CV_MAX_DIM];
|
|
|
|
uchar* dstptr = (uchar*)u->handle;
|
|
|
|
for( int i = 0; i < dims; i++ )
|
|
|
|
{
|
|
|
|
CV_Assert( sz[i] <= (size_t)INT_MAX );
|
|
|
|
if( sz[i] == 0 )
|
|
|
|
return;
|
|
|
|
if( dstofs )
|
|
|
|
dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
|
|
|
|
isz[i] = (int)sz[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
|
|
|
|
Mat dst(dims, isz, CV_8U, dstptr, dststep);
|
|
|
|
|
|
|
|
const Mat* arrays[] = { &src, &dst };
|
|
|
|
uchar* ptrs[2];
|
|
|
|
NAryMatIterator it(arrays, ptrs, 2);
|
|
|
|
size_t j, planesz = it.size;
|
|
|
|
|
|
|
|
for( j = 0; j < it.nplanes; j++, ++it )
|
|
|
|
memcpy(ptrs[1], ptrs[0], planesz);
|
|
|
|
}
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
|
|
|
|
0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
clFinish(q);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else
|
2015-01-02 08:33:40 +08:00
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
if( iscontinuous )
|
|
|
|
{
|
2015-08-11 06:33:46 +08:00
|
|
|
AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
|
|
|
|
CV_Assert(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
|
|
|
|
dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0);
|
2015-01-02 08:33:40 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2015-08-11 21:01:05 +08:00
|
|
|
AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
|
2015-08-11 06:33:46 +08:00
|
|
|
uchar* ptr = alignedPtr.getAlignedPtr();
|
|
|
|
|
|
|
|
CV_Assert(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
|
|
|
|
new_dstofs, new_srcofs, new_sz,
|
|
|
|
new_dststep[0], 0,
|
|
|
|
new_srcstep[0], 0,
|
|
|
|
ptr, 0, 0, 0) >= 0 );
|
2015-01-02 08:33:40 +08:00
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
u->markHostCopyObsolete(true);
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
|
|
|
|
(u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
|
|
|
|
{
|
|
|
|
// nothing
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
u->markHostCopyObsolete(true);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
u->markDeviceCopyObsolete(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
|
|
|
|
const size_t srcofs[], const size_t srcstep[],
|
2013-12-02 00:58:30 +08:00
|
|
|
const size_t dstofs[], const size_t dststep[], bool _sync) const
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
if(!src || !dst)
|
|
|
|
return;
|
|
|
|
|
|
|
|
size_t total = 0, new_sz[] = {0, 0, 0};
|
|
|
|
size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
|
|
|
|
size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
|
|
|
|
|
|
|
|
bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
|
|
|
|
total, new_sz,
|
|
|
|
srcrawofs, new_srcofs, new_srcstep,
|
|
|
|
dstrawofs, new_dstofs, new_dststep);
|
|
|
|
|
|
|
|
UMatDataAutoLock src_autolock(src);
|
|
|
|
UMatDataAutoLock dst_autolock(dst);
|
|
|
|
|
2014-01-18 18:27:30 +08:00
|
|
|
if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
|
|
|
|
return;
|
|
|
|
}
|
2014-01-18 18:27:30 +08:00
|
|
|
if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
|
|
|
download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
|
|
|
|
dst->markHostCopyObsolete(false);
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
|
|
|
|
(dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
|
|
|
|
{
|
|
|
|
// nothing
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
dst->markDeviceCopyObsolete(true);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// there should be no user-visible CPU copies of the UMat which we are going to copy to
|
|
|
|
CV_Assert(dst->refcount == 0);
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
cl_int retval = CL_SUCCESS;
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
|
|
|
|
(dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
|
|
|
|
(dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
|
|
|
{
|
|
|
|
Context& ctx = Context::getDefault();
|
|
|
|
const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
|
|
|
|
CV_DbgAssert(svmFns->isValid());
|
|
|
|
|
|
|
|
if( iscontinuous )
|
|
|
|
{
|
|
|
|
CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
|
|
|
|
(uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
|
|
|
|
cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
|
|
|
|
(uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
|
|
|
|
total, 0, NULL, NULL);
|
|
|
|
CV_Assert(status == CL_SUCCESS);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
clFinish(q);
|
|
|
|
// This code is from MatAllocator::download()/upload()
|
|
|
|
int isz[CV_MAX_DIM];
|
|
|
|
uchar* srcptr = (uchar*)src->handle;
|
|
|
|
for( int i = 0; i < dims; i++ )
|
|
|
|
{
|
|
|
|
CV_Assert( sz[i] <= (size_t)INT_MAX );
|
|
|
|
if( sz[i] == 0 )
|
|
|
|
return;
|
|
|
|
if( srcofs )
|
|
|
|
srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
|
|
|
|
isz[i] = (int)sz[i];
|
|
|
|
}
|
|
|
|
Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
|
|
|
|
|
|
|
|
uchar* dstptr = (uchar*)dst->handle;
|
|
|
|
for( int i = 0; i < dims; i++ )
|
|
|
|
{
|
|
|
|
if( dstofs )
|
|
|
|
dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
|
|
|
|
}
|
|
|
|
Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
|
|
|
|
|
|
|
|
const Mat* arrays[] = { &m_src, &m_dst };
|
|
|
|
uchar* ptrs[2];
|
|
|
|
NAryMatIterator it(arrays, ptrs, 2);
|
|
|
|
size_t j, planesz = it.size;
|
|
|
|
|
|
|
|
for( j = 0; j < it.nplanes; j++, ++it )
|
|
|
|
memcpy(ptrs[1], ptrs[0], planesz);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
|
|
|
|
{
|
|
|
|
map(src, ACCESS_READ);
|
|
|
|
upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
|
|
|
|
unmap(src);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
map(dst, ACCESS_WRITE);
|
|
|
|
download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
|
|
|
|
unmap(dst);
|
|
|
|
}
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
else
|
2015-01-02 08:33:40 +08:00
|
|
|
#endif
|
2013-10-22 18:04:49 +08:00
|
|
|
{
|
2015-01-02 08:33:40 +08:00
|
|
|
if( iscontinuous )
|
|
|
|
{
|
|
|
|
CV_Assert( (retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
|
|
|
|
srcrawofs, dstrawofs, total, 0, 0, 0)) == CL_SUCCESS );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
|
|
|
|
new_srcofs, new_dstofs, new_sz,
|
2015-08-11 06:33:46 +08:00
|
|
|
new_srcstep[0], 0,
|
|
|
|
new_dststep[0], 0,
|
2015-01-02 08:33:40 +08:00
|
|
|
0, 0, 0)) == CL_SUCCESS );
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2015-01-02 08:33:40 +08:00
|
|
|
if (retval == CL_SUCCESS)
|
2014-10-03 19:17:28 +08:00
|
|
|
{
|
|
|
|
CV_IMPL_ADD(CV_IMPL_OCL)
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
|
2015-08-11 06:33:46 +08:00
|
|
|
(dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
|
2015-01-02 08:33:40 +08:00
|
|
|
{
|
|
|
|
// nothing
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
dst->markHostCopyObsolete(true);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
dst->markDeviceCopyObsolete(false);
|
|
|
|
|
2013-12-02 00:58:30 +08:00
|
|
|
if( _sync )
|
2014-02-01 19:07:03 +08:00
|
|
|
{
|
|
|
|
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
2013-12-01 07:12:19 +08:00
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
BufferPoolController* getBufferPoolController(const char* id) const {
|
|
|
|
#ifdef HAVE_OPENCL_SVM
|
|
|
|
if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
|
|
|
|
{
|
|
|
|
return &bufferPoolSVM;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
|
|
|
|
{
|
|
|
|
return &bufferPoolHostPtr;
|
|
|
|
}
|
|
|
|
if (id != NULL && strcmp(id, "OCL") != 0)
|
|
|
|
{
|
|
|
|
CV_ErrorNoReturn(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
|
|
|
|
}
|
|
|
|
return &bufferPool;
|
|
|
|
}
|
2014-01-16 22:30:39 +08:00
|
|
|
|
2013-12-01 07:12:19 +08:00
|
|
|
MatAllocator* matStdAllocator;
|
2017-07-06 22:57:05 +08:00
|
|
|
|
|
|
|
mutable cv::Mutex cleanupQueueMutex;
|
|
|
|
mutable std::deque<UMatData*> cleanupQueue;
|
|
|
|
|
|
|
|
void flushCleanupQueue() const
|
|
|
|
{
|
|
|
|
if (!cleanupQueue.empty())
|
|
|
|
{
|
|
|
|
std::deque<UMatData*> q;
|
|
|
|
{
|
|
|
|
cv::AutoLock lock(cleanupQueueMutex);
|
|
|
|
q.swap(cleanupQueue);
|
|
|
|
}
|
|
|
|
for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
|
|
|
|
{
|
|
|
|
deallocate_(*i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void addToCleanupQueue(UMatData* u) const
|
|
|
|
{
|
|
|
|
//TODO: Validation check: CV_Assert(!u->tempUMat());
|
|
|
|
{
|
|
|
|
cv::AutoLock lock(cleanupQueueMutex);
|
|
|
|
cleanupQueue.push_back(u);
|
|
|
|
}
|
|
|
|
}
|
2013-10-22 18:04:49 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
MatAllocator* getOpenCLAllocator()
|
|
|
|
{
|
2015-06-23 19:31:01 +08:00
|
|
|
CV_SINGLETON_LAZY_INIT(MatAllocator, new OpenCLAllocator())
|
2013-10-22 18:04:49 +08:00
|
|
|
}
|
|
|
|
|
OpenCV-OpenCL interop (PR #4072):
Commits:
added new function, cv::ocl::attachContext(String& platformName, void* platformID, void* context, void* deviceID) which allow to attach externally created OpenCL context to OpenCV.
add definitions of clRetainDevice, clRetainContext funcs
removed definitions for clRetainContext, clRetainDevice
fixed build issue under Linux
fixed uninitialized vars, replace dbgassert in error handling
remove function which is not ready yet
add new function, cv::ocl::convertFromBuffer(int rows, int cols, int type, void* cl_mem_obj, UMat& dst, UMatUsageFlags usageFlags = cv::USAGE_DEFAULT) which attaches user allocated OpenCL clBuffer to UMat
uncommented clGetMemObjectInfo definition (otherwise prevent opencv build)
fixed build issue on linux and android
add step parameter to cv::ocl::convertFromBuffer func
suppress compile-time warning
added sample opencl-opencv interoperability (showcase for cv::ocl::convertFromBuffer func)
CMakeLists.txt modified to not create sample build script if OpenCL SDK not found in system
fixed build issue (apple opencl include dir and spaces in CMake file)
added call to clRetainContext for attachContext func and call to clRetainMemObject for convertFromBuffer func
uncommented clRetainMemObject definition
added comments and cleanup
add local path to cmake modules search dirs (instead of replacing)
remove REQUIRED for find_package call (sample build together with opencv). need to try standalone sample build
opencl-interop sample moved to standalone build
set minimum version requirement for sample's cmake to 3.1
put cmake_minimum_required under condition, so do not check if samples not builded
remove code dups for setSize, updateContinuityFlag, and finalizeHdr
commented out cmake_minimum_required(VERSION 3.1)
add safety check for cmake version
add convertFromImage func and update opencl-interop sample
uncommented clGetImageInfo defs
uncommented clEnqueueCopyImageToBuffer defs
fixed clEnqueueCopyImageToBuffer defs
add doxygen comments
remove doxygen @fn tag
try to restart buildbot
add doxygen comments to directx interop funcs
remove internal header, use fwd declarations in affected compile units instead
2015-05-28 04:22:33 +08:00
|
|
|
}} // namespace cv::ocl
|
|
|
|
|
|
|
|
|
|
|
|
namespace cv {
|
|
|
|
|
|
|
|
// three funcs below are implemented in umatrix.cpp
|
|
|
|
void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
|
|
|
|
bool autoSteps = false );
|
|
|
|
|
|
|
|
void updateContinuityFlag(UMat& m);
|
|
|
|
void finalizeHdr(UMat& m);
|
|
|
|
|
|
|
|
} // namespace cv
|
|
|
|
|
|
|
|
|
|
|
|
namespace cv { namespace ocl {
|
|
|
|
|
|
|
|
/*
|
|
|
|
// Convert OpenCL buffer memory to UMat
|
|
|
|
*/
|
|
|
|
void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
|
|
|
|
{
|
|
|
|
int d = 2;
|
|
|
|
int sizes[] = { rows, cols };
|
|
|
|
|
|
|
|
CV_Assert(0 <= d && d <= CV_MAX_DIM);
|
|
|
|
|
|
|
|
dst.release();
|
|
|
|
|
|
|
|
dst.flags = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
|
|
|
|
dst.usageFlags = USAGE_DEFAULT;
|
|
|
|
|
|
|
|
setSize(dst, d, sizes, 0, true);
|
|
|
|
dst.offset = 0;
|
|
|
|
|
|
|
|
cl_mem memobj = (cl_mem)cl_mem_buffer;
|
|
|
|
cl_mem_object_type mem_type = 0;
|
|
|
|
|
|
|
|
CV_Assert(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0) == CL_SUCCESS);
|
|
|
|
|
|
|
|
CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
|
|
|
|
|
|
|
|
size_t total = 0;
|
|
|
|
CV_Assert(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0) == CL_SUCCESS);
|
|
|
|
|
|
|
|
CV_Assert(clRetainMemObject(memobj) == CL_SUCCESS);
|
|
|
|
|
|
|
|
CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
|
|
|
|
CV_Assert(total >= rows * step);
|
|
|
|
|
|
|
|
// attach clBuffer to UMatData
|
|
|
|
dst.u = new UMatData(getOpenCLAllocator());
|
|
|
|
dst.u->data = 0;
|
|
|
|
dst.u->allocatorFlags_ = 0; // not allocated from any OpenCV buffer pool
|
|
|
|
dst.u->flags = 0;
|
|
|
|
dst.u->handle = cl_mem_buffer;
|
|
|
|
dst.u->origdata = 0;
|
|
|
|
dst.u->prevAllocator = 0;
|
|
|
|
dst.u->size = total;
|
|
|
|
|
|
|
|
finalizeHdr(dst);
|
|
|
|
dst.addref();
|
|
|
|
|
|
|
|
return;
|
|
|
|
} // convertFromBuffer()
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
// Convert OpenCL image2d_t memory to UMat
|
|
|
|
*/
|
|
|
|
void convertFromImage(void* cl_mem_image, UMat& dst)
|
|
|
|
{
|
|
|
|
cl_mem clImage = (cl_mem)cl_mem_image;
|
|
|
|
cl_mem_object_type mem_type = 0;
|
|
|
|
|
|
|
|
CV_Assert(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0) == CL_SUCCESS);
|
|
|
|
|
|
|
|
CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
|
|
|
|
|
|
|
|
cl_image_format fmt = { 0, 0 };
|
|
|
|
CV_Assert(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0) == CL_SUCCESS);
|
|
|
|
|
|
|
|
int depth = CV_8U;
|
|
|
|
switch (fmt.image_channel_data_type)
|
|
|
|
{
|
|
|
|
case CL_UNORM_INT8:
|
|
|
|
case CL_UNSIGNED_INT8:
|
|
|
|
depth = CV_8U;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CL_SNORM_INT8:
|
|
|
|
case CL_SIGNED_INT8:
|
|
|
|
depth = CV_8S;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CL_UNORM_INT16:
|
|
|
|
case CL_UNSIGNED_INT16:
|
|
|
|
depth = CV_16U;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CL_SNORM_INT16:
|
|
|
|
case CL_SIGNED_INT16:
|
|
|
|
depth = CV_16S;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CL_SIGNED_INT32:
|
|
|
|
depth = CV_32S;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CL_FLOAT:
|
|
|
|
depth = CV_32F;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
|
|
|
|
}
|
|
|
|
|
|
|
|
int type = CV_8UC1;
|
|
|
|
switch (fmt.image_channel_order)
|
|
|
|
{
|
|
|
|
case CL_R:
|
|
|
|
type = CV_MAKE_TYPE(depth, 1);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CL_RGBA:
|
|
|
|
case CL_BGRA:
|
|
|
|
case CL_ARGB:
|
|
|
|
type = CV_MAKE_TYPE(depth, 4);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t step = 0;
|
|
|
|
CV_Assert(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0) == CL_SUCCESS);
|
|
|
|
|
|
|
|
size_t w = 0;
|
|
|
|
CV_Assert(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0) == CL_SUCCESS);
|
|
|
|
|
|
|
|
size_t h = 0;
|
|
|
|
CV_Assert(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0) == CL_SUCCESS);
|
|
|
|
|
|
|
|
dst.create((int)h, (int)w, type);
|
|
|
|
|
|
|
|
cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
|
|
|
|
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
|
|
|
|
|
|
|
|
size_t offset = 0;
|
|
|
|
size_t src_origin[3] = { 0, 0, 0 };
|
|
|
|
size_t region[3] = { w, h, 1 };
|
|
|
|
CV_Assert(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL) == CL_SUCCESS);
|
|
|
|
|
|
|
|
CV_Assert(clFinish(q) == CL_SUCCESS);
|
|
|
|
|
|
|
|
return;
|
|
|
|
} // convertFromImage()
|
|
|
|
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
|
2014-01-22 14:08:42 +08:00
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
cl_uint numDevices = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
|
|
|
|
0, NULL, &numDevices) == CL_SUCCESS);
|
|
|
|
|
2014-01-22 14:08:42 +08:00
|
|
|
if (numDevices == 0)
|
2014-02-01 19:07:03 +08:00
|
|
|
{
|
|
|
|
devices.clear();
|
2014-01-22 14:08:42 +08:00
|
|
|
return;
|
2014-02-01 19:07:03 +08:00
|
|
|
}
|
|
|
|
|
2014-01-22 14:08:42 +08:00
|
|
|
devices.resize((size_t)numDevices);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
|
|
|
|
numDevices, &devices[0], &numDevices) == CL_SUCCESS);
|
2014-01-22 14:08:42 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
struct PlatformInfo::Impl
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
Impl(void* id)
|
|
|
|
{
|
2014-01-29 00:22:56 +08:00
|
|
|
refcount = 1;
|
2014-01-22 14:08:42 +08:00
|
|
|
handle = *(cl_platform_id*)id;
|
|
|
|
getDevices(devices, handle);
|
|
|
|
}
|
|
|
|
|
2017-02-15 23:44:40 +08:00
|
|
|
String getStrProp(cl_platform_info prop) const
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
char buf[1024];
|
|
|
|
size_t sz=0;
|
2014-02-01 19:07:03 +08:00
|
|
|
return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
|
2014-01-22 14:08:42 +08:00
|
|
|
sz < sizeof(buf) ? String(buf) : String();
|
|
|
|
}
|
|
|
|
|
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
|
|
|
std::vector<cl_device_id> devices;
|
|
|
|
cl_platform_id handle;
|
|
|
|
};
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
PlatformInfo::PlatformInfo()
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
PlatformInfo::PlatformInfo(void* platform_id)
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
p = new Impl(platform_id);
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
PlatformInfo::~PlatformInfo()
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
PlatformInfo::PlatformInfo(const PlatformInfo& i)
|
2014-01-29 21:37:52 +08:00
|
|
|
{
|
|
|
|
if (i.p)
|
|
|
|
i.p->addref();
|
2014-02-01 19:07:03 +08:00
|
|
|
p = i.p;
|
2014-01-29 21:37:52 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
|
2014-01-29 21:37:52 +08:00
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
if (i.p != p)
|
2014-01-29 21:37:52 +08:00
|
|
|
{
|
|
|
|
if (i.p)
|
|
|
|
i.p->addref();
|
2014-02-01 19:07:03 +08:00
|
|
|
if (p)
|
|
|
|
p->release();
|
|
|
|
p = i.p;
|
2014-01-29 21:37:52 +08:00
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
int PlatformInfo::deviceNumber() const
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
return p ? (int)p->devices.size() : 0;
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
void PlatformInfo::getDevice(Device& device, int d) const
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
2014-01-29 00:22:56 +08:00
|
|
|
CV_Assert(p && d < (int)p->devices.size() );
|
2014-01-22 14:08:42 +08:00
|
|
|
if(p)
|
|
|
|
device.set(p->devices[d]);
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
String PlatformInfo::name() const
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
String PlatformInfo::vendor() const
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
|
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
String PlatformInfo::version() const
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void getPlatforms(std::vector<cl_platform_id>& platforms)
|
|
|
|
{
|
|
|
|
cl_uint numPlatforms = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
|
|
|
|
|
2014-01-22 14:08:42 +08:00
|
|
|
if (numPlatforms == 0)
|
2014-02-01 19:07:03 +08:00
|
|
|
{
|
|
|
|
platforms.clear();
|
2014-01-22 14:08:42 +08:00
|
|
|
return;
|
2014-02-01 19:07:03 +08:00
|
|
|
}
|
|
|
|
|
2014-01-22 14:08:42 +08:00
|
|
|
platforms.resize((size_t)numPlatforms);
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
|
2014-01-22 14:08:42 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
|
2014-01-22 14:08:42 +08:00
|
|
|
{
|
|
|
|
std::vector<cl_platform_id> platforms;
|
|
|
|
getPlatforms(platforms);
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2014-01-22 14:08:42 +08:00
|
|
|
for (size_t i = 0; i < platforms.size(); i++)
|
2014-02-01 00:23:01 +08:00
|
|
|
platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
|
2014-01-22 14:08:42 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
const char* typeToStr(int type)
|
2013-11-19 00:48:00 +08:00
|
|
|
{
|
|
|
|
static const char* tab[]=
|
|
|
|
{
|
2014-03-08 05:29:27 +08:00
|
|
|
"uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
|
|
|
|
"char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
|
|
|
|
"ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
|
|
|
|
"short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
|
|
|
|
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
|
|
|
|
"float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
|
|
|
|
"double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
|
|
|
|
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
|
2013-11-19 00:48:00 +08:00
|
|
|
};
|
2014-02-01 19:07:03 +08:00
|
|
|
int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
|
2014-03-08 05:29:27 +08:00
|
|
|
return cn > 16 ? "?" : tab[depth*16 + cn-1];
|
2013-11-19 00:48:00 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 19:07:03 +08:00
|
|
|
const char* memopTypeToStr(int type)
|
2014-08-29 18:18:52 +08:00
|
|
|
{
|
|
|
|
static const char* tab[] =
|
|
|
|
{
|
|
|
|
"uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
|
|
|
|
"char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
|
|
|
|
"ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
|
|
|
|
"short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
|
|
|
|
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
|
|
|
|
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
|
|
|
|
"ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
|
|
|
|
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
|
|
|
|
};
|
|
|
|
int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
|
|
|
|
return cn > 16 ? "?" : tab[depth*16 + cn-1];
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* vecopTypeToStr(int type)
|
2013-11-19 00:48:00 +08:00
|
|
|
{
|
2014-02-23 19:12:38 +08:00
|
|
|
static const char* tab[] =
|
2013-11-19 00:48:00 +08:00
|
|
|
{
|
2014-07-01 17:49:03 +08:00
|
|
|
"uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
|
|
|
|
"char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
|
|
|
|
"ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
|
|
|
|
"short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
|
2014-03-08 05:29:27 +08:00
|
|
|
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
|
|
|
|
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
|
|
|
|
"ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
|
|
|
|
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
|
2013-11-19 00:48:00 +08:00
|
|
|
};
|
2014-02-01 19:07:03 +08:00
|
|
|
int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
|
2014-03-08 05:29:27 +08:00
|
|
|
return cn > 16 ? "?" : tab[depth*16 + cn-1];
|
2013-11-19 00:48:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
|
|
|
|
{
|
|
|
|
if( sdepth == ddepth )
|
|
|
|
return "noconvert";
|
|
|
|
const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
|
|
|
|
if( ddepth >= CV_32F ||
|
|
|
|
(ddepth == CV_32S && sdepth < CV_32S) ||
|
|
|
|
(ddepth == CV_16S && sdepth <= CV_8S) ||
|
|
|
|
(ddepth == CV_16U && sdepth == CV_8U))
|
|
|
|
{
|
|
|
|
sprintf(buf, "convert_%s", typestr);
|
|
|
|
}
|
|
|
|
else if( sdepth >= CV_32F )
|
|
|
|
sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
|
|
|
|
else
|
|
|
|
sprintf(buf, "convert_%s_sat", typestr);
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2013-11-19 00:48:00 +08:00
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
Merge pull request #9114 from pengli:dnn_rebase
add libdnn acceleration to dnn module (#9114)
* import libdnn code
Signed-off-by: Li Peng <peng.li@intel.com>
* add convolution layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add pooling layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add softmax layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add lrn layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add innerproduct layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add HAVE_OPENCL macro
Signed-off-by: Li Peng <peng.li@intel.com>
* fix for convolution ocl
Signed-off-by: Li Peng <peng.li@intel.com>
* enable getUMat() for multi-dimension Mat
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat for ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* use CV_OCL_RUN macro
Signed-off-by: Li Peng <peng.li@intel.com>
* set OPENCL target when it is available
and disable fuseLayer for OCL target for the time being
Signed-off-by: Li Peng <peng.li@intel.com>
* fix innerproduct accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* remove trailing space
Signed-off-by: Li Peng <peng.li@intel.com>
* Fixed tensorflow demo bug.
Root cause is that tensorflow has different algorithm with libdnn
to calculate convolution output dimension.
libdnn don't calculate output dimension anymore and just use one
passed in by config.
* split gemm ocl file
split it into gemm_buffer.cl and gemm_image.cl
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix compile failure
Signed-off-by: Li Peng <peng.li@intel.com>
* check env flag for auto tuning
Signed-off-by: Li Peng <peng.li@intel.com>
* switch to new ocl kernels for softmax layer
Signed-off-by: Li Peng <peng.li@intel.com>
* update softmax layer
on some platform subgroup extension may not work well,
fallback to non subgroup ocl acceleration.
Signed-off-by: Li Peng <peng.li@intel.com>
* fallback to cpu path for fc layer with multi output
Signed-off-by: Li Peng <peng.li@intel.com>
* update output message
Signed-off-by: Li Peng <peng.li@intel.com>
* update fully connected layer
fallback to gemm API if libdnn return false
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ReLU OCL implementation
* disable layer fusion for now
Signed-off-by: Li Peng <peng.li@intel.com>
* Add OCL implementation for concat layer
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
* libdnn: update license and copyrights
Also refine libdnn coding style
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* DNN: Don't link OpenCL library explicitly
* DNN: Make default preferableTarget to DNN_TARGET_CPU
User should set it to DNN_TARGET_OPENCL explicitly if want to
use OpenCL acceleration.
Also don't fusion when using DNN_TARGET_OPENCL
* DNN: refine coding style
* Add getOpenCLErrorString
* DNN: Use int32_t/uint32_t instread of alias
* Use namespace ocl4dnn to include libdnn things
* remove extra copyTo in softmax ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* update ReLU layer ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* Add prefer target property for layer class
It is used to indicate the target for layer forwarding,
either the default CPU target or OCL target.
Signed-off-by: Li Peng <peng.li@intel.com>
* Add cl_event based timer for cv::ocl
* Rename libdnn to ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* use UMat for ocl4dnn internal buffer
Remove allocateMemory which use clCreateBuffer directly
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* enable buffer gemm in ocl4dnn innerproduct
Signed-off-by: Li Peng <peng.li@intel.com>
* replace int_tp globally for ocl4dnn kernels.
Signed-off-by: wzw <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* create UMat for layer params
Signed-off-by: Li Peng <peng.li@intel.com>
* update sign ocl kernel
Signed-off-by: Li Peng <peng.li@intel.com>
* update image based gemm of inner product layer
Signed-off-by: Li Peng <peng.li@intel.com>
* remove buffer gemm of inner product layer
call cv::gemm API instead
Signed-off-by: Li Peng <peng.li@intel.com>
* change ocl4dnn forward parameter to UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine auto-tuning mechanism.
- Use OPENCV_OCL4DNN_KERNEL_CONFIG_PATH to set cache directory
for fine-tuned kernel configuration.
e.g. export OPENCV_OCL4DNN_KERNEL_CONFIG_PATH=/home/tmp,
the cache directory will be /home/tmp/spatialkernels/ on Linux.
- Define environment OPENCV_OCL4DNN_ENABLE_AUTO_TUNING to enable
auto-tuning.
- OPENCV_OPENCL_ENABLE_PROFILING is only used to enable profiling
for OpenCL command queue. This fix basic kernel get wrong running
time, i.e. 0ms.
- If creating cache directory failed, disable auto-tuning.
* Detect and create cache dir on windows
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine gemm like convolution kernel.
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix redundant swizzleWeights calling when use cached kernel config.
* Fix "out of resource" bug when auto-tuning too many kernels.
* replace cl_mem with UMat in ocl4dnnConvSpatial class
* OCL4DNN: reduce the tuning kernel candidate.
This patch could reduce 75% of the tuning candidates with less
than 2% performance impact for the final result.
Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
* replace cl_mem with umat in ocl4dnn convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* remove weight_image_ of ocl4dnn inner product
Actually it is unused in the computation
Signed-off-by: Li Peng <peng.li@intel.com>
* Various fixes for ocl4dnn
1. OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel())
2. Ptr<OCL4DNNInnerProduct<float> > innerProductOp
3. Code comments cleanup
4. ignore check on OCL cpu device
Signed-off-by: Li Peng <peng.li@intel.com>
* add build option for log softmax
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ocl kernels in ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ocl4dnnSet with opencv setTo
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ALIGN with cv::alignSize
Signed-off-by: Li Peng <peng.li@intel.com>
* check kernel build options
Signed-off-by: Li Peng <peng.li@intel.com>
* Handle program compilation fail properly.
* Use std::numeric_limits<float>::infinity() for large float number
* check ocl4dnn kernel compilation result
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ctx_id
Signed-off-by: Li Peng <peng.li@intel.com>
* change clEnqueueNDRangeKernel to kernel.run()
Signed-off-by: Li Peng <peng.li@intel.com>
* change cl_mem to UMat in image based gemm
Signed-off-by: Li Peng <peng.li@intel.com>
* check intel subgroup support for lrn and pooling layer
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix convolution bug if group is greater than 1
Signed-off-by: Li Peng <peng.li@intel.com>
* Set default layer preferableTarget to be DNN_TARGET_CPU
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ocl perf test for convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* Add more ocl accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_image with ocl::Image2D
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix build failure in elementwise layer
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat() to get blob data
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_mem handle with ocl::KernelArg
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(build): don't use C++11, OPENCL_LIBRARIES fix
* dnn(ocl4dnn): remove unused OpenCL kernels
* dnn(ocl4dnn): extract OpenCL code into .cl files
* dnn(ocl4dnn): refine auto-tuning
Defaultly disable auto-tuning, set OPENCV_OCL4DNN_ENABLE_AUTO_TUNING
environment variable to enable it.
Use a set of pre-tuned configs as default config if auto-tuning is disabled.
These configs are tuned for Intel GPU with 48/72 EUs, and for googlenet,
AlexNet, ResNet-50
If default config is not suitable, use the first available kernel config
from the candidates. Candidate priority from high to low is gemm like kernel,
IDLF kernel, basick kernel.
* dnn(ocl4dnn): pooling doesn't use OpenCL subgroups
* dnn(ocl4dnn): fix perf test
OpenCV has default 3sec time limit for each performance test.
Warmup OpenCL backend outside of perf measurement loop.
* use ocl::KernelArg as much as possible
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): fix bias bug for gemm like kernel
* dnn(ocl4dnn): wrap cl_mem into UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): Refine signature of kernel config
- Use more readable string as signture of kernel config
- Don't count device name and vendor in signature string
- Default kernel configurations are tuned for Intel GPU with
24/48/72 EUs, and for googlenet, AlexNet, ResNet-50 net model.
* dnn(ocl4dnn): swap width/height in configuration
* dnn(ocl4dnn): enable configs for Intel OpenCL runtime only
* core: make configuration helper functions accessible from non-core modules
* dnn(ocl4dnn): update kernel auto-tuning behavior
Avoid unwanted creation of directories
* dnn(ocl4dnn): simplify kernel to workaround OpenCL compiler crash
* dnn(ocl4dnn): remove redundant code
* dnn(ocl4dnn): Add more clear message for simd size dismatch.
* dnn(ocl4dnn): add const to const argument
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): force compiler use a specific SIMD size for IDLF kernel
* dnn(ocl4dnn): drop unused tuneLocalSize()
* dnn(ocl4dnn): specify OpenCL queue for Timer and convolve() method
* dnn(ocl4dnn): sanitize file names used for cache
* dnn(perf): enable Network tests with OpenCL
* dnn(ocl4dnn/conv): drop computeGlobalSize()
* dnn(ocl4dnn/conv): drop unused fields
* dnn(ocl4dnn/conv): simplify ctor
* dnn(ocl4dnn/conv): refactor kernelConfig localSize=NULL
* dnn(ocl4dnn/conv): drop unsupported double / untested half types
* dnn(ocl4dnn/conv): drop unused variable
* dnn(ocl4dnn/conv): alignSize/divUp
* dnn(ocl4dnn/conv): use enum values
* dnn(ocl4dnn): drop unused innerproduct variable
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): add an generic function to check cl option support
* dnn(ocl4dnn): run softmax subgroup version kernel first
Signed-off-by: Li Peng <peng.li@intel.com>
2017-10-02 20:38:00 +08:00
|
|
|
const char* getOpenCLErrorString(int errorCode)
|
|
|
|
{
|
|
|
|
switch (errorCode)
|
|
|
|
{
|
|
|
|
case 0: return "CL_SUCCESS";
|
|
|
|
case -1: return "CL_DEVICE_NOT_FOUND";
|
|
|
|
case -2: return "CL_DEVICE_NOT_AVAILABLE";
|
|
|
|
case -3: return "CL_COMPILER_NOT_AVAILABLE";
|
|
|
|
case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
|
|
|
case -5: return "CL_OUT_OF_RESOURCES";
|
|
|
|
case -6: return "CL_OUT_OF_HOST_MEMORY";
|
|
|
|
case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
|
|
|
|
case -8: return "CL_MEM_COPY_OVERLAP";
|
|
|
|
case -9: return "CL_IMAGE_FORMAT_MISMATCH";
|
|
|
|
case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
|
|
|
|
case -11: return "CL_BUILD_PROGRAM_FAILURE";
|
|
|
|
case -12: return "CL_MAP_FAILURE";
|
|
|
|
case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
|
|
|
|
case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
|
|
|
|
case -15: return "CL_COMPILE_PROGRAM_FAILURE";
|
|
|
|
case -16: return "CL_LINKER_NOT_AVAILABLE";
|
|
|
|
case -17: return "CL_LINK_PROGRAM_FAILURE";
|
|
|
|
case -18: return "CL_DEVICE_PARTITION_FAILED";
|
|
|
|
case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
|
|
|
|
case -30: return "CL_INVALID_VALUE";
|
|
|
|
case -31: return "CL_INVALID_DEVICE_TYPE";
|
|
|
|
case -32: return "CL_INVALID_PLATFORM";
|
|
|
|
case -33: return "CL_INVALID_DEVICE";
|
|
|
|
case -34: return "CL_INVALID_CONTEXT";
|
|
|
|
case -35: return "CL_INVALID_QUEUE_PROPERTIES";
|
|
|
|
case -36: return "CL_INVALID_COMMAND_QUEUE";
|
|
|
|
case -37: return "CL_INVALID_HOST_PTR";
|
|
|
|
case -38: return "CL_INVALID_MEM_OBJECT";
|
|
|
|
case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
|
|
|
|
case -40: return "CL_INVALID_IMAGE_SIZE";
|
|
|
|
case -41: return "CL_INVALID_SAMPLER";
|
|
|
|
case -42: return "CL_INVALID_BINARY";
|
|
|
|
case -43: return "CL_INVALID_BUILD_OPTIONS";
|
|
|
|
case -44: return "CL_INVALID_PROGRAM";
|
|
|
|
case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
|
|
|
|
case -46: return "CL_INVALID_KERNEL_NAME";
|
|
|
|
case -47: return "CL_INVALID_KERNEL_DEFINITION";
|
|
|
|
case -48: return "CL_INVALID_KERNEL";
|
|
|
|
case -49: return "CL_INVALID_ARG_INDEX";
|
|
|
|
case -50: return "CL_INVALID_ARG_VALUE";
|
|
|
|
case -51: return "CL_INVALID_ARG_SIZE";
|
|
|
|
case -52: return "CL_INVALID_KERNEL_ARGS";
|
|
|
|
case -53: return "CL_INVALID_WORK_DIMENSION";
|
|
|
|
case -54: return "CL_INVALID_WORK_GROUP_SIZE";
|
|
|
|
case -55: return "CL_INVALID_WORK_ITEM_SIZE";
|
|
|
|
case -56: return "CL_INVALID_GLOBAL_OFFSET";
|
|
|
|
case -57: return "CL_INVALID_EVENT_WAIT_LIST";
|
|
|
|
case -58: return "CL_INVALID_EVENT";
|
|
|
|
case -59: return "CL_INVALID_OPERATION";
|
|
|
|
case -60: return "CL_INVALID_GL_OBJECT";
|
|
|
|
case -61: return "CL_INVALID_BUFFER_SIZE";
|
|
|
|
case -62: return "CL_INVALID_MIP_LEVEL";
|
|
|
|
case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
|
|
|
|
case -64: return "CL_INVALID_PROPERTY";
|
|
|
|
case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
|
|
|
|
case -66: return "CL_INVALID_COMPILER_OPTIONS";
|
|
|
|
case -67: return "CL_INVALID_LINKER_OPTIONS";
|
|
|
|
case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
|
|
|
|
case -69: return "CL_INVALID_PIPE_SIZE";
|
|
|
|
case -70: return "CL_INVALID_DEVICE_QUEUE";
|
|
|
|
case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
|
|
|
|
case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
|
|
|
|
case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
|
|
|
|
case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
|
|
|
|
case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
|
|
|
|
case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
|
|
|
|
case -1024: return "clBLAS: Functionality is not implemented";
|
|
|
|
case -1023: return "clBLAS: Library is not initialized yet";
|
|
|
|
case -1022: return "clBLAS: Matrix A is not a valid memory object";
|
|
|
|
case -1021: return "clBLAS: Matrix B is not a valid memory object";
|
|
|
|
case -1020: return "clBLAS: Matrix C is not a valid memory object";
|
|
|
|
case -1019: return "clBLAS: Vector X is not a valid memory object";
|
|
|
|
case -1018: return "clBLAS: Vector Y is not a valid memory object";
|
|
|
|
case -1017: return "clBLAS: An input dimension (M:N:K) is invalid";
|
|
|
|
case -1016: return "clBLAS: Leading dimension A must not be less than the "
|
|
|
|
"size of the first dimension";
|
|
|
|
case -1015: return "clBLAS: Leading dimension B must not be less than the "
|
|
|
|
"size of the second dimension";
|
|
|
|
case -1014: return "clBLAS: Leading dimension C must not be less than the "
|
|
|
|
"size of the third dimension";
|
|
|
|
case -1013: return "clBLAS: The increment for a vector X must not be 0";
|
|
|
|
case -1012: return "clBLAS: The increment for a vector Y must not be 0";
|
|
|
|
case -1011: return "clBLAS: The memory object for Matrix A is too small";
|
|
|
|
case -1010: return "clBLAS: The memory object for Matrix B is too small";
|
|
|
|
case -1009: return "clBLAS: The memory object for Matrix C is too small";
|
|
|
|
case -1008: return "clBLAS: The memory object for Vector X is too small";
|
|
|
|
case -1007: return "clBLAS: The memory object for Vector Y is too small";
|
|
|
|
default: return "Unknown OpenCL error";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-30 19:23:11 +08:00
|
|
|
template <typename T>
|
|
|
|
static std::string kerToStr(const Mat & k)
|
|
|
|
{
|
|
|
|
int width = k.cols - 1, depth = k.depth();
|
2014-08-13 19:08:27 +08:00
|
|
|
const T * const data = k.ptr<T>();
|
2014-01-30 19:23:11 +08:00
|
|
|
|
|
|
|
std::ostringstream stream;
|
|
|
|
stream.precision(10);
|
|
|
|
|
|
|
|
if (depth <= CV_8S)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < width; ++i)
|
2014-01-30 21:23:56 +08:00
|
|
|
stream << "DIG(" << (int)data[i] << ")";
|
|
|
|
stream << "DIG(" << (int)data[width] << ")";
|
2014-01-30 19:23:11 +08:00
|
|
|
}
|
|
|
|
else if (depth == CV_32F)
|
|
|
|
{
|
|
|
|
stream.setf(std::ios_base::showpoint);
|
|
|
|
for (int i = 0; i < width; ++i)
|
2014-01-30 21:23:56 +08:00
|
|
|
stream << "DIG(" << data[i] << "f)";
|
|
|
|
stream << "DIG(" << data[width] << "f)";
|
2014-01-30 19:23:11 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (int i = 0; i < width; ++i)
|
2014-01-30 21:23:56 +08:00
|
|
|
stream << "DIG(" << data[i] << ")";
|
|
|
|
stream << "DIG(" << data[width] << ")";
|
2014-01-30 19:23:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return stream.str();
|
|
|
|
}
|
|
|
|
|
2014-03-17 23:59:35 +08:00
|
|
|
String kernelToStr(InputArray _kernel, int ddepth, const char * name)
|
2014-01-30 19:23:11 +08:00
|
|
|
{
|
|
|
|
Mat kernel = _kernel.getMat().reshape(1, 1);
|
|
|
|
|
|
|
|
int depth = kernel.depth();
|
|
|
|
if (ddepth < 0)
|
|
|
|
ddepth = depth;
|
|
|
|
|
|
|
|
if (ddepth != depth)
|
|
|
|
kernel.convertTo(kernel, ddepth);
|
|
|
|
|
2014-03-19 22:49:33 +08:00
|
|
|
typedef std::string (* func_t)(const Mat &);
|
|
|
|
static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
|
2014-01-30 19:23:11 +08:00
|
|
|
kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
|
2014-04-24 01:20:09 +08:00
|
|
|
const func_t func = funcs[ddepth];
|
2014-01-30 19:23:11 +08:00
|
|
|
CV_Assert(func != 0);
|
|
|
|
|
2014-03-17 23:59:35 +08:00
|
|
|
return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
|
2014-01-30 19:23:11 +08:00
|
|
|
}
|
|
|
|
|
2014-03-08 05:29:27 +08:00
|
|
|
#define PROCESS_SRC(src) \
|
|
|
|
do \
|
|
|
|
{ \
|
|
|
|
if (!src.empty()) \
|
|
|
|
{ \
|
|
|
|
CV_Assert(src.isMat() || src.isUMat()); \
|
|
|
|
Size csize = src.size(); \
|
2014-07-09 23:57:27 +08:00
|
|
|
int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
|
|
|
|
ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
|
|
|
|
if (cwidth < ckercn || ckercn <= 0) \
|
|
|
|
return 1; \
|
|
|
|
cols.push_back(cwidth); \
|
|
|
|
if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
|
2014-03-08 05:29:27 +08:00
|
|
|
return 1; \
|
|
|
|
offsets.push_back(src.offset()); \
|
|
|
|
steps.push_back(src.step()); \
|
2014-07-09 23:57:27 +08:00
|
|
|
dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
|
2014-07-10 00:06:54 +08:00
|
|
|
kercns.push_back(ckercn); \
|
2014-03-08 05:29:27 +08:00
|
|
|
} \
|
|
|
|
} \
|
|
|
|
while ((void)0, 0)
|
|
|
|
|
|
|
|
int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
|
|
|
|
InputArray src4, InputArray src5, InputArray src6,
|
2014-07-09 23:57:27 +08:00
|
|
|
InputArray src7, InputArray src8, InputArray src9,
|
|
|
|
OclVectorStrategy strat)
|
2014-03-08 05:29:27 +08:00
|
|
|
{
|
|
|
|
const ocl::Device & d = ocl::Device::getDefault();
|
|
|
|
|
|
|
|
int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
|
|
|
|
d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
|
|
|
|
d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
|
2014-07-09 23:57:27 +08:00
|
|
|
d.preferredVectorWidthDouble(), -1 };
|
2014-07-03 17:45:55 +08:00
|
|
|
|
|
|
|
// if the device says don't use vectors
|
|
|
|
if (vectorWidths[0] == 1)
|
2014-05-12 16:45:52 +08:00
|
|
|
{
|
|
|
|
// it's heuristic
|
2014-09-03 17:04:24 +08:00
|
|
|
vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
|
|
|
|
vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
|
2014-07-10 00:06:54 +08:00
|
|
|
vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
|
2014-05-12 16:45:52 +08:00
|
|
|
}
|
2014-03-08 05:29:27 +08:00
|
|
|
|
2014-10-21 18:13:15 +08:00
|
|
|
return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
|
|
|
|
}
|
|
|
|
|
2014-10-20 21:43:18 +08:00
|
|
|
int checkOptimalVectorWidth(const int *vectorWidths,
|
2014-10-21 18:13:15 +08:00
|
|
|
InputArray src1, InputArray src2, InputArray src3,
|
|
|
|
InputArray src4, InputArray src5, InputArray src6,
|
|
|
|
InputArray src7, InputArray src8, InputArray src9,
|
|
|
|
OclVectorStrategy strat)
|
|
|
|
{
|
2014-10-20 21:43:18 +08:00
|
|
|
CV_Assert(vectorWidths);
|
|
|
|
|
2014-10-21 18:13:15 +08:00
|
|
|
int ref_type = src1.type();
|
|
|
|
|
2014-03-08 05:29:27 +08:00
|
|
|
std::vector<size_t> offsets, steps, cols;
|
2014-07-10 00:06:54 +08:00
|
|
|
std::vector<int> dividers, kercns;
|
2014-03-08 05:29:27 +08:00
|
|
|
PROCESS_SRC(src1);
|
|
|
|
PROCESS_SRC(src2);
|
|
|
|
PROCESS_SRC(src3);
|
|
|
|
PROCESS_SRC(src4);
|
|
|
|
PROCESS_SRC(src5);
|
|
|
|
PROCESS_SRC(src6);
|
|
|
|
PROCESS_SRC(src7);
|
|
|
|
PROCESS_SRC(src8);
|
|
|
|
PROCESS_SRC(src9);
|
|
|
|
|
|
|
|
size_t size = offsets.size();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
2014-07-10 00:06:54 +08:00
|
|
|
while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
|
|
|
|
dividers[i] >>= 1, kercns[i] >>= 1;
|
2014-03-08 05:29:27 +08:00
|
|
|
|
|
|
|
// default strategy
|
2014-07-10 00:06:54 +08:00
|
|
|
int kercn = *std::min_element(kercns.begin(), kercns.end());
|
2014-03-08 05:29:27 +08:00
|
|
|
|
2014-05-21 22:12:26 +08:00
|
|
|
return kercn;
|
2014-03-08 05:29:27 +08:00
|
|
|
}
|
|
|
|
|
2014-07-10 00:06:54 +08:00
|
|
|
int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
|
|
|
|
InputArray src4, InputArray src5, InputArray src6,
|
|
|
|
InputArray src7, InputArray src8, InputArray src9)
|
|
|
|
{
|
|
|
|
return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
|
|
|
|
}
|
|
|
|
|
2014-03-08 05:29:27 +08:00
|
|
|
#undef PROCESS_SRC
|
|
|
|
|
2014-02-26 23:02:36 +08:00
|
|
|
|
|
|
|
// TODO Make this as a method of OpenCL "BuildOptions" class
|
|
|
|
void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
|
|
|
|
{
|
|
|
|
if (!buildOptions.empty())
|
|
|
|
buildOptions += " ";
|
|
|
|
int type = _m.type(), depth = CV_MAT_DEPTH(type);
|
|
|
|
buildOptions += format(
|
|
|
|
"-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
|
|
|
|
name.c_str(), ocl::typeToStr(type),
|
|
|
|
name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
|
|
|
|
name.c_str(), (int)CV_MAT_CN(type),
|
|
|
|
name.c_str(), (int)CV_ELEM_SIZE(type),
|
|
|
|
name.c_str(), (int)CV_ELEM_SIZE1(type),
|
|
|
|
name.c_str(), (int)depth
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2014-01-16 15:57:57 +08:00
|
|
|
|
|
|
|
struct Image2D::Impl
|
|
|
|
{
|
2014-04-15 07:09:17 +08:00
|
|
|
Impl(const UMat &src, bool norm, bool alias)
|
2014-01-16 15:57:57 +08:00
|
|
|
{
|
2014-01-16 19:44:35 +08:00
|
|
|
handle = 0;
|
|
|
|
refcount = 1;
|
2014-04-15 07:09:17 +08:00
|
|
|
init(src, norm, alias);
|
2014-01-16 15:57:57 +08:00
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2014-01-16 15:57:57 +08:00
|
|
|
~Impl()
|
|
|
|
{
|
|
|
|
if (handle)
|
|
|
|
clReleaseMemObject(handle);
|
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2014-04-15 07:09:17 +08:00
|
|
|
static cl_image_format getImageFormat(int depth, int cn, bool norm)
|
2014-01-16 15:57:57 +08:00
|
|
|
{
|
|
|
|
cl_image_format format;
|
2014-02-01 19:07:03 +08:00
|
|
|
static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
|
|
|
|
CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
|
2014-04-15 07:09:17 +08:00
|
|
|
static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
|
|
|
|
CL_SNORM_INT16, -1, -1, -1, -1 };
|
2014-02-01 19:07:03 +08:00
|
|
|
static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
|
|
|
|
|
2014-04-15 07:09:17 +08:00
|
|
|
int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
|
|
|
|
int channelOrder = channelOrders[cn];
|
2014-02-01 19:07:03 +08:00
|
|
|
format.image_channel_data_type = (cl_channel_type)channelType;
|
|
|
|
format.image_channel_order = (cl_channel_order)channelOrder;
|
2014-04-15 07:09:17 +08:00
|
|
|
return format;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool isFormatSupported(cl_image_format format)
|
|
|
|
{
|
2014-10-23 19:23:37 +08:00
|
|
|
if (!haveOpenCL())
|
|
|
|
CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
|
|
|
|
|
2014-04-15 07:09:17 +08:00
|
|
|
cl_context context = (cl_context)Context::getDefault().ptr();
|
|
|
|
// Figure out how many formats are supported by this context.
|
|
|
|
cl_uint numFormats = 0;
|
|
|
|
cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
|
|
|
|
CL_MEM_OBJECT_IMAGE2D, numFormats,
|
|
|
|
NULL, &numFormats);
|
|
|
|
AutoBuffer<cl_image_format> formats(numFormats);
|
|
|
|
err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
|
|
|
|
CL_MEM_OBJECT_IMAGE2D, numFormats,
|
|
|
|
formats, NULL);
|
|
|
|
CV_OclDbgAssert(err == CL_SUCCESS);
|
|
|
|
for (cl_uint i = 0; i < numFormats; ++i)
|
|
|
|
{
|
|
|
|
if (!memcmp(&formats[i], &format, sizeof(format)))
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void init(const UMat &src, bool norm, bool alias)
|
|
|
|
{
|
2014-10-24 18:55:16 +08:00
|
|
|
if (!haveOpenCL())
|
|
|
|
CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
|
|
|
|
|
|
|
|
CV_Assert(!src.empty());
|
2014-04-15 07:09:17 +08:00
|
|
|
CV_Assert(ocl::Device::getDefault().imageSupport());
|
|
|
|
|
|
|
|
int err, depth = src.depth(), cn = src.channels();
|
|
|
|
CV_Assert(cn <= 4);
|
|
|
|
cl_image_format format = getImageFormat(depth, cn, norm);
|
|
|
|
|
|
|
|
if (!isFormatSupported(format))
|
|
|
|
CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2014-10-24 18:55:16 +08:00
|
|
|
if (alias && !src.handle(ACCESS_RW))
|
|
|
|
CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
|
|
|
|
|
2014-02-01 00:23:01 +08:00
|
|
|
cl_context context = (cl_context)Context::getDefault().ptr();
|
2014-02-01 19:07:03 +08:00
|
|
|
cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
|
2014-01-16 15:57:57 +08:00
|
|
|
|
|
|
|
#ifdef CL_VERSION_1_2
|
2014-02-01 19:07:03 +08:00
|
|
|
// this enables backwards portability to
|
|
|
|
// run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
|
|
|
|
const Device & d = ocl::Device::getDefault();
|
|
|
|
int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
|
2014-04-15 07:09:17 +08:00
|
|
|
CV_Assert(!alias || canCreateAlias(src));
|
2014-02-01 19:07:03 +08:00
|
|
|
if (1 < major || (1 == major && 2 <= minor))
|
2014-01-16 15:57:57 +08:00
|
|
|
{
|
|
|
|
cl_image_desc desc;
|
|
|
|
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
|
|
|
|
desc.image_width = src.cols;
|
|
|
|
desc.image_height = src.rows;
|
|
|
|
desc.image_depth = 0;
|
|
|
|
desc.image_array_size = 1;
|
2014-04-15 07:09:17 +08:00
|
|
|
desc.image_row_pitch = alias ? src.step[0] : 0;
|
2014-01-16 15:57:57 +08:00
|
|
|
desc.image_slice_pitch = 0;
|
2014-04-15 07:09:17 +08:00
|
|
|
desc.buffer = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
|
2014-01-16 15:57:57 +08:00
|
|
|
desc.num_mip_levels = 0;
|
|
|
|
desc.num_samples = 0;
|
2014-02-01 19:07:03 +08:00
|
|
|
handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
|
2014-01-16 15:57:57 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
2014-04-10 21:19:02 +08:00
|
|
|
CV_SUPPRESS_DEPRECATED_START
|
2014-04-15 07:09:17 +08:00
|
|
|
CV_Assert(!alias); // This is an OpenCL 1.2 extension
|
2014-02-01 19:07:03 +08:00
|
|
|
handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
|
2014-04-10 21:19:02 +08:00
|
|
|
CV_SUPPRESS_DEPRECATED_END
|
2014-01-16 15:57:57 +08:00
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_OclDbgAssert(err == CL_SUCCESS);
|
|
|
|
|
2014-01-16 15:57:57 +08:00
|
|
|
size_t origin[] = { 0, 0, 0 };
|
2014-06-05 23:14:56 +08:00
|
|
|
size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
|
2014-01-16 15:57:57 +08:00
|
|
|
|
|
|
|
cl_mem devData;
|
2014-04-15 07:09:17 +08:00
|
|
|
if (!alias && !src.isContinuous())
|
2014-01-16 15:57:57 +08:00
|
|
|
{
|
2014-02-01 19:07:03 +08:00
|
|
|
devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
|
|
|
|
CV_OclDbgAssert(err == CL_SUCCESS);
|
|
|
|
|
2014-06-05 23:14:56 +08:00
|
|
|
const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
|
|
|
|
roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
|
2014-01-16 15:57:57 +08:00
|
|
|
}
|
|
|
|
else
|
2014-04-15 07:09:17 +08:00
|
|
|
{
|
2014-01-16 15:57:57 +08:00
|
|
|
devData = (cl_mem)src.handle(ACCESS_READ);
|
2014-04-15 07:09:17 +08:00
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
CV_Assert(devData != NULL);
|
2014-01-16 15:57:57 +08:00
|
|
|
|
2014-04-15 07:09:17 +08:00
|
|
|
if (!alias)
|
2014-01-16 15:57:57 +08:00
|
|
|
{
|
2014-04-15 07:09:17 +08:00
|
|
|
CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
|
|
|
|
if (!src.isContinuous())
|
|
|
|
{
|
|
|
|
CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
|
|
|
|
CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
|
|
|
|
}
|
2014-01-16 15:57:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IMPLEMENT_REFCOUNTABLE();
|
|
|
|
|
|
|
|
cl_mem handle;
|
|
|
|
};
|
|
|
|
|
|
|
|
Image2D::Image2D()
|
|
|
|
{
|
|
|
|
p = NULL;
|
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
|
2014-04-15 07:09:17 +08:00
|
|
|
Image2D::Image2D(const UMat &src, bool norm, bool alias)
|
2014-01-16 15:57:57 +08:00
|
|
|
{
|
2014-04-15 07:09:17 +08:00
|
|
|
p = new Impl(src, norm, alias);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Image2D::canCreateAlias(const UMat &m)
|
|
|
|
{
|
|
|
|
bool ret = false;
|
|
|
|
const Device & d = ocl::Device::getDefault();
|
2014-10-24 18:55:16 +08:00
|
|
|
if (d.imageFromBufferSupport() && !m.empty())
|
2014-04-15 07:09:17 +08:00
|
|
|
{
|
|
|
|
// This is the required pitch alignment in pixels
|
|
|
|
uint pitchAlign = d.imagePitchAlignment();
|
|
|
|
if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
|
|
|
|
{
|
|
|
|
// We don't currently handle the case where the buffer was created
|
|
|
|
// with CL_MEM_USE_HOST_PTR
|
|
|
|
if (!m.u->tempUMat())
|
|
|
|
{
|
|
|
|
ret = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Image2D::isFormatSupported(int depth, int cn, bool norm)
|
|
|
|
{
|
|
|
|
cl_image_format format = Impl::getImageFormat(depth, cn, norm);
|
|
|
|
|
|
|
|
return Impl::isFormatSupported(format);
|
2014-01-16 15:57:57 +08:00
|
|
|
}
|
2014-02-01 19:07:03 +08:00
|
|
|
|
|
|
|
Image2D::Image2D(const Image2D & i)
|
|
|
|
{
|
|
|
|
p = i.p;
|
|
|
|
if (p)
|
|
|
|
p->addref();
|
|
|
|
}
|
|
|
|
|
|
|
|
Image2D & Image2D::operator = (const Image2D & i)
|
|
|
|
{
|
|
|
|
if (i.p != p)
|
|
|
|
{
|
|
|
|
if (i.p)
|
|
|
|
i.p->addref();
|
|
|
|
if (p)
|
|
|
|
p->release();
|
|
|
|
p = i.p;
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2014-01-16 15:57:57 +08:00
|
|
|
Image2D::~Image2D()
|
|
|
|
{
|
|
|
|
if (p)
|
|
|
|
p->release();
|
|
|
|
}
|
|
|
|
|
|
|
|
void* Image2D::ptr() const
|
|
|
|
{
|
|
|
|
return p ? p->handle : 0;
|
|
|
|
}
|
|
|
|
|
2016-12-04 07:19:38 +08:00
|
|
|
bool internal::isOpenCLForced()
|
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static bool value = false;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
2017-05-25 23:59:01 +08:00
|
|
|
value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
|
2016-12-04 07:19:38 +08:00
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
bool internal::isPerformanceCheckBypassed()
|
2014-06-19 19:18:52 +08:00
|
|
|
{
|
|
|
|
static bool initialized = false;
|
|
|
|
static bool value = false;
|
|
|
|
if (!initialized)
|
|
|
|
{
|
2017-05-25 23:59:01 +08:00
|
|
|
value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
|
2014-06-19 19:18:52 +08:00
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
2015-01-02 08:33:40 +08:00
|
|
|
bool internal::isCLBuffer(UMat& u)
|
|
|
|
{
|
|
|
|
void* h = u.handle(ACCESS_RW);
|
|
|
|
if (!h)
|
|
|
|
return true;
|
|
|
|
CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
|
|
|
|
#if 1
|
|
|
|
if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
|
|
|
|
return false;
|
|
|
|
#else
|
|
|
|
cl_mem_object_type type = 0;
|
|
|
|
cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
|
|
|
|
if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
|
|
|
|
return false;
|
|
|
|
#endif
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
Merge pull request #9114 from pengli:dnn_rebase
add libdnn acceleration to dnn module (#9114)
* import libdnn code
Signed-off-by: Li Peng <peng.li@intel.com>
* add convolution layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add pooling layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add softmax layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add lrn layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add innerproduct layer ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* add HAVE_OPENCL macro
Signed-off-by: Li Peng <peng.li@intel.com>
* fix for convolution ocl
Signed-off-by: Li Peng <peng.li@intel.com>
* enable getUMat() for multi-dimension Mat
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat for ocl acceleration
Signed-off-by: Li Peng <peng.li@intel.com>
* use CV_OCL_RUN macro
Signed-off-by: Li Peng <peng.li@intel.com>
* set OPENCL target when it is available
and disable fuseLayer for OCL target for the time being
Signed-off-by: Li Peng <peng.li@intel.com>
* fix innerproduct accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* remove trailing space
Signed-off-by: Li Peng <peng.li@intel.com>
* Fixed tensorflow demo bug.
Root cause is that tensorflow has different algorithm with libdnn
to calculate convolution output dimension.
libdnn don't calculate output dimension anymore and just use one
passed in by config.
* split gemm ocl file
split it into gemm_buffer.cl and gemm_image.cl
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix compile failure
Signed-off-by: Li Peng <peng.li@intel.com>
* check env flag for auto tuning
Signed-off-by: Li Peng <peng.li@intel.com>
* switch to new ocl kernels for softmax layer
Signed-off-by: Li Peng <peng.li@intel.com>
* update softmax layer
on some platform subgroup extension may not work well,
fallback to non subgroup ocl acceleration.
Signed-off-by: Li Peng <peng.li@intel.com>
* fallback to cpu path for fc layer with multi output
Signed-off-by: Li Peng <peng.li@intel.com>
* update output message
Signed-off-by: Li Peng <peng.li@intel.com>
* update fully connected layer
fallback to gemm API if libdnn return false
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ReLU OCL implementation
* disable layer fusion for now
Signed-off-by: Li Peng <peng.li@intel.com>
* Add OCL implementation for concat layer
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
* libdnn: update license and copyrights
Also refine libdnn coding style
Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* DNN: Don't link OpenCL library explicitly
* DNN: Make default preferableTarget to DNN_TARGET_CPU
User should set it to DNN_TARGET_OPENCL explicitly if want to
use OpenCL acceleration.
Also don't fusion when using DNN_TARGET_OPENCL
* DNN: refine coding style
* Add getOpenCLErrorString
* DNN: Use int32_t/uint32_t instread of alias
* Use namespace ocl4dnn to include libdnn things
* remove extra copyTo in softmax ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* update ReLU layer ocl path
Signed-off-by: Li Peng <peng.li@intel.com>
* Add prefer target property for layer class
It is used to indicate the target for layer forwarding,
either the default CPU target or OCL target.
Signed-off-by: Li Peng <peng.li@intel.com>
* Add cl_event based timer for cv::ocl
* Rename libdnn to ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* use UMat for ocl4dnn internal buffer
Remove allocateMemory which use clCreateBuffer directly
Signed-off-by: Li Peng <peng.li@intel.com>
Signed-off-by: wzw <zhiwen.wu@intel.com>
* enable buffer gemm in ocl4dnn innerproduct
Signed-off-by: Li Peng <peng.li@intel.com>
* replace int_tp globally for ocl4dnn kernels.
Signed-off-by: wzw <zhiwen.wu@intel.com>
Signed-off-by: Li Peng <peng.li@intel.com>
* create UMat for layer params
Signed-off-by: Li Peng <peng.li@intel.com>
* update sign ocl kernel
Signed-off-by: Li Peng <peng.li@intel.com>
* update image based gemm of inner product layer
Signed-off-by: Li Peng <peng.li@intel.com>
* remove buffer gemm of inner product layer
call cv::gemm API instead
Signed-off-by: Li Peng <peng.li@intel.com>
* change ocl4dnn forward parameter to UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine auto-tuning mechanism.
- Use OPENCV_OCL4DNN_KERNEL_CONFIG_PATH to set cache directory
for fine-tuned kernel configuration.
e.g. export OPENCV_OCL4DNN_KERNEL_CONFIG_PATH=/home/tmp,
the cache directory will be /home/tmp/spatialkernels/ on Linux.
- Define environment OPENCV_OCL4DNN_ENABLE_AUTO_TUNING to enable
auto-tuning.
- OPENCV_OPENCL_ENABLE_PROFILING is only used to enable profiling
for OpenCL command queue. This fix basic kernel get wrong running
time, i.e. 0ms.
- If creating cache directory failed, disable auto-tuning.
* Detect and create cache dir on windows
Signed-off-by: Li Peng <peng.li@intel.com>
* Refine gemm like convolution kernel.
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix redundant swizzleWeights calling when use cached kernel config.
* Fix "out of resource" bug when auto-tuning too many kernels.
* replace cl_mem with UMat in ocl4dnnConvSpatial class
* OCL4DNN: reduce the tuning kernel candidate.
This patch could reduce 75% of the tuning candidates with less
than 2% performance impact for the final result.
Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
* replace cl_mem with umat in ocl4dnn convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* remove weight_image_ of ocl4dnn inner product
Actually it is unused in the computation
Signed-off-by: Li Peng <peng.li@intel.com>
* Various fixes for ocl4dnn
1. OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel())
2. Ptr<OCL4DNNInnerProduct<float> > innerProductOp
3. Code comments cleanup
4. ignore check on OCL cpu device
Signed-off-by: Li Peng <peng.li@intel.com>
* add build option for log softmax
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ocl kernels in ocl4dnn
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ocl4dnnSet with opencv setTo
Signed-off-by: Li Peng <peng.li@intel.com>
* replace ALIGN with cv::alignSize
Signed-off-by: Li Peng <peng.li@intel.com>
* check kernel build options
Signed-off-by: Li Peng <peng.li@intel.com>
* Handle program compilation fail properly.
* Use std::numeric_limits<float>::infinity() for large float number
* check ocl4dnn kernel compilation result
Signed-off-by: Li Peng <peng.li@intel.com>
* remove unused ctx_id
Signed-off-by: Li Peng <peng.li@intel.com>
* change clEnqueueNDRangeKernel to kernel.run()
Signed-off-by: Li Peng <peng.li@intel.com>
* change cl_mem to UMat in image based gemm
Signed-off-by: Li Peng <peng.li@intel.com>
* check intel subgroup support for lrn and pooling layer
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix convolution bug if group is greater than 1
Signed-off-by: Li Peng <peng.li@intel.com>
* Set default layer preferableTarget to be DNN_TARGET_CPU
Signed-off-by: Li Peng <peng.li@intel.com>
* Add ocl perf test for convolution
Signed-off-by: Li Peng <peng.li@intel.com>
* Add more ocl accuracy test
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_image with ocl::Image2D
Signed-off-by: Li Peng <peng.li@intel.com>
* Fix build failure in elementwise layer
Signed-off-by: Li Peng <peng.li@intel.com>
* use getUMat() to get blob data
Signed-off-by: Li Peng <peng.li@intel.com>
* replace cl_mem handle with ocl::KernelArg
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(build): don't use C++11, OPENCL_LIBRARIES fix
* dnn(ocl4dnn): remove unused OpenCL kernels
* dnn(ocl4dnn): extract OpenCL code into .cl files
* dnn(ocl4dnn): refine auto-tuning
Defaultly disable auto-tuning, set OPENCV_OCL4DNN_ENABLE_AUTO_TUNING
environment variable to enable it.
Use a set of pre-tuned configs as default config if auto-tuning is disabled.
These configs are tuned for Intel GPU with 48/72 EUs, and for googlenet,
AlexNet, ResNet-50
If default config is not suitable, use the first available kernel config
from the candidates. Candidate priority from high to low is gemm like kernel,
IDLF kernel, basick kernel.
* dnn(ocl4dnn): pooling doesn't use OpenCL subgroups
* dnn(ocl4dnn): fix perf test
OpenCV has default 3sec time limit for each performance test.
Warmup OpenCL backend outside of perf measurement loop.
* use ocl::KernelArg as much as possible
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): fix bias bug for gemm like kernel
* dnn(ocl4dnn): wrap cl_mem into UMat
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): Refine signature of kernel config
- Use more readable string as signture of kernel config
- Don't count device name and vendor in signature string
- Default kernel configurations are tuned for Intel GPU with
24/48/72 EUs, and for googlenet, AlexNet, ResNet-50 net model.
* dnn(ocl4dnn): swap width/height in configuration
* dnn(ocl4dnn): enable configs for Intel OpenCL runtime only
* core: make configuration helper functions accessible from non-core modules
* dnn(ocl4dnn): update kernel auto-tuning behavior
Avoid unwanted creation of directories
* dnn(ocl4dnn): simplify kernel to workaround OpenCL compiler crash
* dnn(ocl4dnn): remove redundant code
* dnn(ocl4dnn): Add more clear message for simd size dismatch.
* dnn(ocl4dnn): add const to const argument
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): force compiler use a specific SIMD size for IDLF kernel
* dnn(ocl4dnn): drop unused tuneLocalSize()
* dnn(ocl4dnn): specify OpenCL queue for Timer and convolve() method
* dnn(ocl4dnn): sanitize file names used for cache
* dnn(perf): enable Network tests with OpenCL
* dnn(ocl4dnn/conv): drop computeGlobalSize()
* dnn(ocl4dnn/conv): drop unused fields
* dnn(ocl4dnn/conv): simplify ctor
* dnn(ocl4dnn/conv): refactor kernelConfig localSize=NULL
* dnn(ocl4dnn/conv): drop unsupported double / untested half types
* dnn(ocl4dnn/conv): drop unused variable
* dnn(ocl4dnn/conv): alignSize/divUp
* dnn(ocl4dnn/conv): use enum values
* dnn(ocl4dnn): drop unused innerproduct variable
Signed-off-by: Li Peng <peng.li@intel.com>
* dnn(ocl4dnn): add an generic function to check cl option support
* dnn(ocl4dnn): run softmax subgroup version kernel first
Signed-off-by: Li Peng <peng.li@intel.com>
2017-10-02 20:38:00 +08:00
|
|
|
struct Timer::Impl
|
|
|
|
{
|
|
|
|
const Queue queue;
|
|
|
|
|
|
|
|
Impl(const Queue& q)
|
|
|
|
: queue(q)
|
|
|
|
, initted_(false)
|
|
|
|
, running_(false)
|
|
|
|
, has_run_at_least_once_(false)
|
|
|
|
{
|
|
|
|
init();
|
|
|
|
}
|
|
|
|
|
|
|
|
~Impl()
|
|
|
|
{
|
|
|
|
clWaitForEvents(1, &start_gpu_cl_);
|
|
|
|
clWaitForEvents(1, &stop_gpu_cl_);
|
|
|
|
clReleaseEvent(start_gpu_cl_);
|
|
|
|
clReleaseEvent(stop_gpu_cl_);
|
|
|
|
}
|
|
|
|
|
|
|
|
void start()
|
|
|
|
{
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
if (!running())
|
|
|
|
{
|
|
|
|
clWaitForEvents(1, &start_gpu_cl_);
|
|
|
|
clReleaseEvent(start_gpu_cl_);
|
|
|
|
ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc);
|
|
|
|
float arg = 0;
|
|
|
|
clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg);
|
|
|
|
clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0,
|
|
|
|
NULL, &start_gpu_cl_);
|
|
|
|
clFinish((cl_command_queue)queue.ptr());
|
|
|
|
running_ = true;
|
|
|
|
has_run_at_least_once_ = true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
void stop()
|
|
|
|
{
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
if (running())
|
|
|
|
{
|
|
|
|
clWaitForEvents(1, &stop_gpu_cl_);
|
|
|
|
clReleaseEvent(stop_gpu_cl_);
|
|
|
|
ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc);
|
|
|
|
float arg = 0;
|
|
|
|
clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg);
|
|
|
|
clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0,
|
|
|
|
NULL, &stop_gpu_cl_);
|
|
|
|
clFinish((cl_command_queue)queue.ptr());
|
|
|
|
running_ = false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
float microSeconds()
|
|
|
|
{
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
if (!has_run_at_least_once())
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (running())
|
|
|
|
{
|
|
|
|
stop();
|
|
|
|
}
|
|
|
|
cl_ulong startTime, stopTime;
|
|
|
|
clWaitForEvents(1, &stop_gpu_cl_);
|
|
|
|
clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
|
|
|
|
sizeof startTime, &startTime, NULL);
|
|
|
|
clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
|
|
|
|
sizeof stopTime, &stopTime, NULL);
|
|
|
|
double us = static_cast<double>(stopTime - startTime) / 1000.0;
|
|
|
|
elapsed_microseconds_ = static_cast<float>(us);
|
|
|
|
return elapsed_microseconds_;
|
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
float milliSeconds()
|
|
|
|
{
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
|
if (!has_run_at_least_once())
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (running())
|
|
|
|
{
|
|
|
|
stop();
|
|
|
|
}
|
|
|
|
cl_ulong startTime = 0, stopTime = 0;
|
|
|
|
clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
|
|
|
|
sizeof startTime, &startTime, NULL);
|
|
|
|
clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
|
|
|
|
sizeof stopTime, &stopTime, NULL);
|
|
|
|
double ms = static_cast<double>(stopTime - startTime) / 1000000.0;
|
|
|
|
elapsed_milliseconds_ = static_cast<float>(ms);
|
|
|
|
return elapsed_milliseconds_;
|
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
float seconds()
|
|
|
|
{
|
|
|
|
return milliSeconds() / 1000.f;
|
|
|
|
}
|
|
|
|
|
|
|
|
void init()
|
|
|
|
{
|
|
|
|
CV_Assert(queue.getImpl() && queue.getImpl()->isProfilingQueue_);
|
|
|
|
if (!initted())
|
|
|
|
{
|
|
|
|
start_gpu_cl_ = 0;
|
|
|
|
stop_gpu_cl_ = 0;
|
|
|
|
initted_ = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inline bool initted() { return initted_; }
|
|
|
|
inline bool running() { return running_; }
|
|
|
|
inline bool has_run_at_least_once() { return has_run_at_least_once_; }
|
|
|
|
|
|
|
|
bool initted_;
|
|
|
|
bool running_;
|
|
|
|
bool has_run_at_least_once_;
|
|
|
|
float elapsed_milliseconds_;
|
|
|
|
float elapsed_microseconds_;
|
|
|
|
cl_event start_gpu_cl_;
|
|
|
|
cl_event stop_gpu_cl_;
|
|
|
|
};
|
|
|
|
|
|
|
|
Timer::Timer(const Queue& q)
|
|
|
|
{
|
|
|
|
p = new Impl(q);
|
|
|
|
}
|
|
|
|
|
|
|
|
Timer::~Timer()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
{
|
|
|
|
delete p;
|
|
|
|
p = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Timer::start()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->start();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Timer::stop()
|
|
|
|
{
|
|
|
|
if(p)
|
|
|
|
p->stop();
|
|
|
|
}
|
|
|
|
|
|
|
|
float Timer::microSeconds()
|
|
|
|
{ return p ? p->microSeconds() : 0; }
|
|
|
|
|
|
|
|
float Timer::milliSeconds()
|
|
|
|
{ return p ? p->milliSeconds() : 0; }
|
|
|
|
|
|
|
|
float Timer::seconds()
|
|
|
|
{ return p ? p->seconds() : 0; }
|
|
|
|
|
2013-10-22 18:04:49 +08:00
|
|
|
}}
|