opencv/modules/core/src/ocl.cpp
2014-04-23 10:20:09 -07:00

4680 lines
146 KiB
C++

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the OpenCV Foundation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include <list>
#include <map>
#include <string>
#include <sstream>
#include <iostream> // std::cerr
#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
#include "opencv2/core/bufferpool.hpp"
#ifndef LOG_BUFFER_POOL
# if 0
# define LOG_BUFFER_POOL printf
# else
# define LOG_BUFFER_POOL(...)
# endif
#endif
// TODO Move to some common place
static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
{
const char* envValue = getenv(name);
if (envValue == NULL)
{
return defaultValue;
}
cv::String value = envValue;
size_t pos = 0;
for (; pos < value.size(); pos++)
{
if (!isdigit(value[pos]))
break;
}
cv::String valueStr = value.substr(0, pos);
cv::String suffixStr = value.substr(pos, value.length() - pos);
int v = atoi(valueStr.c_str());
if (suffixStr.length() == 0)
return v;
else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
return v * 1024 * 1024;
else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
return v * 1024;
CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
}
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
#include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
#ifdef HAVE_OPENCL
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
#else
// TODO FIXIT: This file can't be build without OPENCL
/*
Part of the file is an extract from the standard OpenCL headers from Khronos site.
Below is the original copyright.
*/
/*******************************************************************************
* Copyright (c) 2008 - 2012 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
#if 0 //defined __APPLE__
#define HAVE_OPENCL 1
#else
#undef HAVE_OPENCL
#endif
#define OPENCV_CL_NOT_IMPLEMENTED -1000
#ifdef HAVE_OPENCL
#if defined __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
static const bool g_haveOpenCL = true;
#else
extern "C" {
struct _cl_platform_id { int dummy; };
struct _cl_device_id { int dummy; };
struct _cl_context { int dummy; };
struct _cl_command_queue { int dummy; };
struct _cl_mem { int dummy; };
struct _cl_program { int dummy; };
struct _cl_kernel { int dummy; };
struct _cl_event { int dummy; };
struct _cl_sampler { int dummy; };
typedef struct _cl_platform_id * cl_platform_id;
typedef struct _cl_device_id * cl_device_id;
typedef struct _cl_context * cl_context;
typedef struct _cl_command_queue * cl_command_queue;
typedef struct _cl_mem * cl_mem;
typedef struct _cl_program * cl_program;
typedef struct _cl_kernel * cl_kernel;
typedef struct _cl_event * cl_event;
typedef struct _cl_sampler * cl_sampler;
typedef int cl_int;
typedef unsigned cl_uint;
#if defined (_WIN32) && defined(_MSC_VER)
typedef __int64 cl_long;
typedef unsigned __int64 cl_ulong;
#else
typedef long cl_long;
typedef unsigned long cl_ulong;
#endif
typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
typedef cl_ulong cl_bitfield;
typedef cl_bitfield cl_device_type;
typedef cl_uint cl_platform_info;
typedef cl_uint cl_device_info;
typedef cl_bitfield cl_device_fp_config;
typedef cl_uint cl_device_mem_cache_type;
typedef cl_uint cl_device_local_mem_type;
typedef cl_bitfield cl_device_exec_capabilities;
typedef cl_bitfield cl_command_queue_properties;
typedef intptr_t cl_device_partition_property;
typedef cl_bitfield cl_device_affinity_domain;
typedef intptr_t cl_context_properties;
typedef cl_uint cl_context_info;
typedef cl_uint cl_command_queue_info;
typedef cl_uint cl_channel_order;
typedef cl_uint cl_channel_type;
typedef cl_bitfield cl_mem_flags;
typedef cl_uint cl_mem_object_type;
typedef cl_uint cl_mem_info;
typedef cl_bitfield cl_mem_migration_flags;
typedef cl_uint cl_image_info;
typedef cl_uint cl_buffer_create_type;
typedef cl_uint cl_addressing_mode;
typedef cl_uint cl_filter_mode;
typedef cl_uint cl_sampler_info;
typedef cl_bitfield cl_map_flags;
typedef cl_uint cl_program_info;
typedef cl_uint cl_program_build_info;
typedef cl_uint cl_program_binary_type;
typedef cl_int cl_build_status;
typedef cl_uint cl_kernel_info;
typedef cl_uint cl_kernel_arg_info;
typedef cl_uint cl_kernel_arg_address_qualifier;
typedef cl_uint cl_kernel_arg_access_qualifier;
typedef cl_bitfield cl_kernel_arg_type_qualifier;
typedef cl_uint cl_kernel_work_group_info;
typedef cl_uint cl_event_info;
typedef cl_uint cl_command_type;
typedef cl_uint cl_profiling_info;
typedef struct _cl_image_format {
cl_channel_order image_channel_order;
cl_channel_type image_channel_data_type;
} cl_image_format;
typedef struct _cl_image_desc {
cl_mem_object_type image_type;
size_t image_width;
size_t image_height;
size_t image_depth;
size_t image_array_size;
size_t image_row_pitch;
size_t image_slice_pitch;
cl_uint num_mip_levels;
cl_uint num_samples;
cl_mem buffer;
} cl_image_desc;
typedef struct _cl_buffer_region {
size_t origin;
size_t size;
} cl_buffer_region;
//////////////////////////////////////////////////////////
#define CL_SUCCESS 0
#define CL_DEVICE_NOT_FOUND -1
#define CL_DEVICE_NOT_AVAILABLE -2
#define CL_COMPILER_NOT_AVAILABLE -3
#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
#define CL_OUT_OF_RESOURCES -5
#define CL_OUT_OF_HOST_MEMORY -6
#define CL_PROFILING_INFO_NOT_AVAILABLE -7
#define CL_MEM_COPY_OVERLAP -8
#define CL_IMAGE_FORMAT_MISMATCH -9
#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
#define CL_BUILD_PROGRAM_FAILURE -11
#define CL_MAP_FAILURE -12
#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
#define CL_COMPILE_PROGRAM_FAILURE -15
#define CL_LINKER_NOT_AVAILABLE -16
#define CL_LINK_PROGRAM_FAILURE -17
#define CL_DEVICE_PARTITION_FAILED -18
#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19
#define CL_INVALID_VALUE -30
#define CL_INVALID_DEVICE_TYPE -31
#define CL_INVALID_PLATFORM -32
#define CL_INVALID_DEVICE -33
#define CL_INVALID_CONTEXT -34
#define CL_INVALID_QUEUE_PROPERTIES -35
#define CL_INVALID_COMMAND_QUEUE -36
#define CL_INVALID_HOST_PTR -37
#define CL_INVALID_MEM_OBJECT -38
#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
#define CL_INVALID_IMAGE_SIZE -40
#define CL_INVALID_SAMPLER -41
#define CL_INVALID_BINARY -42
#define CL_INVALID_BUILD_OPTIONS -43
#define CL_INVALID_PROGRAM -44
#define CL_INVALID_PROGRAM_EXECUTABLE -45
#define CL_INVALID_KERNEL_NAME -46
#define CL_INVALID_KERNEL_DEFINITION -47
#define CL_INVALID_KERNEL -48
#define CL_INVALID_ARG_INDEX -49
#define CL_INVALID_ARG_VALUE -50
#define CL_INVALID_ARG_SIZE -51
#define CL_INVALID_KERNEL_ARGS -52
#define CL_INVALID_WORK_DIMENSION -53
#define CL_INVALID_WORK_GROUP_SIZE -54
#define CL_INVALID_WORK_ITEM_SIZE -55
#define CL_INVALID_GLOBAL_OFFSET -56
#define CL_INVALID_EVENT_WAIT_LIST -57
#define CL_INVALID_EVENT -58
#define CL_INVALID_OPERATION -59
#define CL_INVALID_GL_OBJECT -60
#define CL_INVALID_BUFFER_SIZE -61
#define CL_INVALID_MIP_LEVEL -62
#define CL_INVALID_GLOBAL_WORK_SIZE -63
#define CL_INVALID_PROPERTY -64
#define CL_INVALID_IMAGE_DESCRIPTOR -65
#define CL_INVALID_COMPILER_OPTIONS -66
#define CL_INVALID_LINKER_OPTIONS -67
#define CL_INVALID_DEVICE_PARTITION_COUNT -68
/*#define CL_VERSION_1_0 1
#define CL_VERSION_1_1 1
#define CL_VERSION_1_2 1*/
#define CL_FALSE 0
#define CL_TRUE 1
#define CL_BLOCKING CL_TRUE
#define CL_NON_BLOCKING CL_FALSE
#define CL_PLATFORM_PROFILE 0x0900
#define CL_PLATFORM_VERSION 0x0901
#define CL_PLATFORM_NAME 0x0902
#define CL_PLATFORM_VENDOR 0x0903
#define CL_PLATFORM_EXTENSIONS 0x0904
#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
#define CL_DEVICE_TYPE_CPU (1 << 1)
#define CL_DEVICE_TYPE_GPU (1 << 2)
#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
#define CL_DEVICE_TYPE 0x1000
#define CL_DEVICE_VENDOR_ID 0x1001
#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
#define CL_DEVICE_ADDRESS_BITS 0x100D
#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
#define CL_DEVICE_IMAGE_SUPPORT 0x1016
#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
#define CL_DEVICE_MAX_SAMPLERS 0x1018
#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
#define CL_DEVICE_ENDIAN_LITTLE 0x1026
#define CL_DEVICE_AVAILABLE 0x1027
#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
#define CL_DEVICE_NAME 0x102B
#define CL_DEVICE_VENDOR 0x102C
#define CL_DRIVER_VERSION 0x102D
#define CL_DEVICE_PROFILE 0x102E
#define CL_DEVICE_VERSION 0x102F
#define CL_DEVICE_EXTENSIONS 0x1030
#define CL_DEVICE_PLATFORM 0x1031
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
#define CL_DEVICE_LINKER_AVAILABLE 0x103E
#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
#define CL_DEVICE_PARENT_DEVICE 0x1042
#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
#define CL_DEVICE_PARTITION_PROPERTIES 0x1044
#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
#define CL_DEVICE_PARTITION_TYPE 0x1046
#define CL_DEVICE_REFERENCE_COUNT 0x1047
#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
#define CL_FP_DENORM (1 << 0)
#define CL_FP_INF_NAN (1 << 1)
#define CL_FP_ROUND_TO_NEAREST (1 << 2)
#define CL_FP_ROUND_TO_ZERO (1 << 3)
#define CL_FP_ROUND_TO_INF (1 << 4)
#define CL_FP_FMA (1 << 5)
#define CL_FP_SOFT_FLOAT (1 << 6)
#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7)
#define CL_NONE 0x0
#define CL_READ_ONLY_CACHE 0x1
#define CL_READ_WRITE_CACHE 0x2
#define CL_LOCAL 0x1
#define CL_GLOBAL 0x2
#define CL_EXEC_KERNEL (1 << 0)
#define CL_EXEC_NATIVE_KERNEL (1 << 1)
#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
#define CL_CONTEXT_REFERENCE_COUNT 0x1080
#define CL_CONTEXT_DEVICES 0x1081
#define CL_CONTEXT_PROPERTIES 0x1082
#define CL_CONTEXT_NUM_DEVICES 0x1083
#define CL_CONTEXT_PLATFORM 0x1084
#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085
#define CL_DEVICE_PARTITION_EQUALLY 0x1086
#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087
#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088
#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
#define CL_QUEUE_CONTEXT 0x1090
#define CL_QUEUE_DEVICE 0x1091
#define CL_QUEUE_REFERENCE_COUNT 0x1092
#define CL_QUEUE_PROPERTIES 0x1093
#define CL_MEM_READ_WRITE (1 << 0)
#define CL_MEM_WRITE_ONLY (1 << 1)
#define CL_MEM_READ_ONLY (1 << 2)
#define CL_MEM_USE_HOST_PTR (1 << 3)
#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
#define CL_MEM_COPY_HOST_PTR (1 << 5)
// reserved (1 << 6)
#define CL_MEM_HOST_WRITE_ONLY (1 << 7)
#define CL_MEM_HOST_READ_ONLY (1 << 8)
#define CL_MEM_HOST_NO_ACCESS (1 << 9)
#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0)
#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1)
#define CL_R 0x10B0
#define CL_A 0x10B1
#define CL_RG 0x10B2
#define CL_RA 0x10B3
#define CL_RGB 0x10B4
#define CL_RGBA 0x10B5
#define CL_BGRA 0x10B6
#define CL_ARGB 0x10B7
#define CL_INTENSITY 0x10B8
#define CL_LUMINANCE 0x10B9
#define CL_Rx 0x10BA
#define CL_RGx 0x10BB
#define CL_RGBx 0x10BC
#define CL_DEPTH 0x10BD
#define CL_DEPTH_STENCIL 0x10BE
#define CL_SNORM_INT8 0x10D0
#define CL_SNORM_INT16 0x10D1
#define CL_UNORM_INT8 0x10D2
#define CL_UNORM_INT16 0x10D3
#define CL_UNORM_SHORT_565 0x10D4
#define CL_UNORM_SHORT_555 0x10D5
#define CL_UNORM_INT_101010 0x10D6
#define CL_SIGNED_INT8 0x10D7
#define CL_SIGNED_INT16 0x10D8
#define CL_SIGNED_INT32 0x10D9
#define CL_UNSIGNED_INT8 0x10DA
#define CL_UNSIGNED_INT16 0x10DB
#define CL_UNSIGNED_INT32 0x10DC
#define CL_HALF_FLOAT 0x10DD
#define CL_FLOAT 0x10DE
#define CL_UNORM_INT24 0x10DF
#define CL_MEM_OBJECT_BUFFER 0x10F0
#define CL_MEM_OBJECT_IMAGE2D 0x10F1
#define CL_MEM_OBJECT_IMAGE3D 0x10F2
#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
#define CL_MEM_OBJECT_IMAGE1D 0x10F4
#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
#define CL_MEM_TYPE 0x1100
#define CL_MEM_FLAGS 0x1101
#define CL_MEM_SIZE 0x1102
#define CL_MEM_HOST_PTR 0x1103
#define CL_MEM_MAP_COUNT 0x1104
#define CL_MEM_REFERENCE_COUNT 0x1105
#define CL_MEM_CONTEXT 0x1106
#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
#define CL_MEM_OFFSET 0x1108
#define CL_IMAGE_FORMAT 0x1110
#define CL_IMAGE_ELEMENT_SIZE 0x1111
#define CL_IMAGE_ROW_PITCH 0x1112
#define CL_IMAGE_SLICE_PITCH 0x1113
#define CL_IMAGE_WIDTH 0x1114
#define CL_IMAGE_HEIGHT 0x1115
#define CL_IMAGE_DEPTH 0x1116
#define CL_IMAGE_ARRAY_SIZE 0x1117
#define CL_IMAGE_BUFFER 0x1118
#define CL_IMAGE_NUM_MIP_LEVELS 0x1119
#define CL_IMAGE_NUM_SAMPLES 0x111A
#define CL_ADDRESS_NONE 0x1130
#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
#define CL_ADDRESS_CLAMP 0x1132
#define CL_ADDRESS_REPEAT 0x1133
#define CL_ADDRESS_MIRRORED_REPEAT 0x1134
#define CL_FILTER_NEAREST 0x1140
#define CL_FILTER_LINEAR 0x1141
#define CL_SAMPLER_REFERENCE_COUNT 0x1150
#define CL_SAMPLER_CONTEXT 0x1151
#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
#define CL_SAMPLER_ADDRESSING_MODE 0x1153
#define CL_SAMPLER_FILTER_MODE 0x1154
#define CL_MAP_READ (1 << 0)
#define CL_MAP_WRITE (1 << 1)
#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
#define CL_PROGRAM_REFERENCE_COUNT 0x1160
#define CL_PROGRAM_CONTEXT 0x1161
#define CL_PROGRAM_NUM_DEVICES 0x1162
#define CL_PROGRAM_DEVICES 0x1163
#define CL_PROGRAM_SOURCE 0x1164
#define CL_PROGRAM_BINARY_SIZES 0x1165
#define CL_PROGRAM_BINARIES 0x1166
#define CL_PROGRAM_NUM_KERNELS 0x1167
#define CL_PROGRAM_KERNEL_NAMES 0x1168
#define CL_PROGRAM_BUILD_STATUS 0x1181
#define CL_PROGRAM_BUILD_OPTIONS 0x1182
#define CL_PROGRAM_BUILD_LOG 0x1183
#define CL_PROGRAM_BINARY_TYPE 0x1184
#define CL_PROGRAM_BINARY_TYPE_NONE 0x0
#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1
#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2
#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4
#define CL_BUILD_SUCCESS 0
#define CL_BUILD_NONE -1
#define CL_BUILD_ERROR -2
#define CL_BUILD_IN_PROGRESS -3
#define CL_KERNEL_FUNCTION_NAME 0x1190
#define CL_KERNEL_NUM_ARGS 0x1191
#define CL_KERNEL_REFERENCE_COUNT 0x1192
#define CL_KERNEL_CONTEXT 0x1193
#define CL_KERNEL_PROGRAM 0x1194
#define CL_KERNEL_ATTRIBUTES 0x1195
#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196
#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197
#define CL_KERNEL_ARG_TYPE_NAME 0x1198
#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199
#define CL_KERNEL_ARG_NAME 0x119A
#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B
#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C
#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D
#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E
#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0
#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1
#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2
#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3
#define CL_KERNEL_ARG_TYPE_NONE 0
#define CL_KERNEL_ARG_TYPE_CONST (1 << 0)
#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1)
#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2)
#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5
#define CL_EVENT_COMMAND_QUEUE 0x11D0
#define CL_EVENT_COMMAND_TYPE 0x11D1
#define CL_EVENT_REFERENCE_COUNT 0x11D2
#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
#define CL_EVENT_CONTEXT 0x11D4
#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
#define CL_COMMAND_TASK 0x11F1
#define CL_COMMAND_NATIVE_KERNEL 0x11F2
#define CL_COMMAND_READ_BUFFER 0x11F3
#define CL_COMMAND_WRITE_BUFFER 0x11F4
#define CL_COMMAND_COPY_BUFFER 0x11F5
#define CL_COMMAND_READ_IMAGE 0x11F6
#define CL_COMMAND_WRITE_IMAGE 0x11F7
#define CL_COMMAND_COPY_IMAGE 0x11F8
#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
#define CL_COMMAND_MAP_BUFFER 0x11FB
#define CL_COMMAND_MAP_IMAGE 0x11FC
#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
#define CL_COMMAND_MARKER 0x11FE
#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
#define CL_COMMAND_READ_BUFFER_RECT 0x1201
#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
#define CL_COMMAND_USER 0x1204
#define CL_COMMAND_BARRIER 0x1205
#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
#define CL_COMMAND_FILL_BUFFER 0x1207
#define CL_COMMAND_FILL_IMAGE 0x1208
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
#define CL_BUFFER_CREATE_TYPE_REGION 0x1220
#define CL_PROFILING_COMMAND_QUEUED 0x1280
#define CL_PROFILING_COMMAND_SUBMIT 0x1281
#define CL_PROFILING_COMMAND_START 0x1282
#define CL_PROFILING_COMMAND_END 0x1283
#define CL_CALLBACK CV_STDCALL
static volatile bool g_haveOpenCL = false;
static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
#if defined(__APPLE__)
#include <dlfcn.h>
static void* initOpenCLAndLoad(const char* funcname)
{
static bool initialized = false;
static void* handle = 0;
if (!handle)
{
if(!initialized)
{
const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
"/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
handle = dlopen(oclpath, RTLD_LAZY);
initialized = true;
g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
if( g_haveOpenCL )
fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
else
fprintf(stderr, "Failed to load OpenCL runtime\n");
}
if(!handle)
return 0;
}
return funcname && handle ? dlsym(handle, funcname) : 0;
}
#elif defined WIN32 || defined _WIN32
#ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
#define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
#endif
#include <windows.h>
#if (_WIN32_WINNT >= 0x0602)
#include <synchapi.h>
#endif
#undef small
#undef min
#undef max
#undef abs
static void* initOpenCLAndLoad(const char* funcname)
{
static bool initialized = false;
static HMODULE handle = 0;
if (!handle)
{
if(!initialized)
{
handle = LoadLibraryA("OpenCL.dll");
initialized = true;
g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
}
if(!handle)
return 0;
}
return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
}
#elif defined(__linux)
#include <dlfcn.h>
#include <stdio.h>
static void* initOpenCLAndLoad(const char* funcname)
{
static bool initialized = false;
static void* handle = 0;
if (!handle)
{
if(!initialized)
{
handle = dlopen("libOpenCL.so", RTLD_LAZY);
if(!handle)
handle = dlopen("libCL.so", RTLD_LAZY);
initialized = true;
g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
}
if(!handle)
return 0;
}
return funcname ? (void*)dlsym(handle, funcname) : 0;
}
#else
static void* initOpenCLAndLoad(const char*)
{
return 0;
}
#endif
#define OCL_FUNC(rettype, funcname, argsdecl, args) \
typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
static rettype funcname argsdecl \
{ \
static funcname##_t funcname##_p = 0; \
if( !funcname##_p ) \
{ \
funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
if( !funcname##_p ) \
return OPENCV_CL_NOT_IMPLEMENTED; \
} \
return funcname##_p args; \
}
#define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
static rettype funcname argsdecl \
{ \
static funcname##_t funcname##_p = 0; \
if( !funcname##_p ) \
{ \
funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
if( !funcname##_p ) \
{ \
if( errcode_ret ) \
*errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
return 0; \
} \
} \
return funcname##_p args; \
}
OCL_FUNC(cl_int, clGetPlatformIDs,
(cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
(num_entries, platforms, num_platforms))
OCL_FUNC(cl_int, clGetPlatformInfo,
(cl_platform_id platform, cl_platform_info param_name,
size_t param_value_size, void * param_value,
size_t * param_value_size_ret),
(platform, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clGetDeviceInfo,
(cl_device_id device,
cl_device_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(device, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clGetDeviceIDs,
(cl_platform_id platform,
cl_device_type device_type,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices),
(platform, device_type, num_entries, devices, num_devices))
OCL_FUNC_P(cl_context, clCreateContext,
(const cl_context_properties * properties,
cl_uint num_devices,
const cl_device_id * devices,
void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
void * user_data,
cl_int * errcode_ret),
(properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
/*
OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
OCL_FUNC_P(cl_context, clCreateContextFromType,
(const cl_context_properties * properties,
cl_device_type device_type,
void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
void * user_data,
cl_int * errcode_ret),
(properties, device_type, pfn_notify, user_data, errcode_ret))
OCL_FUNC(cl_int, clGetContextInfo,
(cl_context context,
cl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(context, param_name, param_value_size,
param_value, param_value_size_ret))
*/
OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
(cl_context context,
cl_device_id device,
cl_command_queue_properties properties,
cl_int * errcode_ret),
(context, device, properties, errcode_ret))
OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
OCL_FUNC_P(cl_mem, clCreateBuffer,
(cl_context context,
cl_mem_flags flags,
size_t size,
void * host_ptr,
cl_int * errcode_ret),
(context, flags, size, host_ptr, errcode_ret))
/*
OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
OCL_FUNC(cl_int, clGetCommandQueueInfo,
(cl_command_queue command_queue,
cl_command_queue_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(command_queue, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC_P(cl_mem, clCreateSubBuffer,
(cl_mem buffer,
cl_mem_flags flags,
cl_buffer_create_type buffer_create_type,
const void * buffer_create_info,
cl_int * errcode_ret),
(buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
*/
OCL_FUNC_P(cl_mem, clCreateImage,
(cl_context context,
cl_mem_flags flags,
const cl_image_format * image_format,
const cl_image_desc * image_desc,
void * host_ptr,
cl_int * errcode_ret),
(context, flags, image_format, image_desc, host_ptr, errcode_ret))
OCL_FUNC_P(cl_mem, clCreateImage2D,
(cl_context context,
cl_mem_flags flags,
const cl_image_format * image_format,
size_t image_width,
size_t image_height,
size_t image_row_pitch,
void * host_ptr,
cl_int *errcode_ret),
(context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
OCL_FUNC(cl_int, clGetSupportedImageFormats,
(cl_context context,
cl_mem_flags flags,
cl_mem_object_type image_type,
cl_uint num_entries,
cl_image_format * image_formats,
cl_uint * num_image_formats),
(context, flags, image_type, num_entries, image_formats, num_image_formats))
/*
OCL_FUNC(cl_int, clGetMemObjectInfo,
(cl_mem memobj,
cl_mem_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(memobj, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clGetImageInfo,
(cl_mem image,
cl_image_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(image, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clCreateKernelsInProgram,
(cl_program program,
cl_uint num_kernels,
cl_kernel * kernels,
cl_uint * num_kernels_ret),
(program, num_kernels, kernels, num_kernels_ret))
OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
OCL_FUNC(cl_int, clGetKernelArgInfo,
(cl_kernel kernel,
cl_uint arg_indx,
cl_kernel_arg_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clEnqueueReadImage,
(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_read,
const size_t * origin[3],
const size_t * region[3],
size_t row_pitch,
size_t slice_pitch,
void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, image, blocking_read, origin, region,
row_pitch, slice_pitch,
ptr,
num_events_in_wait_list,
event_wait_list,
event))
OCL_FUNC(cl_int, clEnqueueWriteImage,
(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_write,
const size_t * origin[3],
const size_t * region[3],
size_t input_row_pitch,
size_t input_slice_pitch,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, image, blocking_write, origin, region, input_row_pitch,
input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueFillImage,
(cl_command_queue command_queue,
cl_mem image,
const void * fill_color,
const size_t * origin[3],
const size_t * region[3],
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, image, fill_color, origin, region,
num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueCopyImage,
(cl_command_queue command_queue,
cl_mem src_image,
cl_mem dst_image,
const size_t * src_origin[3],
const size_t * dst_origin[3],
const size_t * region[3],
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, src_image, dst_image, src_origin, dst_origin,
region, num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
(cl_command_queue command_queue,
cl_mem src_image,
cl_mem dst_buffer,
const size_t * src_origin[3],
const size_t * region[3],
size_t dst_offset,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
num_events_in_wait_list, event_wait_list, event))
*/
OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
(cl_command_queue command_queue,
cl_mem src_buffer,
cl_mem dst_image,
size_t src_offset,
const size_t dst_origin[3],
const size_t region[3],
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, src_buffer, dst_image, src_offset, dst_origin,
region, num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clFlush,
(cl_command_queue command_queue),
(command_queue))
/*
OCL_FUNC_P(void*, clEnqueueMapImage,
(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_map,
cl_map_flags map_flags,
const size_t * origin[3],
const size_t * region[3],
size_t * image_row_pitch,
size_t * image_slice_pitch,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event,
cl_int * errcode_ret),
(command_queue, image, blocking_map, map_flags, origin, region,
image_row_pitch, image_slice_pitch, num_events_in_wait_list,
event_wait_list, event, errcode_ret))
*/
/*
OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
OCL_FUNC(cl_int, clGetKernelInfo,
(cl_kernel kernel,
cl_kernel_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(kernel, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
*/
OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
OCL_FUNC_P(cl_program, clCreateProgramWithSource,
(cl_context context,
cl_uint count,
const char ** strings,
const size_t * lengths,
cl_int * errcode_ret),
(context, count, strings, lengths, errcode_ret))
OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
(cl_context context,
cl_uint num_devices,
const cl_device_id * device_list,
const size_t * lengths,
const unsigned char ** binaries,
cl_int * binary_status,
cl_int * errcode_ret),
(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
OCL_FUNC(cl_int, clBuildProgram,
(cl_program program,
cl_uint num_devices,
const cl_device_id * device_list,
const char * options,
void (CL_CALLBACK * pfn_notify)(cl_program, void *),
void * user_data),
(program, num_devices, device_list, options, pfn_notify, user_data))
OCL_FUNC(cl_int, clGetProgramInfo,
(cl_program program,
cl_program_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(program, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clGetProgramBuildInfo,
(cl_program program,
cl_device_id device,
cl_program_build_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(program, device, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC_P(cl_kernel, clCreateKernel,
(cl_program program,
const char * kernel_name,
cl_int * errcode_ret),
(program, kernel_name, errcode_ret))
OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
OCL_FUNC(cl_int, clSetKernelArg,
(cl_kernel kernel,
cl_uint arg_index,
size_t arg_size,
const void * arg_value),
(kernel, arg_index, arg_size, arg_value))
OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
(cl_kernel kernel,
cl_device_id device,
cl_kernel_work_group_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret),
(kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
OCL_FUNC(cl_int, clEnqueueReadBuffer,
(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_read,
size_t offset,
size_t size,
void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, buffer, blocking_read, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueReadBufferRect,
(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_read,
const size_t * buffer_offset,
const size_t * host_offset,
const size_t * region,
size_t buffer_row_pitch,
size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueWriteBuffer,
(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_write,
size_t offset,
size_t size,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, buffer, blocking_write, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_write,
const size_t * buffer_offset,
const size_t * host_offset,
const size_t * region,
size_t buffer_row_pitch,
size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, buffer, blocking_write, buffer_offset, host_offset,
region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
/*OCL_FUNC(cl_int, clEnqueueFillBuffer,
(cl_command_queue command_queue,
cl_mem buffer,
const void * pattern,
size_t pattern_size,
size_t offset,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, buffer, pattern, pattern_size, offset, size,
num_events_in_wait_list, event_wait_list, event))*/
OCL_FUNC(cl_int, clEnqueueCopyBuffer,
(cl_command_queue command_queue,
cl_mem src_buffer,
cl_mem dst_buffer,
size_t src_offset,
size_t dst_offset,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
size, num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
(cl_command_queue command_queue,
cl_mem src_buffer,
cl_mem dst_buffer,
const size_t * src_origin,
const size_t * dst_origin,
const size_t * region,
size_t src_row_pitch,
size_t src_slice_pitch,
size_t dst_row_pitch,
size_t dst_slice_pitch,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
num_events_in_wait_list, event_wait_list, event))
OCL_FUNC_P(void*, clEnqueueMapBuffer,
(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_map,
cl_map_flags map_flags,
size_t offset,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event,
cl_int * errcode_ret),
(command_queue, buffer, blocking_map, map_flags, offset, size,
num_events_in_wait_list, event_wait_list, event, errcode_ret))
OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
(cl_command_queue command_queue,
cl_mem memobj,
void * mapped_ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint work_dim,
const size_t * global_work_offset,
const size_t * global_work_size,
const size_t * local_work_size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, kernel, work_dim, global_work_offset, global_work_size,
local_work_size, num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clEnqueueTask,
(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event),
(command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
OCL_FUNC(cl_int, clSetEventCallback,
(cl_event event,
cl_int command_exec_callback_type ,
void (CL_CALLBACK *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
void *user_data),
(event, command_exec_callback_type, pfn_event_notify, user_data))
OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
}
#endif
#ifndef CL_VERSION_1_2
#define CL_VERSION_1_2
#endif
#endif
#ifdef _DEBUG
#define CV_OclDbgAssert CV_DbgAssert
#else
#define CV_OclDbgAssert(expr) (void)(expr)
#endif
namespace cv { namespace ocl {
struct UMat2D
{
UMat2D(const UMat& m)
{
offset = (int)m.offset;
step = (int)m.step;
rows = m.rows;
cols = m.cols;
}
int offset;
int step;
int rows;
int cols;
};
struct UMat3D
{
UMat3D(const UMat& m)
{
offset = (int)m.offset;
step = (int)m.step.p[1];
slicestep = (int)m.step.p[0];
slices = (int)m.size.p[0];
rows = m.size.p[1];
cols = m.size.p[2];
}
int offset;
int slicestep;
int step;
int slices;
int rows;
int cols;
};
// Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
{
static uint64 table[256];
static bool initialized = false;
if( !initialized )
{
for( int i = 0; i < 256; i++ )
{
uint64 c = i;
for( int j = 0; j < 8; j++ )
c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
table[i] = c;
}
initialized = true;
}
uint64 crc = ~crc0;
for( size_t idx = 0; idx < size; idx++ )
crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
return ~crc;
}
struct HashKey
{
typedef uint64 part;
HashKey(part _a, part _b) : a(_a), b(_b) {}
part a, b;
};
inline bool operator == (const HashKey& h1, const HashKey& h2)
{
return h1.a == h2.a && h1.b == h2.b;
}
inline bool operator < (const HashKey& h1, const HashKey& h2)
{
return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
}
bool haveOpenCL()
{
#ifdef HAVE_OPENCL
static bool g_isOpenCLInitialized = false;
static bool g_isOpenCLAvailable = false;
if (!g_isOpenCLInitialized)
{
try
{
cl_uint n = 0;
g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
}
catch (...)
{
g_isOpenCLAvailable = false;
}
g_isOpenCLInitialized = true;
}
return g_isOpenCLAvailable;
#else
return false;
#endif
}
bool useOpenCL()
{
CoreTLSData* data = coreTlsData.get();
if( data->useOpenCL < 0 )
data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
return data->useOpenCL > 0;
}
void setUseOpenCL(bool flag)
{
if( haveOpenCL() )
{
CoreTLSData* data = coreTlsData.get();
data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
}
}
#ifdef HAVE_CLAMDBLAS
class AmdBlasHelper
{
public:
static AmdBlasHelper & getInstance()
{
static AmdBlasHelper amdBlas;
return amdBlas;
}
bool isAvailable() const
{
return g_isAmdBlasAvailable;
}
~AmdBlasHelper()
{
try
{
clAmdBlasTeardown();
}
catch (...) { }
}
protected:
AmdBlasHelper()
{
if (!g_isAmdBlasInitialized)
{
AutoLock lock(m);
if (!g_isAmdBlasInitialized && haveOpenCL())
{
try
{
g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
}
catch (...)
{
g_isAmdBlasAvailable = false;
}
}
else
g_isAmdBlasAvailable = false;
g_isAmdBlasInitialized = true;
}
}
private:
static Mutex m;
static bool g_isAmdBlasInitialized;
static bool g_isAmdBlasAvailable;
};
bool AmdBlasHelper::g_isAmdBlasAvailable = false;
bool AmdBlasHelper::g_isAmdBlasInitialized = false;
Mutex AmdBlasHelper::m;
bool haveAmdBlas()
{
return AmdBlasHelper::getInstance().isAvailable();
}
#else
bool haveAmdBlas()
{
return false;
}
#endif
#ifdef HAVE_CLAMDFFT
class AmdFftHelper
{
public:
static AmdFftHelper & getInstance()
{
static AmdFftHelper amdFft;
return amdFft;
}
bool isAvailable() const
{
return g_isAmdFftAvailable;
}
~AmdFftHelper()
{
try
{
// clAmdFftTeardown();
}
catch (...) { }
}
protected:
AmdFftHelper()
{
if (!g_isAmdFftInitialized)
{
AutoLock lock(m);
if (!g_isAmdFftInitialized && haveOpenCL())
{
try
{
CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
g_isAmdFftAvailable = true;
}
catch (const Exception &)
{
g_isAmdFftAvailable = false;
}
}
else
g_isAmdFftAvailable = false;
g_isAmdFftInitialized = true;
}
}
private:
static clAmdFftSetupData setupData;
static Mutex m;
static bool g_isAmdFftInitialized;
static bool g_isAmdFftAvailable;
};
clAmdFftSetupData AmdFftHelper::setupData;
bool AmdFftHelper::g_isAmdFftAvailable = false;
bool AmdFftHelper::g_isAmdFftInitialized = false;
Mutex AmdFftHelper::m;
bool haveAmdFft()
{
return AmdFftHelper::getInstance().isAvailable();
}
#else
bool haveAmdFft()
{
return false;
}
#endif
void finish()
{
Queue::getDefault().finish();
}
#define IMPLEMENT_REFCOUNTABLE() \
void addref() { CV_XADD(&refcount, 1); } \
void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
int refcount
/////////////////////////////////////////// Platform /////////////////////////////////////////////
struct Platform::Impl
{
Impl()
{
refcount = 1;
handle = 0;
initialized = false;
}
~Impl() {}
void init()
{
if( !initialized )
{
//cl_uint num_entries
cl_uint n = 0;
if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
handle = 0;
if( handle != 0 )
{
char buf[1000];
size_t len = 0;
CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
buf[len] = '\0';
vendor = String(buf);
}
initialized = true;
}
}
IMPLEMENT_REFCOUNTABLE();
cl_platform_id handle;
String vendor;
bool initialized;
};
Platform::Platform()
{
p = 0;
}
Platform::~Platform()
{
if(p)
p->release();
}
Platform::Platform(const Platform& pl)
{
p = (Impl*)pl.p;
if(p)
p->addref();
}
Platform& Platform::operator = (const Platform& pl)
{
Impl* newp = (Impl*)pl.p;
if(newp)
newp->addref();
if(p)
p->release();
p = newp;
return *this;
}
void* Platform::ptr() const
{
return p ? p->handle : 0;
}
Platform& Platform::getDefault()
{
static Platform p;
if( !p.p )
{
p.p = new Impl;
p.p->init();
}
return p;
}
/////////////////////////////////////// Device ////////////////////////////////////////////
// deviceVersion has format
// OpenCL<space><major_version.minor_version><space><vendor-specific information>
// by specification
// http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
// http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
{
major = minor = 0;
if (10 >= deviceVersion.length())
return;
const char *pstr = deviceVersion.c_str();
if (0 != strncmp(pstr, "OpenCL ", 7))
return;
size_t ppos = deviceVersion.find('.', 7);
if (String::npos == ppos)
return;
String temp = deviceVersion.substr(7, ppos - 7);
major = atoi(temp.c_str());
temp = deviceVersion.substr(ppos + 1);
minor = atoi(temp.c_str());
}
struct Device::Impl
{
Impl(void* d)
{
handle = (cl_device_id)d;
refcount = 1;
name_ = getStrProp(CL_DEVICE_NAME);
version_ = getStrProp(CL_DEVICE_VERSION);
doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
driverVersion_ = getStrProp(CL_DRIVER_VERSION);
String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
vendorName_ = getStrProp(CL_DEVICE_VENDOR);
if (vendorName_ == "Advanced Micro Devices, Inc." ||
vendorName_ == "AMD")
vendorID_ = VENDOR_AMD;
else if (vendorName_ == "Intel(R) Corporation")
vendorID_ = VENDOR_INTEL;
else if (vendorName_ == "NVIDIA Corporation")
vendorID_ = VENDOR_NVIDIA;
else
vendorID_ = UNKNOWN_VENDOR;
}
template<typename _TpCL, typename _TpOut>
_TpOut getProp(cl_device_info prop) const
{
_TpCL temp=_TpCL();
size_t sz = 0;
return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
}
bool getBoolProp(cl_device_info prop) const
{
cl_bool temp = CL_FALSE;
size_t sz = 0;
return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
sz == sizeof(temp) ? temp != 0 : false;
}
String getStrProp(cl_device_info prop) const
{
char buf[1024];
size_t sz=0;
return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
sz < sizeof(buf) ? String(buf) : String();
}
IMPLEMENT_REFCOUNTABLE();
cl_device_id handle;
String name_;
String version_;
int doubleFPConfig_;
bool hostUnifiedMemory_;
int maxComputeUnits_;
size_t maxWorkGroupSize_;
int type_;
int deviceVersionMajor_;
int deviceVersionMinor_;
String driverVersion_;
String vendorName_;
int vendorID_;
};
Device::Device()
{
p = 0;
}
Device::Device(void* d)
{
p = 0;
set(d);
}
Device::Device(const Device& d)
{
p = d.p;
if(p)
p->addref();
}
Device& Device::operator = (const Device& d)
{
Impl* newp = (Impl*)d.p;
if(newp)
newp->addref();
if(p)
p->release();
p = newp;
return *this;
}
Device::~Device()
{
if(p)
p->release();
}
void Device::set(void* d)
{
if(p)
p->release();
p = new Impl(d);
}
void* Device::ptr() const
{
return p ? p->handle : 0;
}
String Device::name() const
{ return p ? p->name_ : String(); }
String Device::extensions() const
{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
String Device::version() const
{ return p ? p->version_ : String(); }
String Device::vendorName() const
{ return p ? p->vendorName_ : String(); }
int Device::vendorID() const
{ return p ? p->vendorID_ : 0; }
String Device::OpenCL_C_Version() const
{ return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
String Device::OpenCLVersion() const
{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
int Device::deviceVersionMajor() const
{ return p ? p->deviceVersionMajor_ : 0; }
int Device::deviceVersionMinor() const
{ return p ? p->deviceVersionMinor_ : 0; }
String Device::driverVersion() const
{ return p ? p->driverVersion_ : String(); }
int Device::type() const
{ return p ? p->type_ : 0; }
int Device::addressBits() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
bool Device::available() const
{ return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
bool Device::compilerAvailable() const
{ return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
bool Device::linkerAvailable() const
#ifdef CL_VERSION_1_2
{ return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
#else
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
#endif
int Device::doubleFPConfig() const
{ return p ? p->doubleFPConfig_ : 0; }
int Device::singleFPConfig() const
{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
int Device::halfFPConfig() const
#ifdef CL_VERSION_1_2
{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
#else
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
#endif
bool Device::endianLittle() const
{ return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
bool Device::errorCorrectionSupport() const
{ return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
int Device::executionCapabilities() const
{ return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
size_t Device::globalMemCacheSize() const
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
int Device::globalMemCacheType() const
{ return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
int Device::globalMemCacheLineSize() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
size_t Device::globalMemSize() const
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
size_t Device::localMemSize() const
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
int Device::localMemType() const
{ return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
bool Device::hostUnifiedMemory() const
{ return p ? p->hostUnifiedMemory_ : false; }
bool Device::imageSupport() const
{ return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
bool Device::imageFromBufferSupport() const
{
bool ret = false;
if (p)
{
size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
if (pos != String::npos)
{
ret = true;
}
}
return ret;
}
uint Device::imagePitchAlignment() const
{
#ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
#else
return 0;
#endif
}
uint Device::imageBaseAddressAlignment() const
{
#ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
#else
return 0;
#endif
}
size_t Device::image2DMaxWidth() const
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
size_t Device::image2DMaxHeight() const
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
size_t Device::image3DMaxWidth() const
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
size_t Device::image3DMaxHeight() const
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
size_t Device::image3DMaxDepth() const
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
size_t Device::imageMaxBufferSize() const
#ifdef CL_VERSION_1_2
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
#else
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
#endif
size_t Device::imageMaxArraySize() const
#ifdef CL_VERSION_1_2
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
#else
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
#endif
int Device::maxClockFrequency() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
int Device::maxComputeUnits() const
{ return p ? p->maxComputeUnits_ : 0; }
int Device::maxConstantArgs() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
size_t Device::maxConstantBufferSize() const
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
size_t Device::maxMemAllocSize() const
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
size_t Device::maxParameterSize() const
{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
int Device::maxReadImageArgs() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
int Device::maxWriteImageArgs() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
int Device::maxSamplers() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
size_t Device::maxWorkGroupSize() const
{ return p ? p->maxWorkGroupSize_ : 0; }
int Device::maxWorkItemDims() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
void Device::maxWorkItemSizes(size_t* sizes) const
{
if(p)
{
const int MAX_DIMS = 32;
size_t retsz = 0;
CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
}
}
int Device::memBaseAddrAlign() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
int Device::nativeVectorWidthChar() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
int Device::nativeVectorWidthShort() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
int Device::nativeVectorWidthInt() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
int Device::nativeVectorWidthLong() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
int Device::nativeVectorWidthFloat() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
int Device::nativeVectorWidthDouble() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
int Device::nativeVectorWidthHalf() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
int Device::preferredVectorWidthChar() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
int Device::preferredVectorWidthShort() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
int Device::preferredVectorWidthInt() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
int Device::preferredVectorWidthLong() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
int Device::preferredVectorWidthFloat() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
int Device::preferredVectorWidthDouble() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
int Device::preferredVectorWidthHalf() const
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
size_t Device::printfBufferSize() const
#ifdef CL_VERSION_1_2
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
#else
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
#endif
size_t Device::profilingTimerResolution() const
{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
const Device& Device::getDefault()
{
const Context& ctx = Context::getDefault();
int idx = coreTlsData.get()->device;
return ctx.device(idx);
}
////////////////////////////////////// Context ///////////////////////////////////////////////////
template <typename Functor, typename ObjectType>
inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
{
::size_t required;
cl_int err = f(obj, name, 0, NULL, &required);
if (err != CL_SUCCESS)
return err;
param.clear();
if (required > 0)
{
AutoBuffer<char> buf(required + 1);
char* ptr = (char*)buf; // cleanup is not needed
err = f(obj, name, required, ptr, NULL);
if (err != CL_SUCCESS)
return err;
param = ptr;
}
return CL_SUCCESS;
}
static void split(const std::string &s, char delim, std::vector<std::string> &elems)
{
elems.clear();
if (s.size() == 0)
return;
std::istringstream ss(s);
std::string item;
while (!ss.eof())
{
std::getline(ss, item, delim);
elems.push_back(item);
}
}
// Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
// Sample: AMD:GPU:
// Sample: AMD:GPU:Tahiti
// Sample: :GPU|CPU: = '' = ':' = '::'
static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
{
std::vector<std::string> parts;
split(configurationStr, ':', parts);
if (parts.size() > 3)
{
std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
return false;
}
if (parts.size() > 2)
deviceNameOrID = parts[2];
if (parts.size() > 1)
{
split(parts[1], '|', deviceTypes);
}
if (parts.size() > 0)
{
platform = parts[0];
}
return true;
}
static cl_device_id selectOpenCLDevice()
{
std::string platform, deviceName;
std::vector<std::string> deviceTypes;
const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
if (configuration && !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName))
return NULL;
bool isID = false;
int deviceID = -1;
if (deviceName.length() == 1)
// We limit ID range to 0..9, because we want to write:
// - '2500' to mean i5-2500
// - '8350' to mean AMD FX-8350
// - '650' to mean GeForce 650
// To extend ID range change condition to '> 0'
{
isID = true;
for (size_t i = 0; i < deviceName.length(); i++)
{
if (!isdigit(deviceName[i]))
{
isID = false;
break;
}
}
if (isID)
{
deviceID = atoi(deviceName.c_str());
if (deviceID < 0)
return NULL;
}
}
std::vector<cl_platform_id> platforms;
{
cl_uint numPlatforms = 0;
CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
if (numPlatforms == 0)
return NULL;
platforms.resize((size_t)numPlatforms);
CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
platforms.resize(numPlatforms);
}
int selectedPlatform = -1;
if (platform.length() > 0)
{
for (size_t i = 0; i < platforms.size(); i++)
{
std::string name;
CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
if (name.find(platform) != std::string::npos)
{
selectedPlatform = (int)i;
break;
}
}
if (selectedPlatform == -1)
{
std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
goto not_found;
}
}
if (deviceTypes.size() == 0)
{
if (!isID)
{
deviceTypes.push_back("GPU");
deviceTypes.push_back("CPU");
}
else
deviceTypes.push_back("ALL");
}
for (size_t t = 0; t < deviceTypes.size(); t++)
{
int deviceType = 0;
std::string tempStrDeviceType = deviceTypes[t];
std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
deviceType = Device::TYPE_GPU;
else if (tempStrDeviceType == "cpu")
deviceType = Device::TYPE_CPU;
else if (tempStrDeviceType == "accelerator")
deviceType = Device::TYPE_ACCELERATOR;
else if (tempStrDeviceType == "all")
deviceType = Device::TYPE_ALL;
else
{
std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
goto not_found;
}
std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
(selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
i++)
{
cl_uint count = 0;
cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
if (count == 0)
continue;
size_t base = devices.size();
devices.resize(base + count);
status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
}
for (size_t i = (isID ? deviceID : 0);
(isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
i++)
{
std::string name;
CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
cl_bool useGPU = true;
if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
{
cl_bool isIGPU = CL_FALSE;
clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
}
if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
{
// TODO check for OpenCL 1.1
return devices[i];
}
}
}
not_found:
std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
<< " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
<< " Device types: ";
for (size_t t = 0; t < deviceTypes.size(); t++)
std::cerr << deviceTypes[t] << " ";
std::cerr << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
CV_Error(CL_INVALID_DEVICE, "Requested OpenCL device is not found");
return NULL;
}
struct Context::Impl
{
Impl()
{
refcount = 1;
handle = 0;
}
void setDefault()
{
CV_Assert(handle == NULL);
cl_device_id d = selectOpenCLDevice();
if (d == NULL)
return;
cl_platform_id pl = NULL;
CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
cl_context_properties prop[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
0
};
// !!! in the current implementation force the number of devices to 1 !!!
cl_uint nd = 1;
cl_int status;
handle = clCreateContext(prop, nd, &d, 0, 0, &status);
bool ok = handle != 0 && status == CL_SUCCESS;
if( ok )
{
devices.resize(nd);
devices[0].set(d);
}
else
handle = NULL;
}
Impl(int dtype0)
{
refcount = 1;
handle = 0;
cl_int retval = 0;
cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
cl_context_properties prop[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
0
};
cl_uint i, nd0 = 0, nd = 0;
int dtype = dtype0 & 15;
CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
AutoBuffer<void*> dlistbuf(nd0*2+1);
cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
cl_device_id* dlist_new = dlist + nd0;
CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
String name0;
for(i = 0; i < nd0; i++)
{
Device d(dlist[i]);
if( !d.available() || !d.compilerAvailable() )
continue;
if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
continue;
if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
continue;
String name = d.name();
if( nd != 0 && name != name0 )
continue;
name0 = name;
dlist_new[nd++] = dlist[i];
}
if(nd == 0)
return;
// !!! in the current implementation force the number of devices to 1 !!!
nd = 1;
handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
bool ok = handle != 0 && retval == CL_SUCCESS;
if( ok )
{
devices.resize(nd);
for( i = 0; i < nd; i++ )
devices[i].set(dlist_new[i]);
}
}
~Impl()
{
if(handle)
{
clReleaseContext(handle);
handle = NULL;
}
devices.clear();
}
Program getProg(const ProgramSource& src,
const String& buildflags, String& errmsg)
{
String prefix = Program::getPrefix(buildflags);
HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
phash_t::iterator it = phash.find(k);
if( it != phash.end() )
return it->second;
//String filename = format("%08x%08x_%08x%08x.clb2",
Program prog(src, buildflags, errmsg);
if(prog.ptr())
phash.insert(std::pair<HashKey,Program>(k, prog));
return prog;
}
IMPLEMENT_REFCOUNTABLE();
cl_context handle;
std::vector<Device> devices;
typedef ProgramSource::hash_t hash_t;
struct HashKey
{
HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
hash_t a, b;
};
typedef std::map<HashKey, Program> phash_t;
phash_t phash;
};
Context::Context()
{
p = 0;
}
Context::Context(int dtype)
{
p = 0;
create(dtype);
}
bool Context::create()
{
if( !haveOpenCL() )
return false;
if(p)
p->release();
p = new Impl();
if(!p->handle)
{
delete p;
p = 0;
}
return p != 0;
}
bool Context::create(int dtype0)
{
if( !haveOpenCL() )
return false;
if(p)
p->release();
p = new Impl(dtype0);
if(!p->handle)
{
delete p;
p = 0;
}
return p != 0;
}
Context::~Context()
{
if (p)
{
p->release();
p = NULL;
}
}
Context::Context(const Context& c)
{
p = (Impl*)c.p;
if(p)
p->addref();
}
Context& Context::operator = (const Context& c)
{
Impl* newp = (Impl*)c.p;
if(newp)
newp->addref();
if(p)
p->release();
p = newp;
return *this;
}
void* Context::ptr() const
{
return p == NULL ? NULL : p->handle;
}
size_t Context::ndevices() const
{
return p ? p->devices.size() : 0;
}
const Device& Context::device(size_t idx) const
{
static Device dummy;
return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
}
Context& Context::getDefault(bool initialize)
{
static Context* ctx = new Context();
if(!ctx->p && haveOpenCL())
{
if (!ctx->p)
ctx->p = new Impl();
if (initialize)
{
// do not create new Context right away.
// First, try to retrieve existing context of the same type.
// In its turn, Platform::getContext() may call Context::create()
// if there is no such context.
if (ctx->p->handle == NULL)
ctx->p->setDefault();
}
}
return *ctx;
}
Program Context::getProg(const ProgramSource& prog,
const String& buildopts, String& errmsg)
{
return p ? p->getProg(prog, buildopts, errmsg) : Program();
}
void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
{
cl_context context = (cl_context)_context;
cl_device_id device = (cl_device_id)_device;
// cleanup old context
Context::Impl * impl = ctx.p;
if (impl->handle)
{
CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
}
impl->devices.clear();
impl->handle = context;
impl->devices.resize(1);
impl->devices[0].set(device);
Platform& p = Platform::getDefault();
Platform::Impl* pImpl = p.p;
pImpl->handle = (cl_platform_id)platform;
}
/////////////////////////////////////////// Queue /////////////////////////////////////////////
struct Queue::Impl
{
Impl(const Context& c, const Device& d)
{
refcount = 1;
const Context* pc = &c;
cl_context ch = (cl_context)pc->ptr();
if( !ch )
{
pc = &Context::getDefault();
ch = (cl_context)pc->ptr();
}
cl_device_id dh = (cl_device_id)d.ptr();
if( !dh )
dh = (cl_device_id)pc->device(0).ptr();
cl_int retval = 0;
handle = clCreateCommandQueue(ch, dh, 0, &retval);
CV_OclDbgAssert(retval == CL_SUCCESS);
}
~Impl()
{
#ifdef _WIN32
if (!cv::__termination)
#endif
{
if(handle)
{
clFinish(handle);
clReleaseCommandQueue(handle);
handle = NULL;
}
}
}
IMPLEMENT_REFCOUNTABLE();
cl_command_queue handle;
};
Queue::Queue()
{
p = 0;
}
Queue::Queue(const Context& c, const Device& d)
{
p = 0;
create(c, d);
}
Queue::Queue(const Queue& q)
{
p = q.p;
if(p)
p->addref();
}
Queue& Queue::operator = (const Queue& q)
{
Impl* newp = (Impl*)q.p;
if(newp)
newp->addref();
if(p)
p->release();
p = newp;
return *this;
}
Queue::~Queue()
{
if(p)
p->release();
}
bool Queue::create(const Context& c, const Device& d)
{
if(p)
p->release();
p = new Impl(c, d);
return p->handle != 0;
}
void Queue::finish()
{
if(p && p->handle)
{
CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
}
}
void* Queue::ptr() const
{
return p ? p->handle : 0;
}
Queue& Queue::getDefault()
{
Queue& q = coreTlsData.get()->oclQueue;
if( !q.p && haveOpenCL() )
q.create(Context::getDefault());
return q;
}
static cl_command_queue getQueue(const Queue& q)
{
cl_command_queue qq = (cl_command_queue)q.ptr();
if(!qq)
qq = (cl_command_queue)Queue::getDefault().ptr();
return qq;
}
/////////////////////////////////////////// KernelArg /////////////////////////////////////////////
KernelArg::KernelArg()
: flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
{
}
KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
: flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
{
}
KernelArg KernelArg::Constant(const Mat& m)
{
CV_Assert(m.isContinuous());
return KernelArg(CONSTANT, 0, 0, 0, m.data, m.total()*m.elemSize());
}
/////////////////////////////////////////// Kernel /////////////////////////////////////////////
struct Kernel::Impl
{
Impl(const char* kname, const Program& prog) :
refcount(1), e(0), nu(0)
{
cl_program ph = (cl_program)prog.ptr();
cl_int retval = 0;
handle = ph != 0 ?
clCreateKernel(ph, kname, &retval) : 0;
CV_OclDbgAssert(retval == CL_SUCCESS);
for( int i = 0; i < MAX_ARRS; i++ )
u[i] = 0;
haveTempDstUMats = false;
}
void cleanupUMats()
{
for( int i = 0; i < MAX_ARRS; i++ )
if( u[i] )
{
if( CV_XADD(&u[i]->urefcount, -1) == 1 )
u[i]->currAllocator->deallocate(u[i]);
u[i] = 0;
}
nu = 0;
haveTempDstUMats = false;
}
void addUMat(const UMat& m, bool dst)
{
CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
u[nu] = m.u;
CV_XADD(&m.u->urefcount, 1);
nu++;
if(dst && m.u->tempUMat())
haveTempDstUMats = true;
}
void addImage(const Image2D& image)
{
images.push_back(image);
}
void finit()
{
cleanupUMats();
images.clear();
if(e) { clReleaseEvent(e); e = 0; }
release();
}
~Impl()
{
if(handle)
clReleaseKernel(handle);
}
IMPLEMENT_REFCOUNTABLE();
cl_kernel handle;
cl_event e;
enum { MAX_ARRS = 16 };
UMatData* u[MAX_ARRS];
int nu;
std::list<Image2D> images;
bool haveTempDstUMats;
};
}}
extern "C"
{
static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
{
((cv::ocl::Kernel::Impl*)p)->finit();
}
}
namespace cv { namespace ocl {
Kernel::Kernel()
{
p = 0;
}
Kernel::Kernel(const char* kname, const Program& prog)
{
p = 0;
create(kname, prog);
}
Kernel::Kernel(const char* kname, const ProgramSource& src,
const String& buildopts, String* errmsg)
{
p = 0;
create(kname, src, buildopts, errmsg);
}
Kernel::Kernel(const Kernel& k)
{
p = k.p;
if(p)
p->addref();
}
Kernel& Kernel::operator = (const Kernel& k)
{
Impl* newp = (Impl*)k.p;
if(newp)
newp->addref();
if(p)
p->release();
p = newp;
return *this;
}
Kernel::~Kernel()
{
if(p)
p->release();
}
bool Kernel::create(const char* kname, const Program& prog)
{
if(p)
p->release();
p = new Impl(kname, prog);
if(p->handle == 0)
{
p->release();
p = 0;
}
return p != 0;
}
bool Kernel::create(const char* kname, const ProgramSource& src,
const String& buildopts, String* errmsg)
{
if(p)
{
p->release();
p = 0;
}
String tempmsg;
if( !errmsg ) errmsg = &tempmsg;
const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
return create(kname, prog);
}
void* Kernel::ptr() const
{
return p ? p->handle : 0;
}
bool Kernel::empty() const
{
return ptr() == 0;
}
int Kernel::set(int i, const void* value, size_t sz)
{
if (!p || !p->handle)
return -1;
if (i < 0)
return i;
if( i == 0 )
p->cleanupUMats();
cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
CV_OclDbgAssert(retval == CL_SUCCESS);
if (retval != CL_SUCCESS)
return -1;
return i+1;
}
int Kernel::set(int i, const Image2D& image2D)
{
p->addImage(image2D);
cl_mem h = (cl_mem)image2D.ptr();
return set(i, &h, sizeof(h));
}
int Kernel::set(int i, const UMat& m)
{
return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
}
int Kernel::set(int i, const KernelArg& arg)
{
if( !p || !p->handle )
return -1;
if (i < 0)
return i;
if( i == 0 )
p->cleanupUMats();
if( arg.m )
{
int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
cl_mem h = (cl_mem)arg.m->handle(accessFlags);
if (!h)
{
p->release();
p = 0;
return -1;
}
if (ptronly)
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h) == CL_SUCCESS);
else if( arg.m->dims <= 2 )
{
UMat2D u2d(*arg.m);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
i += 3;
if( !(arg.flags & KernelArg::NO_SIZE) )
{
int cols = u2d.cols*arg.wscale/arg.iwscale;
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
i += 2;
}
}
else
{
UMat3D u3d(*arg.m);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
i += 4;
if( !(arg.flags & KernelArg::NO_SIZE) )
{
int cols = u3d.cols*arg.wscale/arg.iwscale;
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
i += 3;
}
}
p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
return i;
}
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
return i+1;
}
bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
bool sync, const Queue& q)
{
if(!p || !p->handle || p->e != 0)
return false;
cl_command_queue qq = getQueue(q);
size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
size_t total = 1;
CV_Assert(_globalsize != 0);
for (int i = 0; i < dims; i++)
{
size_t val = _localsize ? _localsize[i] :
dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
CV_Assert( val > 0 );
total *= _globalsize[i];
globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
}
if( total == 0 )
return true;
if( p->haveTempDstUMats )
sync = true;
cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
offset, globalsize, _localsize, 0, 0,
sync ? 0 : &p->e);
if( sync || retval != CL_SUCCESS )
{
CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
p->cleanupUMats();
}
else
{
p->addref();
CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
}
return retval == CL_SUCCESS;
}
bool Kernel::runTask(bool sync, const Queue& q)
{
if(!p || !p->handle || p->e != 0)
return false;
cl_command_queue qq = getQueue(q);
cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
if( sync || retval != CL_SUCCESS )
{
CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
p->cleanupUMats();
}
else
{
p->addref();
CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
}
return retval == CL_SUCCESS;
}
size_t Kernel::workGroupSize() const
{
if(!p || !p->handle)
return 0;
size_t val = 0, retsz = 0;
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
}
size_t Kernel::preferedWorkGroupSizeMultiple() const
{
if(!p || !p->handle)
return 0;
size_t val = 0, retsz = 0;
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
}
bool Kernel::compileWorkGroupSize(size_t wsz[]) const
{
if(!p || !p->handle || !wsz)
return 0;
size_t retsz = 0;
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
sizeof(wsz[0]*3), wsz, &retsz) == CL_SUCCESS;
}
size_t Kernel::localMemSize() const
{
if(!p || !p->handle)
return 0;
size_t retsz = 0;
cl_ulong val = 0;
cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
}
/////////////////////////////////////////// Program /////////////////////////////////////////////
struct Program::Impl
{
Impl(const ProgramSource& _src,
const String& _buildflags, String& errmsg)
{
refcount = 1;
const Context& ctx = Context::getDefault();
src = _src;
buildflags = _buildflags;
const String& srcstr = src.source();
const char* srcptr = srcstr.c_str();
size_t srclen = srcstr.size();
cl_int retval = 0;
handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
if( handle && retval == CL_SUCCESS )
{
int i, n = (int)ctx.ndevices();
AutoBuffer<void*> deviceListBuf(n+1);
void** deviceList = deviceListBuf;
for( i = 0; i < n; i++ )
deviceList[i] = ctx.device(i).ptr();
Device device = Device::getDefault();
if (device.isAMD())
buildflags += " -D AMD_DEVICE";
else if (device.isIntel())
buildflags += " -D INTEL_DEVICE";
retval = clBuildProgram(handle, n,
(const cl_device_id*)deviceList,
buildflags.c_str(), 0, 0);
#if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
if( retval != CL_SUCCESS )
#endif
{
size_t retsz = 0;
cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
if (buildInfo_retval == CL_SUCCESS && retsz > 1)
{
AutoBuffer<char> bufbuf(retsz + 16);
char* buf = bufbuf;
buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
if (buildInfo_retval == CL_SUCCESS)
{
// TODO It is useful to see kernel name & program file name also
errmsg = String(buf);
printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
fflush(stdout);
}
}
if (retval != CL_SUCCESS && handle)
{
clReleaseProgram(handle);
handle = NULL;
}
}
}
}
Impl(const String& _buf, const String& _buildflags)
{
refcount = 1;
handle = 0;
buildflags = _buildflags;
if(_buf.empty())
return;
String prefix0 = Program::getPrefix(buildflags);
const Context& ctx = Context::getDefault();
const Device& dev = Device::getDefault();
const char* pos0 = _buf.c_str();
const char* pos1 = strchr(pos0, '\n');
if(!pos1)
return;
const char* pos2 = strchr(pos1+1, '\n');
if(!pos2)
return;
const char* pos3 = strchr(pos2+1, '\n');
if(!pos3)
return;
size_t prefixlen = (pos3 - pos0)+1;
String prefix(pos0, prefixlen);
if( prefix != prefix0 )
return;
const uchar* bin = (uchar*)(pos3+1);
void* devid = dev.ptr();
size_t codelen = _buf.length() - prefixlen;
cl_int binstatus = 0, retval = 0;
handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
&codelen, &bin, &binstatus, &retval);
CV_OclDbgAssert(retval == CL_SUCCESS);
}
String store()
{
if(!handle)
return String();
size_t progsz = 0, retsz = 0;
String prefix = Program::getPrefix(buildflags);
size_t prefixlen = prefix.length();
if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
return String();
AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
uchar* buf = bufbuf;
memcpy(buf, prefix.c_str(), prefixlen);
buf += prefixlen;
if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
return String();
buf[progsz] = (uchar)'\0';
return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
}
~Impl()
{
if( handle )
{
#ifdef _WIN32
if (!cv::__termination)
#endif
{
clReleaseProgram(handle);
}
handle = NULL;
}
}
IMPLEMENT_REFCOUNTABLE();
ProgramSource src;
String buildflags;
cl_program handle;
};
Program::Program() { p = 0; }
Program::Program(const ProgramSource& src,
const String& buildflags, String& errmsg)
{
p = 0;
create(src, buildflags, errmsg);
}
Program::Program(const Program& prog)
{
p = prog.p;
if(p)
p->addref();
}
Program& Program::operator = (const Program& prog)
{
Impl* newp = (Impl*)prog.p;
if(newp)
newp->addref();
if(p)
p->release();
p = newp;
return *this;
}
Program::~Program()
{
if(p)
p->release();
}
bool Program::create(const ProgramSource& src,
const String& buildflags, String& errmsg)
{
if(p)
p->release();
p = new Impl(src, buildflags, errmsg);
if(!p->handle)
{
p->release();
p = 0;
}
return p != 0;
}
const ProgramSource& Program::source() const
{
static ProgramSource dummy;
return p ? p->src : dummy;
}
void* Program::ptr() const
{
return p ? p->handle : 0;
}
bool Program::read(const String& bin, const String& buildflags)
{
if(p)
p->release();
p = new Impl(bin, buildflags);
return p->handle != 0;
}
bool Program::write(String& bin) const
{
if(!p)
return false;
bin = p->store();
return !bin.empty();
}
String Program::getPrefix() const
{
if(!p)
return String();
return getPrefix(p->buildflags);
}
String Program::getPrefix(const String& buildflags)
{
const Context& ctx = Context::getDefault();
const Device& dev = ctx.device(0);
return format("name=%s\ndriver=%s\nbuildflags=%s\n",
dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
}
///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
struct ProgramSource::Impl
{
Impl(const char* _src)
{
init(String(_src));
}
Impl(const String& _src)
{
init(_src);
}
void init(const String& _src)
{
refcount = 1;
src = _src;
h = crc64((uchar*)src.c_str(), src.size());
}
IMPLEMENT_REFCOUNTABLE();
String src;
ProgramSource::hash_t h;
};
ProgramSource::ProgramSource()
{
p = 0;
}
ProgramSource::ProgramSource(const char* prog)
{
p = new Impl(prog);
}
ProgramSource::ProgramSource(const String& prog)
{
p = new Impl(prog);
}
ProgramSource::~ProgramSource()
{
if(p)
p->release();
}
ProgramSource::ProgramSource(const ProgramSource& prog)
{
p = prog.p;
if(p)
p->addref();
}
ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
{
Impl* newp = (Impl*)prog.p;
if(newp)
newp->addref();
if(p)
p->release();
p = newp;
return *this;
}
const String& ProgramSource::source() const
{
static String dummy;
return p ? p->src : dummy;
}
ProgramSource::hash_t ProgramSource::hash() const
{
return p ? p->h : 0;
}
//////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
class OpenCLBufferPool
{
protected:
~OpenCLBufferPool() { }
public:
virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity) = 0;
virtual void release(cl_mem handle, size_t capacity) = 0;
};
class OpenCLBufferPoolImpl : public BufferPoolController, public OpenCLBufferPool
{
public:
struct BufferEntry
{
cl_mem clBuffer_;
size_t capacity_;
};
protected:
Mutex mutex_;
size_t currentReservedSize;
size_t maxReservedSize;
std::list<BufferEntry> reservedEntries_; // LRU order
// synchronized
bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
{
if (reservedEntries_.empty())
return false;
std::list<BufferEntry>::iterator i = reservedEntries_.begin();
std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
BufferEntry result = {NULL, 0};
size_t minDiff = (size_t)(-1);
for (; i != reservedEntries_.end(); ++i)
{
BufferEntry& e = *i;
if (e.capacity_ >= size)
{
size_t diff = e.capacity_ - size;
if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
{
minDiff = diff;
result_pos = i;
result = e;
if (diff == 0)
break;
}
}
}
if (result_pos != reservedEntries_.end())
{
//CV_DbgAssert(result == *result_pos);
reservedEntries_.erase(result_pos);
entry = result;
currentReservedSize -= entry.capacity_;
return true;
}
return false;
}
// synchronized
void _checkSizeOfReservedEntries()
{
while (currentReservedSize > maxReservedSize)
{
CV_DbgAssert(!reservedEntries_.empty());
const BufferEntry& entry = reservedEntries_.back();
CV_DbgAssert(currentReservedSize >= entry.capacity_);
currentReservedSize -= entry.capacity_;
_releaseBufferEntry(entry);
reservedEntries_.pop_back();
}
}
inline size_t _allocationGranularity(size_t size)
{
// heuristic values
if (size < 1024)
return 16;
else if (size < 64*1024)
return 64;
else if (size < 1024*1024)
return 4096;
else if (size < 16*1024*1024)
return 64*1024;
else
return 1024*1024;
}
void _allocateBufferEntry(BufferEntry& entry, size_t size)
{
CV_DbgAssert(entry.clBuffer_ == NULL);
entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
Context& ctx = Context::getDefault();
cl_int retval = CL_SUCCESS;
entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE, entry.capacity_, 0, &retval);
CV_Assert(retval == CL_SUCCESS);
CV_Assert(entry.clBuffer_ != NULL);
LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
(long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
}
void _releaseBufferEntry(const BufferEntry& entry)
{
CV_Assert(entry.capacity_ != 0);
CV_Assert(entry.clBuffer_ != NULL);
LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
clReleaseMemObject(entry.clBuffer_);
}
public:
OpenCLBufferPoolImpl()
: currentReservedSize(0), maxReservedSize(0)
{
// Note: Buffer pool is disabled by default,
// because we didn't receive significant performance improvement
maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", 0);
}
virtual ~OpenCLBufferPoolImpl()
{
freeAllReservedBuffers();
CV_Assert(reservedEntries_.empty());
}
public:
virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity)
{
BufferEntry entry = {NULL, 0};
if (maxReservedSize > 0)
{
AutoLock locker(mutex_);
if (_findAndRemoveEntryFromReservedList(entry, size))
{
CV_DbgAssert(size <= entry.capacity_);
LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
capacity = entry.capacity_;
return entry.clBuffer_;
}
}
_allocateBufferEntry(entry, size);
capacity = entry.capacity_;
return entry.clBuffer_;
}
virtual void release(cl_mem handle, size_t capacity)
{
BufferEntry entry = {handle, capacity};
if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
{
_releaseBufferEntry(entry);
}
else
{
AutoLock locker(mutex_);
reservedEntries_.push_front(entry);
currentReservedSize += entry.capacity_;
_checkSizeOfReservedEntries();
}
}
virtual size_t getReservedSize() const { return currentReservedSize; }
virtual size_t getMaxReservedSize() const { return maxReservedSize; }
virtual void setMaxReservedSize(size_t size)
{
AutoLock locker(mutex_);
size_t oldMaxReservedSize = maxReservedSize;
maxReservedSize = size;
if (maxReservedSize < oldMaxReservedSize)
{
std::list<BufferEntry>::iterator i = reservedEntries_.begin();
for (; i != reservedEntries_.end();)
{
const BufferEntry& entry = *i;
if (entry.capacity_ > maxReservedSize / 8)
{
CV_DbgAssert(currentReservedSize >= entry.capacity_);
currentReservedSize -= entry.capacity_;
_releaseBufferEntry(entry);
i = reservedEntries_.erase(i);
continue;
}
++i;
}
_checkSizeOfReservedEntries();
}
}
virtual void freeAllReservedBuffers()
{
AutoLock locker(mutex_);
std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
for (; i != reservedEntries_.end(); ++i)
{
const BufferEntry& entry = *i;
_releaseBufferEntry(entry);
}
reservedEntries_.clear();
}
};
#if defined _MSC_VER
#pragma warning(disable:4127) // conditional expression is constant
#endif
template <bool readAccess, bool writeAccess>
class AlignedDataPtr
{
protected:
const size_t size_;
uchar* const originPtr_;
const size_t alignment_;
uchar* ptr_;
uchar* allocatedPtr_;
public:
AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
: size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
{
CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
if (((size_t)ptr_ & (alignment - 1)) != 0)
{
allocatedPtr_ = new uchar[size_ + alignment - 1];
ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
if (readAccess)
{
memcpy(ptr_, originPtr_, size_);
}
}
}
uchar* getAlignedPtr() const
{
CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
return ptr_;
}
~AlignedDataPtr()
{
if (allocatedPtr_)
{
if (writeAccess)
{
memcpy(originPtr_, ptr_, size_);
}
delete[] allocatedPtr_;
allocatedPtr_ = NULL;
}
ptr_ = NULL;
}
private:
AlignedDataPtr(const AlignedDataPtr&); // disabled
AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
};
#if defined _MSC_VER
#pragma warning(default:4127) // conditional expression is constant
#endif
#ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
#define CV_OPENCL_DATA_PTR_ALIGNMENT 16
#endif
class OpenCLAllocator : public MatAllocator
{
mutable OpenCLBufferPoolImpl bufferPool;
enum AllocatorFlags
{
ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0
};
public:
OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); }
UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
int flags, UMatUsageFlags usageFlags) const
{
UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
return u;
}
void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
{
const Device& dev = ctx.device(0);
createFlags = 0;
if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
createFlags |= CL_MEM_ALLOC_HOST_PTR;
if( dev.hostUnifiedMemory() )
flags0 = 0;
else
flags0 = UMatData::COPY_ON_MAP;
}
UMatData* allocate(int dims, const int* sizes, int type,
void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
{
if(!useOpenCL())
return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
CV_Assert(data == 0);
size_t total = CV_ELEM_SIZE(type);
for( int i = dims-1; i >= 0; i-- )
{
if( step )
step[i] = total;
total *= sizes[i];
}
Context& ctx = Context::getDefault();
int createFlags = 0, flags0 = 0;
getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
size_t capacity = 0;
void* handle = NULL;
int allocatorFlags = 0;
if (createFlags == 0)
{
handle = bufferPool.allocate(total, capacity);
if (!handle)
return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
}
else
{
capacity = total;
cl_int retval = 0;
handle = clCreateBuffer((cl_context)ctx.ptr(),
CL_MEM_READ_WRITE|createFlags, total, 0, &retval);
if( !handle || retval != CL_SUCCESS )
return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
}
UMatData* u = new UMatData(this);
u->data = 0;
u->size = total;
u->capacity = capacity;
u->handle = handle;
u->flags = flags0;
u->allocatorFlags_ = allocatorFlags;
CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
return u;
}
bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
{
if(!u)
return false;
UMatDataAutoLock lock(u);
if(u->handle == 0)
{
CV_Assert(u->origdata != 0);
Context& ctx = Context::getDefault();
int createFlags = 0, flags0 = 0;
getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
cl_context ctx_handle = (cl_context)ctx.ptr();
cl_int retval = 0;
int tempUMatFlags = UMatData::TEMP_UMAT;
u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,
u->size, u->origdata, &retval);
if((!u->handle || retval != CL_SUCCESS) && !(accessFlags & ACCESS_FAST))
{
u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
u->size, u->origdata, &retval);
tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
}
if(!u->handle || retval != CL_SUCCESS)
return false;
u->prevAllocator = u->currAllocator;
u->currAllocator = this;
u->flags |= tempUMatFlags;
}
if(accessFlags & ACCESS_WRITE)
u->markHostCopyObsolete(true);
return true;
}
/*void sync(UMatData* u) const
{
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
UMatDataAutoLock lock(u);
if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
{
if( u->tempCopiedUMat() )
{
clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
u->size, u->origdata, 0, 0, 0);
}
else
{
cl_int retval = 0;
void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
(CL_MAP_READ | CL_MAP_WRITE),
0, u->size, 0, 0, 0, &retval);
clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
clFinish(q);
}
u->markHostCopyObsolete(false);
}
else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
{
clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
u->size, u->data, 0, 0, 0);
}
}*/
void deallocate(UMatData* u) const
{
if(!u)
return;
CV_Assert(u->urefcount >= 0);
CV_Assert(u->refcount >= 0);
// TODO: !!! when we add Shared Virtual Memory Support,
// this function (as well as the others) should be corrected
CV_Assert(u->handle != 0 && u->urefcount == 0);
if(u->tempUMat())
{
// UMatDataAutoLock lock(u);
if( u->hostCopyObsolete() && u->refcount > 0 )
{
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
if( u->tempCopiedUMat() )
{
AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
}
else
{
cl_int retval = 0;
void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
(CL_MAP_READ | CL_MAP_WRITE),
0, u->size, 0, 0, 0, &retval);
CV_OclDbgAssert(retval == CL_SUCCESS);
CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
}
}
u->markHostCopyObsolete(false);
clReleaseMemObject((cl_mem)u->handle);
u->handle = 0;
u->currAllocator = u->prevAllocator;
if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
fastFree(u->data);
u->data = u->origdata;
if(u->refcount == 0)
u->currAllocator->deallocate(u);
}
else
{
CV_Assert(u->refcount == 0);
if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
{
fastFree(u->data);
u->data = 0;
}
if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
{
bufferPool.release((cl_mem)u->handle, u->capacity);
}
else
{
clReleaseMemObject((cl_mem)u->handle);
}
u->handle = 0;
u->capacity = 0;
delete u;
}
}
void map(UMatData* u, int accessFlags) const
{
if(!u)
return;
CV_Assert( u->handle != 0 );
UMatDataAutoLock autolock(u);
if(accessFlags & ACCESS_WRITE)
u->markDeviceCopyObsolete(true);
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
// FIXIT Workaround for UMat synchronization issue
// if( u->refcount == 0 )
{
if( !u->copyOnMap() )
{
if (u->data) // FIXIT Workaround for UMat synchronization issue
{
//CV_Assert(u->hostCopyObsolete() == false);
return;
}
// because there can be other map requests for the same UMat with different access flags,
// we use the universal (read-write) access mode.
cl_int retval = 0;
u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
(CL_MAP_READ | CL_MAP_WRITE),
0, u->size, 0, 0, 0, &retval);
if(u->data && retval == CL_SUCCESS)
{
u->markHostCopyObsolete(false);
return;
}
// if map failed, switch to copy-on-map mode for the particular buffer
u->flags |= UMatData::COPY_ON_MAP;
}
if(!u->data)
{
u->data = (uchar*)fastMalloc(u->size);
u->markHostCopyObsolete(true);
}
}
if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
{
AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
u->markHostCopyObsolete(false);
}
}
void unmap(UMatData* u) const
{
if(!u)
return;
CV_Assert(u->handle != 0);
UMatDataAutoLock autolock(u);
// FIXIT Workaround for UMat synchronization issue
if(u->refcount > 0)
return;
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
cl_int retval = 0;
if( !u->copyOnMap() && u->data )
{
CV_Assert( (retval = clEnqueueUnmapMemObject(q,
(cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
u->data = 0;
}
else if( u->copyOnMap() && u->deviceCopyObsolete() )
{
AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
}
u->markDeviceCopyObsolete(false);
u->markHostCopyObsolete(false);
}
bool checkContinuous(int dims, const size_t sz[],
const size_t srcofs[], const size_t srcstep[],
const size_t dstofs[], const size_t dststep[],
size_t& total, size_t new_sz[],
size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
{
bool iscontinuous = true;
srcrawofs = srcofs ? srcofs[dims-1] : 0;
dstrawofs = dstofs ? dstofs[dims-1] : 0;
total = sz[dims-1];
for( int i = dims-2; i >= 0; i-- )
{
if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
iscontinuous = false;
total *= sz[i];
if( srcofs )
srcrawofs += srcofs[i]*srcstep[i];
if( dstofs )
dstrawofs += dstofs[i]*dststep[i];
}
if( !iscontinuous )
{
// OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
if( dims == 2 )
{
new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
// we assume that new_... arrays are initialized by caller
// with 0's, so there is no else branch
if( srcofs )
{
new_srcofs[0] = srcofs[1];
new_srcofs[1] = srcofs[0];
new_srcofs[2] = 0;
}
if( dstofs )
{
new_dstofs[0] = dstofs[1];
new_dstofs[1] = dstofs[0];
new_dstofs[2] = 0;
}
new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
new_dststep[0] = dststep[0]; new_dststep[1] = 0;
}
else
{
// we could check for dims == 3 here,
// but from user perspective this one is more informative
CV_Assert(dims <= 3);
new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
if( srcofs )
{
new_srcofs[0] = srcofs[2];
new_srcofs[1] = srcofs[1];
new_srcofs[2] = srcofs[0];
}
if( dstofs )
{
new_dstofs[0] = dstofs[2];
new_dstofs[1] = dstofs[1];
new_dstofs[2] = dstofs[0];
}
new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
}
}
return iscontinuous;
}
void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
const size_t srcofs[], const size_t srcstep[],
const size_t dststep[]) const
{
if(!u)
return;
UMatDataAutoLock autolock(u);
if( u->data && !u->hostCopyObsolete() )
{
Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
return;
}
CV_Assert( u->handle != 0 );
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
size_t total = 0, new_sz[] = {0, 0, 0};
size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
total, new_sz,
srcrawofs, new_srcofs, new_srcstep,
dstrawofs, new_dstofs, new_dststep);
AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
if( iscontinuous )
{
CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
}
else
{
CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
}
}
void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
const size_t dstofs[], const size_t dststep[],
const size_t srcstep[]) const
{
if(!u)
return;
// there should be no user-visible CPU copies of the UMat which we are going to copy to
CV_Assert(u->refcount == 0 || u->tempUMat());
size_t total = 0, new_sz[] = {0, 0, 0};
size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
total, new_sz,
srcrawofs, new_srcofs, new_srcstep,
dstrawofs, new_dstofs, new_dststep);
UMatDataAutoLock autolock(u);
// if there is cached CPU copy of the GPU matrix,
// we could use it as a destination.
// we can do it in 2 cases:
// 1. we overwrite the whole content
// 2. we overwrite part of the matrix, but the GPU copy is out-of-date
if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
{
Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
u->markHostCopyObsolete(false);
u->markDeviceCopyObsolete(true);
return;
}
CV_Assert( u->handle != 0 );
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
if( iscontinuous )
{
CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) == CL_SUCCESS );
}
else
{
CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) == CL_SUCCESS );
}
u->markHostCopyObsolete(true);
u->markDeviceCopyObsolete(false);
}
void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
const size_t srcofs[], const size_t srcstep[],
const size_t dstofs[], const size_t dststep[], bool _sync) const
{
if(!src || !dst)
return;
size_t total = 0, new_sz[] = {0, 0, 0};
size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
total, new_sz,
srcrawofs, new_srcofs, new_srcstep,
dstrawofs, new_dstofs, new_dststep);
UMatDataAutoLock src_autolock(src);
UMatDataAutoLock dst_autolock(dst);
if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
{
upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
return;
}
if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
{
download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
dst->markHostCopyObsolete(false);
dst->markDeviceCopyObsolete(true);
return;
}
// there should be no user-visible CPU copies of the UMat which we are going to copy to
CV_Assert(dst->refcount == 0);
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
if( iscontinuous )
{
CV_Assert( clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
srcrawofs, dstrawofs, total, 0, 0, 0) == CL_SUCCESS );
}
else
{
cl_int retval;
CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
new_srcofs, new_dstofs, new_sz,
new_srcstep[0], new_srcstep[1],
new_dststep[0], new_dststep[1],
0, 0, 0)) == CL_SUCCESS );
}
dst->markHostCopyObsolete(true);
dst->markDeviceCopyObsolete(false);
if( _sync )
{
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
}
}
BufferPoolController* getBufferPoolController() const { return &bufferPool; }
MatAllocator* matStdAllocator;
};
MatAllocator* getOpenCLAllocator()
{
static MatAllocator * allocator = new OpenCLAllocator();
return allocator;
}
///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
{
cl_uint numDevices = 0;
CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
0, NULL, &numDevices) == CL_SUCCESS);
if (numDevices == 0)
{
devices.clear();
return;
}
devices.resize((size_t)numDevices);
CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
numDevices, &devices[0], &numDevices) == CL_SUCCESS);
}
struct PlatformInfo::Impl
{
Impl(void* id)
{
refcount = 1;
handle = *(cl_platform_id*)id;
getDevices(devices, handle);
}
String getStrProp(cl_device_info prop) const
{
char buf[1024];
size_t sz=0;
return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
sz < sizeof(buf) ? String(buf) : String();
}
IMPLEMENT_REFCOUNTABLE();
std::vector<cl_device_id> devices;
cl_platform_id handle;
};
PlatformInfo::PlatformInfo()
{
p = 0;
}
PlatformInfo::PlatformInfo(void* platform_id)
{
p = new Impl(platform_id);
}
PlatformInfo::~PlatformInfo()
{
if(p)
p->release();
}
PlatformInfo::PlatformInfo(const PlatformInfo& i)
{
if (i.p)
i.p->addref();
p = i.p;
}
PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
{
if (i.p != p)
{
if (i.p)
i.p->addref();
if (p)
p->release();
p = i.p;
}
return *this;
}
int PlatformInfo::deviceNumber() const
{
return p ? (int)p->devices.size() : 0;
}
void PlatformInfo::getDevice(Device& device, int d) const
{
CV_Assert(p && d < (int)p->devices.size() );
if(p)
device.set(p->devices[d]);
}
String PlatformInfo::name() const
{
return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
}
String PlatformInfo::vendor() const
{
return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
}
String PlatformInfo::version() const
{
return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
}
static void getPlatforms(std::vector<cl_platform_id>& platforms)
{
cl_uint numPlatforms = 0;
CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
if (numPlatforms == 0)
{
platforms.clear();
return;
}
platforms.resize((size_t)numPlatforms);
CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
}
void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
{
std::vector<cl_platform_id> platforms;
getPlatforms(platforms);
for (size_t i = 0; i < platforms.size(); i++)
platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
}
const char* typeToStr(int type)
{
static const char* tab[]=
{
"uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
"char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
"ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
"short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
"float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
"double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
};
int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
return cn > 16 ? "?" : tab[depth*16 + cn-1];
}
const char* memopTypeToStr(int type)
{
static const char* tab[] =
{
"uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
"char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
"ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
"short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
"int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
"ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
};
int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
return cn > 16 ? "?" : tab[depth*16 + cn-1];
}
const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
{
if( sdepth == ddepth )
return "noconvert";
const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
if( ddepth >= CV_32F ||
(ddepth == CV_32S && sdepth < CV_32S) ||
(ddepth == CV_16S && sdepth <= CV_8S) ||
(ddepth == CV_16U && sdepth == CV_8U))
{
sprintf(buf, "convert_%s", typestr);
}
else if( sdepth >= CV_32F )
sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
else
sprintf(buf, "convert_%s_sat", typestr);
return buf;
}
template <typename T>
static std::string kerToStr(const Mat & k)
{
int width = k.cols - 1, depth = k.depth();
const T * const data = reinterpret_cast<const T *>(k.data);
std::ostringstream stream;
stream.precision(10);
if (depth <= CV_8S)
{
for (int i = 0; i < width; ++i)
stream << "DIG(" << (int)data[i] << ")";
stream << "DIG(" << (int)data[width] << ")";
}
else if (depth == CV_32F)
{
stream.setf(std::ios_base::showpoint);
for (int i = 0; i < width; ++i)
stream << "DIG(" << data[i] << "f)";
stream << "DIG(" << data[width] << "f)";
}
else
{
for (int i = 0; i < width; ++i)
stream << "DIG(" << data[i] << ")";
stream << "DIG(" << data[width] << ")";
}
return stream.str();
}
String kernelToStr(InputArray _kernel, int ddepth, const char * name)
{
Mat kernel = _kernel.getMat().reshape(1, 1);
int depth = kernel.depth();
if (ddepth < 0)
ddepth = depth;
if (ddepth != depth)
kernel.convertTo(kernel, ddepth);
typedef std::string (* func_t)(const Mat &);
static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
const func_t func = funcs[ddepth];
CV_Assert(func != 0);
return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
}
#define PROCESS_SRC(src) \
do \
{ \
if (!src.empty()) \
{ \
CV_Assert(src.isMat() || src.isUMat()); \
int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
Size csize = src.size(); \
cols.push_back(ccn * src.size().width); \
if (ctype != type || csize != ssize) \
return 1; \
offsets.push_back(src.offset()); \
steps.push_back(src.step()); \
} \
} \
while ((void)0, 0)
int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
InputArray src4, InputArray src5, InputArray src6,
InputArray src7, InputArray src8, InputArray src9)
{
int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(depth);
Size ssize = src1.size();
const ocl::Device & d = ocl::Device::getDefault();
int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
d.preferredVectorWidthDouble(), -1 }, width = vectorWidths[depth];
if (ssize.width * cn < width || width <= 0)
return 1;
std::vector<size_t> offsets, steps, cols;
PROCESS_SRC(src1);
PROCESS_SRC(src2);
PROCESS_SRC(src3);
PROCESS_SRC(src4);
PROCESS_SRC(src5);
PROCESS_SRC(src6);
PROCESS_SRC(src7);
PROCESS_SRC(src8);
PROCESS_SRC(src9);
size_t size = offsets.size();
int wsz = width * esz;
std::vector<int> dividers(size, wsz);
for (size_t i = 0; i < size; ++i)
while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
dividers[i] >>= 1;
// default strategy
for (size_t i = 0; i < size; ++i)
if (dividers[i] != wsz)
{
width = 1;
break;
}
// another strategy
// width = *std::min_element(dividers.begin(), dividers.end());
return width;
}
#undef PROCESS_SRC
// TODO Make this as a method of OpenCL "BuildOptions" class
void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
{
if (!buildOptions.empty())
buildOptions += " ";
int type = _m.type(), depth = CV_MAT_DEPTH(type);
buildOptions += format(
"-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
name.c_str(), ocl::typeToStr(type),
name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
name.c_str(), (int)CV_MAT_CN(type),
name.c_str(), (int)CV_ELEM_SIZE(type),
name.c_str(), (int)CV_ELEM_SIZE1(type),
name.c_str(), (int)depth
);
}
struct Image2D::Impl
{
Impl(const UMat &src, bool norm, bool alias)
{
handle = 0;
refcount = 1;
init(src, norm, alias);
}
~Impl()
{
if (handle)
clReleaseMemObject(handle);
}
static cl_image_format getImageFormat(int depth, int cn, bool norm)
{
cl_image_format format;
static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
CL_SNORM_INT16, -1, -1, -1, -1 };
static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
int channelOrder = channelOrders[cn];
format.image_channel_data_type = (cl_channel_type)channelType;
format.image_channel_order = (cl_channel_order)channelOrder;
return format;
}
static bool isFormatSupported(cl_image_format format)
{
cl_context context = (cl_context)Context::getDefault().ptr();
// Figure out how many formats are supported by this context.
cl_uint numFormats = 0;
cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
CL_MEM_OBJECT_IMAGE2D, numFormats,
NULL, &numFormats);
AutoBuffer<cl_image_format> formats(numFormats);
err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
CL_MEM_OBJECT_IMAGE2D, numFormats,
formats, NULL);
CV_OclDbgAssert(err == CL_SUCCESS);
for (cl_uint i = 0; i < numFormats; ++i)
{
if (!memcmp(&formats[i], &format, sizeof(format)))
{
return true;
}
}
return false;
}
void init(const UMat &src, bool norm, bool alias)
{
CV_Assert(ocl::Device::getDefault().imageSupport());
int err, depth = src.depth(), cn = src.channels();
CV_Assert(cn <= 4);
cl_image_format format = getImageFormat(depth, cn, norm);
if (!isFormatSupported(format))
CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
cl_context context = (cl_context)Context::getDefault().ptr();
cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
#ifdef CL_VERSION_1_2
// this enables backwards portability to
// run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
const Device & d = ocl::Device::getDefault();
int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
CV_Assert(!alias || canCreateAlias(src));
if (1 < major || (1 == major && 2 <= minor))
{
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = src.cols;
desc.image_height = src.rows;
desc.image_depth = 0;
desc.image_array_size = 1;
desc.image_row_pitch = alias ? src.step[0] : 0;
desc.image_slice_pitch = 0;
desc.buffer = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
desc.num_mip_levels = 0;
desc.num_samples = 0;
handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
}
else
#endif
{
CV_SUPPRESS_DEPRECATED_START
CV_Assert(!alias); // This is an OpenCL 1.2 extension
handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
CV_SUPPRESS_DEPRECATED_END
}
CV_OclDbgAssert(err == CL_SUCCESS);
size_t origin[] = { 0, 0, 0 };
size_t region[] = { src.cols, src.rows, 1 };
cl_mem devData;
if (!alias && !src.isContinuous())
{
devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
CV_OclDbgAssert(err == CL_SUCCESS);
const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1};
CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
}
else
{
devData = (cl_mem)src.handle(ACCESS_READ);
}
CV_Assert(devData != NULL);
if (!alias)
{
CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
if (!src.isContinuous())
{
CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
}
}
}
IMPLEMENT_REFCOUNTABLE();
cl_mem handle;
};
Image2D::Image2D()
{
p = NULL;
}
Image2D::Image2D(const UMat &src, bool norm, bool alias)
{
p = new Impl(src, norm, alias);
}
bool Image2D::canCreateAlias(const UMat &m)
{
bool ret = false;
const Device & d = ocl::Device::getDefault();
if (d.imageFromBufferSupport())
{
// This is the required pitch alignment in pixels
uint pitchAlign = d.imagePitchAlignment();
if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
{
// We don't currently handle the case where the buffer was created
// with CL_MEM_USE_HOST_PTR
if (!m.u->tempUMat())
{
ret = true;
}
}
}
return ret;
}
bool Image2D::isFormatSupported(int depth, int cn, bool norm)
{
cl_image_format format = Impl::getImageFormat(depth, cn, norm);
return Impl::isFormatSupported(format);
}
Image2D::Image2D(const Image2D & i)
{
p = i.p;
if (p)
p->addref();
}
Image2D & Image2D::operator = (const Image2D & i)
{
if (i.p != p)
{
if (i.p)
i.p->addref();
if (p)
p->release();
p = i.p;
}
return *this;
}
Image2D::~Image2D()
{
if (p)
p->release();
}
void* Image2D::ptr() const
{
return p ? p->handle : 0;
}
}}