opencv/modules/ocl/src/initialization.cpp
2013-09-24 13:17:09 +04:00

1091 lines
42 KiB
C++

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Guoping Long, longguoping@gmail.com
// Niko Li, newlife20080214@gmail.com
// Yao Wang, bitwangyaoyao@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include <iomanip>
#include <fstream>
#include "binarycaching.hpp"
using namespace cv;
using namespace cv::ocl;
using namespace std;
using std::cout;
using std::endl;
//#define PRINT_KERNEL_RUN_TIME
#define RUN_TIMES 100
#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
#endif
//#define AMD_DOUBLE_DIFFER
namespace cv
{
namespace ocl
{
extern void fft_teardown();
extern void clBlasTeardown();
/*
* The binary caching system to eliminate redundant program source compilation.
* Strictly, this is not a cache because we do not implement evictions right now.
* We shall add such features to trade-off memory consumption and performance when necessary.
*/
auto_ptr<ProgramCache> ProgramCache::programCache;
ProgramCache *programCache = NULL;
DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
int gDevMemTypeValueMap[5] = {0,
CL_MEM_ALLOC_HOST_PTR,
CL_MEM_USE_HOST_PTR,
CL_MEM_COPY_HOST_PTR,
CL_MEM_USE_PERSISTENT_MEM_AMD};
int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
ProgramCache::ProgramCache()
{
codeCache.clear();
cacheSize = 0;
}
ProgramCache::~ProgramCache()
{
releaseProgram();
}
cl_program ProgramCache::progLookup(string srcsign)
{
map<string, cl_program>::iterator iter;
iter = codeCache.find(srcsign);
if(iter != codeCache.end())
return iter->second;
else
return NULL;
}
void ProgramCache::addProgram(string srcsign , cl_program program)
{
if(!progLookup(srcsign))
{
codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
}
}
void ProgramCache::releaseProgram()
{
map<string, cl_program>::iterator iter;
for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
{
openCLSafeCall(clReleaseProgram(iter->second));
}
codeCache.clear();
cacheSize = 0;
}
struct Info::Impl
{
cl_platform_id oclplatform;
std::vector<cl_device_id> devices;
std::vector<std::string> devName;
std::string clVersion;
cl_context oclcontext;
cl_command_queue clCmdQueue;
int devnum;
size_t maxWorkGroupSize;
cl_uint maxDimensions; // == maxWorkItemSizes.size()
std::vector<size_t> maxWorkItemSizes;
cl_uint maxComputeUnits;
char extra_options[512];
int double_support;
int unified_memory; //1 means integrated GPU, otherwise this value is 0
int refcounter;
Impl();
void setDevice(void *ctx, void *q, int devnum);
void release()
{
if(1 == CV_XADD(&refcounter, -1))
{
releaseResources();
delete this;
}
}
Impl* copy()
{
CV_XADD(&refcounter, 1);
return this;
}
private:
Impl(const Impl&);
Impl& operator=(const Impl&);
void releaseResources();
};
// global variables to hold binary cache properties
static int enable_disk_cache =
#ifdef _DEBUG
false;
#else
true;
#endif
static int update_disk_cache = false;
static String binpath = "";
Info::Impl::Impl()
:oclplatform(0),
oclcontext(0),
clCmdQueue(0),
devnum(-1),
maxWorkGroupSize(0),
maxDimensions(0),
maxComputeUnits(0),
double_support(0),
unified_memory(0),
refcounter(1)
{
memset(extra_options, 0, 512);
}
void Info::Impl::releaseResources()
{
devnum = -1;
if(clCmdQueue)
{
//temporarily disable command queue release as it causes program hang at exit
//openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
clCmdQueue = 0;
}
if(oclcontext)
{
openCLSafeCall(clReleaseContext(oclcontext));
oclcontext = 0;
}
}
void Info::Impl::setDevice(void *ctx, void *q, int dnum)
{
if((ctx && q) || devnum != dnum)
releaseResources();
CV_Assert(dnum >= 0 && dnum < (int)devices.size());
devnum = dnum;
if(ctx && q)
{
oclcontext = (cl_context)ctx;
clCmdQueue = (cl_command_queue)q;
clRetainContext(oclcontext);
clRetainCommandQueue(clCmdQueue);
}
else
{
cl_int status = 0;
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
openCLVerifyCall(status);
clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
openCLVerifyCall(status);
}
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
maxWorkItemSizes.resize(maxDimensions);
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
cl_bool unfymem = false;
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
unified_memory = unfymem ? 1 : 0;
//initialize extra options for compilation. Currently only fp64 is included.
//Assume 4KB is enough to store all possible extensions.
const int EXT_LEN = 4096 + 1 ;
char extends_set[EXT_LEN];
size_t extends_size;
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
extends_set[EXT_LEN - 1] = 0;
size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
if(fp64_khr != std::string::npos)
{
sprintf(extra_options, "-D DOUBLE_SUPPORT");
double_support = 1;
}
else
{
memset(extra_options, 0, 512);
double_support = 0;
}
}
////////////////////////Common OpenCL specific calls///////////////
int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
{
rw_type = gDeviceMemRW;
mem_type = gDeviceMemType;
return Context::getContext()->impl->unified_memory;
}
int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
{
if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
mem_type == DEVICE_MEM_UHP ||
mem_type == DEVICE_MEM_CHP )
return -1;
gDeviceMemRW = rw_type;
gDeviceMemType = mem_type;
return 0;
}
int getDevice(std::vector<Info> &oclinfo, int devicetype)
{
//TODO: cache oclinfo vector
oclinfo.clear();
switch(devicetype)
{
case CVCL_DEVICE_TYPE_DEFAULT:
case CVCL_DEVICE_TYPE_CPU:
case CVCL_DEVICE_TYPE_GPU:
case CVCL_DEVICE_TYPE_ACCELERATOR:
case CVCL_DEVICE_TYPE_ALL:
break;
default:
return 0;
}
// Platform info
cl_uint numPlatforms;
openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
if(numPlatforms < 1) return 0;
std::vector<cl_platform_id> platforms(numPlatforms);
openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
char deviceName[256];
int devcienums = 0;
char clVersion[256];
for (unsigned i = 0; i < numPlatforms; ++i)
{
cl_uint numsdev = 0;
cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
if(status != CL_DEVICE_NOT_FOUND)
openCLVerifyCall(status);
if(numsdev > 0)
{
devcienums += numsdev;
std::vector<cl_device_id> devices(numsdev);
openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
Info ocltmpinfo;
ocltmpinfo.impl->oclplatform = platforms[i];
openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL));
ocltmpinfo.impl->clVersion = clVersion;
for(unsigned j = 0; j < numsdev; ++j)
{
ocltmpinfo.impl->devices.push_back(devices[j]);
openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
ocltmpinfo.impl->devName.push_back(deviceName);
ocltmpinfo.DeviceName.push_back(deviceName);
}
oclinfo.push_back(ocltmpinfo);
}
}
if(devcienums > 0)
{
setDevice(oclinfo[0]);
}
return devcienums;
}
void setDevice(Info &oclinfo, int devnum)
{
oclinfo.impl->setDevice(0, 0, devnum);
Context::setContext(oclinfo);
}
void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
{
oclinfo.impl->setDevice(ctx, q, devnum);
Context::setContext(oclinfo);
}
void *getoclContext()
{
return &(Context::getContext()->impl->oclcontext);
}
void *getoclCommandQueue()
{
return &(Context::getContext()->impl->clCmdQueue);
}
void finish()
{
clFinish(Context::getContext()->impl->clCmdQueue);
}
//template specializations of queryDeviceInfo
template<>
bool queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel)
{
Info::Impl* impl = Context::getContext()->impl;
cl_device_type devicetype;
openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
CL_DEVICE_TYPE, sizeof(cl_device_type),
&devicetype, NULL));
return (devicetype == CVCL_DEVICE_TYPE_CPU);
}
template<typename _ty>
static _ty queryWavesize(cl_kernel kernel)
{
size_t info = 0;
Info::Impl* impl = Context::getContext()->impl;
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if(is_cpu)
{
return 1;
}
CV_Assert(kernel != NULL);
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &info, NULL));
return static_cast<_ty>(info);
}
template<>
size_t queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel)
{
return queryWavesize<size_t>(kernel);
}
template<>
int queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel)
{
return queryWavesize<int>(kernel);
}
void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
{
cl_int status;
status = clEnqueueReadBuffer(clCxt->impl->clCmdQueue, dst_buffer, CL_TRUE, 0,
size, host_buffer, 0, NULL, NULL);
openCLVerifyCall(status);
}
cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
{
cl_int status;
cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
openCLVerifyCall(status);
return buffer;
}
void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height)
{
openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
}
void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
{
cl_int status;
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
widthInBytes * height, 0, &status);
openCLVerifyCall(status);
*pitch = widthInBytes;
}
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height, openCLMemcpyKind kind, int channels)
{
size_t buffer_origin[3] = {0, 0, 0};
size_t host_origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
if(kind == clMemcpyHostToDevice)
{
if(dpitch == width || channels == 3 || height == 1)
{
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
0, width * height, src, 0, NULL, NULL));
}
else
{
openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
}
}
else if(kind == clMemcpyDeviceToHost)
{
if(spitch == width || channels == 3 || height == 1)
{
openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
0, width * height, dst, 0, NULL, NULL));
}
else
{
openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
}
}
}
void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch,
size_t width, size_t height, int src_offset)
{
size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
size_t region[3] = {width, height, 1};
openCLSafeCall(clEnqueueCopyBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
region, spitch, 0, dpitch, 0, 0, 0, 0));
}
void openCLFree(void *devPtr)
{
openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
}
cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName)
{
return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
}
void setBinaryDiskCache(int mode, String path)
{
if(mode == CACHE_NONE)
{
update_disk_cache = 0;
enable_disk_cache = 0;
return;
}
update_disk_cache |= (mode & CACHE_UPDATE) == CACHE_UPDATE;
enable_disk_cache |=
#ifdef _DEBUG
(mode & CACHE_DEBUG) == CACHE_DEBUG;
#else
(mode & CACHE_RELEASE) == CACHE_RELEASE;
#endif
if(enable_disk_cache && !path.empty())
{
binpath = path;
}
}
void setBinpath(const char *path)
{
binpath = path;
}
int savetofile(const Context*, cl_program &program, const char *fileName)
{
size_t binarySize;
openCLSafeCall(clGetProgramInfo(program,
CL_PROGRAM_BINARY_SIZES,
sizeof(size_t),
&binarySize, NULL));
char* binary = (char*)malloc(binarySize);
if(binary == NULL)
{
CV_Error(CV_StsNoMem, "Failed to allocate host memory.");
}
openCLSafeCall(clGetProgramInfo(program,
CL_PROGRAM_BINARIES,
sizeof(char *),
&binary,
NULL));
FILE *fp = fopen(fileName, "wb+");
if(fp != NULL)
{
fwrite(binary, binarySize, 1, fp);
free(binary);
fclose(fp);
}
return 1;
}
cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName,
const char *build_options)
{
cl_kernel kernel;
cl_program program ;
cl_int status = 0;
stringstream src_sign;
string srcsign;
string filename;
CV_Assert(programCache != NULL);
if(NULL != build_options)
{
src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
}
else
{
src_sign << (int64)(*source) << clCxt->impl->oclcontext;
}
srcsign = src_sign.str();
program = NULL;
program = programCache->progLookup(srcsign);
if(!program)
{
//config build programs
char all_build_options[1024];
memset(all_build_options, 0, 1024);
char zeromem[512] = {0};
if(0 != memcmp(clCxt -> impl->extra_options, zeromem, 512))
strcat(all_build_options, clCxt -> impl->extra_options);
strcat(all_build_options, " ");
if(build_options != NULL)
strcat(all_build_options, build_options);
if(all_build_options != NULL)
{
filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
}
else
{
filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
}
FILE *fp = enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
if(fp == NULL || update_disk_cache)
{
if(fp != NULL)
fclose(fp);
program = clCreateProgramWithSource(
clCxt->impl->oclcontext, 1, source, NULL, &status);
openCLVerifyCall(status);
status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
if(status == CL_SUCCESS && enable_disk_cache)
savetofile(clCxt, program, filename.c_str());
}
else
{
fseek(fp, 0, SEEK_END);
size_t binarySize = ftell(fp);
fseek(fp, 0, SEEK_SET);
char *binary = new char[binarySize];
CV_Assert(1 == fread(binary, binarySize, 1, fp));
fclose(fp);
cl_int status = 0;
program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
1,
&(clCxt->impl->devices[clCxt->impl->devnum]),
(const size_t *)&binarySize,
(const unsigned char **)&binary,
NULL,
&status);
openCLVerifyCall(status);
status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
delete[] binary;
}
if(status != CL_SUCCESS)
{
if(status == CL_BUILD_PROGRAM_FAILURE)
{
cl_int logStatus;
char *buildLog = NULL;
size_t buildLogSize = 0;
logStatus = clGetProgramBuildInfo(program,
clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
buildLog, &buildLogSize);
if(logStatus != CL_SUCCESS)
cout << "Failed to build the program and get the build info." << endl;
buildLog = new char[buildLogSize];
CV_DbgAssert(!!buildLog);
memset(buildLog, 0, buildLogSize);
openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
cout << "\n\t\t\tBUILD LOG\n";
cout << buildLog << endl;
delete [] buildLog;
}
openCLVerifyCall(status);
}
//Cache the binary for future use if build_options is null
if( (programCache->cacheSize += 1) < programCache->MAX_PROG_CACHE_SIZE)
programCache->addProgram(srcsign, program);
else
cout << "Warning: code cache has been full.\n";
}
kernel = clCreateKernel(program, kernelName.c_str(), &status);
openCLVerifyCall(status);
return kernel;
}
void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
{
size_t kernelWorkGroupSize;
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
}
static inline size_t roundUp(size_t sz, size_t n)
{
// we don't assume that n is a power of 2 (see alignSize)
// equal to divUp(sz, n) * n
size_t t = sz + n - 1;
size_t rem = t % n;
size_t result = t - rem;
return result;
}
#ifdef PRINT_KERNEL_RUN_TIME
static double total_execute_time = 0;
static double total_kernel_time = 0;
#endif
void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
int depth, const char *build_options)
{
//construct kernel name
//The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
//for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
stringstream idxStr;
if(channels != -1)
idxStr << "_C" << channels;
if(depth != -1)
idxStr << "_D" << depth;
kernelName += idxStr.str();
cl_kernel kernel;
kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
if ( localThreads != NULL)
{
globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
}
for(size_t i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
#ifndef PRINT_KERNEL_RUN_TIME
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
localThreads, 0, NULL, NULL));
#else
cl_event event = NULL;
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
localThreads, 0, NULL, &event));
cl_ulong start_time, end_time, queue_time;
double execute_time = 0;
double total_time = 0;
openCLSafeCall(clWaitForEvents(1, &event));
openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start_time, 0));
openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end_time, 0));
openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
sizeof(cl_ulong), &queue_time, 0));
execute_time = (double)(end_time - start_time) / (1000 * 1000);
total_time = (double)(end_time - queue_time) / (1000 * 1000);
total_execute_time += execute_time;
total_kernel_time += total_time;
clReleaseEvent(event);
#endif
clFlush(clCxt->impl->clCmdQueue);
openCLSafeCall(clReleaseKernel(kernel));
}
void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth)
{
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args,
channels, depth, NULL);
}
void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
{
#ifndef PRINT_KERNEL_RUN_TIME
openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
build_options);
#else
string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
cout << endl;
cout << "Function Name: " << kernelName;
if(depth >= 0)
cout << " |data type: " << data_type[depth];
cout << " |channels: " << channels;
cout << " |Time Unit: " << "ms" << endl;
total_execute_time = 0;
total_kernel_time = 0;
cout << "-------------------------------------" << endl;
cout << setiosflags(ios::left) << setw(15) << "excute time";
cout << setiosflags(ios::left) << setw(15) << "lauch time";
cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
int i = 0;
for(i = 0; i < RUN_TIMES; i++)
openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
build_options);
cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
cout << "average kernel total time: " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
#endif
}
double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
bool finish, bool measureKernelTime, bool cleanUp)
{
//construct kernel name
//The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
//for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
stringstream idxStr;
if(channels != -1)
idxStr << "_C" << channels;
if(depth != -1)
idxStr << "_D" << depth;
kernelName += idxStr.str();
cl_kernel kernel;
kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
double kernelTime = 0.0;
if( globalThreads != NULL)
{
if ( localThreads != NULL)
{
globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
//size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
}
for(size_t i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
if(measureKernelTime == false)
{
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
localThreads, 0, NULL, NULL));
}
else
{
cl_event event = NULL;
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
localThreads, 0, NULL, &event));
cl_ulong end_time, queue_time;
openCLSafeCall(clWaitForEvents(1, &event));
openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end_time, 0));
openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
sizeof(cl_ulong), &queue_time, 0));
kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
clReleaseEvent(event);
}
}
if(finish)
{
clFinish(clCxt->impl->clCmdQueue);
}
if(cleanUp)
{
openCLSafeCall(clReleaseKernel(kernel));
}
return kernelTime;
}
// Converts the contents of a file into a string
static int convertToString(const char *filename, std::string& s)
{
size_t size;
char* str;
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if(f.is_open())
{
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size+1];
if(!str)
{
f.close();
return -1;
}
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return 0;
}
printf("Error: Failed to open file %s\n", filename);
return -1;
}
double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
bool finish, bool measureKernelTime, bool cleanUp)
{
std::vector<std::string> fsource;
for (int i = 0 ; i < numFiles ; i++)
{
std::string str;
if (convertToString(fileName[i], str) >= 0)
fsource.push_back(str);
}
const char **source = new const char *[numFiles];
for (int i = 0 ; i < numFiles ; i++)
source[i] = fsource[i].c_str();
double kernelTime = openCLExecuteKernelInterop(clCxt ,source, kernelName, globalThreads, localThreads,
args, channels, depth, build_options, finish, measureKernelTime, cleanUp);
fsource.clear();
delete []source;
return kernelTime;
}
cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
const size_t size)
{
int status;
cl_mem con_struct;
con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
openCLSafeCall(status);
openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
value, 0, 0, 0));
return con_struct;
}
/////////////////////////////OpenCL initialization/////////////////
auto_ptr<Context> Context::clCxt;
int Context::val = 0;
static Mutex cs;
static volatile int context_tear_down = 0;
bool initialized()
{
return *((volatile int*)&Context::val) != 0 &&
Context::clCxt->impl->clCmdQueue != NULL&&
Context::clCxt->impl->oclcontext != NULL;
}
Context* Context::getContext()
{
if(*((volatile int*)&val) != 1)
{
AutoLock al(cs);
if(*((volatile int*)&val) != 1)
{
if (context_tear_down)
return clCxt.get();
if( 0 == clCxt.get())
clCxt.reset(new Context);
std::vector<Info> oclinfo;
CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
*((volatile int*)&val) = 1;
}
}
return clCxt.get();
}
void Context::setContext(Info &oclinfo)
{
AutoLock guard(cs);
if(*((volatile int*)&val) != 1)
{
if( 0 == clCxt.get())
clCxt.reset(new Context);
clCxt.get()->impl = oclinfo.impl->copy();
*((volatile int*)&val) = 1;
}
else
{
clCxt.get()->impl->release();
clCxt.get()->impl = oclinfo.impl->copy();
}
}
Context::Context()
{
impl = 0;
programCache = ProgramCache::getProgramCache();
}
Context::~Context()
{
release();
}
void Context::release()
{
if (impl)
impl->release();
programCache->releaseProgram();
}
bool Context::supportsFeature(int ftype) const
{
switch(ftype)
{
case CL_DOUBLE:
return impl->double_support == 1;
case CL_UNIFIED_MEM:
return impl->unified_memory == 1;
case CL_VER_1_2:
return impl->clVersion.find("OpenCL 1.2") != string::npos;
default:
return false;
}
}
size_t Context::computeUnits() const
{
return impl->maxComputeUnits;
}
unsigned long queryLocalMemInfo()
{
Info::Impl* impl = Context::getContext()->impl;
cl_ulong local_memory_size = 0;
clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void*)&local_memory_size, 0);
return local_memory_size;
}
void* Context::oclContext()
{
return impl->oclcontext;
}
void* Context::oclCommandQueue()
{
return impl->clCmdQueue;
}
Info::Info()
{
impl = new Impl;
}
void Info::release()
{
fft_teardown();
clBlasTeardown();
impl->release();
impl = new Impl;
DeviceName.clear();
}
Info::~Info()
{
fft_teardown();
clBlasTeardown();
impl->release();
}
Info &Info::operator = (const Info &m)
{
impl->release();
impl = m.impl->copy();
DeviceName = m.DeviceName;
return *this;
}
Info::Info(const Info &m)
{
impl = m.impl->copy();
DeviceName = m.DeviceName;
}
}//namespace ocl
}//namespace cv