Merge remote-tracking branch 'upstream/3.4' into merge-3.4

This commit is contained in:
Alexander Alekhin 2021-09-11 16:32:13 +00:00
commit c3ac834526
35 changed files with 1014 additions and 444 deletions

View File

@ -141,8 +141,8 @@ if(INF_ENGINE_TARGET)
endif() endif()
endif() endif()
if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT) if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT)
message(WARNING "InferenceEngine version has not been set, 2021.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.") message(WARNING "InferenceEngine version has not been set, 2021.4.1 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
set(INF_ENGINE_RELEASE_INIT "2021040000") set(INF_ENGINE_RELEASE_INIT "2021040100")
elseif(DEFINED INF_ENGINE_RELEASE) elseif(DEFINED INF_ENGINE_RELEASE)
set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}") set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}")
endif() endif()

View File

@ -98,7 +98,7 @@ import numpy as np
import cv2 as cv import cv2 as cv
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
img = cv.imread('simple.jpg',0) img = cv.imread('blox.jpg',0) # `<opencv_root>/samples/data/blox.jpg`
# Initiate FAST object with default values # Initiate FAST object with default values
fast = cv.FastFeatureDetector_create() fast = cv.FastFeatureDetector_create()
@ -113,17 +113,17 @@ print( "nonmaxSuppression:{}".format(fast.getNonmaxSuppression()) )
print( "neighborhood: {}".format(fast.getType()) ) print( "neighborhood: {}".format(fast.getType()) )
print( "Total Keypoints with nonmaxSuppression: {}".format(len(kp)) ) print( "Total Keypoints with nonmaxSuppression: {}".format(len(kp)) )
cv.imwrite('fast_true.png',img2) cv.imwrite('fast_true.png', img2)
# Disable nonmaxSuppression # Disable nonmaxSuppression
fast.setNonmaxSuppression(0) fast.setNonmaxSuppression(0)
kp = fast.detect(img,None) kp = fast.detect(img, None)
print( "Total Keypoints without nonmaxSuppression: {}".format(len(kp)) ) print( "Total Keypoints without nonmaxSuppression: {}".format(len(kp)) )
img3 = cv.drawKeypoints(img, kp, None, color=(255,0,0)) img3 = cv.drawKeypoints(img, kp, None, color=(255,0,0))
cv.imwrite('fast_false.png',img3) cv.imwrite('fast_false.png', img3)
@endcode @endcode
See the results. First image shows FAST with nonmaxSuppression and second one without See the results. First image shows FAST with nonmaxSuppression and second one without
nonmaxSuppression: nonmaxSuppression:

View File

@ -499,8 +499,8 @@ public:
template<typename... _Tps> inline template<typename... _Tps> inline
Kernel& args(const _Tps&... kernel_args) { set_args_(0, kernel_args...); return *this; } Kernel& args(const _Tps&... kernel_args) { set_args_(0, kernel_args...); return *this; }
/** @brief Run the OpenCL kernel (globalsize value may be adjusted)
/** @brief Run the OpenCL kernel.
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3. @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
@param globalsize work items for each dimension. It is not the final globalsize passed to @param globalsize work items for each dimension. It is not the final globalsize passed to
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
@ -509,12 +509,26 @@ public:
@param localsize work-group size for each dimension. @param localsize work-group size for each dimension.
@param sync specify whether to wait for OpenCL computation to finish before return. @param sync specify whether to wait for OpenCL computation to finish before return.
@param q command queue @param q command queue
@note Use run_() if your kernel code doesn't support adjusted globalsize.
*/ */
bool run(int dims, size_t globalsize[], bool run(int dims, size_t globalsize[],
size_t localsize[], bool sync, const Queue& q=Queue()); size_t localsize[], bool sync, const Queue& q=Queue());
/** @brief Run the OpenCL kernel
*
* @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
* @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
* @param localsize work-group size for each dimension.
* @param sync specify whether to wait for OpenCL computation to finish before return.
* @param q command queue
*/
bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
bool runTask(bool sync, const Queue& q=Queue()); bool runTask(bool sync, const Queue& q=Queue());
/** @brief Similar to synchronized run() call with returning of kernel execution time /** @brief Similar to synchronized run_() call with returning of kernel execution time
*
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE) * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
* @return Execution time in nanoseconds or negative number on error * @return Execution time in nanoseconds or negative number on error
*/ */

View File

@ -24,11 +24,6 @@
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
#include <sstream>
#include "opencl_kernels_core.hpp"
#include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
namespace cv namespace cv
{ {
@ -37,52 +32,75 @@ static bool intel_gpu_gemm(
UMat B, Size sizeB, UMat B, Size sizeB,
UMat D, Size sizeD, UMat D, Size sizeD,
double alpha, double beta, double alpha, double beta,
bool atrans, bool btrans) bool atrans, bool btrans,
bool& isPropagatedC2D
)
{ {
CV_UNUSED(sizeB); CV_UNUSED(sizeB);
int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width); int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
std::string kernelName; if (M < 4 || N < 4 || K < 4) // vload4
bool ret = true; return false;
size_t lx = 8, ly = 4; CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K);
size_t dx = 4, dy = 8;
std::string kernelName;
unsigned int lx = 8, ly = 4;
unsigned int dx = 4, dy = 8;
if(!atrans && !btrans) if(!atrans && !btrans)
{ {
if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0) if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
{ {
kernelName = "intelblas_gemm_buffer_NN_sp"; kernelName = "intelblas_gemm_buffer_NN_sp";
} }
else else
{ {
if (M % 2 != 0)
return false;
// vload4(0, dst_write0) - 4 cols
// multiply by lx: 8
if (N % (4*8) != 0)
return false;
kernelName = "intelblas_gemm_buffer_NN"; kernelName = "intelblas_gemm_buffer_NN";
} }
} }
else if(atrans && !btrans) else if(atrans && !btrans)
{ {
if (M % 32 != 0)
return false;
if (N % 32 != 0)
return false;
kernelName = "intelblas_gemm_buffer_TN"; kernelName = "intelblas_gemm_buffer_TN";
} }
else if(!atrans && btrans) else if(!atrans && btrans)
{ {
if (K % 4 != 0)
return false;
kernelName = "intelblas_gemm_buffer_NT"; kernelName = "intelblas_gemm_buffer_NT";
ly = 16; ly = 16;
dx = 1; dx = 1;
} }
else else
{ {
if (M % 32 != 0)
return false;
if (N % 32 != 0)
return false;
if (K % 16 != 0)
return false;
kernelName = "intelblas_gemm_buffer_TT"; kernelName = "intelblas_gemm_buffer_TT";
} }
const size_t gx = (size_t)(N + dx - 1) / dx; CV_LOG_DEBUG(NULL, "kernel: " << kernelName << " (M=" << M << " N=" << N << " K=" << K << ")");
const size_t gy = (size_t)(M + dy - 1) / dy;
const size_t gx = divUp((size_t)N, dx);
const size_t gy = divUp((size_t)M, dy);
size_t local[] = {lx, ly, 1}; size_t local[] = {lx, ly, 1};
size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1}; size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1};
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
ocl::Queue q; ocl::Queue q;
String errmsg; String errmsg;
@ -110,10 +128,13 @@ static bool intel_gpu_gemm(
(int)(D.step / sizeof(float)) (int)(D.step / sizeof(float))
); );
ret = k.run(2, global, local, false, q); bool ret = k.run(2, global, local, false, q);
return ret;
} }
else else
{ {
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
for(int start_index = 0; start_index < K; start_index += stride) for(int start_index = 0; start_index < K; start_index += stride)
{ {
ocl::Kernel k(kernelName.c_str(), program); ocl::Kernel k(kernelName.c_str(), program);
@ -132,12 +153,16 @@ static bool intel_gpu_gemm(
(int) start_index, // 14 start_index (int) start_index, // 14 start_index
stride); stride);
ret = k.run(2, global, local, false, q); bool ret = k.run(2, global, local, false, q);
if (!ret) return ret; if (!ret)
{
if (start_index != 0)
isPropagatedC2D = false; // D array content is changed, need to rewrite
return false;
}
} }
return true;
} }
return ret;
} }
} // namespace cv } // namespace cv

View File

@ -42,6 +42,8 @@
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#include <opencv2/core/utils/logger.hpp>
#include "opencl_kernels_core.hpp" #include "opencl_kernels_core.hpp"
#include "opencv2/core/opencl/runtime/opencl_clblas.hpp" #include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
#include "opencv2/core/opencl/runtime/opencl_core.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp"
@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
InputArray matC, double beta, OutputArray matD, int flags ) InputArray matC, double beta, OutputArray matD, int flags )
{ {
int depth = matA.depth(), cn = matA.channels(); int type = matA.type();
int type = CV_MAKETYPE(depth, cn); int depth = CV_MAT_DEPTH(type);
int cn = CV_MAT_CN(type);
CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); CV_CheckTypeEQ(type, matB.type(), "");
CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, "");
const ocl::Device & dev = ocl::Device::getDefault(); const ocl::Device & dev = ocl::Device::getDefault();
bool doubleSupport = dev.doubleFPConfig() > 0; bool doubleSupport = dev.doubleFPConfig() > 0;
@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
CV_Assert( !haveC || matC.type() == type ); if (haveC)
CV_CheckTypeEQ(type, matC.type(), "");
Size sizeD(((btrans) ? sizeB.height : sizeB.width),
((atrans) ? sizeA.width : sizeA.height));
if (atrans)
sizeA = Size(sizeA.height, sizeA.width);
if (btrans)
sizeB = Size(sizeB.height, sizeB.width);
if (haveC && ctrans)
sizeC = Size(sizeC.height, sizeC.width);
CV_CheckEQ(sizeA.width, sizeB.height, "");
if (haveC)
CV_CheckEQ(sizeC, sizeD, "");
UMat A = matA.getUMat();
UMat B = matB.getUMat();
Size sizeD(((btrans)? sizeB.height : sizeB.width),
((atrans)? sizeA.width : sizeA.height));
matD.create(sizeD, type); matD.create(sizeD, type);
UMat D = matD.getUMat();
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); bool isPropagatedC2D = false; // D content is updated with C / C.t()
if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1)
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
{
String opts;
if (atrans)
sizeA = Size(sizeA.height, sizeA.width);
if (btrans)
sizeB = Size(sizeB.height, sizeB.width);
if (haveC && ctrans)
sizeC = Size(sizeC.height, sizeC.width);
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
int max_wg_size = (int)dev.maxWorkGroupSize();
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
if (atrans)
A = A.t();
if (btrans)
B = B.t();
if (haveC)
ctrans ? transpose(matC, D) : matC.copyTo(D);
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
cn, kercn, block_size,
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
haveC ? " -D HAVE_C" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
if (k.empty())
return false;
if (depth == CV_64F)
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, alpha, beta);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, (float)alpha, (float)beta);
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
}
else
{ {
if (haveC && beta != 0.0) if (haveC && beta != 0.0)
{ {
ctrans ? transpose(matC, D) : matC.copyTo(D); ctrans ? transpose(matC, D) : matC.copyTo(D);
isPropagatedC2D = true;
} }
else else
{ {
beta = 0.0; beta = 0.0;
} }
return intel_gpu_gemm(A, sizeA, bool res = intel_gpu_gemm(A, matA.size(),
B, sizeB, B, matB.size(),
D, sizeD, D, sizeD,
alpha, alpha,
beta, beta,
atrans, btrans); atrans, btrans,
isPropagatedC2D);
if (res)
return true;
// fallback on generic OpenCL code
} }
if (sizeD.width < 8 || sizeD.height < 8)
return false;
String opts;
int wg_size = (int)dev.maxWorkGroupSize();
int sizeDmin = std::min(sizeD.width, sizeD.height);
wg_size = std::min(wg_size, sizeDmin * sizeDmin);
int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
if (atrans)
A = A.t();
if (btrans)
B = B.t();
if (haveC && !isPropagatedC2D)
ctrans ? transpose(matC, D) : matC.copyTo(D);
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
cn, kercn, block_size,
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
haveC ? " -D HAVE_C" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
if (k.empty())
return false;
if (depth == CV_64F)
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, alpha, beta);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, (float)alpha, (float)beta);
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false);
} }
#endif #endif

View File

@ -76,8 +76,11 @@
#undef CV__ALLOCATOR_STATS_LOG #undef CV__ALLOCATOR_STATS_LOG
#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
#define CV_OPENCL_SHOW_BUILD_OPTIONS 0
#define CV_OPENCL_SHOW_BUILD_KERNELS 0
#define CV_OPENCL_SHOW_RUN_KERNELS 0 #define CV_OPENCL_SHOW_RUN_KERNELS 0
#define CV_OPENCL_SYNC_RUN_KERNELS 0
#define CV_OPENCL_TRACE_CHECK 0 #define CV_OPENCL_TRACE_CHECK 0
#define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1
@ -2155,20 +2158,22 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
platforms.resize(numPlatforms); platforms.resize(numPlatforms);
} }
int selectedPlatform = -1;
if (platform.length() > 0) if (platform.length() > 0)
{ {
for (size_t i = 0; i < platforms.size(); i++) for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end();)
{ {
std::string name; std::string name;
CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name)); CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, *currentPlatform, CL_PLATFORM_NAME, name));
if (name.find(platform) != std::string::npos) if (name.find(platform) != std::string::npos)
{ {
selectedPlatform = (int)i; ++currentPlatform;
break; }
else
{
currentPlatform = platforms.erase(currentPlatform);
} }
} }
if (selectedPlatform == -1) if (platforms.size() == 0)
{ {
CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform); CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
goto not_found; goto not_found;
@ -2205,13 +2210,11 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
goto not_found; goto not_found;
} }
std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup std::vector<cl_device_id> devices;
for (int i = selectedPlatform >= 0 ? selectedPlatform : 0; for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end(); ++currentPlatform)
(selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
i++)
{ {
cl_uint count = 0; cl_uint count = 0;
cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count); cl_int status = clGetDeviceIDs(*currentPlatform, deviceType, 0, NULL, &count);
if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND)) if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
{ {
CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count"); CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
@ -2220,7 +2223,7 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
continue; continue;
size_t base = devices.size(); size_t base = devices.size();
devices.resize(base + count); devices.resize(base + count);
status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count); status = clGetDeviceIDs(*currentPlatform, deviceType, count, &devices[base], &count);
if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND)) if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
{ {
CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs"); CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
@ -3679,6 +3682,8 @@ static cv::String dumpValue(size_t sz, const void* p)
{ {
if (!p) if (!p)
return "NULL"; return "NULL";
if (sz == 2)
return cv::format("%d / %uu / 0x%04x", *(short*)p, *(unsigned short*)p, *(short*)p);
if (sz == 4) if (sz == 4)
return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p); return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
if (sz == 8) if (sz == 8)
@ -3851,6 +3856,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
} }
bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
bool sync, const Queue& q)
{
CV_Assert(p);
return p->run(dims, _globalsize, _localsize, sync, NULL, q);
}
static bool isRaiseErrorOnReuseAsyncKernel() static bool isRaiseErrorOnReuseAsyncKernel()
{ {
static bool initialized = false; static bool initialized = false;
@ -3891,6 +3904,10 @@ bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
return false; // OpenCV 5.0: raise error return false; // OpenCV 5.0: raise error
} }
#if CV_OPENCL_SYNC_RUN_KERNELS
sync = true;
#endif
cl_command_queue qq = getQueue(q); cl_command_queue qq = getQueue(q);
if (haveTempDstUMats) if (haveTempDstUMats)
sync = true; sync = true;
@ -4338,7 +4355,28 @@ struct Program::Impl
if (!param_buildExtraOptions.empty()) if (!param_buildExtraOptions.empty())
buildflags = joinBuildOptions(buildflags, param_buildExtraOptions); buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
} }
#if CV_OPENCL_SHOW_BUILD_OPTIONS
CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' options:" << buildflags);
#endif
compile(ctx, src_, errmsg); compile(ctx, src_, errmsg);
#if CV_OPENCL_SHOW_BUILD_KERNELS
if (handle)
{
size_t retsz = 0;
char kernels_buffer[4096] = {0};
cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
CV_OCL_DBG_CHECK_RESULT(result, cv::format("clGetProgramInfo(CL_PROGRAM_KERNEL_NAMES: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
if (result == CL_SUCCESS && retsz < sizeof(kernels_buffer))
{
kernels_buffer[retsz] = 0;
CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' kernels: '" << kernels_buffer << "'");
}
else
{
CV_LOG_ERROR(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' can't retrieve kernel names!");
}
}
#endif
} }
bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg) bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
@ -4570,7 +4608,6 @@ struct Program::Impl
CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'"); CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
} }
#endif #endif
} }
return handle != NULL; return handle != NULL;
} }

View File

@ -392,6 +392,15 @@ __kernel void intelblas_gemm_buffer_NN(
#define TILE_N 8 #define TILE_N 8
#define SLM_BLOCK 512 #define SLM_BLOCK 512
/*
A K B.t() K D N
----------- ----------- -----------
| | | | | |
M | | x N | | => M | |
| | | | | |
----------- ----------- -----------
*/
__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1))) __attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))
__kernel void intelblas_gemm_buffer_NT( __kernel void intelblas_gemm_buffer_NT(
const __global float *src0, int off0, const __global float *src0, int off0,
@ -422,59 +431,79 @@ __kernel void intelblas_gemm_buffer_NT(
float8 dot06 = 0.f; float8 dot06 = 0.f;
float8 dot07 = 0.f; float8 dot07 = 0.f;
float4 brow0; const int dst_row = (global_y * TILE_M);
float4 brow1; __global float *dst_write0 = dst + global_x + dst_row * ldC + offd;
float4 brow2;
float4 brow3;
float4 brow4;
float4 brow5;
float4 brow6;
float4 brow7;
__global float *dst_write0 = dst + local_x * VEC_SIZE + ( group_x * TILE_N ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * ldC + offd; const __global float *src0_read00 = src0 + off0;
const int a_row_base = global_y * TILE_M;
const int a_col_base = local_x * (TILE_K / 8); // <= TILE_K - 4
const __global float *src0_read = src0 + local_x * ( TILE_K / 8 ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M ) * ldA + off0; const __global float *src1_read00 = src1 + off1;
const int b_row_base = (group_x * TILE_N);
const __global float *src1_read0 = src1 + ( group_x * TILE_N ) * ldB + off1; //const int b_col_base = 0;
__local float slm_brow[8 * SLM_BLOCK]; __local float slm_brow[8 * SLM_BLOCK];
__local float* slm_brow0;
int local_index = mad24(local_y, 8, local_x) * 4; int local_index = mad24(local_y, 8, local_x) * 4;
int w; int w = 0;
for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) { for (int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK)
{
#define UPDATE_BROW(_row) \
{ \
float4 brow; \
int b_row = b_row_base + _row; \
int b_col = b_tile + local_index; \
if (b_row < N && b_col <= K - 4 /*vload4*/) \
brow = vload4(0, src1_read00 + mad24(b_row, ldB, b_col)); \
else \
brow = (float4)0; \
vstore4(brow, 0, slm_brow + mad24(_row, SLM_BLOCK, local_index)); \
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
vstore4(vload4(0, src1_read0 + mad24(0, ldB, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index)); UPDATE_BROW(0);
vstore4(vload4(0, src1_read0 + mad24(1, ldB, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index)); UPDATE_BROW(1);
vstore4(vload4(0, src1_read0 + mad24(2, ldB, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index)); UPDATE_BROW(2);
vstore4(vload4(0, src1_read0 + mad24(3, ldB, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index)); UPDATE_BROW(3);
vstore4(vload4(0, src1_read0 + mad24(4, ldB, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index)); UPDATE_BROW(4);
vstore4(vload4(0, src1_read0 + mad24(5, ldB, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index)); UPDATE_BROW(5);
vstore4(vload4(0, src1_read0 + mad24(6, ldB, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index)); UPDATE_BROW(6);
vstore4(vload4(0, src1_read0 + mad24(7, ldB, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index)); UPDATE_BROW(7);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#undef UPDATE_BROW
slm_brow0 = slm_brow + local_x * (TILE_K / 8); for (int k_tile_offset = 0; k_tile_offset < SLM_BLOCK; k_tile_offset += TILE_K)
w = b_tile; {
int end_w = min(b_tile + SLM_BLOCK, K); int a_col = a_col_base + b_tile + k_tile_offset;
while( w + TILE_K <= end_w ) {
float4 arow;
brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK); if (a_col > K - 4 /*vload4*/)
brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK); break;
brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);
brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);
brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);
brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);
brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);
brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);
#define MM_DOT_PRODUCT(_row,_dot) \ int slm_brow_col = a_col_base + k_tile_offset; // <= SLM_BLOCK - 4
arow = vload4(0, src0_read + _row * ldA); \ #define READ_SLM_BROW(_row) \
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \ float4 brow##_row = vload4(0, slm_brow + mad24(_row, SLM_BLOCK, slm_brow_col));
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \ READ_SLM_BROW(0);
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); READ_SLM_BROW(1);
READ_SLM_BROW(2);
READ_SLM_BROW(3);
READ_SLM_BROW(4);
READ_SLM_BROW(5);
READ_SLM_BROW(6);
READ_SLM_BROW(7);
#undef READ_SLM_BROW
#define MM_DOT_PRODUCT(_row,_dot) \
{ \
int a_row = a_row_base + _row; \
if (a_row < M) { \
float4 arow = vload4(0, src0_read00 + mad24(a_row, ldA, a_col)); \
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); \
} \
}
MM_DOT_PRODUCT(0,dot00); MM_DOT_PRODUCT(0,dot00);
MM_DOT_PRODUCT(1,dot01); MM_DOT_PRODUCT(1,dot01);
@ -485,53 +514,7 @@ __kernel void intelblas_gemm_buffer_NT(
MM_DOT_PRODUCT(6,dot06); MM_DOT_PRODUCT(6,dot06);
MM_DOT_PRODUCT(7,dot07); MM_DOT_PRODUCT(7,dot07);
#undef MM_DOT_PRODUCT #undef MM_DOT_PRODUCT
src0_read += TILE_K;
slm_brow0 += TILE_K;
w += TILE_K;
} }
src1_read0 += SLM_BLOCK;
}
if(w < K) {
float4 arow;
#define READ_BROW(_brow,_row) \
_brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \
_brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \
_brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \
_brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \
_brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;
READ_BROW(brow0,0);
READ_BROW(brow1,1);
READ_BROW(brow2,2);
READ_BROW(brow3,3);
READ_BROW(brow4,4);
READ_BROW(brow5,5);
READ_BROW(brow6,6);
READ_BROW(brow7,7);
#define MM_DOT_PRODUCT(_row,_dot) \
arow = vload4(0, src0_read + _row * ldA); \
arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \
arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \
arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \
arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
MM_DOT_PRODUCT(0,dot00);
MM_DOT_PRODUCT(1,dot01);
MM_DOT_PRODUCT(2,dot02);
MM_DOT_PRODUCT(3,dot03);
MM_DOT_PRODUCT(4,dot04);
MM_DOT_PRODUCT(5,dot05);
MM_DOT_PRODUCT(6,dot06);
MM_DOT_PRODUCT(7,dot07);
#undef MM_DOT_PRODUCT
} }
#define REDUCE(_dot) \ #define REDUCE(_dot) \
@ -572,21 +555,22 @@ __kernel void intelblas_gemm_buffer_NT(
output = (local_x == 5) ? _dot.s5 : output; \ output = (local_x == 5) ? _dot.s5 : output; \
output = (local_x == 6) ? _dot.s6 : output; \ output = (local_x == 6) ? _dot.s6 : output; \
output = (local_x == 7) ? _dot.s7 : output; \ output = (local_x == 7) ? _dot.s7 : output; \
if (beta != 0.0) \ if (beta != 0.0f) \
dst_write0[0] = mad(output, (float)alpha, ((float)beta * dst_write0[0])); \ dst_write0[0] = mad(output, (float)alpha, ((float)beta * dst_write0[0])); \
else \ else \
dst_write0[0] = output * (float)alpha; \ dst_write0[0] = output * (float)alpha; \
dst_write0 += ldC; dst_write0 += ldC;
if(global_x < N && global_y * 8 < M) { if (global_x < N && dst_row < M)
OUTPUT(dot00); {
if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); } /*if (dst_row + 0 < M)*/ { OUTPUT(dot00); }
if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); } if (dst_row + 1 < M) { OUTPUT(dot01); }
if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); } if (dst_row + 2 < M) { OUTPUT(dot02); }
if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); } if (dst_row + 3 < M) { OUTPUT(dot03); }
if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); } if (dst_row + 4 < M) { OUTPUT(dot04); }
if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); } if (dst_row + 5 < M) { OUTPUT(dot05); }
if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); } if (dst_row + 6 < M) { OUTPUT(dot06); }
if (dst_row + 7 < M) { OUTPUT(dot07); }
} }
#undef OUTPUT #undef OUTPUT
} }

View File

@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm,
double alpha, beta; double alpha, beta;
int M, N, K;
TEST_DECLARE_INPUT_PARAMETER(A); TEST_DECLARE_INPUT_PARAMETER(A);
TEST_DECLARE_INPUT_PARAMETER(B); TEST_DECLARE_INPUT_PARAMETER(B);
TEST_DECLARE_INPUT_PARAMETER(C); TEST_DECLARE_INPUT_PARAMETER(C);
@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm,
void generateTestData() void generateTestData()
{ {
// set minimum size to 20, since testing less sizes doesn't make sense M = (int)randomDoubleLog(1, 100);
Size ARoiSize = randomSize(20, MAX_VALUE); N = (int)randomDoubleLog(1, 100);
K = (int)randomDoubleLog(1, 1200);
M = roundUp(M, 1);
N = roundUp(N, 1);
K = roundUp(K, 1);
Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M);
Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11); randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
if (atrans) Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K);
ARoiSize = Size(ARoiSize.height, ARoiSize.width);
Size BRoiSize = randomSize(20, MAX_VALUE);
if (btrans)
BRoiSize.width = ARoiSize.width;
else
BRoiSize.height = ARoiSize.width;
Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11); randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11);
if (btrans) Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M);
BRoiSize = Size(BRoiSize.height, BRoiSize.width);
Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width);
Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11); randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11);
Size DRoiSize = Size(N, M);
Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11); randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11);
@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy)
for (int i = 0; i < test_loop_times; ++i) for (int i = 0; i < test_loop_times; ++i)
{ {
generateTestData(); generateTestData();
SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K));
OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags)); OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags));
OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags)); OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags));
double eps = D_roi.size().area() * 1e-4; double eps = D_roi.size().area() * (1e-5 * K);
OCL_EXPECT_MATS_NEAR(D, eps); OCL_EXPECT_MATS_NEAR(D, eps);
} }
} }

View File

@ -238,7 +238,7 @@ public:
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight)); kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias)); kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst)); kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
bool ret = kernel.run(2, global, NULL, false); bool ret = kernel.run_(2, global, NULL, false);
if (!ret) if (!ret)
return false; return false;
} }

View File

@ -1951,7 +1951,7 @@ Ptr<Layer> ChannelsPReLULayer::create(const LayerParams& params)
if (params.blobs[0].total() == 1) if (params.blobs[0].total() == 1)
{ {
LayerParams reluParams = params; LayerParams reluParams = params;
reluParams.set("negative_slope", params.blobs[0].at<float>(0)); reluParams.set("negative_slope", *params.blobs[0].ptr<float>());
return ReLULayer::create(reluParams); return ReLULayer::create(reluParams);
} }
Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0]))); Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));

View File

@ -200,7 +200,7 @@ public:
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat)); k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
ret = k1.run(1, globalsize, localsize, false); ret = k1.run_(1, globalsize, localsize, false);
if (!ret) if (!ret)
return false; return false;
} }

View File

@ -120,7 +120,14 @@ public:
internals_arr.getMatVector(internals); internals_arr.getMatVector(internals);
if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3]) if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
{
// outputs[0] = inputs[0] doesn't work due to BlobManager optimizations
if (inputs[0].data != outputs[0].data)
{
inputs[0].copyTo(outputs[0]);
}
return; return;
}
Mat& inp = inputs[0]; Mat& inp = inputs[0];
Mat& out = outputs[0]; Mat& out = outputs[0];

View File

@ -64,6 +64,31 @@ namespace cv
namespace dnn namespace dnn
{ {
void sliceRangesFromShape(const MatShape& inpShape, int& axis, std::vector<std::vector<cv::Range> >& sliceRanges)
{
CV_Assert(inpShape.size() > 0);
bool axisNeg = (axis < 0);
axis = (axis + static_cast<int>(inpShape.size())) % inpShape.size();
int n = inpShape[axis];
for (size_t i = 0; i < sliceRanges.size(); ++i){
std::vector<Range>& ranges = sliceRanges[i];
if (axisNeg)
{
ranges.insert(ranges.begin(), axis, Range::all());
}
Range& range = ranges.back();
if (range.start >= 0)
{
continue;
}
CV_Assert(n != 0);
range.start = (n + range.start) % n;
}
}
class SliceLayerImpl : public SliceLayer class SliceLayerImpl : public SliceLayer
{ {
public: public:
@ -75,20 +100,22 @@ public:
num_split = params.get<int>("num_split", 0); num_split = params.get<int>("num_split", 0);
hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false); hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
shapesInitialized = !hasDynamicShapes; shapesInitialized = !hasDynamicShapes;
if (params.has("slice_point")) if (params.has("slice_point"))
{ {
CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end")); CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
const DictValue &indicesValue = params.get("slice_point"); const DictValue &indicesValue = params.get("slice_point");
int size = axis > 0 ? axis + 1 : 1;
sliceRanges.resize(indicesValue.size() + 1, sliceRanges.resize(indicesValue.size() + 1,
std::vector<Range>(std::max(axis,0) + 1, Range::all())); std::vector<Range>(size, Range::all()));
int prevSlice = 0; int prevSlice = 0;
for (int i = 0; i < indicesValue.size(); ++i) for (int i = 0; i < indicesValue.size(); ++i)
{ {
sliceRanges[i][axis].start = prevSlice; sliceRanges[i][size - 1].start = prevSlice;
sliceRanges[i][axis].end = indicesValue.get<int>(i); sliceRanges[i][size - 1].end = indicesValue.get<int>(i);
prevSlice = sliceRanges[i][axis].end; prevSlice = sliceRanges[i][size - 1].end;
} }
sliceRanges.back()[axis].start = prevSlice; sliceRanges.back()[size - 1].start = prevSlice;
} }
else if (params.has("begin")) else if (params.has("begin"))
{ {
@ -103,7 +130,6 @@ public:
{ {
int start = begins.get<int>(i); int start = begins.get<int>(i);
int sizeOrEnd = sizesOrEnds.get<int>(i); // It may be negative to reverse indexation. int sizeOrEnd = sizesOrEnds.get<int>(i); // It may be negative to reverse indexation.
CV_Assert(start >= 0);
sliceRanges[0][i].start = start; sliceRanges[0][i].start = start;
if (params.has("size")) if (params.has("size"))
@ -164,16 +190,20 @@ public:
CV_Assert(inputs.size() == 1); CV_Assert(inputs.size() == 1);
MatShape inpShape = inputs[0]; MatShape inpShape = inputs[0];
if (!sliceRanges.empty()) int axis_rw = axis;
std::vector<std::vector<cv::Range> > sliceRanges_rw = sliceRanges;
sliceRangesFromShape(inpShape, axis_rw, sliceRanges_rw);
if (!sliceRanges_rw.empty())
{ {
outputs.resize(sliceRanges.size(), inpShape); outputs.resize(sliceRanges_rw.size(), inpShape);
for (int i = 0; i < outputs.size(); ++i) for (int i = 0; i < outputs.size(); ++i)
{ {
CV_Assert(sliceRanges[i].size() <= inpShape.size()); CV_Assert(sliceRanges_rw[i].size() <= inpShape.size());
for (int j = 0; j < sliceRanges[i].size(); ++j) for (int j = 0; j < sliceRanges_rw[i].size(); ++j)
{ {
if (shapesInitialized || inpShape[j] > 0) if (shapesInitialized || inpShape[j] > 0)
outputs[i][j] = normalize_axis_range(sliceRanges[i][j], inpShape[j]).size(); outputs[i][j] = normalize_axis_range(sliceRanges_rw[i][j], inpShape[j]).size();
if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1)) if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1))
outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j]; outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j];
@ -182,10 +212,10 @@ public:
} }
else // Divide input blob on equal parts by axis. else // Divide input blob on equal parts by axis.
{ {
CV_Assert(0 <= axis && axis < inpShape.size()); CV_Assert(0 <= axis_rw && axis_rw < inpShape.size());
int splits = num_split ? num_split : requiredOutputs; int splits = num_split ? num_split : requiredOutputs;
CV_Assert(splits > 0 && inpShape[axis] % splits == 0); CV_Assert(splits > 0 && inpShape[axis_rw] % splits == 0);
inpShape[axis] /= splits; inpShape[axis_rw] /= splits;
outputs.resize(splits, inpShape); outputs.resize(splits, inpShape);
} }
return false; return false;
@ -210,6 +240,7 @@ public:
CV_Assert(inputs.size() == 1); CV_Assert(inputs.size() == 1);
const MatSize& inpShape = inputs[0].size; const MatSize& inpShape = inputs[0].size;
sliceRangesFromShape(shape(inputs[0]), axis, sliceRanges);
finalSliceRanges = sliceRanges; finalSliceRanges = sliceRanges;
if (sliceRanges.empty()) if (sliceRanges.empty())
@ -492,7 +523,7 @@ public:
ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrReadOnly(input),
ocl::KernelArg::PtrWriteOnly(output) ocl::KernelArg::PtrWriteOnly(output)
) )
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false); .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
if (!ret) if (!ret)
return false; return false;
} // for outputs.size() } // for outputs.size()

View File

@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems, void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int simd_size); int blockM, int blockK, int simd_size);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise); void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx); void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;

View File

@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
.args( .args(
ocl::KernelArg::PtrReadOnly(buffer), ocl::KernelArg::PtrReadOnly(buffer),
image, offset, image, offset,
padded_width, padded_height,
width, height, width, height,
ld) ld)
.run(2, global_copy, NULL, false); .run(2, global_copy, NULL, false);

View File

@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
} }
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx) void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
{ {
if (fused_eltwise) if (fused_eltwise)
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ)); {
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
if (fused_eltwise_offset >= 0)
kernel.set(argIdx++, fused_eltwise_offset);
}
switch (fused_activ) { switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU: case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_); kernel.set(argIdx++, (float)negative_slope_);
break; break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU: case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
break; break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER: case OCL4DNN_CONV_FUSED_ACTIV_POWER:
kernel.set(argIdx++, (float)power_); kernel.set(argIdx++, (float)power_);
@ -765,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
swizzled_factor swizzled_factor
); );
size_t global_work_size_copy[3] = { size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false)) if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
{ {
std::cout << "Swizzle kernel run failed." << std::endl; CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
return false; return false;
} }
} else { } else {
@ -895,10 +898,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) { if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
if (!swizzleWeight(weight, config->workItem_output[2], false)) if (!swizzleWeight(weight, config->workItem_output[2], false))
return false; return false;
#if 0
size_t total_bottom_size = bottom_dim_ * numImages; size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_; size_t total_bias_size = M_ * group_;
size_t total_top_size = top_dim_ * numImages; size_t total_top_size = top_dim_ * numImages;
#endif
for (int32_t g = 0; g < group_; ++g) { for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g; bias_offset = M_ * g;
int32_t image_offset = width_ * height_ * (channels_ / group_) * g; int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
@ -910,78 +915,28 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
UMat img_buffer; kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
if (image_offset) kernel.set(argIdx++, image_offset);
{
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
} kernel.set(argIdx++, kernel_offset);
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
}
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
}
UMat bias_buffer;
if (bias_term_) if (bias_term_)
{ {
if (bias_offset) kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
{ kernel.set(argIdx++, bias_offset);
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
}
} }
UMat out_buffer; kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
if (output_image_offset) kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
{
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size));
}
kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_); kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_); kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_); kernel.set(argIdx++, (uint16_t)output_h_);
if (!kernel.run(3, config->global_work_size, config->local_work_size, false)) if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
{ {
std::cout << "IDLF kernel run failed." << std::endl; std::cout << "IDLF kernel run failed." << std::endl;
return false; return false;
@ -990,9 +945,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
} else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) { } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
if (!swizzleWeight(weight, config->workItem_output[1], true)) if (!swizzleWeight(weight, config->workItem_output[1], true))
return false; return false;
#if 0
size_t total_bottom_size = bottom_dim_ * numImages; size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_; size_t total_bias_size = M_ * group_;
#endif
size_t total_top_size = top_dim_ * numImages; size_t total_top_size = top_dim_ * numImages;
for (int32_t g = 0; g < group_; ++g) { for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g; bias_offset = M_ * g;
@ -1005,72 +962,25 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
UMat img_buffer; kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
if (image_offset) kernel.set(argIdx++, (int)image_offset);
{ kernel.set(argIdx++, (int)(bottom.total() - image_offset));
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
} kernel.set(argIdx++, (int)kernel_offset);
else kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset));
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
}
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
}
UMat bias_buffer;
if (bias_term_) if (bias_term_)
{ {
if (bias_offset) kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
{ kernel.set(argIdx++, (int)bias_offset);
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
}
} }
UMat out_buffer; kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
if (output_image_offset) kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
{ kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size));
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size));
}
kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_); kernel.set(argIdx++, (uint16_t)height_);
@ -1100,7 +1010,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
gy = alignSize(gy, blockK); gy = alignSize(gy, blockK);
size_t global_size[3] = { gx, gy, config->global_work_size[2] }; size_t global_size[3] = { gx, gy, config->global_work_size[2] };
if (!kernel.run(3, global_size, config->local_work_size, false)) if (!kernel.run_(3, global_size, config->local_work_size, false))
{ {
std::cout << "GEMM like kernel run failed." << std::endl; std::cout << "GEMM like kernel run failed." << std::endl;
return false; return false;
@ -1112,7 +1022,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_) if (bias_term_)
@ -1129,9 +1039,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
global_size[1] = output_h_; global_size[1] = output_h_;
global_size[2] = num_output_ * num_; global_size[2] = num_output_ * num_;
if (!kernel.run(3, global_size, NULL, false)) if (!kernel.run_(3, global_size, NULL, false))
{ {
std::cout << "DWCONV kernel run failed." << std::endl; CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
return false; return false;
} }
} else { } else {
@ -1152,7 +1062,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset); kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
@ -1171,11 +1081,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)output_h_); kernel.set(argIdx++, (uint16_t)output_h_);
kernel.set(argIdx++, (uint16_t)pad_w_); kernel.set(argIdx++, (uint16_t)pad_w_);
kernel.set(argIdx++, (uint16_t)pad_h_); kernel.set(argIdx++, (uint16_t)pad_h_);
if (!kernel.run(3, config->global_work_size, if (!kernel.run_(3, config->global_work_size,
(config->use_null_local) ? NULL : config->local_work_size, (config->use_null_local) ? NULL : config->local_work_size,
false)) false))
{ {
std::cout << "Basic kernel run failed." << std::endl; CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
return false; return false;
} }
} }

View File

@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
} }
ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false); ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
} }
return ret; return ret;
} }

View File

@ -234,6 +234,27 @@ public:
} }
}; };
class NormalizeSubgraph2_2 : public NormalizeSubgraphBase
{
public:
NormalizeSubgraph2_2()
{
int input = addNodeToMatch("");
int norm = addNodeToMatch("ReduceL2", input);
int min = addNodeToMatch("");
int max = addNodeToMatch("");
int clip = addNodeToMatch("Clip", norm, min, max);
int shape = addNodeToMatch("");
int expand = addNodeToMatch("Expand", clip, shape);
addNodeToMatch("Div", input, expand);
setFusedNode("Normalize", input);
}
};
class NormalizeSubgraph3 : public NormalizeSubgraphBase class NormalizeSubgraph3 : public NormalizeSubgraphBase
{ {
public: public:
@ -558,6 +579,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
subgraphs.push_back(makePtr<SoftMaxSubgraph>()); subgraphs.push_back(makePtr<SoftMaxSubgraph>());
subgraphs.push_back(makePtr<NormalizeSubgraph1>()); subgraphs.push_back(makePtr<NormalizeSubgraph1>());
subgraphs.push_back(makePtr<NormalizeSubgraph2>()); subgraphs.push_back(makePtr<NormalizeSubgraph2>());
subgraphs.push_back(makePtr<NormalizeSubgraph2_2>());
subgraphs.push_back(makePtr<NormalizeSubgraph3>()); subgraphs.push_back(makePtr<NormalizeSubgraph3>());
subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>()); subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>()); subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());

View File

@ -64,6 +64,8 @@ class ONNXImporter
void addLayer(LayerParams& layerParams, void addLayer(LayerParams& layerParams,
const opencv_onnx::NodeProto& node_proto); const opencv_onnx::NodeProto& node_proto);
void expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
const std::string& input, size_t n);
public: public:
ONNXImporter(Net& net, const char *onnxFile); ONNXImporter(Net& net, const char *onnxFile);
ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer); ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer);
@ -486,6 +488,37 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
} }
} }
/** @brief Make N copies of input layer and set them as input to node_proto.
* @param prefix prefix of new layers' names
* @param node_proto node which will contain all copies as inputs
* @param input name of the node to copy
* @param n number of copies
*/
void ONNXImporter::expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
const std::string& input, size_t n)
{
std::vector<std::string> input_names;
input_names.reserve(n);
for (size_t j = 0; j < n; j++)
{
LayerParams copyLP;
copyLP.name = format("%s/copy_%zu", prefix.c_str(), j);
copyLP.type = "Identity";
CV_Assert((layer_id.find(copyLP.name) == layer_id.end()) &&
"Couldn't copy the node: generated name already exists in the graph.");
input_names.push_back(copyLP.name);
node_proto.set_input(0, input);
node_proto.set_output(0, copyLP.name);
addLayer(copyLP, node_proto);
}
node_proto.clear_input();
for (size_t i = 0; i < input_names.size(); i++)
{
node_proto.add_input(input_names[i]);
}
}
void ONNXImporter::addConstant(const std::string& name, const Mat& blob) void ONNXImporter::addConstant(const std::string& name, const Mat& blob)
{ {
constBlobs.insert(std::make_pair(name, blob)); constBlobs.insert(std::make_pair(name, blob));
@ -1422,6 +1455,38 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
addLayer(layerParams, node_proto); addLayer(layerParams, node_proto);
} }
void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis)
{
const size_t diff = outShape.size() - broadShape.size();
// find the first non-one element of the broadcasting shape
axis = 0;
for (; axis < broadShape.size() && broadShape[axis] == 1; ++axis) {}
// find the last non-one element of the broadcasting shape
size_t endAxis = broadShape.size();
for (; endAxis > axis && broadShape[endAxis - 1] == 1; --endAxis) {}
// find one between axis and endAxis - as it needs to be broadcasted,
// dimensions from the left of axis and from the right of endAxis will be handled by Scale layer
broadAxis = -1;
for (size_t i = axis; i < endAxis; ++i)
{
size_t outAxis = i + diff;
if (outShape[outAxis] == broadShape[i])
{
continue;
}
// ensure we need to broadcast only 1 dimension in the middle
CV_Assert(broadShape[i] == 1 && broadAxis == -1);
broadAxis = static_cast<int>(outAxis);
}
axis += diff;
}
// "Mul" "Div"
void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
{ {
opencv_onnx::NodeProto node_proto = node_proto_; opencv_onnx::NodeProto node_proto = node_proto_;
@ -1543,13 +1608,31 @@ void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodePro
} }
const MatShape& broadShape = outShapes[node_proto.input(1)]; const MatShape& broadShape = outShapes[node_proto.input(1)];
const size_t outShapeSize = outShapes[node_proto.input(0)].size(); const MatShape& outShape = outShapes[node_proto.input(0)];
const size_t diff = outShapeSize - broadShape.size();
size_t axis; size_t axis = 0;
for (axis = diff; axis < broadShape.size() && broadShape[axis - diff] == 1; ++axis) {} int broadAxis = -1;
findBroadAxis(broadShape, outShape, axis, broadAxis);
CV_Assert(axis != outShapeSize); // if there is a one dimension in the middle that should be broadcasted, broadcast it
if (broadAxis != -1)
{
opencv_onnx::NodeProto concat_node_proto = node_proto;
const std::string& input1 = concat_node_proto.input(1);
expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);
LayerParams concatLP;
concatLP.name = layerParams.name + "/concat";
concatLP.set("axis", broadAxis);
concatLP.type = "Concat";
concat_node_proto.set_output(0, concatLP.name);
addLayer(concatLP, concat_node_proto);
node_proto.set_input(1, concatLP.name);
}
CV_Assert(axis != outShape.size());
layerParams.set("axis", static_cast<int>(axis)); layerParams.set("axis", static_cast<int>(axis));
layerParams.type = "Scale"; layerParams.type = "Scale";
} }
@ -1818,12 +1901,11 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
// Unsqueeze and repeat along new axis // Unsqueeze and repeat along new axis
if (targetShape.size() == inpShape.size() + 1) if (targetShape.size() == inpShape.size() + 1)
{ {
inpShape.insert(inpShape.begin(), targetShape.size() - inpShape.size(), 1);
for (int i = 0; i < targetShape.size(); i++) for (int i = 0; i < targetShape.size(); i++)
{ {
if (targetShape[i] == -1 && i < inpShape.size()) if (abs(targetShape[i]) == 1)
targetShape[i] = inpShape[i]; targetShape[i] = inpShape[i];
else if (i < inpShape.size() && targetShape[i] != inpShape[i])
inpShape.insert(inpShape.begin() + i, 1);
} }
if (haveVariables) if (haveVariables)
{ {
@ -1843,14 +1925,19 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims"); CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");
std::vector<int> broadcast_axes; std::vector<int> broadcast_axes;
// shapes aren't right-aligned here because targetShape.size() == inpShape.size()
for (int i = 0; i < targetShape.size(); i++) for (int i = 0; i < targetShape.size(); i++)
{ {
if (targetShape[i] != inpShape[i]) if (targetShape[i] != inpShape[i])
{ {
if (inpShape[i] == 1) if (inpShape[i] == 1)
{
broadcast_axes.push_back(i); broadcast_axes.push_back(i);
else }
else if (targetShape[i] != 1)
{
CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i)); CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
}
} }
} }
@ -1889,31 +1976,16 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
} }
else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1) else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1)
{ {
String base_name = layerParams.name + "/copy_"; expandMid(layerParams.name, node_proto, srcName, targetShape[broadcast_axes[0]]);
std::vector<std::string> input_names;
for (int j = 0; j < targetShape[broadcast_axes[0]]; j++)
{
std::ostringstream ss;
ss << j;
LayerParams copyLP;
copyLP.name = base_name + ss.str();
copyLP.type = "Identity";
CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
input_names.push_back(copyLP.name);
node_proto.set_input(0, srcName);
node_proto.set_output(0, copyLP.name);
addLayer(copyLP, node_proto);
}
node_proto.clear_input();
for (int i = 0; i < input_names.size(); i++)
{
node_proto.add_input(input_names[i]);
}
layerParams.set("axis", broadcast_axes[0]); layerParams.set("axis", broadcast_axes[0]);
layerParams.type = "Concat"; layerParams.type = "Concat";
node_proto.set_output(0, layerParams.name); node_proto.set_output(0, layerParams.name);
} }
else if (broadcast_axes.empty())
{
layerParams.type = "Identity";
}
else else
CV_Error(Error::StsNotImplemented, "Unsupported Expand op"); CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
addLayer(layerParams, node_proto); addLayer(layerParams, node_proto);

View File

@ -74,18 +74,22 @@
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0) } while(0)
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data, #define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
#else #else
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \ #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
const Dtype _x_ = (_data_); \ const Dtype _x_ = (_data_); \
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0) } while(0)
#define ELTWISE_DATA_ARG #define ELTWISE_DATA_ARG
#define ELTWISE_DATA_ARG_WITH_OFFSET
#endif #endif
#if APPLY_BIAS #if APPLY_BIAS
#define BIAS_KERNEL_ARG __global Dtype * biases_base, #define BIAS_KERNEL_ARG __global Dtype * biases_base,
#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
#else #else
#define BIAS_KERNEL_ARG #define BIAS_KERNEL_ARG
#define BIAS_KERNEL_ARG_WITH_OFFSET
#endif #endif
#define __CAT(x, y) x##y #define __CAT(x, y) x##y
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
__kernel void __kernel void
convolve_simd( convolve_simd(
ELTWISE_DATA_ARG ELTWISE_DATA_ARG_WITH_OFFSET
FUSED_ARG FUSED_ARG
__global Dtype* inputs, __global Dtype* inputs_ptr, const int inputs_offset,
__global Dtype* weights, __global Dtype* weights_ptr, const int weights_offset,
BIAS_KERNEL_ARG BIAS_KERNEL_ARG_WITH_OFFSET
__global Dtype* outputs_base, __global Dtype* outputs_base, const int outputs_offset,
const int outputs_offset,
const ushort input_width, const ushort input_width,
const ushort input_height, const ushort input_height,
const ushort output_width, const ushort output_width,
const ushort output_height) const ushort output_height)
{ {
__global Dtype* inputs = inputs_ptr + inputs_offset;
__global Dtype* weights = weights_ptr + weights_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype* outputs = outputs_base + outputs_offset; __global Dtype* outputs = outputs_base + outputs_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
@ -388,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
#define ROW_PITCH input_width #define ROW_PITCH input_width
#define GEMM_LIKE_KERNEL_ARGS \ #define GEMM_LIKE_KERNEL_ARGS \
ELTWISE_DATA_ARG \ ELTWISE_DATA_ARG_WITH_OFFSET \
FUSED_ARG \ FUSED_ARG \
const __global Dtype *src0, \ const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \
const __global Dtype *src1, \ const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \
BIAS_KERNEL_ARG \ BIAS_KERNEL_ARG_WITH_OFFSET \
__global Dtype *dst_base, \ __global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \
const int dst_offset, \
const ushort input_width, \ const ushort input_width, \
const ushort input_height, \ const ushort input_height, \
const ushort output_width, \ const ushort output_width, \
@ -424,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((intel_reqd_sub_group_size(8)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{ {
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset; __global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0); const int group_x = get_group_id(0);
const int group_y = get_group_id(1); const int group_y = get_group_id(1);
const int global_x = get_global_id(0); const int global_x = get_global_id(0);
@ -447,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
} }
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds)
// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1)
#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#define OPTIMIZE_READ 1
#else
#define OPTIMIZE_READ 0
#endif
// True for all threads if filter_width is multiple of TILE_N // True for all threads if filter_width is multiple of TILE_N
// else, true for all but right-most column of threads. // else, true for all but right-most column of threads.
if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
@ -463,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// atile is M rows x K columns. // atile is M rows x K columns.
int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_x = ( global_y % output_width ) * STRIDE_X;
int curr_y = ( global_y / output_width ) * STRIDE_Y; int curr_y = ( global_y / output_width ) * STRIDE_Y;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 #if !OPTIMIZE_READ
int saved_y = curr_y; int saved_y = curr_y;
#endif #endif
const __global Dtype *src0_read = src0 const __global Dtype *src0_read = src0
@ -483,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
do do
{ {
int patch_row = 0; int patch_row = 0;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 #if !OPTIMIZE_READ
curr_y = saved_y; curr_y = saved_y;
#endif #endif
@ -501,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// ... // ...
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 #if OPTIMIZE_READ
#if KERNEL_WIDTH == 3 #if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3(0, src0_read); Dtype_t blockA00 = vload3(0, src0_read);
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
#else #else
#if 0 // debug
if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)
{
printf("CATCH: src0_read-src0: %d limit=%d curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x);
}
#endif
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
#endif #endif
@ -626,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// atile is M rows x K columns. // atile is M rows x K columns.
int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_x = ( global_y % output_width ) * STRIDE_X;
int curr_y = ( global_y / output_width ) * STRIDE_Y; int curr_y = ( global_y / output_width ) * STRIDE_Y;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 #if !OPTIMIZE_READ
int saved_y = curr_y; int saved_y = curr_y;
#endif #endif
const __global Dtype *src0_read = src0 const __global Dtype *src0_read = src0
@ -646,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
do do
{ {
int patch_row = 0; int patch_row = 0;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 #if !OPTIMIZE_READ
curr_y = saved_y; curr_y = saved_y;
#endif #endif
do do
{ {
// Load atile and interleaved btile. // Load atile and interleaved btile.
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 #if OPTIMIZE_READ
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
#else #else
@ -790,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
} }
} }
} }
#endif #endif // TILE_N_LAST > 0
} }
#endif #endif
#ifdef GEMM_LIKE_CONV_32_2 #ifdef GEMM_LIKE_CONV_32_2
@ -813,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((intel_reqd_sub_group_size(8)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{ {
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset; __global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0); const int group_x = get_group_id(0);
const int group_y = get_group_id(1); const int group_y = get_group_id(1);
const int global_x = get_global_id(0); const int global_x = get_global_id(0);
@ -1375,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((intel_reqd_sub_group_size(16)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{ {
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset; __global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0); const int group_x = get_group_id(0);
const int group_y = get_group_id(1); const int group_y = get_group_id(1);
const int global_x = get_global_id(0); const int global_x = get_global_id(0);
@ -1561,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((intel_reqd_sub_group_size(16)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{ {
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset; __global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0); const int group_x = get_group_id(0);
const int group_y = get_group_id(1); const int group_y = get_group_id(1);
const int global_x = get_global_id(0); const int global_x = get_global_id(0);

View File

@ -62,8 +62,8 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
//Original location //Original location
//Output location //Output location
int outputSublayer = channels / swizzleFactor; //int outputSublayer = channels / swizzleFactor;
int outputSublayerIndex = channels % swizzleFactor; //int outputSublayerIndex = channels % swizzleFactor;
int filter = sX / (kernel_w*kernel_h*channels); int filter = sX / (kernel_w*kernel_h*channels);
int kernel_X = sX % kernel_w; int kernel_X = sX % kernel_w;
@ -73,6 +73,10 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
int FP = filter / swizzleFactor; int FP = filter / swizzleFactor;
int F1 = filter % swizzleFactor; int F1 = filter % swizzleFactor;
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1] int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1;
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X]; int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X;
// idxIn is not valid if (filter >= outputs) - no data for these elements. Output alignment gaps are filled by zeros
Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0;
weightOut[idxOut] = v;
} }

View File

@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
{ {
const int gidx = get_global_id(0); const int gidx = get_global_id(0);
const int gidy = get_global_id(1); const int gidy = get_global_id(1);
if (gidx >= width || gidy >= height)
return;
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
__global Dtype* A_off = A + offA; __global Dtype* A_off = A + offA;
Dtype srcA = A_off[gidy * ldA + gidx]; Dtype srcA = A_off[gidy * ldA + gidx];
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
__global Dtype* A, __global Dtype* A,
__write_only image2d_t ImA, __write_only image2d_t ImA,
int offA, int offA,
int padded_width,
int padded_height,
int width, int width,
int height, int height,
int ldA) int ldA)
{ {
const int gidx = get_global_id(0); const int gidx = get_global_id(0);
const int gidy = get_global_id(1); const int gidy = get_global_id(1);
if (gidx >= padded_width || gidy >= padded_height)
return;
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
#if TYPE == TYPE_HALF #if TYPE == TYPE_HALF
if (gidx >= width || gidy >= height) { if (gidx >= width || gidy >= height) {

View File

@ -933,7 +933,12 @@ void TFImporter::parseBias(tensorflow::GraphDef& net, const tensorflow::NodeDef&
layer_id[name] = id; layer_id[name] = id;
// one input only // one input only
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); Pin inp0 = parsePin(layer.input(0));
if (layer_id.find(inp0.name) != layer_id.end())
// First operand is a constant.
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
else
connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
} }
else else
{ {

View File

@ -267,6 +267,11 @@ TEST_P(Test_ONNX_layers, ReLU)
testONNXModels("ReLU"); testONNXModels("ReLU");
} }
TEST_P(Test_ONNX_layers, PReLU)
{
testONNXModels("PReLU_slope");
}
TEST_P(Test_ONNX_layers, Clip) TEST_P(Test_ONNX_layers, Clip)
{ {
testONNXModels("clip", npy); testONNXModels("clip", npy);
@ -302,6 +307,7 @@ TEST_P(Test_ONNX_layers, Scale)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
testONNXModels("scale"); testONNXModels("scale");
testONNXModels("scale_broadcast", npy, 0, 0, false, true, 3); testONNXModels("scale_broadcast", npy, 0, 0, false, true, 3);
testONNXModels("scale_broadcast_mid", npy, 0, 0, false, true, 2);
} }
TEST_P(Test_ONNX_layers, ReduceMean3D) TEST_P(Test_ONNX_layers, ReduceMean3D)
@ -505,6 +511,8 @@ TEST_P(Test_ONNX_layers, MatMulAdd)
TEST_P(Test_ONNX_layers, Expand) TEST_P(Test_ONNX_layers, Expand)
{ {
testONNXModels("expand");
testONNXModels("expand_identity");
testONNXModels("expand_batch"); testONNXModels("expand_batch");
testONNXModels("expand_channels"); testONNXModels("expand_channels");
testONNXModels("expand_neg_batch"); testONNXModels("expand_neg_batch");
@ -646,6 +654,7 @@ TEST_P(Test_ONNX_layers, ReduceL2)
testONNXModels("reduceL2"); testONNXModels("reduceL2");
testONNXModels("reduceL2_subgraph"); testONNXModels("reduceL2_subgraph");
testONNXModels("reduceL2_subgraph_2"); testONNXModels("reduceL2_subgraph_2");
testONNXModels("reduceL2_subgraph2_2");
} }
TEST_P(Test_ONNX_layers, Split) TEST_P(Test_ONNX_layers, Split)
@ -659,6 +668,7 @@ TEST_P(Test_ONNX_layers, Split)
testONNXModels("split_3"); testONNXModels("split_3");
testONNXModels("split_4"); testONNXModels("split_4");
testONNXModels("split_sizes"); testONNXModels("split_sizes");
testONNXModels("split_neg_axis");
} }
TEST_P(Test_ONNX_layers, Slice) TEST_P(Test_ONNX_layers, Slice)
@ -667,6 +677,7 @@ TEST_P(Test_ONNX_layers, Slice)
testONNXModels("slice", npy, 0, 0, false, false); testONNXModels("slice", npy, 0, 0, false, false);
#else #else
testONNXModels("slice"); testONNXModels("slice");
testONNXModels("slice_neg_starts");
testONNXModels("slice_opset_11"); testONNXModels("slice_opset_11");
#endif #endif
} }

View File

@ -581,6 +581,18 @@ TEST_P(Test_TensorFlow_layers, l2_normalize)
runTensorFlowNet("l2_normalize"); runTensorFlowNet("l2_normalize");
} }
TEST_P(Test_TensorFlow_layers, BiasAdd)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
&& getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
#endif
runTensorFlowNet("bias_add_1");
}
// TODO: fix it and add to l2_normalize // TODO: fix it and add to l2_normalize
TEST_P(Test_TensorFlow_layers, l2_normalize_3d) TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
{ {
@ -1242,6 +1254,11 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_down)
runTensorFlowNet("resize_bilinear_down"); runTensorFlowNet("resize_bilinear_down");
} }
TEST_P(Test_TensorFlow_layers, resize_concat_optimization)
{
runTensorFlowNet("resize_concat_optimization");
}
TEST_P(Test_TensorFlow_layers, tf2_dense) TEST_P(Test_TensorFlow_layers, tf2_dense)
{ {
runTensorFlowNet("tf2_dense"); runTensorFlowNet("tf2_dense");

View File

@ -1105,7 +1105,7 @@ public:
that is, copies both parameters and train data. If emptyTrainData is true, the method creates an that is, copies both parameters and train data. If emptyTrainData is true, the method creates an
object copy with the current parameters but with empty train data. object copy with the current parameters but with empty train data.
*/ */
CV_WRAP virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0; CV_WRAP CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
/** @brief Creates a descriptor matcher of a given type with the default parameters (using default /** @brief Creates a descriptor matcher of a given type with the default parameters (using default
constructor). constructor).
@ -1165,7 +1165,7 @@ protected:
static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx ); static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx ); static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );
static Mat clone_op( Mat m ) { return m.clone(); } CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); }
void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const; void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;
//! Collection of descriptors from train images. //! Collection of descriptors from train images.
@ -1206,7 +1206,7 @@ public:
*/ */
CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ; CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ;
virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE; CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
protected: protected:
virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k, virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE; InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
@ -1245,7 +1245,7 @@ public:
CV_WRAP static Ptr<FlannBasedMatcher> create(); CV_WRAP static Ptr<FlannBasedMatcher> create();
virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE; CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
protected: protected:
static void convertToDMatches( const DescriptorCollection& descriptors, static void convertToDMatches( const DescriptorCollection& descriptors,
const Mat& indices, const Mat& distances, const Mat& indices, const Mat& distances,

View File

@ -437,11 +437,18 @@ class CppHeaderParser(object):
# filter off some common prefixes, which are meaningless for Python wrappers. # filter off some common prefixes, which are meaningless for Python wrappers.
# note that we do not strip "static" prefix, which does matter; # note that we do not strip "static" prefix, which does matter;
# it means class methods, not instance methods # it means class methods, not instance methods
decl_str = self.batch_replace(decl_str, [("static inline", ""), ("inline", ""), ("explicit ", ""), decl_str = self.batch_replace(decl_str, [("static inline", ""),
("CV_EXPORTS_W", ""), ("CV_EXPORTS", ""), ("CV_CDECL", ""), ("inline", ""),
("CV_WRAP ", " "), ("CV_INLINE", ""), ("explicit ", ""),
("CV_DEPRECATED", ""), ("CV_DEPRECATED_EXTERNAL", "")]).strip() ("CV_EXPORTS_W", ""),
("CV_EXPORTS", ""),
("CV_CDECL", ""),
("CV_WRAP ", " "),
("CV_INLINE", ""),
("CV_DEPRECATED", ""),
("CV_DEPRECATED_EXTERNAL", ""),
("CV_NODISCARD_STD", ""),
("CV_NODISCARD", "")]).strip()
if decl_str.strip().startswith('virtual'): if decl_str.strip().startswith('virtual'):
virtual_method = True virtual_method = True

View File

@ -0,0 +1,16 @@
diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
index 0ba0dd78..7d34e7cb 100644
--- a/inference-engine/src/CMakeLists.txt
+++ b/inference-engine/src/CMakeLists.txt
@@ -26,9 +26,9 @@ endif()
add_subdirectory(hetero_plugin)
-add_subdirectory(auto_plugin)
+#add_subdirectory(auto_plugin)
-add_subdirectory(multi_device)
+#add_subdirectory(multi_device)
add_subdirectory(transformations)

View File

@ -0,0 +1,219 @@
diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake
index d49f16a4d..2726ca787 100644
--- a/cmake/developer_package/add_ie_target.cmake
+++ b/cmake/developer_package/add_ie_target.cmake
@@ -92,7 +92,7 @@ function(addIeTarget)
if (ARG_TYPE STREQUAL EXECUTABLE)
add_executable(${ARG_NAME} ${all_sources})
elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
- add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
+ add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
else()
message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
endif()
diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
index 1ac7fd8bf..df7091e51 100644
--- a/inference-engine/CMakeLists.txt
+++ b/inference-engine/CMakeLists.txt
@@ -39,7 +39,7 @@ if(ENABLE_TESTS)
add_subdirectory(tests)
endif()
-add_subdirectory(tools)
+#add_subdirectory(tools)
function(ie_build_samples)
# samples should be build with the same flags as from OpenVINO package,
@@ -58,7 +58,7 @@ endfunction()
# gflags and format_reader targets are kept inside of samples directory and
# they must be built even if samples build is disabled (required for tests and tools).
-ie_build_samples()
+#ie_build_samples()
if(ENABLE_PYTHON)
add_subdirectory(ie_bridges/python)
@@ -142,7 +142,7 @@ endif()
# Developer package
#
-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
+#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
# for Template plugin
if(NGRAPH_INTERPRETER_ENABLE)
@@ -166,7 +166,7 @@ function(ie_generate_dev_package_config)
@ONLY)
endfunction()
-ie_generate_dev_package_config()
+#ie_generate_dev_package_config()
#
# Coverage
diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
index e8ed1a5c4..1fc9fc3ff 100644
--- a/inference-engine/src/inference_engine/CMakeLists.txt
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
@@ -110,7 +110,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
# Create object library
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${LIBRARY_HEADERS}
${PUBLIC_HEADERS})
@@ -181,7 +181,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
# Static library used for unit tests which are always built
-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
$<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
$<TARGET_OBJECTS:${TARGET_NAME}_obj>
${IE_STATIC_DEPENDENT_FILES})
diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
index 8eae82bd2..e0e6745b1 100644
--- a/inference-engine/src/legacy_api/CMakeLists.txt
+++ b/inference-engine/src/legacy_api/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${PUBLIC_HEADERS})
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
index fe57b29dd..07831e2fb 100644
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@@ -67,7 +67,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
# add test object library
-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
index f9548339d..ef962145a 100644
--- a/inference-engine/src/preprocessing/CMakeLists.txt
+++ b/inference-engine/src/preprocessing/CMakeLists.txt
@@ -101,7 +101,7 @@ endif()
# Create object library
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${LIBRARY_HEADERS})
@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
# Static library used for unit tests which are always built
-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
$<TARGET_OBJECTS:${TARGET_NAME}_obj>)
set_ie_threading_interface_for(${TARGET_NAME}_s)
diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
index 249e47c28..4ddf63049 100644
--- a/inference-engine/src/vpu/common/CMakeLists.txt
+++ b/inference-engine/src/vpu/common/CMakeLists.txt
@@ -5,7 +5,7 @@
file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
function(add_common_target TARGET_NAME STATIC_IE)
- add_library(${TARGET_NAME} STATIC ${SOURCES})
+ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
ie_faster_build(${TARGET_NAME}
UNITY
@@ -60,7 +60,7 @@ add_common_target("vpu_common_lib" FALSE)
# Unit tests support for graph transformer
if(WIN32)
- add_common_target("vpu_common_lib_test_static" TRUE)
+ #add_common_target("vpu_common_lib_test_static" TRUE)
else()
add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
endif()
diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
index bc73ab5b1..b4c1547fc 100644
--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
@@ -5,7 +5,7 @@
file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
function(add_graph_transformer_target TARGET_NAME STATIC_IE)
- add_library(${TARGET_NAME} STATIC ${SOURCES})
+ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
set_ie_threading_interface_for(${TARGET_NAME})
@@ -70,7 +70,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
# Unit tests support for graph transformer
if(WIN32)
- add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
else()
add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
endif()
diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
index 8bcb2801a..f7e031c01 100644
--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
+++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
else()
add_library(pugixml STATIC ${SOURCES})
if (MSVC)
- add_library(pugixml_mt STATIC ${SOURCES})
+ #add_library(pugixml_mt STATIC ${SOURCES})
#if (WIN32)
# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
index ff5c381e7..2797ec9ab 100644
--- a/ngraph/core/builder/CMakeLists.txt
+++ b/ngraph/core/builder/CMakeLists.txt
@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
source_group("include" FILES ${PUBLIC_HEADERS})
# Create shared library
-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
if(COMMAND ie_faster_build)
ie_faster_build(${TARGET_NAME}
diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
index ef4a764ab..f6d3172e2 100644
--- a/ngraph/core/reference/CMakeLists.txt
+++ b/ngraph/core/reference/CMakeLists.txt
@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
source_group("include" FILES ${PUBLIC_HEADERS})
# Create shared library
-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
if(COMMAND ie_faster_build)
ie_faster_build(${TARGET_NAME}
diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
index e9f880b8c..c63f4df63 100644
--- a/openvino/itt/CMakeLists.txt
+++ b/openvino/itt/CMakeLists.txt
@@ -6,7 +6,7 @@ set(TARGET_NAME itt)
file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp")
-add_library(${TARGET_NAME} STATIC ${SOURCES})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
add_library(openvino::itt ALIAS ${TARGET_NAME})

View File

@ -0,0 +1,15 @@
iff --git a/CMakeLists.txt b/CMakeLists.txt
index e0706a72e..9a053b1e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,10 @@ cmake_minimum_required(VERSION 3.13)
project(OpenVINO)
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
+set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+
set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)

View File

@ -0,0 +1,16 @@
diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake
index 14d4c0e1e..6a44f73b9 100644
--- a/cmake/developer_package/vs_version/vs_version.cmake
+++ b/cmake/developer_package/vs_version/vs_version.cmake
@@ -8,9 +8,9 @@ set(IE_VS_VER_FILEVERSION_STR "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERS
set(IE_VS_VER_COMPANY_NAME_STR "Intel Corporation")
set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
+set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2021, Intel Corporation")
-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
+set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
#
# ie_add_vs_version_file(NAME <name>

View File

@ -0,0 +1 @@
os.environ['CI_BUILD_NUMBER'] = '2021.4.1-opencv_winpack_dldt'

View File

@ -0,0 +1,4 @@
applyPatch('20210630-dldt-disable-unused-targets.patch')
applyPatch('20210630-dldt-pdb.patch')
applyPatch('20210630-dldt-disable-multidevice-autoplugin.patch')
applyPatch('20210630-dldt-vs-version.patch')

View File

@ -0,0 +1,56 @@
sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
build_config = 'Release' if not self.config.build_debug else 'Debug'
build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
def copy_bin(name):
global build_bin_dir, sysroot_bin_dir
copytree(build_bin_dir / name, sysroot_bin_dir / name)
dll_suffix = 'd' if self.config.build_debug else ''
def copy_dll(name):
global copy_bin, dll_suffix
copy_bin(name + dll_suffix + '.dll')
copy_bin(name + dll_suffix + '.pdb')
copy_bin('cache.json')
copy_dll('clDNNPlugin')
copy_dll('HeteroPlugin')
copy_dll('inference_engine')
copy_dll('inference_engine_ir_reader')
#copy_dll('inference_engine_ir_v7_reader')
copy_dll('inference_engine_legacy')
copy_dll('inference_engine_transformations') # runtime
copy_dll('inference_engine_lp_transformations') # runtime
#copy_dll('inference_engine_preproc') # runtime
copy_dll('MKLDNNPlugin') # runtime
copy_dll('myriadPlugin') # runtime
#copy_dll('MultiDevicePlugin') # runtime, not used
copy_dll('ngraph')
copy_bin('plugins.xml')
copy_bin('pcie-ma2x8x.elf')
copy_bin('usb-ma2x8x.mvcmd')
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
if not self.config.build_debug:
copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
else:
copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')

View File

@ -471,7 +471,8 @@ class Builder:
def main(): def main():
dldt_src_url = 'https://github.com/openvinotoolkit/openvino' dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
dldt_src_commit = '2021.4' dldt_src_commit = '2021.4.1'
dldt_config = None
dldt_release = None dldt_release = None
build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache') build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
@ -505,7 +506,7 @@ def main():
parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)') parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)') parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')
parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)') parser.add_argument('--dldt_config', default=dldt_config, help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)') parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)')