Merge remote-tracking branch 'upstream/3.4' into merge-3.4

This commit is contained in:
Alexander Alekhin 2021-09-11 16:32:13 +00:00
commit c3ac834526
35 changed files with 1014 additions and 444 deletions

View File

@ -141,8 +141,8 @@ if(INF_ENGINE_TARGET)
endif()
endif()
if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT)
message(WARNING "InferenceEngine version has not been set, 2021.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
set(INF_ENGINE_RELEASE_INIT "2021040000")
message(WARNING "InferenceEngine version has not been set, 2021.4.1 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
set(INF_ENGINE_RELEASE_INIT "2021040100")
elseif(DEFINED INF_ENGINE_RELEASE)
set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}")
endif()

View File

@ -98,7 +98,7 @@ import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt
img = cv.imread('simple.jpg',0)
img = cv.imread('blox.jpg',0) # `<opencv_root>/samples/data/blox.jpg`
# Initiate FAST object with default values
fast = cv.FastFeatureDetector_create()
@ -113,17 +113,17 @@ print( "nonmaxSuppression:{}".format(fast.getNonmaxSuppression()) )
print( "neighborhood: {}".format(fast.getType()) )
print( "Total Keypoints with nonmaxSuppression: {}".format(len(kp)) )
cv.imwrite('fast_true.png',img2)
cv.imwrite('fast_true.png', img2)
# Disable nonmaxSuppression
fast.setNonmaxSuppression(0)
kp = fast.detect(img,None)
kp = fast.detect(img, None)
print( "Total Keypoints without nonmaxSuppression: {}".format(len(kp)) )
img3 = cv.drawKeypoints(img, kp, None, color=(255,0,0))
cv.imwrite('fast_false.png',img3)
cv.imwrite('fast_false.png', img3)
@endcode
See the results. First image shows FAST with nonmaxSuppression and second one without
nonmaxSuppression:

View File

@ -499,8 +499,8 @@ public:
template<typename... _Tps> inline
Kernel& args(const _Tps&... kernel_args) { set_args_(0, kernel_args...); return *this; }
/** @brief Run the OpenCL kernel (globalsize value may be adjusted)
/** @brief Run the OpenCL kernel.
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
@param globalsize work items for each dimension. It is not the final globalsize passed to
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
@ -509,12 +509,26 @@ public:
@param localsize work-group size for each dimension.
@param sync specify whether to wait for OpenCL computation to finish before return.
@param q command queue
@note Use run_() if your kernel code doesn't support adjusted globalsize.
*/
bool run(int dims, size_t globalsize[],
size_t localsize[], bool sync, const Queue& q=Queue());
/** @brief Run the OpenCL kernel
*
* @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
* @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
* @param localsize work-group size for each dimension.
* @param sync specify whether to wait for OpenCL computation to finish before return.
* @param q command queue
*/
bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
bool runTask(bool sync, const Queue& q=Queue());
/** @brief Similar to synchronized run() call with returning of kernel execution time
/** @brief Similar to synchronized run_() call with returning of kernel execution time
*
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
* @return Execution time in nanoseconds or negative number on error
*/

View File

@ -24,11 +24,6 @@
#ifdef HAVE_OPENCL
#include <sstream>
#include "opencl_kernels_core.hpp"
#include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
namespace cv
{
@ -37,52 +32,75 @@ static bool intel_gpu_gemm(
UMat B, Size sizeB,
UMat D, Size sizeD,
double alpha, double beta,
bool atrans, bool btrans)
bool atrans, bool btrans,
bool& isPropagatedC2D
)
{
CV_UNUSED(sizeB);
int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
std::string kernelName;
bool ret = true;
if (M < 4 || N < 4 || K < 4) // vload4
return false;
size_t lx = 8, ly = 4;
size_t dx = 4, dy = 8;
CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K);
std::string kernelName;
unsigned int lx = 8, ly = 4;
unsigned int dx = 4, dy = 8;
if(!atrans && !btrans)
{
if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
{
kernelName = "intelblas_gemm_buffer_NN_sp";
}
else
{
if (M % 2 != 0)
return false;
// vload4(0, dst_write0) - 4 cols
// multiply by lx: 8
if (N % (4*8) != 0)
return false;
kernelName = "intelblas_gemm_buffer_NN";
}
}
else if(atrans && !btrans)
{
if (M % 32 != 0)
return false;
if (N % 32 != 0)
return false;
kernelName = "intelblas_gemm_buffer_TN";
}
else if(!atrans && btrans)
{
if (K % 4 != 0)
return false;
kernelName = "intelblas_gemm_buffer_NT";
ly = 16;
dx = 1;
}
else
{
if (M % 32 != 0)
return false;
if (N % 32 != 0)
return false;
if (K % 16 != 0)
return false;
kernelName = "intelblas_gemm_buffer_TT";
}
const size_t gx = (size_t)(N + dx - 1) / dx;
const size_t gy = (size_t)(M + dy - 1) / dy;
CV_LOG_DEBUG(NULL, "kernel: " << kernelName << " (M=" << M << " N=" << N << " K=" << K << ")");
const size_t gx = divUp((size_t)N, dx);
const size_t gy = divUp((size_t)M, dy);
size_t local[] = {lx, ly, 1};
size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1};
ocl::Queue q;
String errmsg;
@ -110,10 +128,13 @@ static bool intel_gpu_gemm(
(int)(D.step / sizeof(float))
);
ret = k.run(2, global, local, false, q);
bool ret = k.run(2, global, local, false, q);
return ret;
}
else
{
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
for(int start_index = 0; start_index < K; start_index += stride)
{
ocl::Kernel k(kernelName.c_str(), program);
@ -132,12 +153,16 @@ static bool intel_gpu_gemm(
(int) start_index, // 14 start_index
stride);
ret = k.run(2, global, local, false, q);
if (!ret) return ret;
bool ret = k.run(2, global, local, false, q);
if (!ret)
{
if (start_index != 0)
isPropagatedC2D = false; // D array content is changed, need to rewrite
return false;
}
}
return true;
}
return ret;
}
} // namespace cv

View File

@ -42,6 +42,8 @@
//M*/
#include "precomp.hpp"
#include <opencv2/core/utils/logger.hpp>
#include "opencl_kernels_core.hpp"
#include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
InputArray matC, double beta, OutputArray matD, int flags )
{
int depth = matA.depth(), cn = matA.channels();
int type = CV_MAKETYPE(depth, cn);
int type = matA.type();
int depth = CV_MAT_DEPTH(type);
int cn = CV_MAT_CN(type);
CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
CV_CheckTypeEQ(type, matB.type(), "");
CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, "");
const ocl::Device & dev = ocl::Device::getDefault();
bool doubleSupport = dev.doubleFPConfig() > 0;
@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
CV_Assert( !haveC || matC.type() == type );
if (haveC)
CV_CheckTypeEQ(type, matC.type(), "");
Size sizeD(((btrans) ? sizeB.height : sizeB.width),
((atrans) ? sizeA.width : sizeA.height));
if (atrans)
sizeA = Size(sizeA.height, sizeA.width);
if (btrans)
sizeB = Size(sizeB.height, sizeB.width);
if (haveC && ctrans)
sizeC = Size(sizeC.height, sizeC.width);
CV_CheckEQ(sizeA.width, sizeB.height, "");
if (haveC)
CV_CheckEQ(sizeC, sizeD, "");
UMat A = matA.getUMat();
UMat B = matB.getUMat();
Size sizeD(((btrans)? sizeB.height : sizeB.width),
((atrans)? sizeA.width : sizeA.height));
matD.create(sizeD, type);
UMat D = matD.getUMat();
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
bool isPropagatedC2D = false; // D content is updated with C / C.t()
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
{
String opts;
if (atrans)
sizeA = Size(sizeA.height, sizeA.width);
if (btrans)
sizeB = Size(sizeB.height, sizeB.width);
if (haveC && ctrans)
sizeC = Size(sizeC.height, sizeC.width);
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
int max_wg_size = (int)dev.maxWorkGroupSize();
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
if (atrans)
A = A.t();
if (btrans)
B = B.t();
if (haveC)
ctrans ? transpose(matC, D) : matC.copyTo(D);
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
cn, kercn, block_size,
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
haveC ? " -D HAVE_C" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
if (k.empty())
return false;
if (depth == CV_64F)
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, alpha, beta);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, (float)alpha, (float)beta);
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
}
else
if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1)
{
if (haveC && beta != 0.0)
{
ctrans ? transpose(matC, D) : matC.copyTo(D);
isPropagatedC2D = true;
}
else
{
beta = 0.0;
}
return intel_gpu_gemm(A, sizeA,
B, sizeB,
D, sizeD,
alpha,
beta,
atrans, btrans);
bool res = intel_gpu_gemm(A, matA.size(),
B, matB.size(),
D, sizeD,
alpha,
beta,
atrans, btrans,
isPropagatedC2D);
if (res)
return true;
// fallback on generic OpenCL code
}
if (sizeD.width < 8 || sizeD.height < 8)
return false;
String opts;
int wg_size = (int)dev.maxWorkGroupSize();
int sizeDmin = std::min(sizeD.width, sizeD.height);
wg_size = std::min(wg_size, sizeDmin * sizeDmin);
int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
if (atrans)
A = A.t();
if (btrans)
B = B.t();
if (haveC && !isPropagatedC2D)
ctrans ? transpose(matC, D) : matC.copyTo(D);
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
cn, kercn, block_size,
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
haveC ? " -D HAVE_C" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
if (k.empty())
return false;
if (depth == CV_64F)
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, alpha, beta);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, (float)alpha, (float)beta);
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false);
}
#endif

View File

@ -76,8 +76,11 @@
#undef CV__ALLOCATOR_STATS_LOG
#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
#define CV_OPENCL_SHOW_BUILD_OPTIONS 0
#define CV_OPENCL_SHOW_BUILD_KERNELS 0
#define CV_OPENCL_SHOW_RUN_KERNELS 0
#define CV_OPENCL_SYNC_RUN_KERNELS 0
#define CV_OPENCL_TRACE_CHECK 0
#define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1
@ -2155,20 +2158,22 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
platforms.resize(numPlatforms);
}
int selectedPlatform = -1;
if (platform.length() > 0)
{
for (size_t i = 0; i < platforms.size(); i++)
for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end();)
{
std::string name;
CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, *currentPlatform, CL_PLATFORM_NAME, name));
if (name.find(platform) != std::string::npos)
{
selectedPlatform = (int)i;
break;
++currentPlatform;
}
else
{
currentPlatform = platforms.erase(currentPlatform);
}
}
if (selectedPlatform == -1)
if (platforms.size() == 0)
{
CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
goto not_found;
@ -2205,13 +2210,11 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
goto not_found;
}
std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
(selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
i++)
std::vector<cl_device_id> devices;
for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end(); ++currentPlatform)
{
cl_uint count = 0;
cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
cl_int status = clGetDeviceIDs(*currentPlatform, deviceType, 0, NULL, &count);
if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
{
CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
@ -2220,7 +2223,7 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
continue;
size_t base = devices.size();
devices.resize(base + count);
status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
status = clGetDeviceIDs(*currentPlatform, deviceType, count, &devices[base], &count);
if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
{
CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
@ -3679,6 +3682,8 @@ static cv::String dumpValue(size_t sz, const void* p)
{
if (!p)
return "NULL";
if (sz == 2)
return cv::format("%d / %uu / 0x%04x", *(short*)p, *(unsigned short*)p, *(short*)p);
if (sz == 4)
return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
if (sz == 8)
@ -3851,6 +3856,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
}
bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
bool sync, const Queue& q)
{
CV_Assert(p);
return p->run(dims, _globalsize, _localsize, sync, NULL, q);
}
static bool isRaiseErrorOnReuseAsyncKernel()
{
static bool initialized = false;
@ -3891,6 +3904,10 @@ bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
return false; // OpenCV 5.0: raise error
}
#if CV_OPENCL_SYNC_RUN_KERNELS
sync = true;
#endif
cl_command_queue qq = getQueue(q);
if (haveTempDstUMats)
sync = true;
@ -4338,7 +4355,28 @@ struct Program::Impl
if (!param_buildExtraOptions.empty())
buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
}
#if CV_OPENCL_SHOW_BUILD_OPTIONS
CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' options:" << buildflags);
#endif
compile(ctx, src_, errmsg);
#if CV_OPENCL_SHOW_BUILD_KERNELS
if (handle)
{
size_t retsz = 0;
char kernels_buffer[4096] = {0};
cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
CV_OCL_DBG_CHECK_RESULT(result, cv::format("clGetProgramInfo(CL_PROGRAM_KERNEL_NAMES: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
if (result == CL_SUCCESS && retsz < sizeof(kernels_buffer))
{
kernels_buffer[retsz] = 0;
CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' kernels: '" << kernels_buffer << "'");
}
else
{
CV_LOG_ERROR(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' can't retrieve kernel names!");
}
}
#endif
}
bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
@ -4570,7 +4608,6 @@ struct Program::Impl
CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
}
#endif
}
return handle != NULL;
}

View File

@ -392,6 +392,15 @@ __kernel void intelblas_gemm_buffer_NN(
#define TILE_N 8
#define SLM_BLOCK 512
/*
A K B.t() K D N
----------- ----------- -----------
| | | | | |
M | | x N | | => M | |
| | | | | |
----------- ----------- -----------
*/
__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))
__kernel void intelblas_gemm_buffer_NT(
const __global float *src0, int off0,
@ -422,59 +431,79 @@ __kernel void intelblas_gemm_buffer_NT(
float8 dot06 = 0.f;
float8 dot07 = 0.f;
float4 brow0;
float4 brow1;
float4 brow2;
float4 brow3;
float4 brow4;
float4 brow5;
float4 brow6;
float4 brow7;
const int dst_row = (global_y * TILE_M);
__global float *dst_write0 = dst + global_x + dst_row * ldC + offd;
__global float *dst_write0 = dst + local_x * VEC_SIZE + ( group_x * TILE_N ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * ldC + offd;
const __global float *src0_read00 = src0 + off0;
const int a_row_base = global_y * TILE_M;
const int a_col_base = local_x * (TILE_K / 8); // <= TILE_K - 4
const __global float *src0_read = src0 + local_x * ( TILE_K / 8 ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M ) * ldA + off0;
const __global float *src1_read0 = src1 + ( group_x * TILE_N ) * ldB + off1;
const __global float *src1_read00 = src1 + off1;
const int b_row_base = (group_x * TILE_N);
//const int b_col_base = 0;
__local float slm_brow[8 * SLM_BLOCK];
__local float* slm_brow0;
int local_index = mad24(local_y, 8, local_x) * 4;
int w;
for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
int w = 0;
for (int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK)
{
#define UPDATE_BROW(_row) \
{ \
float4 brow; \
int b_row = b_row_base + _row; \
int b_col = b_tile + local_index; \
if (b_row < N && b_col <= K - 4 /*vload4*/) \
brow = vload4(0, src1_read00 + mad24(b_row, ldB, b_col)); \
else \
brow = (float4)0; \
vstore4(brow, 0, slm_brow + mad24(_row, SLM_BLOCK, local_index)); \
}
barrier(CLK_LOCAL_MEM_FENCE);
vstore4(vload4(0, src1_read0 + mad24(0, ldB, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
vstore4(vload4(0, src1_read0 + mad24(1, ldB, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
vstore4(vload4(0, src1_read0 + mad24(2, ldB, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
vstore4(vload4(0, src1_read0 + mad24(3, ldB, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
vstore4(vload4(0, src1_read0 + mad24(4, ldB, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
vstore4(vload4(0, src1_read0 + mad24(5, ldB, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
vstore4(vload4(0, src1_read0 + mad24(6, ldB, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
vstore4(vload4(0, src1_read0 + mad24(7, ldB, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
UPDATE_BROW(0);
UPDATE_BROW(1);
UPDATE_BROW(2);
UPDATE_BROW(3);
UPDATE_BROW(4);
UPDATE_BROW(5);
UPDATE_BROW(6);
UPDATE_BROW(7);
barrier(CLK_LOCAL_MEM_FENCE);
#undef UPDATE_BROW
slm_brow0 = slm_brow + local_x * (TILE_K / 8);
w = b_tile;
int end_w = min(b_tile + SLM_BLOCK, K);
while( w + TILE_K <= end_w ) {
float4 arow;
for (int k_tile_offset = 0; k_tile_offset < SLM_BLOCK; k_tile_offset += TILE_K)
{
int a_col = a_col_base + b_tile + k_tile_offset;
brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK);
brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK);
brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);
brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);
brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);
brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);
brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);
brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);
if (a_col > K - 4 /*vload4*/)
break;
#define MM_DOT_PRODUCT(_row,_dot) \
arow = vload4(0, src0_read + _row * ldA); \
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
int slm_brow_col = a_col_base + k_tile_offset; // <= SLM_BLOCK - 4
#define READ_SLM_BROW(_row) \
float4 brow##_row = vload4(0, slm_brow + mad24(_row, SLM_BLOCK, slm_brow_col));
READ_SLM_BROW(0);
READ_SLM_BROW(1);
READ_SLM_BROW(2);
READ_SLM_BROW(3);
READ_SLM_BROW(4);
READ_SLM_BROW(5);
READ_SLM_BROW(6);
READ_SLM_BROW(7);
#undef READ_SLM_BROW
#define MM_DOT_PRODUCT(_row,_dot) \
{ \
int a_row = a_row_base + _row; \
if (a_row < M) { \
float4 arow = vload4(0, src0_read00 + mad24(a_row, ldA, a_col)); \
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); \
} \
}
MM_DOT_PRODUCT(0,dot00);
MM_DOT_PRODUCT(1,dot01);
@ -485,53 +514,7 @@ __kernel void intelblas_gemm_buffer_NT(
MM_DOT_PRODUCT(6,dot06);
MM_DOT_PRODUCT(7,dot07);
#undef MM_DOT_PRODUCT
src0_read += TILE_K;
slm_brow0 += TILE_K;
w += TILE_K;
}
src1_read0 += SLM_BLOCK;
}
if(w < K) {
float4 arow;
#define READ_BROW(_brow,_row) \
_brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \
_brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \
_brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \
_brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \
_brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;
READ_BROW(brow0,0);
READ_BROW(brow1,1);
READ_BROW(brow2,2);
READ_BROW(brow3,3);
READ_BROW(brow4,4);
READ_BROW(brow5,5);
READ_BROW(brow6,6);
READ_BROW(brow7,7);
#define MM_DOT_PRODUCT(_row,_dot) \
arow = vload4(0, src0_read + _row * ldA); \
arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \
arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \
arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \
arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
MM_DOT_PRODUCT(0,dot00);
MM_DOT_PRODUCT(1,dot01);
MM_DOT_PRODUCT(2,dot02);
MM_DOT_PRODUCT(3,dot03);
MM_DOT_PRODUCT(4,dot04);
MM_DOT_PRODUCT(5,dot05);
MM_DOT_PRODUCT(6,dot06);
MM_DOT_PRODUCT(7,dot07);
#undef MM_DOT_PRODUCT
}
#define REDUCE(_dot) \
@ -572,21 +555,22 @@ __kernel void intelblas_gemm_buffer_NT(
output = (local_x == 5) ? _dot.s5 : output; \
output = (local_x == 6) ? _dot.s6 : output; \
output = (local_x == 7) ? _dot.s7 : output; \
if (beta != 0.0) \
if (beta != 0.0f) \
dst_write0[0] = mad(output, (float)alpha, ((float)beta * dst_write0[0])); \
else \
dst_write0[0] = output * (float)alpha; \
dst_write0 += ldC;
if(global_x < N && global_y * 8 < M) {
OUTPUT(dot00);
if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }
if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }
if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }
if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }
if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }
if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }
if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }
if (global_x < N && dst_row < M)
{
/*if (dst_row + 0 < M)*/ { OUTPUT(dot00); }
if (dst_row + 1 < M) { OUTPUT(dot01); }
if (dst_row + 2 < M) { OUTPUT(dot02); }
if (dst_row + 3 < M) { OUTPUT(dot03); }
if (dst_row + 4 < M) { OUTPUT(dot04); }
if (dst_row + 5 < M) { OUTPUT(dot05); }
if (dst_row + 6 < M) { OUTPUT(dot06); }
if (dst_row + 7 < M) { OUTPUT(dot07); }
}
#undef OUTPUT
}

View File

@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm,
double alpha, beta;
int M, N, K;
TEST_DECLARE_INPUT_PARAMETER(A);
TEST_DECLARE_INPUT_PARAMETER(B);
TEST_DECLARE_INPUT_PARAMETER(C);
@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm,
void generateTestData()
{
// set minimum size to 20, since testing less sizes doesn't make sense
Size ARoiSize = randomSize(20, MAX_VALUE);
M = (int)randomDoubleLog(1, 100);
N = (int)randomDoubleLog(1, 100);
K = (int)randomDoubleLog(1, 1200);
M = roundUp(M, 1);
N = roundUp(N, 1);
K = roundUp(K, 1);
Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M);
Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
if (atrans)
ARoiSize = Size(ARoiSize.height, ARoiSize.width);
Size BRoiSize = randomSize(20, MAX_VALUE);
if (btrans)
BRoiSize.width = ARoiSize.width;
else
BRoiSize.height = ARoiSize.width;
Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K);
Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11);
if (btrans)
BRoiSize = Size(BRoiSize.height, BRoiSize.width);
Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width);
Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M);
Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11);
randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11);
Size DRoiSize = Size(N, M);
Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11);
@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy)
for (int i = 0; i < test_loop_times; ++i)
{
generateTestData();
SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K));
OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags));
OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags));
double eps = D_roi.size().area() * 1e-4;
double eps = D_roi.size().area() * (1e-5 * K);
OCL_EXPECT_MATS_NEAR(D, eps);
}
}

View File

@ -238,7 +238,7 @@ public:
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
bool ret = kernel.run(2, global, NULL, false);
bool ret = kernel.run_(2, global, NULL, false);
if (!ret)
return false;
}

View File

@ -1951,7 +1951,7 @@ Ptr<Layer> ChannelsPReLULayer::create(const LayerParams& params)
if (params.blobs[0].total() == 1)
{
LayerParams reluParams = params;
reluParams.set("negative_slope", params.blobs[0].at<float>(0));
reluParams.set("negative_slope", *params.blobs[0].ptr<float>());
return ReLULayer::create(reluParams);
}
Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));

View File

@ -200,7 +200,7 @@ public:
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
ret = k1.run(1, globalsize, localsize, false);
ret = k1.run_(1, globalsize, localsize, false);
if (!ret)
return false;
}

View File

@ -120,7 +120,14 @@ public:
internals_arr.getMatVector(internals);
if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
{
// outputs[0] = inputs[0] doesn't work due to BlobManager optimizations
if (inputs[0].data != outputs[0].data)
{
inputs[0].copyTo(outputs[0]);
}
return;
}
Mat& inp = inputs[0];
Mat& out = outputs[0];

View File

@ -64,6 +64,31 @@ namespace cv
namespace dnn
{
void sliceRangesFromShape(const MatShape& inpShape, int& axis, std::vector<std::vector<cv::Range> >& sliceRanges)
{
CV_Assert(inpShape.size() > 0);
bool axisNeg = (axis < 0);
axis = (axis + static_cast<int>(inpShape.size())) % inpShape.size();
int n = inpShape[axis];
for (size_t i = 0; i < sliceRanges.size(); ++i){
std::vector<Range>& ranges = sliceRanges[i];
if (axisNeg)
{
ranges.insert(ranges.begin(), axis, Range::all());
}
Range& range = ranges.back();
if (range.start >= 0)
{
continue;
}
CV_Assert(n != 0);
range.start = (n + range.start) % n;
}
}
class SliceLayerImpl : public SliceLayer
{
public:
@ -75,20 +100,22 @@ public:
num_split = params.get<int>("num_split", 0);
hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
shapesInitialized = !hasDynamicShapes;
if (params.has("slice_point"))
{
CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
const DictValue &indicesValue = params.get("slice_point");
int size = axis > 0 ? axis + 1 : 1;
sliceRanges.resize(indicesValue.size() + 1,
std::vector<Range>(std::max(axis,0) + 1, Range::all()));
std::vector<Range>(size, Range::all()));
int prevSlice = 0;
for (int i = 0; i < indicesValue.size(); ++i)
{
sliceRanges[i][axis].start = prevSlice;
sliceRanges[i][axis].end = indicesValue.get<int>(i);
prevSlice = sliceRanges[i][axis].end;
sliceRanges[i][size - 1].start = prevSlice;
sliceRanges[i][size - 1].end = indicesValue.get<int>(i);
prevSlice = sliceRanges[i][size - 1].end;
}
sliceRanges.back()[axis].start = prevSlice;
sliceRanges.back()[size - 1].start = prevSlice;
}
else if (params.has("begin"))
{
@ -103,7 +130,6 @@ public:
{
int start = begins.get<int>(i);
int sizeOrEnd = sizesOrEnds.get<int>(i); // It may be negative to reverse indexation.
CV_Assert(start >= 0);
sliceRanges[0][i].start = start;
if (params.has("size"))
@ -164,16 +190,20 @@ public:
CV_Assert(inputs.size() == 1);
MatShape inpShape = inputs[0];
if (!sliceRanges.empty())
int axis_rw = axis;
std::vector<std::vector<cv::Range> > sliceRanges_rw = sliceRanges;
sliceRangesFromShape(inpShape, axis_rw, sliceRanges_rw);
if (!sliceRanges_rw.empty())
{
outputs.resize(sliceRanges.size(), inpShape);
outputs.resize(sliceRanges_rw.size(), inpShape);
for (int i = 0; i < outputs.size(); ++i)
{
CV_Assert(sliceRanges[i].size() <= inpShape.size());
for (int j = 0; j < sliceRanges[i].size(); ++j)
CV_Assert(sliceRanges_rw[i].size() <= inpShape.size());
for (int j = 0; j < sliceRanges_rw[i].size(); ++j)
{
if (shapesInitialized || inpShape[j] > 0)
outputs[i][j] = normalize_axis_range(sliceRanges[i][j], inpShape[j]).size();
outputs[i][j] = normalize_axis_range(sliceRanges_rw[i][j], inpShape[j]).size();
if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1))
outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j];
@ -182,10 +212,10 @@ public:
}
else // Divide input blob on equal parts by axis.
{
CV_Assert(0 <= axis && axis < inpShape.size());
CV_Assert(0 <= axis_rw && axis_rw < inpShape.size());
int splits = num_split ? num_split : requiredOutputs;
CV_Assert(splits > 0 && inpShape[axis] % splits == 0);
inpShape[axis] /= splits;
CV_Assert(splits > 0 && inpShape[axis_rw] % splits == 0);
inpShape[axis_rw] /= splits;
outputs.resize(splits, inpShape);
}
return false;
@ -210,6 +240,7 @@ public:
CV_Assert(inputs.size() == 1);
const MatSize& inpShape = inputs[0].size;
sliceRangesFromShape(shape(inputs[0]), axis, sliceRanges);
finalSliceRanges = sliceRanges;
if (sliceRanges.empty())
@ -492,7 +523,7 @@ public:
ocl::KernelArg::PtrReadOnly(input),
ocl::KernelArg::PtrWriteOnly(output)
)
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
.run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
if (!ret)
return false;
} // for outputs.size()

View File

@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int simd_size);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_;
bool bias_term_;

View File

@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
.args(
ocl::KernelArg::PtrReadOnly(buffer),
image, offset,
padded_width, padded_height,
width, height,
ld)
.run(2, global_copy, NULL, false);

View File

@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
{
if (fused_eltwise)
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
if (fused_eltwise_offset >= 0)
kernel.set(argIdx++, fused_eltwise_offset);
}
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
kernel.set(argIdx++, (float)power_);
@ -765,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
swizzled_factor
);
size_t global_work_size_copy[3] = {
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
{
std::cout << "Swizzle kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
return false;
}
} else {
@ -895,10 +898,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
if (!swizzleWeight(weight, config->workItem_output[2], false))
return false;
#if 0
size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_;
size_t total_top_size = top_dim_ * numImages;
#endif
for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g;
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
@ -910,78 +915,28 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
UMat img_buffer;
if (image_offset)
{
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
}
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
kernel.set(argIdx++, kernel_offset);
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
}
UMat bias_buffer;
if (bias_term_)
{
if (bias_offset)
{
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
}
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, bias_offset);
}
UMat out_buffer;
if (output_image_offset)
{
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size));
}
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
{
std::cout << "IDLF kernel run failed." << std::endl;
return false;
@ -990,9 +945,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
} else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
if (!swizzleWeight(weight, config->workItem_output[1], true))
return false;
#if 0
size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_;
#endif
size_t total_top_size = top_dim_ * numImages;
for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g;
@ -1005,72 +962,25 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
UMat img_buffer;
if (image_offset)
{
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, (int)image_offset);
kernel.set(argIdx++, (int)(bottom.total() - image_offset));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
}
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
kernel.set(argIdx++, (int)kernel_offset);
kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset));
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
}
UMat bias_buffer;
if (bias_term_)
{
if (bias_offset)
{
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
}
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, (int)bias_offset);
}
UMat out_buffer;
if (output_image_offset)
{
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size));
}
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size));
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
@ -1100,7 +1010,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
gy = alignSize(gy, blockK);
size_t global_size[3] = { gx, gy, config->global_work_size[2] };
if (!kernel.run(3, global_size, config->local_work_size, false))
if (!kernel.run_(3, global_size, config->local_work_size, false))
{
std::cout << "GEMM like kernel run failed." << std::endl;
return false;
@ -1112,7 +1022,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_)
@ -1129,9 +1039,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
global_size[1] = output_h_;
global_size[2] = num_output_ * num_;
if (!kernel.run(3, global_size, NULL, false))
if (!kernel.run_(3, global_size, NULL, false))
{
std::cout << "DWCONV kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
return false;
}
} else {
@ -1152,7 +1062,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
@ -1171,11 +1081,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)output_h_);
kernel.set(argIdx++, (uint16_t)pad_w_);
kernel.set(argIdx++, (uint16_t)pad_h_);
if (!kernel.run(3, config->global_work_size,
if (!kernel.run_(3, config->global_work_size,
(config->use_null_local) ? NULL : config->local_work_size,
false))
{
std::cout << "Basic kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
return false;
}
}

View File

@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
}
ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
}
return ret;
}

View File

@ -234,6 +234,27 @@ public:
}
};
class NormalizeSubgraph2_2 : public NormalizeSubgraphBase
{
public:
NormalizeSubgraph2_2()
{
int input = addNodeToMatch("");
int norm = addNodeToMatch("ReduceL2", input);
int min = addNodeToMatch("");
int max = addNodeToMatch("");
int clip = addNodeToMatch("Clip", norm, min, max);
int shape = addNodeToMatch("");
int expand = addNodeToMatch("Expand", clip, shape);
addNodeToMatch("Div", input, expand);
setFusedNode("Normalize", input);
}
};
class NormalizeSubgraph3 : public NormalizeSubgraphBase
{
public:
@ -558,6 +579,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
subgraphs.push_back(makePtr<SoftMaxSubgraph>());
subgraphs.push_back(makePtr<NormalizeSubgraph1>());
subgraphs.push_back(makePtr<NormalizeSubgraph2>());
subgraphs.push_back(makePtr<NormalizeSubgraph2_2>());
subgraphs.push_back(makePtr<NormalizeSubgraph3>());
subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());

View File

@ -64,6 +64,8 @@ class ONNXImporter
void addLayer(LayerParams& layerParams,
const opencv_onnx::NodeProto& node_proto);
void expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
const std::string& input, size_t n);
public:
ONNXImporter(Net& net, const char *onnxFile);
ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer);
@ -486,6 +488,37 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
}
}
/** @brief Make N copies of input layer and set them as input to node_proto.
* @param prefix prefix of new layers' names
* @param node_proto node which will contain all copies as inputs
* @param input name of the node to copy
* @param n number of copies
*/
void ONNXImporter::expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
const std::string& input, size_t n)
{
std::vector<std::string> input_names;
input_names.reserve(n);
for (size_t j = 0; j < n; j++)
{
LayerParams copyLP;
copyLP.name = format("%s/copy_%zu", prefix.c_str(), j);
copyLP.type = "Identity";
CV_Assert((layer_id.find(copyLP.name) == layer_id.end()) &&
"Couldn't copy the node: generated name already exists in the graph.");
input_names.push_back(copyLP.name);
node_proto.set_input(0, input);
node_proto.set_output(0, copyLP.name);
addLayer(copyLP, node_proto);
}
node_proto.clear_input();
for (size_t i = 0; i < input_names.size(); i++)
{
node_proto.add_input(input_names[i]);
}
}
void ONNXImporter::addConstant(const std::string& name, const Mat& blob)
{
constBlobs.insert(std::make_pair(name, blob));
@ -1422,6 +1455,38 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
addLayer(layerParams, node_proto);
}
void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis)
{
const size_t diff = outShape.size() - broadShape.size();
// find the first non-one element of the broadcasting shape
axis = 0;
for (; axis < broadShape.size() && broadShape[axis] == 1; ++axis) {}
// find the last non-one element of the broadcasting shape
size_t endAxis = broadShape.size();
for (; endAxis > axis && broadShape[endAxis - 1] == 1; --endAxis) {}
// find one between axis and endAxis - as it needs to be broadcasted,
// dimensions from the left of axis and from the right of endAxis will be handled by Scale layer
broadAxis = -1;
for (size_t i = axis; i < endAxis; ++i)
{
size_t outAxis = i + diff;
if (outShape[outAxis] == broadShape[i])
{
continue;
}
// ensure we need to broadcast only 1 dimension in the middle
CV_Assert(broadShape[i] == 1 && broadAxis == -1);
broadAxis = static_cast<int>(outAxis);
}
axis += diff;
}
// "Mul" "Div"
void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
{
opencv_onnx::NodeProto node_proto = node_proto_;
@ -1543,13 +1608,31 @@ void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodePro
}
const MatShape& broadShape = outShapes[node_proto.input(1)];
const size_t outShapeSize = outShapes[node_proto.input(0)].size();
const size_t diff = outShapeSize - broadShape.size();
const MatShape& outShape = outShapes[node_proto.input(0)];
size_t axis;
for (axis = diff; axis < broadShape.size() && broadShape[axis - diff] == 1; ++axis) {}
size_t axis = 0;
int broadAxis = -1;
findBroadAxis(broadShape, outShape, axis, broadAxis);
CV_Assert(axis != outShapeSize);
// if there is a one dimension in the middle that should be broadcasted, broadcast it
if (broadAxis != -1)
{
opencv_onnx::NodeProto concat_node_proto = node_proto;
const std::string& input1 = concat_node_proto.input(1);
expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);
LayerParams concatLP;
concatLP.name = layerParams.name + "/concat";
concatLP.set("axis", broadAxis);
concatLP.type = "Concat";
concat_node_proto.set_output(0, concatLP.name);
addLayer(concatLP, concat_node_proto);
node_proto.set_input(1, concatLP.name);
}
CV_Assert(axis != outShape.size());
layerParams.set("axis", static_cast<int>(axis));
layerParams.type = "Scale";
}
@ -1818,12 +1901,11 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
// Unsqueeze and repeat along new axis
if (targetShape.size() == inpShape.size() + 1)
{
inpShape.insert(inpShape.begin(), targetShape.size() - inpShape.size(), 1);
for (int i = 0; i < targetShape.size(); i++)
{
if (targetShape[i] == -1 && i < inpShape.size())
if (abs(targetShape[i]) == 1)
targetShape[i] = inpShape[i];
else if (i < inpShape.size() && targetShape[i] != inpShape[i])
inpShape.insert(inpShape.begin() + i, 1);
}
if (haveVariables)
{
@ -1843,14 +1925,19 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");
std::vector<int> broadcast_axes;
// shapes aren't right-aligned here because targetShape.size() == inpShape.size()
for (int i = 0; i < targetShape.size(); i++)
{
if (targetShape[i] != inpShape[i])
{
if (inpShape[i] == 1)
{
broadcast_axes.push_back(i);
else
}
else if (targetShape[i] != 1)
{
CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
}
}
}
@ -1889,31 +1976,16 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
}
else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1)
{
String base_name = layerParams.name + "/copy_";
std::vector<std::string> input_names;
for (int j = 0; j < targetShape[broadcast_axes[0]]; j++)
{
std::ostringstream ss;
ss << j;
LayerParams copyLP;
copyLP.name = base_name + ss.str();
copyLP.type = "Identity";
CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
input_names.push_back(copyLP.name);
expandMid(layerParams.name, node_proto, srcName, targetShape[broadcast_axes[0]]);
node_proto.set_input(0, srcName);
node_proto.set_output(0, copyLP.name);
addLayer(copyLP, node_proto);
}
node_proto.clear_input();
for (int i = 0; i < input_names.size(); i++)
{
node_proto.add_input(input_names[i]);
}
layerParams.set("axis", broadcast_axes[0]);
layerParams.type = "Concat";
node_proto.set_output(0, layerParams.name);
}
else if (broadcast_axes.empty())
{
layerParams.type = "Identity";
}
else
CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
addLayer(layerParams, node_proto);

View File

@ -74,18 +74,22 @@
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0)
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
#else
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
const Dtype _x_ = (_data_); \
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0)
#define ELTWISE_DATA_ARG
#define ELTWISE_DATA_ARG_WITH_OFFSET
#endif
#if APPLY_BIAS
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
#else
#define BIAS_KERNEL_ARG
#define BIAS_KERNEL_ARG_WITH_OFFSET
#endif
#define __CAT(x, y) x##y
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
__kernel void
convolve_simd(
ELTWISE_DATA_ARG
ELTWISE_DATA_ARG_WITH_OFFSET
FUSED_ARG
__global Dtype* inputs,
__global Dtype* weights,
BIAS_KERNEL_ARG
__global Dtype* outputs_base,
const int outputs_offset,
__global Dtype* inputs_ptr, const int inputs_offset,
__global Dtype* weights_ptr, const int weights_offset,
BIAS_KERNEL_ARG_WITH_OFFSET
__global Dtype* outputs_base, const int outputs_offset,
const ushort input_width,
const ushort input_height,
const ushort output_width,
const ushort output_height)
{
__global Dtype* inputs = inputs_ptr + inputs_offset;
__global Dtype* weights = weights_ptr + weights_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype* outputs = outputs_base + outputs_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
@ -388,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
#define ROW_PITCH input_width
#define GEMM_LIKE_KERNEL_ARGS \
ELTWISE_DATA_ARG \
ELTWISE_DATA_ARG_WITH_OFFSET \
FUSED_ARG \
const __global Dtype *src0, \
const __global Dtype *src1, \
BIAS_KERNEL_ARG \
__global Dtype *dst_base, \
const int dst_offset, \
const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \
const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \
BIAS_KERNEL_ARG_WITH_OFFSET \
__global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \
const ushort input_width, \
const ushort input_height, \
const ushort output_width, \
@ -424,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
__attribute__((intel_reqd_sub_group_size(8)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
@ -447,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
}
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds)
// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1)
#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#define OPTIMIZE_READ 1
#else
#define OPTIMIZE_READ 0
#endif
// True for all threads if filter_width is multiple of TILE_N
// else, true for all but right-most column of threads.
if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
@ -463,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// atile is M rows x K columns.
int curr_x = ( global_y % output_width ) * STRIDE_X;
int curr_y = ( global_y / output_width ) * STRIDE_Y;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
#if !OPTIMIZE_READ
int saved_y = curr_y;
#endif
const __global Dtype *src0_read = src0
@ -483,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
do
{
int patch_row = 0;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
#if !OPTIMIZE_READ
curr_y = saved_y;
#endif
@ -501,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// ...
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#if OPTIMIZE_READ
#if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3(0, src0_read);
Dtype* pblockA00 = (Dtype*)(&blockA00);
#else
#if 0 // debug
if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)
{
printf("CATCH: src0_read-src0: %d limit=%d curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x);
}
#endif
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00);
#endif
@ -626,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// atile is M rows x K columns.
int curr_x = ( global_y % output_width ) * STRIDE_X;
int curr_y = ( global_y / output_width ) * STRIDE_Y;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
#if !OPTIMIZE_READ
int saved_y = curr_y;
#endif
const __global Dtype *src0_read = src0
@ -646,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
do
{
int patch_row = 0;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
#if !OPTIMIZE_READ
curr_y = saved_y;
#endif
do
{
// Load atile and interleaved btile.
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#if OPTIMIZE_READ
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00);
#else
@ -790,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
}
}
}
#endif
#endif // TILE_N_LAST > 0
}
#endif
#ifdef GEMM_LIKE_CONV_32_2
@ -813,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
__attribute__((intel_reqd_sub_group_size(8)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
@ -1375,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
@ -1561,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const __global Dtype *src0 = src0_ptr + src0_offset;
const __global Dtype *src1 = src1_ptr + src1_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype *dst = dst_base + dst_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);

View File

@ -62,8 +62,8 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
//Original location
//Output location
int outputSublayer = channels / swizzleFactor;
int outputSublayerIndex = channels % swizzleFactor;
//int outputSublayer = channels / swizzleFactor;
//int outputSublayerIndex = channels % swizzleFactor;
int filter = sX / (kernel_w*kernel_h*channels);
int kernel_X = sX % kernel_w;
@ -73,6 +73,10 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
int FP = filter / swizzleFactor;
int F1 = filter % swizzleFactor;
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1;
int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X;
// idxIn is not valid if (filter >= outputs) - no data for these elements. Output alignment gaps are filled by zeros
Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0;
weightOut[idxOut] = v;
}

View File

@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
{
const int gidx = get_global_id(0);
const int gidy = get_global_id(1);
if (gidx >= width || gidy >= height)
return;
int2 coord_dst = (int2)(gidx, gidy);
__global Dtype* A_off = A + offA;
Dtype srcA = A_off[gidy * ldA + gidx];
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
__global Dtype* A,
__write_only image2d_t ImA,
int offA,
int padded_width,
int padded_height,
int width,
int height,
int ldA)
{
const int gidx = get_global_id(0);
const int gidy = get_global_id(1);
if (gidx >= padded_width || gidy >= padded_height)
return;
int2 coord_dst = (int2)(gidx, gidy);
#if TYPE == TYPE_HALF
if (gidx >= width || gidy >= height) {

View File

@ -933,7 +933,12 @@ void TFImporter::parseBias(tensorflow::GraphDef& net, const tensorflow::NodeDef&
layer_id[name] = id;
// one input only
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
Pin inp0 = parsePin(layer.input(0));
if (layer_id.find(inp0.name) != layer_id.end())
// First operand is a constant.
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
else
connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
}
else
{

View File

@ -267,6 +267,11 @@ TEST_P(Test_ONNX_layers, ReLU)
testONNXModels("ReLU");
}
TEST_P(Test_ONNX_layers, PReLU)
{
testONNXModels("PReLU_slope");
}
TEST_P(Test_ONNX_layers, Clip)
{
testONNXModels("clip", npy);
@ -302,6 +307,7 @@ TEST_P(Test_ONNX_layers, Scale)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
testONNXModels("scale");
testONNXModels("scale_broadcast", npy, 0, 0, false, true, 3);
testONNXModels("scale_broadcast_mid", npy, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, ReduceMean3D)
@ -505,6 +511,8 @@ TEST_P(Test_ONNX_layers, MatMulAdd)
TEST_P(Test_ONNX_layers, Expand)
{
testONNXModels("expand");
testONNXModels("expand_identity");
testONNXModels("expand_batch");
testONNXModels("expand_channels");
testONNXModels("expand_neg_batch");
@ -646,6 +654,7 @@ TEST_P(Test_ONNX_layers, ReduceL2)
testONNXModels("reduceL2");
testONNXModels("reduceL2_subgraph");
testONNXModels("reduceL2_subgraph_2");
testONNXModels("reduceL2_subgraph2_2");
}
TEST_P(Test_ONNX_layers, Split)
@ -659,6 +668,7 @@ TEST_P(Test_ONNX_layers, Split)
testONNXModels("split_3");
testONNXModels("split_4");
testONNXModels("split_sizes");
testONNXModels("split_neg_axis");
}
TEST_P(Test_ONNX_layers, Slice)
@ -667,6 +677,7 @@ TEST_P(Test_ONNX_layers, Slice)
testONNXModels("slice", npy, 0, 0, false, false);
#else
testONNXModels("slice");
testONNXModels("slice_neg_starts");
testONNXModels("slice_opset_11");
#endif
}

View File

@ -581,6 +581,18 @@ TEST_P(Test_TensorFlow_layers, l2_normalize)
runTensorFlowNet("l2_normalize");
}
TEST_P(Test_TensorFlow_layers, BiasAdd)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
&& getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
#endif
runTensorFlowNet("bias_add_1");
}
// TODO: fix it and add to l2_normalize
TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
{
@ -1242,6 +1254,11 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_down)
runTensorFlowNet("resize_bilinear_down");
}
TEST_P(Test_TensorFlow_layers, resize_concat_optimization)
{
runTensorFlowNet("resize_concat_optimization");
}
TEST_P(Test_TensorFlow_layers, tf2_dense)
{
runTensorFlowNet("tf2_dense");

View File

@ -1105,7 +1105,7 @@ public:
that is, copies both parameters and train data. If emptyTrainData is true, the method creates an
object copy with the current parameters but with empty train data.
*/
CV_WRAP virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
CV_WRAP CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
/** @brief Creates a descriptor matcher of a given type with the default parameters (using default
constructor).
@ -1165,7 +1165,7 @@ protected:
static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );
static Mat clone_op( Mat m ) { return m.clone(); }
CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); }
void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;
//! Collection of descriptors from train images.
@ -1206,7 +1206,7 @@ public:
*/
CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ;
virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
protected:
virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
@ -1245,7 +1245,7 @@ public:
CV_WRAP static Ptr<FlannBasedMatcher> create();
virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
protected:
static void convertToDMatches( const DescriptorCollection& descriptors,
const Mat& indices, const Mat& distances,

View File

@ -437,11 +437,18 @@ class CppHeaderParser(object):
# filter off some common prefixes, which are meaningless for Python wrappers.
# note that we do not strip "static" prefix, which does matter;
# it means class methods, not instance methods
decl_str = self.batch_replace(decl_str, [("static inline", ""), ("inline", ""), ("explicit ", ""),
("CV_EXPORTS_W", ""), ("CV_EXPORTS", ""), ("CV_CDECL", ""),
("CV_WRAP ", " "), ("CV_INLINE", ""),
("CV_DEPRECATED", ""), ("CV_DEPRECATED_EXTERNAL", "")]).strip()
decl_str = self.batch_replace(decl_str, [("static inline", ""),
("inline", ""),
("explicit ", ""),
("CV_EXPORTS_W", ""),
("CV_EXPORTS", ""),
("CV_CDECL", ""),
("CV_WRAP ", " "),
("CV_INLINE", ""),
("CV_DEPRECATED", ""),
("CV_DEPRECATED_EXTERNAL", ""),
("CV_NODISCARD_STD", ""),
("CV_NODISCARD", "")]).strip()
if decl_str.strip().startswith('virtual'):
virtual_method = True

View File

@ -0,0 +1,16 @@
diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
index 0ba0dd78..7d34e7cb 100644
--- a/inference-engine/src/CMakeLists.txt
+++ b/inference-engine/src/CMakeLists.txt
@@ -26,9 +26,9 @@ endif()
add_subdirectory(hetero_plugin)
-add_subdirectory(auto_plugin)
+#add_subdirectory(auto_plugin)
-add_subdirectory(multi_device)
+#add_subdirectory(multi_device)
add_subdirectory(transformations)

View File

@ -0,0 +1,219 @@
diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake
index d49f16a4d..2726ca787 100644
--- a/cmake/developer_package/add_ie_target.cmake
+++ b/cmake/developer_package/add_ie_target.cmake
@@ -92,7 +92,7 @@ function(addIeTarget)
if (ARG_TYPE STREQUAL EXECUTABLE)
add_executable(${ARG_NAME} ${all_sources})
elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
- add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
+ add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
else()
message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
endif()
diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
index 1ac7fd8bf..df7091e51 100644
--- a/inference-engine/CMakeLists.txt
+++ b/inference-engine/CMakeLists.txt
@@ -39,7 +39,7 @@ if(ENABLE_TESTS)
add_subdirectory(tests)
endif()
-add_subdirectory(tools)
+#add_subdirectory(tools)
function(ie_build_samples)
# samples should be build with the same flags as from OpenVINO package,
@@ -58,7 +58,7 @@ endfunction()
# gflags and format_reader targets are kept inside of samples directory and
# they must be built even if samples build is disabled (required for tests and tools).
-ie_build_samples()
+#ie_build_samples()
if(ENABLE_PYTHON)
add_subdirectory(ie_bridges/python)
@@ -142,7 +142,7 @@ endif()
# Developer package
#
-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
+#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
# for Template plugin
if(NGRAPH_INTERPRETER_ENABLE)
@@ -166,7 +166,7 @@ function(ie_generate_dev_package_config)
@ONLY)
endfunction()
-ie_generate_dev_package_config()
+#ie_generate_dev_package_config()
#
# Coverage
diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
index e8ed1a5c4..1fc9fc3ff 100644
--- a/inference-engine/src/inference_engine/CMakeLists.txt
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
@@ -110,7 +110,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
# Create object library
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${LIBRARY_HEADERS}
${PUBLIC_HEADERS})
@@ -181,7 +181,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
# Static library used for unit tests which are always built
-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
$<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
$<TARGET_OBJECTS:${TARGET_NAME}_obj>
${IE_STATIC_DEPENDENT_FILES})
diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
index 8eae82bd2..e0e6745b1 100644
--- a/inference-engine/src/legacy_api/CMakeLists.txt
+++ b/inference-engine/src/legacy_api/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${PUBLIC_HEADERS})
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
index fe57b29dd..07831e2fb 100644
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@@ -67,7 +67,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
# add test object library
-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
index f9548339d..ef962145a 100644
--- a/inference-engine/src/preprocessing/CMakeLists.txt
+++ b/inference-engine/src/preprocessing/CMakeLists.txt
@@ -101,7 +101,7 @@ endif()
# Create object library
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${LIBRARY_HEADERS})
@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
# Static library used for unit tests which are always built
-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
$<TARGET_OBJECTS:${TARGET_NAME}_obj>)
set_ie_threading_interface_for(${TARGET_NAME}_s)
diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
index 249e47c28..4ddf63049 100644
--- a/inference-engine/src/vpu/common/CMakeLists.txt
+++ b/inference-engine/src/vpu/common/CMakeLists.txt
@@ -5,7 +5,7 @@
file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
function(add_common_target TARGET_NAME STATIC_IE)
- add_library(${TARGET_NAME} STATIC ${SOURCES})
+ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
ie_faster_build(${TARGET_NAME}
UNITY
@@ -60,7 +60,7 @@ add_common_target("vpu_common_lib" FALSE)
# Unit tests support for graph transformer
if(WIN32)
- add_common_target("vpu_common_lib_test_static" TRUE)
+ #add_common_target("vpu_common_lib_test_static" TRUE)
else()
add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
endif()
diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
index bc73ab5b1..b4c1547fc 100644
--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
@@ -5,7 +5,7 @@
file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
function(add_graph_transformer_target TARGET_NAME STATIC_IE)
- add_library(${TARGET_NAME} STATIC ${SOURCES})
+ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
set_ie_threading_interface_for(${TARGET_NAME})
@@ -70,7 +70,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
# Unit tests support for graph transformer
if(WIN32)
- add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
else()
add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
endif()
diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
index 8bcb2801a..f7e031c01 100644
--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
+++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
else()
add_library(pugixml STATIC ${SOURCES})
if (MSVC)
- add_library(pugixml_mt STATIC ${SOURCES})
+ #add_library(pugixml_mt STATIC ${SOURCES})
#if (WIN32)
# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
index ff5c381e7..2797ec9ab 100644
--- a/ngraph/core/builder/CMakeLists.txt
+++ b/ngraph/core/builder/CMakeLists.txt
@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
source_group("include" FILES ${PUBLIC_HEADERS})
# Create shared library
-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
if(COMMAND ie_faster_build)
ie_faster_build(${TARGET_NAME}
diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
index ef4a764ab..f6d3172e2 100644
--- a/ngraph/core/reference/CMakeLists.txt
+++ b/ngraph/core/reference/CMakeLists.txt
@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
source_group("include" FILES ${PUBLIC_HEADERS})
# Create shared library
-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
if(COMMAND ie_faster_build)
ie_faster_build(${TARGET_NAME}
diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
index e9f880b8c..c63f4df63 100644
--- a/openvino/itt/CMakeLists.txt
+++ b/openvino/itt/CMakeLists.txt
@@ -6,7 +6,7 @@ set(TARGET_NAME itt)
file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp")
-add_library(${TARGET_NAME} STATIC ${SOURCES})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
add_library(openvino::itt ALIAS ${TARGET_NAME})

View File

@ -0,0 +1,15 @@
iff --git a/CMakeLists.txt b/CMakeLists.txt
index e0706a72e..9a053b1e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,10 @@ cmake_minimum_required(VERSION 3.13)
project(OpenVINO)
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
+set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+
set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)

View File

@ -0,0 +1,16 @@
diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake
index 14d4c0e1e..6a44f73b9 100644
--- a/cmake/developer_package/vs_version/vs_version.cmake
+++ b/cmake/developer_package/vs_version/vs_version.cmake
@@ -8,9 +8,9 @@ set(IE_VS_VER_FILEVERSION_STR "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERS
set(IE_VS_VER_COMPANY_NAME_STR "Intel Corporation")
set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
+set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2021, Intel Corporation")
-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
+set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
#
# ie_add_vs_version_file(NAME <name>

View File

@ -0,0 +1 @@
os.environ['CI_BUILD_NUMBER'] = '2021.4.1-opencv_winpack_dldt'

View File

@ -0,0 +1,4 @@
applyPatch('20210630-dldt-disable-unused-targets.patch')
applyPatch('20210630-dldt-pdb.patch')
applyPatch('20210630-dldt-disable-multidevice-autoplugin.patch')
applyPatch('20210630-dldt-vs-version.patch')

View File

@ -0,0 +1,56 @@
sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
build_config = 'Release' if not self.config.build_debug else 'Debug'
build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
def copy_bin(name):
global build_bin_dir, sysroot_bin_dir
copytree(build_bin_dir / name, sysroot_bin_dir / name)
dll_suffix = 'd' if self.config.build_debug else ''
def copy_dll(name):
global copy_bin, dll_suffix
copy_bin(name + dll_suffix + '.dll')
copy_bin(name + dll_suffix + '.pdb')
copy_bin('cache.json')
copy_dll('clDNNPlugin')
copy_dll('HeteroPlugin')
copy_dll('inference_engine')
copy_dll('inference_engine_ir_reader')
#copy_dll('inference_engine_ir_v7_reader')
copy_dll('inference_engine_legacy')
copy_dll('inference_engine_transformations') # runtime
copy_dll('inference_engine_lp_transformations') # runtime
#copy_dll('inference_engine_preproc') # runtime
copy_dll('MKLDNNPlugin') # runtime
copy_dll('myriadPlugin') # runtime
#copy_dll('MultiDevicePlugin') # runtime, not used
copy_dll('ngraph')
copy_bin('plugins.xml')
copy_bin('pcie-ma2x8x.elf')
copy_bin('usb-ma2x8x.mvcmd')
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
if not self.config.build_debug:
copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
else:
copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')

View File

@ -471,7 +471,8 @@ class Builder:
def main():
dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
dldt_src_commit = '2021.4'
dldt_src_commit = '2021.4.1'
dldt_config = None
dldt_release = None
build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
@ -505,7 +506,7 @@ def main():
parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')
parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
parser.add_argument('--dldt_config', default=dldt_config, help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)')