mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 22:44:02 +08:00
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
This commit is contained in:
commit
c3ac834526
@ -141,8 +141,8 @@ if(INF_ENGINE_TARGET)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT)
|
||||
message(WARNING "InferenceEngine version has not been set, 2021.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
|
||||
set(INF_ENGINE_RELEASE_INIT "2021040000")
|
||||
message(WARNING "InferenceEngine version has not been set, 2021.4.1 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
|
||||
set(INF_ENGINE_RELEASE_INIT "2021040100")
|
||||
elseif(DEFINED INF_ENGINE_RELEASE)
|
||||
set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}")
|
||||
endif()
|
||||
|
@ -98,7 +98,7 @@ import numpy as np
|
||||
import cv2 as cv
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
img = cv.imread('simple.jpg',0)
|
||||
img = cv.imread('blox.jpg',0) # `<opencv_root>/samples/data/blox.jpg`
|
||||
|
||||
# Initiate FAST object with default values
|
||||
fast = cv.FastFeatureDetector_create()
|
||||
@ -113,17 +113,17 @@ print( "nonmaxSuppression:{}".format(fast.getNonmaxSuppression()) )
|
||||
print( "neighborhood: {}".format(fast.getType()) )
|
||||
print( "Total Keypoints with nonmaxSuppression: {}".format(len(kp)) )
|
||||
|
||||
cv.imwrite('fast_true.png',img2)
|
||||
cv.imwrite('fast_true.png', img2)
|
||||
|
||||
# Disable nonmaxSuppression
|
||||
fast.setNonmaxSuppression(0)
|
||||
kp = fast.detect(img,None)
|
||||
kp = fast.detect(img, None)
|
||||
|
||||
print( "Total Keypoints without nonmaxSuppression: {}".format(len(kp)) )
|
||||
|
||||
img3 = cv.drawKeypoints(img, kp, None, color=(255,0,0))
|
||||
|
||||
cv.imwrite('fast_false.png',img3)
|
||||
cv.imwrite('fast_false.png', img3)
|
||||
@endcode
|
||||
See the results. First image shows FAST with nonmaxSuppression and second one without
|
||||
nonmaxSuppression:
|
||||
|
@ -499,8 +499,8 @@ public:
|
||||
template<typename... _Tps> inline
|
||||
Kernel& args(const _Tps&... kernel_args) { set_args_(0, kernel_args...); return *this; }
|
||||
|
||||
/** @brief Run the OpenCL kernel (globalsize value may be adjusted)
|
||||
|
||||
/** @brief Run the OpenCL kernel.
|
||||
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
|
||||
@param globalsize work items for each dimension. It is not the final globalsize passed to
|
||||
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
|
||||
@ -509,12 +509,26 @@ public:
|
||||
@param localsize work-group size for each dimension.
|
||||
@param sync specify whether to wait for OpenCL computation to finish before return.
|
||||
@param q command queue
|
||||
|
||||
@note Use run_() if your kernel code doesn't support adjusted globalsize.
|
||||
*/
|
||||
bool run(int dims, size_t globalsize[],
|
||||
size_t localsize[], bool sync, const Queue& q=Queue());
|
||||
|
||||
/** @brief Run the OpenCL kernel
|
||||
*
|
||||
* @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
|
||||
* @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
|
||||
* @param localsize work-group size for each dimension.
|
||||
* @param sync specify whether to wait for OpenCL computation to finish before return.
|
||||
* @param q command queue
|
||||
*/
|
||||
bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
|
||||
|
||||
bool runTask(bool sync, const Queue& q=Queue());
|
||||
|
||||
/** @brief Similar to synchronized run() call with returning of kernel execution time
|
||||
/** @brief Similar to synchronized run_() call with returning of kernel execution time
|
||||
*
|
||||
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
|
||||
* @return Execution time in nanoseconds or negative number on error
|
||||
*/
|
||||
|
@ -24,11 +24,6 @@
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
#include <sstream>
|
||||
#include "opencl_kernels_core.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
@ -37,52 +32,75 @@ static bool intel_gpu_gemm(
|
||||
UMat B, Size sizeB,
|
||||
UMat D, Size sizeD,
|
||||
double alpha, double beta,
|
||||
bool atrans, bool btrans)
|
||||
bool atrans, bool btrans,
|
||||
bool& isPropagatedC2D
|
||||
)
|
||||
{
|
||||
CV_UNUSED(sizeB);
|
||||
|
||||
int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
|
||||
|
||||
std::string kernelName;
|
||||
bool ret = true;
|
||||
if (M < 4 || N < 4 || K < 4) // vload4
|
||||
return false;
|
||||
|
||||
size_t lx = 8, ly = 4;
|
||||
size_t dx = 4, dy = 8;
|
||||
CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K);
|
||||
|
||||
std::string kernelName;
|
||||
|
||||
unsigned int lx = 8, ly = 4;
|
||||
unsigned int dx = 4, dy = 8;
|
||||
|
||||
if(!atrans && !btrans)
|
||||
{
|
||||
|
||||
if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
|
||||
{
|
||||
kernelName = "intelblas_gemm_buffer_NN_sp";
|
||||
}
|
||||
else
|
||||
{
|
||||
if (M % 2 != 0)
|
||||
return false;
|
||||
// vload4(0, dst_write0) - 4 cols
|
||||
// multiply by lx: 8
|
||||
if (N % (4*8) != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_NN";
|
||||
}
|
||||
}
|
||||
else if(atrans && !btrans)
|
||||
{
|
||||
if (M % 32 != 0)
|
||||
return false;
|
||||
if (N % 32 != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_TN";
|
||||
}
|
||||
else if(!atrans && btrans)
|
||||
{
|
||||
if (K % 4 != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_NT";
|
||||
ly = 16;
|
||||
dx = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (M % 32 != 0)
|
||||
return false;
|
||||
if (N % 32 != 0)
|
||||
return false;
|
||||
if (K % 16 != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_TT";
|
||||
}
|
||||
|
||||
const size_t gx = (size_t)(N + dx - 1) / dx;
|
||||
const size_t gy = (size_t)(M + dy - 1) / dy;
|
||||
CV_LOG_DEBUG(NULL, "kernel: " << kernelName << " (M=" << M << " N=" << N << " K=" << K << ")");
|
||||
|
||||
const size_t gx = divUp((size_t)N, dx);
|
||||
const size_t gy = divUp((size_t)M, dy);
|
||||
|
||||
size_t local[] = {lx, ly, 1};
|
||||
size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};
|
||||
|
||||
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
|
||||
size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1};
|
||||
|
||||
ocl::Queue q;
|
||||
String errmsg;
|
||||
@ -110,10 +128,13 @@ static bool intel_gpu_gemm(
|
||||
(int)(D.step / sizeof(float))
|
||||
);
|
||||
|
||||
ret = k.run(2, global, local, false, q);
|
||||
bool ret = k.run(2, global, local, false, q);
|
||||
return ret;
|
||||
}
|
||||
else
|
||||
{
|
||||
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
|
||||
|
||||
for(int start_index = 0; start_index < K; start_index += stride)
|
||||
{
|
||||
ocl::Kernel k(kernelName.c_str(), program);
|
||||
@ -132,12 +153,16 @@ static bool intel_gpu_gemm(
|
||||
(int) start_index, // 14 start_index
|
||||
stride);
|
||||
|
||||
ret = k.run(2, global, local, false, q);
|
||||
if (!ret) return ret;
|
||||
bool ret = k.run(2, global, local, false, q);
|
||||
if (!ret)
|
||||
{
|
||||
if (start_index != 0)
|
||||
isPropagatedC2D = false; // D array content is changed, need to rewrite
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace cv
|
||||
|
@ -42,6 +42,8 @@
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include "opencl_kernels_core.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
|
||||
@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
|
||||
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
|
||||
InputArray matC, double beta, OutputArray matD, int flags )
|
||||
{
|
||||
int depth = matA.depth(), cn = matA.channels();
|
||||
int type = CV_MAKETYPE(depth, cn);
|
||||
int type = matA.type();
|
||||
int depth = CV_MAT_DEPTH(type);
|
||||
int cn = CV_MAT_CN(type);
|
||||
|
||||
CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
|
||||
CV_CheckTypeEQ(type, matB.type(), "");
|
||||
CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, "");
|
||||
|
||||
const ocl::Device & dev = ocl::Device::getDefault();
|
||||
bool doubleSupport = dev.doubleFPConfig() > 0;
|
||||
@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
|
||||
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
|
||||
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
|
||||
|
||||
CV_Assert( !haveC || matC.type() == type );
|
||||
if (haveC)
|
||||
CV_CheckTypeEQ(type, matC.type(), "");
|
||||
|
||||
Size sizeD(((btrans) ? sizeB.height : sizeB.width),
|
||||
((atrans) ? sizeA.width : sizeA.height));
|
||||
|
||||
if (atrans)
|
||||
sizeA = Size(sizeA.height, sizeA.width);
|
||||
if (btrans)
|
||||
sizeB = Size(sizeB.height, sizeB.width);
|
||||
if (haveC && ctrans)
|
||||
sizeC = Size(sizeC.height, sizeC.width);
|
||||
|
||||
CV_CheckEQ(sizeA.width, sizeB.height, "");
|
||||
if (haveC)
|
||||
CV_CheckEQ(sizeC, sizeD, "");
|
||||
|
||||
UMat A = matA.getUMat();
|
||||
UMat B = matB.getUMat();
|
||||
|
||||
Size sizeD(((btrans)? sizeB.height : sizeB.width),
|
||||
((atrans)? sizeA.width : sizeA.height));
|
||||
matD.create(sizeD, type);
|
||||
UMat D = matD.getUMat();
|
||||
|
||||
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
|
||||
bool isPropagatedC2D = false; // D content is updated with C / C.t()
|
||||
|
||||
|
||||
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
|
||||
{
|
||||
String opts;
|
||||
|
||||
if (atrans)
|
||||
sizeA = Size(sizeA.height, sizeA.width);
|
||||
if (btrans)
|
||||
sizeB = Size(sizeB.height, sizeB.width);
|
||||
if (haveC && ctrans)
|
||||
sizeC = Size(sizeC.height, sizeC.width);
|
||||
|
||||
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
|
||||
|
||||
int max_wg_size = (int)dev.maxWorkGroupSize();
|
||||
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
|
||||
|
||||
if (atrans)
|
||||
A = A.t();
|
||||
|
||||
if (btrans)
|
||||
B = B.t();
|
||||
|
||||
if (haveC)
|
||||
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
||||
|
||||
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
|
||||
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
|
||||
|
||||
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
|
||||
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
|
||||
cn, kercn, block_size,
|
||||
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
|
||||
haveC ? " -D HAVE_C" : "",
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
if (depth == CV_64F)
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, alpha, beta);
|
||||
else
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, (float)alpha, (float)beta);
|
||||
|
||||
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
|
||||
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
|
||||
|
||||
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
|
||||
}
|
||||
else
|
||||
if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1)
|
||||
{
|
||||
if (haveC && beta != 0.0)
|
||||
{
|
||||
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
||||
isPropagatedC2D = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
beta = 0.0;
|
||||
}
|
||||
|
||||
return intel_gpu_gemm(A, sizeA,
|
||||
B, sizeB,
|
||||
D, sizeD,
|
||||
alpha,
|
||||
beta,
|
||||
atrans, btrans);
|
||||
bool res = intel_gpu_gemm(A, matA.size(),
|
||||
B, matB.size(),
|
||||
D, sizeD,
|
||||
alpha,
|
||||
beta,
|
||||
atrans, btrans,
|
||||
isPropagatedC2D);
|
||||
if (res)
|
||||
return true;
|
||||
// fallback on generic OpenCL code
|
||||
}
|
||||
|
||||
if (sizeD.width < 8 || sizeD.height < 8)
|
||||
return false;
|
||||
|
||||
String opts;
|
||||
|
||||
int wg_size = (int)dev.maxWorkGroupSize();
|
||||
int sizeDmin = std::min(sizeD.width, sizeD.height);
|
||||
wg_size = std::min(wg_size, sizeDmin * sizeDmin);
|
||||
int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
|
||||
|
||||
if (atrans)
|
||||
A = A.t();
|
||||
|
||||
if (btrans)
|
||||
B = B.t();
|
||||
|
||||
if (haveC && !isPropagatedC2D)
|
||||
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
||||
|
||||
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
|
||||
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
|
||||
|
||||
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
|
||||
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
|
||||
cn, kercn, block_size,
|
||||
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
|
||||
haveC ? " -D HAVE_C" : "",
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
if (depth == CV_64F)
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, alpha, beta);
|
||||
else
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, (float)alpha, (float)beta);
|
||||
|
||||
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
|
||||
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
|
||||
|
||||
return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -76,8 +76,11 @@
|
||||
#undef CV__ALLOCATOR_STATS_LOG
|
||||
|
||||
#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
|
||||
#define CV_OPENCL_SHOW_BUILD_OPTIONS 0
|
||||
#define CV_OPENCL_SHOW_BUILD_KERNELS 0
|
||||
|
||||
#define CV_OPENCL_SHOW_RUN_KERNELS 0
|
||||
#define CV_OPENCL_SYNC_RUN_KERNELS 0
|
||||
#define CV_OPENCL_TRACE_CHECK 0
|
||||
|
||||
#define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1
|
||||
@ -2155,20 +2158,22 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
|
||||
platforms.resize(numPlatforms);
|
||||
}
|
||||
|
||||
int selectedPlatform = -1;
|
||||
if (platform.length() > 0)
|
||||
{
|
||||
for (size_t i = 0; i < platforms.size(); i++)
|
||||
for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end();)
|
||||
{
|
||||
std::string name;
|
||||
CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
|
||||
CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, *currentPlatform, CL_PLATFORM_NAME, name));
|
||||
if (name.find(platform) != std::string::npos)
|
||||
{
|
||||
selectedPlatform = (int)i;
|
||||
break;
|
||||
++currentPlatform;
|
||||
}
|
||||
else
|
||||
{
|
||||
currentPlatform = platforms.erase(currentPlatform);
|
||||
}
|
||||
}
|
||||
if (selectedPlatform == -1)
|
||||
if (platforms.size() == 0)
|
||||
{
|
||||
CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
|
||||
goto not_found;
|
||||
@ -2205,13 +2210,11 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
|
||||
goto not_found;
|
||||
}
|
||||
|
||||
std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
|
||||
for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
|
||||
(selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
|
||||
i++)
|
||||
std::vector<cl_device_id> devices;
|
||||
for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end(); ++currentPlatform)
|
||||
{
|
||||
cl_uint count = 0;
|
||||
cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
|
||||
cl_int status = clGetDeviceIDs(*currentPlatform, deviceType, 0, NULL, &count);
|
||||
if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
|
||||
{
|
||||
CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
|
||||
@ -2220,7 +2223,7 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
|
||||
continue;
|
||||
size_t base = devices.size();
|
||||
devices.resize(base + count);
|
||||
status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
|
||||
status = clGetDeviceIDs(*currentPlatform, deviceType, count, &devices[base], &count);
|
||||
if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
|
||||
{
|
||||
CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
|
||||
@ -3679,6 +3682,8 @@ static cv::String dumpValue(size_t sz, const void* p)
|
||||
{
|
||||
if (!p)
|
||||
return "NULL";
|
||||
if (sz == 2)
|
||||
return cv::format("%d / %uu / 0x%04x", *(short*)p, *(unsigned short*)p, *(short*)p);
|
||||
if (sz == 4)
|
||||
return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
|
||||
if (sz == 8)
|
||||
@ -3851,6 +3856,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
}
|
||||
|
||||
|
||||
bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
|
||||
bool sync, const Queue& q)
|
||||
{
|
||||
CV_Assert(p);
|
||||
return p->run(dims, _globalsize, _localsize, sync, NULL, q);
|
||||
}
|
||||
|
||||
|
||||
static bool isRaiseErrorOnReuseAsyncKernel()
|
||||
{
|
||||
static bool initialized = false;
|
||||
@ -3891,6 +3904,10 @@ bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
|
||||
return false; // OpenCV 5.0: raise error
|
||||
}
|
||||
|
||||
#if CV_OPENCL_SYNC_RUN_KERNELS
|
||||
sync = true;
|
||||
#endif
|
||||
|
||||
cl_command_queue qq = getQueue(q);
|
||||
if (haveTempDstUMats)
|
||||
sync = true;
|
||||
@ -4338,7 +4355,28 @@ struct Program::Impl
|
||||
if (!param_buildExtraOptions.empty())
|
||||
buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
|
||||
}
|
||||
#if CV_OPENCL_SHOW_BUILD_OPTIONS
|
||||
CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' options:" << buildflags);
|
||||
#endif
|
||||
compile(ctx, src_, errmsg);
|
||||
#if CV_OPENCL_SHOW_BUILD_KERNELS
|
||||
if (handle)
|
||||
{
|
||||
size_t retsz = 0;
|
||||
char kernels_buffer[4096] = {0};
|
||||
cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
|
||||
CV_OCL_DBG_CHECK_RESULT(result, cv::format("clGetProgramInfo(CL_PROGRAM_KERNEL_NAMES: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
|
||||
if (result == CL_SUCCESS && retsz < sizeof(kernels_buffer))
|
||||
{
|
||||
kernels_buffer[retsz] = 0;
|
||||
CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' kernels: '" << kernels_buffer << "'");
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_LOG_ERROR(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' can't retrieve kernel names!");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
|
||||
@ -4570,7 +4608,6 @@ struct Program::Impl
|
||||
CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
return handle != NULL;
|
||||
}
|
||||
|
@ -392,6 +392,15 @@ __kernel void intelblas_gemm_buffer_NN(
|
||||
#define TILE_N 8
|
||||
#define SLM_BLOCK 512
|
||||
|
||||
/*
|
||||
A K B.t() K D N
|
||||
----------- ----------- -----------
|
||||
| | | | | |
|
||||
M | | x N | | => M | |
|
||||
| | | | | |
|
||||
----------- ----------- -----------
|
||||
*/
|
||||
|
||||
__attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))
|
||||
__kernel void intelblas_gemm_buffer_NT(
|
||||
const __global float *src0, int off0,
|
||||
@ -422,59 +431,79 @@ __kernel void intelblas_gemm_buffer_NT(
|
||||
float8 dot06 = 0.f;
|
||||
float8 dot07 = 0.f;
|
||||
|
||||
float4 brow0;
|
||||
float4 brow1;
|
||||
float4 brow2;
|
||||
float4 brow3;
|
||||
float4 brow4;
|
||||
float4 brow5;
|
||||
float4 brow6;
|
||||
float4 brow7;
|
||||
const int dst_row = (global_y * TILE_M);
|
||||
__global float *dst_write0 = dst + global_x + dst_row * ldC + offd;
|
||||
|
||||
__global float *dst_write0 = dst + local_x * VEC_SIZE + ( group_x * TILE_N ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * ldC + offd;
|
||||
const __global float *src0_read00 = src0 + off0;
|
||||
const int a_row_base = global_y * TILE_M;
|
||||
const int a_col_base = local_x * (TILE_K / 8); // <= TILE_K - 4
|
||||
|
||||
const __global float *src0_read = src0 + local_x * ( TILE_K / 8 ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M ) * ldA + off0;
|
||||
|
||||
const __global float *src1_read0 = src1 + ( group_x * TILE_N ) * ldB + off1;
|
||||
const __global float *src1_read00 = src1 + off1;
|
||||
const int b_row_base = (group_x * TILE_N);
|
||||
//const int b_col_base = 0;
|
||||
|
||||
__local float slm_brow[8 * SLM_BLOCK];
|
||||
__local float* slm_brow0;
|
||||
|
||||
int local_index = mad24(local_y, 8, local_x) * 4;
|
||||
int w;
|
||||
for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
|
||||
int w = 0;
|
||||
for (int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK)
|
||||
{
|
||||
#define UPDATE_BROW(_row) \
|
||||
{ \
|
||||
float4 brow; \
|
||||
int b_row = b_row_base + _row; \
|
||||
int b_col = b_tile + local_index; \
|
||||
if (b_row < N && b_col <= K - 4 /*vload4*/) \
|
||||
brow = vload4(0, src1_read00 + mad24(b_row, ldB, b_col)); \
|
||||
else \
|
||||
brow = (float4)0; \
|
||||
vstore4(brow, 0, slm_brow + mad24(_row, SLM_BLOCK, local_index)); \
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
vstore4(vload4(0, src1_read0 + mad24(0, ldB, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
|
||||
vstore4(vload4(0, src1_read0 + mad24(1, ldB, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
|
||||
vstore4(vload4(0, src1_read0 + mad24(2, ldB, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
|
||||
vstore4(vload4(0, src1_read0 + mad24(3, ldB, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
|
||||
vstore4(vload4(0, src1_read0 + mad24(4, ldB, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
|
||||
vstore4(vload4(0, src1_read0 + mad24(5, ldB, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
|
||||
vstore4(vload4(0, src1_read0 + mad24(6, ldB, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
|
||||
vstore4(vload4(0, src1_read0 + mad24(7, ldB, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
|
||||
UPDATE_BROW(0);
|
||||
UPDATE_BROW(1);
|
||||
UPDATE_BROW(2);
|
||||
UPDATE_BROW(3);
|
||||
UPDATE_BROW(4);
|
||||
UPDATE_BROW(5);
|
||||
UPDATE_BROW(6);
|
||||
UPDATE_BROW(7);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#undef UPDATE_BROW
|
||||
|
||||
slm_brow0 = slm_brow + local_x * (TILE_K / 8);
|
||||
w = b_tile;
|
||||
int end_w = min(b_tile + SLM_BLOCK, K);
|
||||
while( w + TILE_K <= end_w ) {
|
||||
float4 arow;
|
||||
for (int k_tile_offset = 0; k_tile_offset < SLM_BLOCK; k_tile_offset += TILE_K)
|
||||
{
|
||||
int a_col = a_col_base + b_tile + k_tile_offset;
|
||||
|
||||
brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK);
|
||||
brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK);
|
||||
brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);
|
||||
brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);
|
||||
brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);
|
||||
brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);
|
||||
brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);
|
||||
brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);
|
||||
if (a_col > K - 4 /*vload4*/)
|
||||
break;
|
||||
|
||||
#define MM_DOT_PRODUCT(_row,_dot) \
|
||||
arow = vload4(0, src0_read + _row * ldA); \
|
||||
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
|
||||
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
|
||||
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
|
||||
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
|
||||
int slm_brow_col = a_col_base + k_tile_offset; // <= SLM_BLOCK - 4
|
||||
#define READ_SLM_BROW(_row) \
|
||||
float4 brow##_row = vload4(0, slm_brow + mad24(_row, SLM_BLOCK, slm_brow_col));
|
||||
|
||||
READ_SLM_BROW(0);
|
||||
READ_SLM_BROW(1);
|
||||
READ_SLM_BROW(2);
|
||||
READ_SLM_BROW(3);
|
||||
READ_SLM_BROW(4);
|
||||
READ_SLM_BROW(5);
|
||||
READ_SLM_BROW(6);
|
||||
READ_SLM_BROW(7);
|
||||
#undef READ_SLM_BROW
|
||||
|
||||
#define MM_DOT_PRODUCT(_row,_dot) \
|
||||
{ \
|
||||
int a_row = a_row_base + _row; \
|
||||
if (a_row < M) { \
|
||||
float4 arow = vload4(0, src0_read00 + mad24(a_row, ldA, a_col)); \
|
||||
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
|
||||
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
|
||||
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
|
||||
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); \
|
||||
} \
|
||||
}
|
||||
|
||||
MM_DOT_PRODUCT(0,dot00);
|
||||
MM_DOT_PRODUCT(1,dot01);
|
||||
@ -485,53 +514,7 @@ __kernel void intelblas_gemm_buffer_NT(
|
||||
MM_DOT_PRODUCT(6,dot06);
|
||||
MM_DOT_PRODUCT(7,dot07);
|
||||
#undef MM_DOT_PRODUCT
|
||||
|
||||
src0_read += TILE_K;
|
||||
slm_brow0 += TILE_K;
|
||||
w += TILE_K;
|
||||
}
|
||||
src1_read0 += SLM_BLOCK;
|
||||
}
|
||||
|
||||
if(w < K) {
|
||||
float4 arow;
|
||||
|
||||
#define READ_BROW(_brow,_row) \
|
||||
_brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \
|
||||
_brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \
|
||||
_brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \
|
||||
_brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \
|
||||
_brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;
|
||||
|
||||
READ_BROW(brow0,0);
|
||||
READ_BROW(brow1,1);
|
||||
READ_BROW(brow2,2);
|
||||
READ_BROW(brow3,3);
|
||||
READ_BROW(brow4,4);
|
||||
READ_BROW(brow5,5);
|
||||
READ_BROW(brow6,6);
|
||||
READ_BROW(brow7,7);
|
||||
|
||||
#define MM_DOT_PRODUCT(_row,_dot) \
|
||||
arow = vload4(0, src0_read + _row * ldA); \
|
||||
arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \
|
||||
arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \
|
||||
arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \
|
||||
arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \
|
||||
_dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
|
||||
_dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
|
||||
_dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
|
||||
_dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
|
||||
|
||||
MM_DOT_PRODUCT(0,dot00);
|
||||
MM_DOT_PRODUCT(1,dot01);
|
||||
MM_DOT_PRODUCT(2,dot02);
|
||||
MM_DOT_PRODUCT(3,dot03);
|
||||
MM_DOT_PRODUCT(4,dot04);
|
||||
MM_DOT_PRODUCT(5,dot05);
|
||||
MM_DOT_PRODUCT(6,dot06);
|
||||
MM_DOT_PRODUCT(7,dot07);
|
||||
#undef MM_DOT_PRODUCT
|
||||
}
|
||||
|
||||
#define REDUCE(_dot) \
|
||||
@ -572,21 +555,22 @@ __kernel void intelblas_gemm_buffer_NT(
|
||||
output = (local_x == 5) ? _dot.s5 : output; \
|
||||
output = (local_x == 6) ? _dot.s6 : output; \
|
||||
output = (local_x == 7) ? _dot.s7 : output; \
|
||||
if (beta != 0.0) \
|
||||
if (beta != 0.0f) \
|
||||
dst_write0[0] = mad(output, (float)alpha, ((float)beta * dst_write0[0])); \
|
||||
else \
|
||||
dst_write0[0] = output * (float)alpha; \
|
||||
dst_write0 += ldC;
|
||||
|
||||
if(global_x < N && global_y * 8 < M) {
|
||||
OUTPUT(dot00);
|
||||
if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }
|
||||
if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }
|
||||
if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }
|
||||
if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }
|
||||
if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }
|
||||
if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }
|
||||
if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }
|
||||
if (global_x < N && dst_row < M)
|
||||
{
|
||||
/*if (dst_row + 0 < M)*/ { OUTPUT(dot00); }
|
||||
if (dst_row + 1 < M) { OUTPUT(dot01); }
|
||||
if (dst_row + 2 < M) { OUTPUT(dot02); }
|
||||
if (dst_row + 3 < M) { OUTPUT(dot03); }
|
||||
if (dst_row + 4 < M) { OUTPUT(dot04); }
|
||||
if (dst_row + 5 < M) { OUTPUT(dot05); }
|
||||
if (dst_row + 6 < M) { OUTPUT(dot06); }
|
||||
if (dst_row + 7 < M) { OUTPUT(dot07); }
|
||||
}
|
||||
#undef OUTPUT
|
||||
}
|
||||
|
@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm,
|
||||
|
||||
double alpha, beta;
|
||||
|
||||
int M, N, K;
|
||||
|
||||
TEST_DECLARE_INPUT_PARAMETER(A);
|
||||
TEST_DECLARE_INPUT_PARAMETER(B);
|
||||
TEST_DECLARE_INPUT_PARAMETER(C);
|
||||
@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm,
|
||||
|
||||
void generateTestData()
|
||||
{
|
||||
// set minimum size to 20, since testing less sizes doesn't make sense
|
||||
Size ARoiSize = randomSize(20, MAX_VALUE);
|
||||
M = (int)randomDoubleLog(1, 100);
|
||||
N = (int)randomDoubleLog(1, 100);
|
||||
K = (int)randomDoubleLog(1, 1200);
|
||||
|
||||
M = roundUp(M, 1);
|
||||
N = roundUp(N, 1);
|
||||
K = roundUp(K, 1);
|
||||
|
||||
Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M);
|
||||
Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
|
||||
|
||||
if (atrans)
|
||||
ARoiSize = Size(ARoiSize.height, ARoiSize.width);
|
||||
|
||||
Size BRoiSize = randomSize(20, MAX_VALUE);
|
||||
if (btrans)
|
||||
BRoiSize.width = ARoiSize.width;
|
||||
else
|
||||
BRoiSize.height = ARoiSize.width;
|
||||
|
||||
Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K);
|
||||
Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11);
|
||||
|
||||
if (btrans)
|
||||
BRoiSize = Size(BRoiSize.height, BRoiSize.width);
|
||||
|
||||
Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width);
|
||||
Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M);
|
||||
Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11);
|
||||
randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11);
|
||||
|
||||
Size DRoiSize = Size(N, M);
|
||||
Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11);
|
||||
|
||||
@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy)
|
||||
for (int i = 0; i < test_loop_times; ++i)
|
||||
{
|
||||
generateTestData();
|
||||
SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K));
|
||||
|
||||
OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags));
|
||||
OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags));
|
||||
|
||||
double eps = D_roi.size().area() * 1e-4;
|
||||
double eps = D_roi.size().area() * (1e-5 * K);
|
||||
OCL_EXPECT_MATS_NEAR(D, eps);
|
||||
}
|
||||
}
|
||||
|
@ -238,7 +238,7 @@ public:
|
||||
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
|
||||
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
|
||||
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
|
||||
bool ret = kernel.run(2, global, NULL, false);
|
||||
bool ret = kernel.run_(2, global, NULL, false);
|
||||
if (!ret)
|
||||
return false;
|
||||
}
|
||||
|
@ -1951,7 +1951,7 @@ Ptr<Layer> ChannelsPReLULayer::create(const LayerParams& params)
|
||||
if (params.blobs[0].total() == 1)
|
||||
{
|
||||
LayerParams reluParams = params;
|
||||
reluParams.set("negative_slope", params.blobs[0].at<float>(0));
|
||||
reluParams.set("negative_slope", *params.blobs[0].ptr<float>());
|
||||
return ReLULayer::create(reluParams);
|
||||
}
|
||||
Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));
|
||||
|
@ -200,7 +200,7 @@ public:
|
||||
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
|
||||
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
|
||||
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
|
||||
ret = k1.run(1, globalsize, localsize, false);
|
||||
ret = k1.run_(1, globalsize, localsize, false);
|
||||
if (!ret)
|
||||
return false;
|
||||
}
|
||||
|
@ -120,7 +120,14 @@ public:
|
||||
internals_arr.getMatVector(internals);
|
||||
|
||||
if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
|
||||
{
|
||||
// outputs[0] = inputs[0] doesn't work due to BlobManager optimizations
|
||||
if (inputs[0].data != outputs[0].data)
|
||||
{
|
||||
inputs[0].copyTo(outputs[0]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
Mat& inp = inputs[0];
|
||||
Mat& out = outputs[0];
|
||||
|
@ -64,6 +64,31 @@ namespace cv
|
||||
namespace dnn
|
||||
{
|
||||
|
||||
void sliceRangesFromShape(const MatShape& inpShape, int& axis, std::vector<std::vector<cv::Range> >& sliceRanges)
|
||||
{
|
||||
CV_Assert(inpShape.size() > 0);
|
||||
bool axisNeg = (axis < 0);
|
||||
axis = (axis + static_cast<int>(inpShape.size())) % inpShape.size();
|
||||
int n = inpShape[axis];
|
||||
|
||||
for (size_t i = 0; i < sliceRanges.size(); ++i){
|
||||
std::vector<Range>& ranges = sliceRanges[i];
|
||||
if (axisNeg)
|
||||
{
|
||||
ranges.insert(ranges.begin(), axis, Range::all());
|
||||
}
|
||||
Range& range = ranges.back();
|
||||
|
||||
if (range.start >= 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
CV_Assert(n != 0);
|
||||
range.start = (n + range.start) % n;
|
||||
}
|
||||
}
|
||||
|
||||
class SliceLayerImpl : public SliceLayer
|
||||
{
|
||||
public:
|
||||
@ -75,20 +100,22 @@ public:
|
||||
num_split = params.get<int>("num_split", 0);
|
||||
hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
|
||||
shapesInitialized = !hasDynamicShapes;
|
||||
|
||||
if (params.has("slice_point"))
|
||||
{
|
||||
CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
|
||||
const DictValue &indicesValue = params.get("slice_point");
|
||||
int size = axis > 0 ? axis + 1 : 1;
|
||||
sliceRanges.resize(indicesValue.size() + 1,
|
||||
std::vector<Range>(std::max(axis,0) + 1, Range::all()));
|
||||
std::vector<Range>(size, Range::all()));
|
||||
int prevSlice = 0;
|
||||
for (int i = 0; i < indicesValue.size(); ++i)
|
||||
{
|
||||
sliceRanges[i][axis].start = prevSlice;
|
||||
sliceRanges[i][axis].end = indicesValue.get<int>(i);
|
||||
prevSlice = sliceRanges[i][axis].end;
|
||||
sliceRanges[i][size - 1].start = prevSlice;
|
||||
sliceRanges[i][size - 1].end = indicesValue.get<int>(i);
|
||||
prevSlice = sliceRanges[i][size - 1].end;
|
||||
}
|
||||
sliceRanges.back()[axis].start = prevSlice;
|
||||
sliceRanges.back()[size - 1].start = prevSlice;
|
||||
}
|
||||
else if (params.has("begin"))
|
||||
{
|
||||
@ -103,7 +130,6 @@ public:
|
||||
{
|
||||
int start = begins.get<int>(i);
|
||||
int sizeOrEnd = sizesOrEnds.get<int>(i); // It may be negative to reverse indexation.
|
||||
CV_Assert(start >= 0);
|
||||
|
||||
sliceRanges[0][i].start = start;
|
||||
if (params.has("size"))
|
||||
@ -164,16 +190,20 @@ public:
|
||||
CV_Assert(inputs.size() == 1);
|
||||
MatShape inpShape = inputs[0];
|
||||
|
||||
if (!sliceRanges.empty())
|
||||
int axis_rw = axis;
|
||||
std::vector<std::vector<cv::Range> > sliceRanges_rw = sliceRanges;
|
||||
sliceRangesFromShape(inpShape, axis_rw, sliceRanges_rw);
|
||||
|
||||
if (!sliceRanges_rw.empty())
|
||||
{
|
||||
outputs.resize(sliceRanges.size(), inpShape);
|
||||
outputs.resize(sliceRanges_rw.size(), inpShape);
|
||||
for (int i = 0; i < outputs.size(); ++i)
|
||||
{
|
||||
CV_Assert(sliceRanges[i].size() <= inpShape.size());
|
||||
for (int j = 0; j < sliceRanges[i].size(); ++j)
|
||||
CV_Assert(sliceRanges_rw[i].size() <= inpShape.size());
|
||||
for (int j = 0; j < sliceRanges_rw[i].size(); ++j)
|
||||
{
|
||||
if (shapesInitialized || inpShape[j] > 0)
|
||||
outputs[i][j] = normalize_axis_range(sliceRanges[i][j], inpShape[j]).size();
|
||||
outputs[i][j] = normalize_axis_range(sliceRanges_rw[i][j], inpShape[j]).size();
|
||||
|
||||
if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1))
|
||||
outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j];
|
||||
@ -182,10 +212,10 @@ public:
|
||||
}
|
||||
else // Divide input blob on equal parts by axis.
|
||||
{
|
||||
CV_Assert(0 <= axis && axis < inpShape.size());
|
||||
CV_Assert(0 <= axis_rw && axis_rw < inpShape.size());
|
||||
int splits = num_split ? num_split : requiredOutputs;
|
||||
CV_Assert(splits > 0 && inpShape[axis] % splits == 0);
|
||||
inpShape[axis] /= splits;
|
||||
CV_Assert(splits > 0 && inpShape[axis_rw] % splits == 0);
|
||||
inpShape[axis_rw] /= splits;
|
||||
outputs.resize(splits, inpShape);
|
||||
}
|
||||
return false;
|
||||
@ -210,6 +240,7 @@ public:
|
||||
CV_Assert(inputs.size() == 1);
|
||||
const MatSize& inpShape = inputs[0].size;
|
||||
|
||||
sliceRangesFromShape(shape(inputs[0]), axis, sliceRanges);
|
||||
finalSliceRanges = sliceRanges;
|
||||
|
||||
if (sliceRanges.empty())
|
||||
@ -492,7 +523,7 @@ public:
|
||||
ocl::KernelArg::PtrReadOnly(input),
|
||||
ocl::KernelArg::PtrWriteOnly(output)
|
||||
)
|
||||
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
|
||||
.run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
|
||||
if (!ret)
|
||||
return false;
|
||||
} // for outputs.size()
|
||||
|
@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
|
||||
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
|
||||
int blockM, int blockK, int simd_size);
|
||||
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
|
||||
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
|
||||
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
|
||||
|
||||
int32_t group_;
|
||||
bool bias_term_;
|
||||
|
@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
|
||||
.args(
|
||||
ocl::KernelArg::PtrReadOnly(buffer),
|
||||
image, offset,
|
||||
padded_width, padded_height,
|
||||
width, height,
|
||||
ld)
|
||||
.run(2, global_copy, NULL, false);
|
||||
|
@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
|
||||
}
|
||||
|
||||
template<typename Dtype>
|
||||
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
|
||||
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
|
||||
{
|
||||
if (fused_eltwise)
|
||||
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
|
||||
if (fused_eltwise_offset >= 0)
|
||||
kernel.set(argIdx++, fused_eltwise_offset);
|
||||
}
|
||||
|
||||
switch (fused_activ) {
|
||||
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
|
||||
kernel.set(argIdx++, (float)negative_slope_);
|
||||
break;
|
||||
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
|
||||
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
|
||||
break;
|
||||
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
|
||||
kernel.set(argIdx++, (float)power_);
|
||||
@ -765,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
|
||||
swizzled_factor
|
||||
);
|
||||
|
||||
size_t global_work_size_copy[3] = {
|
||||
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
|
||||
size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
|
||||
|
||||
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
|
||||
if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
|
||||
{
|
||||
std::cout << "Swizzle kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
@ -895,10 +898,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
|
||||
if (!swizzleWeight(weight, config->workItem_output[2], false))
|
||||
return false;
|
||||
#if 0
|
||||
size_t total_bottom_size = bottom_dim_ * numImages;
|
||||
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
|
||||
size_t total_bias_size = M_ * group_;
|
||||
size_t total_top_size = top_dim_ * numImages;
|
||||
#endif
|
||||
for (int32_t g = 0; g < group_; ++g) {
|
||||
bias_offset = M_ * g;
|
||||
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
|
||||
@ -910,78 +915,28 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
|
||||
|
||||
UMat img_buffer;
|
||||
if (image_offset)
|
||||
{
|
||||
CreateSubBuffer(bottom, img_buffer, image_offset,
|
||||
total_bottom_size - image_offset, false);
|
||||
if (img_buffer.empty())
|
||||
return false;
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, image_offset);
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
||||
kernel.set(argIdx++, kernel_offset);
|
||||
|
||||
UMat kernel_buffer;
|
||||
if (kernel_offset)
|
||||
{
|
||||
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
|
||||
total_kernel_size - kernel_offset, false);
|
||||
if (kernel_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
||||
}
|
||||
|
||||
UMat bias_buffer;
|
||||
if (bias_term_)
|
||||
{
|
||||
if (bias_offset)
|
||||
{
|
||||
CreateSubBuffer(bias, bias_buffer, bias_offset,
|
||||
total_bias_size - bias_offset, false);
|
||||
if (bias_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
kernel.set(argIdx++, bias_offset);
|
||||
}
|
||||
|
||||
UMat out_buffer;
|
||||
if (output_image_offset)
|
||||
{
|
||||
CreateSubBuffer(top, out_buffer, output_image_offset,
|
||||
total_top_size - output_image_offset, true);
|
||||
if (out_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
|
||||
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
|
||||
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
kernel.set(argIdx++, (uint16_t)height_);
|
||||
kernel.set(argIdx++, (uint16_t)output_w_);
|
||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||
if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
|
||||
if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
|
||||
{
|
||||
std::cout << "IDLF kernel run failed." << std::endl;
|
||||
return false;
|
||||
@ -990,9 +945,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
} else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
|
||||
if (!swizzleWeight(weight, config->workItem_output[1], true))
|
||||
return false;
|
||||
#if 0
|
||||
size_t total_bottom_size = bottom_dim_ * numImages;
|
||||
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
|
||||
size_t total_bias_size = M_ * group_;
|
||||
#endif
|
||||
size_t total_top_size = top_dim_ * numImages;
|
||||
for (int32_t g = 0; g < group_; ++g) {
|
||||
bias_offset = M_ * g;
|
||||
@ -1005,72 +962,25 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
|
||||
|
||||
UMat img_buffer;
|
||||
if (image_offset)
|
||||
{
|
||||
CreateSubBuffer(bottom, img_buffer, image_offset,
|
||||
total_bottom_size - image_offset, false);
|
||||
if (img_buffer.empty())
|
||||
return false;
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, (int)image_offset);
|
||||
kernel.set(argIdx++, (int)(bottom.total() - image_offset));
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
||||
kernel.set(argIdx++, (int)kernel_offset);
|
||||
kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset));
|
||||
|
||||
UMat kernel_buffer;
|
||||
if (kernel_offset)
|
||||
{
|
||||
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
|
||||
total_kernel_size - kernel_offset, false);
|
||||
if (kernel_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
|
||||
}
|
||||
|
||||
UMat bias_buffer;
|
||||
if (bias_term_)
|
||||
{
|
||||
if (bias_offset)
|
||||
{
|
||||
CreateSubBuffer(bias, bias_buffer, bias_offset,
|
||||
total_bias_size - bias_offset, false);
|
||||
if (bias_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
|
||||
kernel.set(argIdx++, (int)bias_offset);
|
||||
}
|
||||
|
||||
UMat out_buffer;
|
||||
if (output_image_offset)
|
||||
{
|
||||
CreateSubBuffer(top, out_buffer, output_image_offset,
|
||||
total_top_size - output_image_offset, true);
|
||||
if (out_buffer.empty())
|
||||
return false;
|
||||
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
|
||||
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size));
|
||||
}
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
|
||||
kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size));
|
||||
|
||||
kernel.set(argIdx++, (uint16_t)width_);
|
||||
kernel.set(argIdx++, (uint16_t)height_);
|
||||
@ -1100,7 +1010,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
gy = alignSize(gy, blockK);
|
||||
size_t global_size[3] = { gx, gy, config->global_work_size[2] };
|
||||
|
||||
if (!kernel.run(3, global_size, config->local_work_size, false))
|
||||
if (!kernel.run_(3, global_size, config->local_work_size, false))
|
||||
{
|
||||
std::cout << "GEMM like kernel run failed." << std::endl;
|
||||
return false;
|
||||
@ -1112,7 +1022,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
if (bias_term_)
|
||||
@ -1129,9 +1039,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
global_size[1] = output_h_;
|
||||
global_size[2] = num_output_ * num_;
|
||||
|
||||
if (!kernel.run(3, global_size, NULL, false))
|
||||
if (!kernel.run_(3, global_size, NULL, false))
|
||||
{
|
||||
std::cout << "DWCONV kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
@ -1152,7 +1062,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
return false;
|
||||
|
||||
cl_uint argIdx = 0;
|
||||
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
|
||||
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
kernel.set(argIdx++, image_offset);
|
||||
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
|
||||
@ -1171,11 +1081,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
|
||||
kernel.set(argIdx++, (uint16_t)output_h_);
|
||||
kernel.set(argIdx++, (uint16_t)pad_w_);
|
||||
kernel.set(argIdx++, (uint16_t)pad_h_);
|
||||
if (!kernel.run(3, config->global_work_size,
|
||||
if (!kernel.run_(3, config->global_work_size,
|
||||
(config->use_null_local) ? NULL : config->local_work_size,
|
||||
false))
|
||||
{
|
||||
std::cout << "Basic kernel run failed." << std::endl;
|
||||
CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
|
||||
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
|
||||
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
|
||||
}
|
||||
ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
|
||||
ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -234,6 +234,27 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
class NormalizeSubgraph2_2 : public NormalizeSubgraphBase
|
||||
{
|
||||
public:
|
||||
NormalizeSubgraph2_2()
|
||||
{
|
||||
int input = addNodeToMatch("");
|
||||
int norm = addNodeToMatch("ReduceL2", input);
|
||||
|
||||
int min = addNodeToMatch("");
|
||||
int max = addNodeToMatch("");
|
||||
int clip = addNodeToMatch("Clip", norm, min, max);
|
||||
|
||||
int shape = addNodeToMatch("");
|
||||
int expand = addNodeToMatch("Expand", clip, shape);
|
||||
|
||||
addNodeToMatch("Div", input, expand);
|
||||
|
||||
setFusedNode("Normalize", input);
|
||||
}
|
||||
};
|
||||
|
||||
class NormalizeSubgraph3 : public NormalizeSubgraphBase
|
||||
{
|
||||
public:
|
||||
@ -558,6 +579,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
|
||||
subgraphs.push_back(makePtr<SoftMaxSubgraph>());
|
||||
subgraphs.push_back(makePtr<NormalizeSubgraph1>());
|
||||
subgraphs.push_back(makePtr<NormalizeSubgraph2>());
|
||||
subgraphs.push_back(makePtr<NormalizeSubgraph2_2>());
|
||||
subgraphs.push_back(makePtr<NormalizeSubgraph3>());
|
||||
subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
|
||||
subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
|
||||
|
@ -64,6 +64,8 @@ class ONNXImporter
|
||||
void addLayer(LayerParams& layerParams,
|
||||
const opencv_onnx::NodeProto& node_proto);
|
||||
|
||||
void expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
|
||||
const std::string& input, size_t n);
|
||||
public:
|
||||
ONNXImporter(Net& net, const char *onnxFile);
|
||||
ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer);
|
||||
@ -486,6 +488,37 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Make N copies of input layer and set them as input to node_proto.
|
||||
* @param prefix prefix of new layers' names
|
||||
* @param node_proto node which will contain all copies as inputs
|
||||
* @param input name of the node to copy
|
||||
* @param n number of copies
|
||||
*/
|
||||
void ONNXImporter::expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
|
||||
const std::string& input, size_t n)
|
||||
{
|
||||
std::vector<std::string> input_names;
|
||||
input_names.reserve(n);
|
||||
for (size_t j = 0; j < n; j++)
|
||||
{
|
||||
LayerParams copyLP;
|
||||
copyLP.name = format("%s/copy_%zu", prefix.c_str(), j);
|
||||
copyLP.type = "Identity";
|
||||
CV_Assert((layer_id.find(copyLP.name) == layer_id.end()) &&
|
||||
"Couldn't copy the node: generated name already exists in the graph.");
|
||||
input_names.push_back(copyLP.name);
|
||||
|
||||
node_proto.set_input(0, input);
|
||||
node_proto.set_output(0, copyLP.name);
|
||||
addLayer(copyLP, node_proto);
|
||||
}
|
||||
node_proto.clear_input();
|
||||
for (size_t i = 0; i < input_names.size(); i++)
|
||||
{
|
||||
node_proto.add_input(input_names[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ONNXImporter::addConstant(const std::string& name, const Mat& blob)
|
||||
{
|
||||
constBlobs.insert(std::make_pair(name, blob));
|
||||
@ -1422,6 +1455,38 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
|
||||
addLayer(layerParams, node_proto);
|
||||
}
|
||||
|
||||
void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis)
|
||||
{
|
||||
const size_t diff = outShape.size() - broadShape.size();
|
||||
|
||||
// find the first non-one element of the broadcasting shape
|
||||
axis = 0;
|
||||
for (; axis < broadShape.size() && broadShape[axis] == 1; ++axis) {}
|
||||
|
||||
// find the last non-one element of the broadcasting shape
|
||||
size_t endAxis = broadShape.size();
|
||||
for (; endAxis > axis && broadShape[endAxis - 1] == 1; --endAxis) {}
|
||||
|
||||
// find one between axis and endAxis - as it needs to be broadcasted,
|
||||
// dimensions from the left of axis and from the right of endAxis will be handled by Scale layer
|
||||
broadAxis = -1;
|
||||
for (size_t i = axis; i < endAxis; ++i)
|
||||
{
|
||||
size_t outAxis = i + diff;
|
||||
if (outShape[outAxis] == broadShape[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// ensure we need to broadcast only 1 dimension in the middle
|
||||
CV_Assert(broadShape[i] == 1 && broadAxis == -1);
|
||||
broadAxis = static_cast<int>(outAxis);
|
||||
}
|
||||
|
||||
axis += diff;
|
||||
}
|
||||
|
||||
// "Mul" "Div"
|
||||
void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
|
||||
{
|
||||
opencv_onnx::NodeProto node_proto = node_proto_;
|
||||
@ -1543,13 +1608,31 @@ void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodePro
|
||||
}
|
||||
|
||||
const MatShape& broadShape = outShapes[node_proto.input(1)];
|
||||
const size_t outShapeSize = outShapes[node_proto.input(0)].size();
|
||||
const size_t diff = outShapeSize - broadShape.size();
|
||||
const MatShape& outShape = outShapes[node_proto.input(0)];
|
||||
|
||||
size_t axis;
|
||||
for (axis = diff; axis < broadShape.size() && broadShape[axis - diff] == 1; ++axis) {}
|
||||
size_t axis = 0;
|
||||
int broadAxis = -1;
|
||||
findBroadAxis(broadShape, outShape, axis, broadAxis);
|
||||
|
||||
CV_Assert(axis != outShapeSize);
|
||||
// if there is a one dimension in the middle that should be broadcasted, broadcast it
|
||||
if (broadAxis != -1)
|
||||
{
|
||||
opencv_onnx::NodeProto concat_node_proto = node_proto;
|
||||
const std::string& input1 = concat_node_proto.input(1);
|
||||
|
||||
expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);
|
||||
|
||||
LayerParams concatLP;
|
||||
concatLP.name = layerParams.name + "/concat";
|
||||
concatLP.set("axis", broadAxis);
|
||||
concatLP.type = "Concat";
|
||||
concat_node_proto.set_output(0, concatLP.name);
|
||||
|
||||
addLayer(concatLP, concat_node_proto);
|
||||
node_proto.set_input(1, concatLP.name);
|
||||
}
|
||||
|
||||
CV_Assert(axis != outShape.size());
|
||||
layerParams.set("axis", static_cast<int>(axis));
|
||||
layerParams.type = "Scale";
|
||||
}
|
||||
@ -1818,12 +1901,11 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
|
||||
// Unsqueeze and repeat along new axis
|
||||
if (targetShape.size() == inpShape.size() + 1)
|
||||
{
|
||||
inpShape.insert(inpShape.begin(), targetShape.size() - inpShape.size(), 1);
|
||||
for (int i = 0; i < targetShape.size(); i++)
|
||||
{
|
||||
if (targetShape[i] == -1 && i < inpShape.size())
|
||||
if (abs(targetShape[i]) == 1)
|
||||
targetShape[i] = inpShape[i];
|
||||
else if (i < inpShape.size() && targetShape[i] != inpShape[i])
|
||||
inpShape.insert(inpShape.begin() + i, 1);
|
||||
}
|
||||
if (haveVariables)
|
||||
{
|
||||
@ -1843,14 +1925,19 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
|
||||
CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");
|
||||
|
||||
std::vector<int> broadcast_axes;
|
||||
// shapes aren't right-aligned here because targetShape.size() == inpShape.size()
|
||||
for (int i = 0; i < targetShape.size(); i++)
|
||||
{
|
||||
if (targetShape[i] != inpShape[i])
|
||||
{
|
||||
if (inpShape[i] == 1)
|
||||
{
|
||||
broadcast_axes.push_back(i);
|
||||
else
|
||||
}
|
||||
else if (targetShape[i] != 1)
|
||||
{
|
||||
CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1889,31 +1976,16 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
|
||||
}
|
||||
else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1)
|
||||
{
|
||||
String base_name = layerParams.name + "/copy_";
|
||||
std::vector<std::string> input_names;
|
||||
for (int j = 0; j < targetShape[broadcast_axes[0]]; j++)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << j;
|
||||
LayerParams copyLP;
|
||||
copyLP.name = base_name + ss.str();
|
||||
copyLP.type = "Identity";
|
||||
CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
|
||||
input_names.push_back(copyLP.name);
|
||||
expandMid(layerParams.name, node_proto, srcName, targetShape[broadcast_axes[0]]);
|
||||
|
||||
node_proto.set_input(0, srcName);
|
||||
node_proto.set_output(0, copyLP.name);
|
||||
addLayer(copyLP, node_proto);
|
||||
}
|
||||
node_proto.clear_input();
|
||||
for (int i = 0; i < input_names.size(); i++)
|
||||
{
|
||||
node_proto.add_input(input_names[i]);
|
||||
}
|
||||
layerParams.set("axis", broadcast_axes[0]);
|
||||
layerParams.type = "Concat";
|
||||
node_proto.set_output(0, layerParams.name);
|
||||
}
|
||||
else if (broadcast_axes.empty())
|
||||
{
|
||||
layerParams.type = "Identity";
|
||||
}
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
|
||||
addLayer(layerParams, node_proto);
|
||||
|
@ -74,18 +74,22 @@
|
||||
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
||||
} while(0)
|
||||
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
|
||||
#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
|
||||
#else
|
||||
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
|
||||
const Dtype _x_ = (_data_); \
|
||||
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
|
||||
} while(0)
|
||||
#define ELTWISE_DATA_ARG
|
||||
#define ELTWISE_DATA_ARG_WITH_OFFSET
|
||||
#endif
|
||||
|
||||
#if APPLY_BIAS
|
||||
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
|
||||
#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
|
||||
#else
|
||||
#define BIAS_KERNEL_ARG
|
||||
#define BIAS_KERNEL_ARG_WITH_OFFSET
|
||||
#endif
|
||||
|
||||
#define __CAT(x, y) x##y
|
||||
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||
__kernel void
|
||||
convolve_simd(
|
||||
ELTWISE_DATA_ARG
|
||||
ELTWISE_DATA_ARG_WITH_OFFSET
|
||||
FUSED_ARG
|
||||
__global Dtype* inputs,
|
||||
__global Dtype* weights,
|
||||
BIAS_KERNEL_ARG
|
||||
__global Dtype* outputs_base,
|
||||
const int outputs_offset,
|
||||
__global Dtype* inputs_ptr, const int inputs_offset,
|
||||
__global Dtype* weights_ptr, const int weights_offset,
|
||||
BIAS_KERNEL_ARG_WITH_OFFSET
|
||||
__global Dtype* outputs_base, const int outputs_offset,
|
||||
const ushort input_width,
|
||||
const ushort input_height,
|
||||
const ushort output_width,
|
||||
const ushort output_height)
|
||||
{
|
||||
__global Dtype* inputs = inputs_ptr + inputs_offset;
|
||||
__global Dtype* weights = weights_ptr + weights_offset;
|
||||
#if APPLY_BIAS
|
||||
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
|
||||
#endif
|
||||
|
||||
__global Dtype* outputs = outputs_base + outputs_offset;
|
||||
#ifdef FUSED_CONV_ELTWISE
|
||||
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
|
||||
#endif
|
||||
|
||||
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
|
||||
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
|
||||
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
|
||||
@ -388,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
|
||||
#define ROW_PITCH input_width
|
||||
|
||||
#define GEMM_LIKE_KERNEL_ARGS \
|
||||
ELTWISE_DATA_ARG \
|
||||
ELTWISE_DATA_ARG_WITH_OFFSET \
|
||||
FUSED_ARG \
|
||||
const __global Dtype *src0, \
|
||||
const __global Dtype *src1, \
|
||||
BIAS_KERNEL_ARG \
|
||||
__global Dtype *dst_base, \
|
||||
const int dst_offset, \
|
||||
const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \
|
||||
const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \
|
||||
BIAS_KERNEL_ARG_WITH_OFFSET \
|
||||
__global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \
|
||||
const ushort input_width, \
|
||||
const ushort input_height, \
|
||||
const ushort output_width, \
|
||||
@ -424,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
|
||||
__attribute__((intel_reqd_sub_group_size(8)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
const __global Dtype *src0 = src0_ptr + src0_offset;
|
||||
const __global Dtype *src1 = src1_ptr + src1_offset;
|
||||
#if APPLY_BIAS
|
||||
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
|
||||
#endif
|
||||
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
#ifdef FUSED_CONV_ELTWISE
|
||||
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
|
||||
#endif
|
||||
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
@ -447,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
}
|
||||
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
|
||||
|
||||
// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds)
|
||||
// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1)
|
||||
#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
|
||||
#define OPTIMIZE_READ 1
|
||||
#else
|
||||
#define OPTIMIZE_READ 0
|
||||
#endif
|
||||
|
||||
// True for all threads if filter_width is multiple of TILE_N
|
||||
// else, true for all but right-most column of threads.
|
||||
if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
|
||||
@ -463,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
// atile is M rows x K columns.
|
||||
int curr_x = ( global_y % output_width ) * STRIDE_X;
|
||||
int curr_y = ( global_y / output_width ) * STRIDE_Y;
|
||||
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
|
||||
#if !OPTIMIZE_READ
|
||||
int saved_y = curr_y;
|
||||
#endif
|
||||
const __global Dtype *src0_read = src0
|
||||
@ -483,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
do
|
||||
{
|
||||
int patch_row = 0;
|
||||
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
|
||||
#if !OPTIMIZE_READ
|
||||
curr_y = saved_y;
|
||||
#endif
|
||||
|
||||
@ -501,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
// ...
|
||||
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
|
||||
|
||||
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
|
||||
#if OPTIMIZE_READ
|
||||
#if KERNEL_WIDTH == 3
|
||||
Dtype_t blockA00 = vload3(0, src0_read);
|
||||
Dtype* pblockA00 = (Dtype*)(&blockA00);
|
||||
#else
|
||||
#if 0 // debug
|
||||
if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)
|
||||
{
|
||||
printf("CATCH: src0_read-src0: %d limit=%d curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x);
|
||||
}
|
||||
#endif
|
||||
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
|
||||
Dtype* pblockA00 = (Dtype*)(&blockA00);
|
||||
#endif
|
||||
@ -626,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
// atile is M rows x K columns.
|
||||
int curr_x = ( global_y % output_width ) * STRIDE_X;
|
||||
int curr_y = ( global_y / output_width ) * STRIDE_Y;
|
||||
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
|
||||
#if !OPTIMIZE_READ
|
||||
int saved_y = curr_y;
|
||||
#endif
|
||||
const __global Dtype *src0_read = src0
|
||||
@ -646,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
do
|
||||
{
|
||||
int patch_row = 0;
|
||||
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
|
||||
#if !OPTIMIZE_READ
|
||||
curr_y = saved_y;
|
||||
#endif
|
||||
do
|
||||
{
|
||||
// Load atile and interleaved btile.
|
||||
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
|
||||
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
|
||||
#if OPTIMIZE_READ
|
||||
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
|
||||
Dtype* pblockA00 = (Dtype*)(&blockA00);
|
||||
#else
|
||||
@ -790,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // TILE_N_LAST > 0
|
||||
}
|
||||
#endif
|
||||
#ifdef GEMM_LIKE_CONV_32_2
|
||||
@ -813,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
__attribute__((intel_reqd_sub_group_size(8)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
const __global Dtype *src0 = src0_ptr + src0_offset;
|
||||
const __global Dtype *src1 = src1_ptr + src1_offset;
|
||||
#if APPLY_BIAS
|
||||
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
|
||||
#endif
|
||||
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
#ifdef FUSED_CONV_ELTWISE
|
||||
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
|
||||
#endif
|
||||
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
@ -1375,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
const __global Dtype *src0 = src0_ptr + src0_offset;
|
||||
const __global Dtype *src1 = src1_ptr + src1_offset;
|
||||
#if APPLY_BIAS
|
||||
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
|
||||
#endif
|
||||
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
#ifdef FUSED_CONV_ELTWISE
|
||||
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
|
||||
#endif
|
||||
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
@ -1561,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
|
||||
{
|
||||
const __global Dtype *src0 = src0_ptr + src0_offset;
|
||||
const __global Dtype *src1 = src1_ptr + src1_offset;
|
||||
#if APPLY_BIAS
|
||||
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
|
||||
#endif
|
||||
|
||||
__global Dtype *dst = dst_base + dst_offset;
|
||||
#ifdef FUSED_CONV_ELTWISE
|
||||
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
|
||||
#endif
|
||||
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int global_x = get_global_id(0);
|
||||
|
@ -62,8 +62,8 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
|
||||
//Original location
|
||||
|
||||
//Output location
|
||||
int outputSublayer = channels / swizzleFactor;
|
||||
int outputSublayerIndex = channels % swizzleFactor;
|
||||
//int outputSublayer = channels / swizzleFactor;
|
||||
//int outputSublayerIndex = channels % swizzleFactor;
|
||||
|
||||
int filter = sX / (kernel_w*kernel_h*channels);
|
||||
int kernel_X = sX % kernel_w;
|
||||
@ -73,6 +73,10 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
|
||||
int FP = filter / swizzleFactor;
|
||||
int F1 = filter % swizzleFactor;
|
||||
|
||||
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
|
||||
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
|
||||
int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1;
|
||||
int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X;
|
||||
|
||||
// idxIn is not valid if (filter >= outputs) - no data for these elements. Output alignment gaps are filled by zeros
|
||||
Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0;
|
||||
weightOut[idxOut] = v;
|
||||
}
|
||||
|
@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
|
||||
{
|
||||
const int gidx = get_global_id(0);
|
||||
const int gidy = get_global_id(1);
|
||||
|
||||
if (gidx >= width || gidy >= height)
|
||||
return;
|
||||
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
__global Dtype* A_off = A + offA;
|
||||
Dtype srcA = A_off[gidy * ldA + gidx];
|
||||
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
|
||||
__global Dtype* A,
|
||||
__write_only image2d_t ImA,
|
||||
int offA,
|
||||
int padded_width,
|
||||
int padded_height,
|
||||
int width,
|
||||
int height,
|
||||
int ldA)
|
||||
{
|
||||
const int gidx = get_global_id(0);
|
||||
const int gidy = get_global_id(1);
|
||||
|
||||
if (gidx >= padded_width || gidy >= padded_height)
|
||||
return;
|
||||
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
#if TYPE == TYPE_HALF
|
||||
if (gidx >= width || gidy >= height) {
|
||||
|
@ -933,7 +933,12 @@ void TFImporter::parseBias(tensorflow::GraphDef& net, const tensorflow::NodeDef&
|
||||
layer_id[name] = id;
|
||||
|
||||
// one input only
|
||||
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
|
||||
Pin inp0 = parsePin(layer.input(0));
|
||||
if (layer_id.find(inp0.name) != layer_id.end())
|
||||
// First operand is a constant.
|
||||
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
|
||||
else
|
||||
connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -267,6 +267,11 @@ TEST_P(Test_ONNX_layers, ReLU)
|
||||
testONNXModels("ReLU");
|
||||
}
|
||||
|
||||
TEST_P(Test_ONNX_layers, PReLU)
|
||||
{
|
||||
testONNXModels("PReLU_slope");
|
||||
}
|
||||
|
||||
TEST_P(Test_ONNX_layers, Clip)
|
||||
{
|
||||
testONNXModels("clip", npy);
|
||||
@ -302,6 +307,7 @@ TEST_P(Test_ONNX_layers, Scale)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
|
||||
testONNXModels("scale");
|
||||
testONNXModels("scale_broadcast", npy, 0, 0, false, true, 3);
|
||||
testONNXModels("scale_broadcast_mid", npy, 0, 0, false, true, 2);
|
||||
}
|
||||
|
||||
TEST_P(Test_ONNX_layers, ReduceMean3D)
|
||||
@ -505,6 +511,8 @@ TEST_P(Test_ONNX_layers, MatMulAdd)
|
||||
|
||||
TEST_P(Test_ONNX_layers, Expand)
|
||||
{
|
||||
testONNXModels("expand");
|
||||
testONNXModels("expand_identity");
|
||||
testONNXModels("expand_batch");
|
||||
testONNXModels("expand_channels");
|
||||
testONNXModels("expand_neg_batch");
|
||||
@ -646,6 +654,7 @@ TEST_P(Test_ONNX_layers, ReduceL2)
|
||||
testONNXModels("reduceL2");
|
||||
testONNXModels("reduceL2_subgraph");
|
||||
testONNXModels("reduceL2_subgraph_2");
|
||||
testONNXModels("reduceL2_subgraph2_2");
|
||||
}
|
||||
|
||||
TEST_P(Test_ONNX_layers, Split)
|
||||
@ -659,6 +668,7 @@ TEST_P(Test_ONNX_layers, Split)
|
||||
testONNXModels("split_3");
|
||||
testONNXModels("split_4");
|
||||
testONNXModels("split_sizes");
|
||||
testONNXModels("split_neg_axis");
|
||||
}
|
||||
|
||||
TEST_P(Test_ONNX_layers, Slice)
|
||||
@ -667,6 +677,7 @@ TEST_P(Test_ONNX_layers, Slice)
|
||||
testONNXModels("slice", npy, 0, 0, false, false);
|
||||
#else
|
||||
testONNXModels("slice");
|
||||
testONNXModels("slice_neg_starts");
|
||||
testONNXModels("slice_opset_11");
|
||||
#endif
|
||||
}
|
||||
|
@ -581,6 +581,18 @@ TEST_P(Test_TensorFlow_layers, l2_normalize)
|
||||
runTensorFlowNet("l2_normalize");
|
||||
}
|
||||
|
||||
TEST_P(Test_TensorFlow_layers, BiasAdd)
|
||||
{
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
|
||||
&& getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
|
||||
)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
|
||||
#endif
|
||||
|
||||
runTensorFlowNet("bias_add_1");
|
||||
}
|
||||
|
||||
// TODO: fix it and add to l2_normalize
|
||||
TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
|
||||
{
|
||||
@ -1242,6 +1254,11 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_down)
|
||||
runTensorFlowNet("resize_bilinear_down");
|
||||
}
|
||||
|
||||
TEST_P(Test_TensorFlow_layers, resize_concat_optimization)
|
||||
{
|
||||
runTensorFlowNet("resize_concat_optimization");
|
||||
}
|
||||
|
||||
TEST_P(Test_TensorFlow_layers, tf2_dense)
|
||||
{
|
||||
runTensorFlowNet("tf2_dense");
|
||||
|
@ -1105,7 +1105,7 @@ public:
|
||||
that is, copies both parameters and train data. If emptyTrainData is true, the method creates an
|
||||
object copy with the current parameters but with empty train data.
|
||||
*/
|
||||
CV_WRAP virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
|
||||
CV_WRAP CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
|
||||
|
||||
/** @brief Creates a descriptor matcher of a given type with the default parameters (using default
|
||||
constructor).
|
||||
@ -1165,7 +1165,7 @@ protected:
|
||||
static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
|
||||
static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );
|
||||
|
||||
static Mat clone_op( Mat m ) { return m.clone(); }
|
||||
CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); }
|
||||
void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;
|
||||
|
||||
//! Collection of descriptors from train images.
|
||||
@ -1206,7 +1206,7 @@ public:
|
||||
*/
|
||||
CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ;
|
||||
|
||||
virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
|
||||
CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
|
||||
protected:
|
||||
virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
|
||||
InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
|
||||
@ -1245,7 +1245,7 @@ public:
|
||||
|
||||
CV_WRAP static Ptr<FlannBasedMatcher> create();
|
||||
|
||||
virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
|
||||
CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
|
||||
protected:
|
||||
static void convertToDMatches( const DescriptorCollection& descriptors,
|
||||
const Mat& indices, const Mat& distances,
|
||||
|
@ -437,11 +437,18 @@ class CppHeaderParser(object):
|
||||
# filter off some common prefixes, which are meaningless for Python wrappers.
|
||||
# note that we do not strip "static" prefix, which does matter;
|
||||
# it means class methods, not instance methods
|
||||
decl_str = self.batch_replace(decl_str, [("static inline", ""), ("inline", ""), ("explicit ", ""),
|
||||
("CV_EXPORTS_W", ""), ("CV_EXPORTS", ""), ("CV_CDECL", ""),
|
||||
("CV_WRAP ", " "), ("CV_INLINE", ""),
|
||||
("CV_DEPRECATED", ""), ("CV_DEPRECATED_EXTERNAL", "")]).strip()
|
||||
|
||||
decl_str = self.batch_replace(decl_str, [("static inline", ""),
|
||||
("inline", ""),
|
||||
("explicit ", ""),
|
||||
("CV_EXPORTS_W", ""),
|
||||
("CV_EXPORTS", ""),
|
||||
("CV_CDECL", ""),
|
||||
("CV_WRAP ", " "),
|
||||
("CV_INLINE", ""),
|
||||
("CV_DEPRECATED", ""),
|
||||
("CV_DEPRECATED_EXTERNAL", ""),
|
||||
("CV_NODISCARD_STD", ""),
|
||||
("CV_NODISCARD", "")]).strip()
|
||||
|
||||
if decl_str.strip().startswith('virtual'):
|
||||
virtual_method = True
|
||||
|
@ -0,0 +1,16 @@
|
||||
diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
|
||||
index 0ba0dd78..7d34e7cb 100644
|
||||
--- a/inference-engine/src/CMakeLists.txt
|
||||
+++ b/inference-engine/src/CMakeLists.txt
|
||||
@@ -26,9 +26,9 @@ endif()
|
||||
|
||||
add_subdirectory(hetero_plugin)
|
||||
|
||||
-add_subdirectory(auto_plugin)
|
||||
+#add_subdirectory(auto_plugin)
|
||||
|
||||
-add_subdirectory(multi_device)
|
||||
+#add_subdirectory(multi_device)
|
||||
|
||||
add_subdirectory(transformations)
|
||||
|
@ -0,0 +1,219 @@
|
||||
diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake
|
||||
index d49f16a4d..2726ca787 100644
|
||||
--- a/cmake/developer_package/add_ie_target.cmake
|
||||
+++ b/cmake/developer_package/add_ie_target.cmake
|
||||
@@ -92,7 +92,7 @@ function(addIeTarget)
|
||||
if (ARG_TYPE STREQUAL EXECUTABLE)
|
||||
add_executable(${ARG_NAME} ${all_sources})
|
||||
elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
|
||||
- add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
|
||||
+ add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
|
||||
else()
|
||||
message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
|
||||
endif()
|
||||
diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
|
||||
index 1ac7fd8bf..df7091e51 100644
|
||||
--- a/inference-engine/CMakeLists.txt
|
||||
+++ b/inference-engine/CMakeLists.txt
|
||||
@@ -39,7 +39,7 @@ if(ENABLE_TESTS)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
-add_subdirectory(tools)
|
||||
+#add_subdirectory(tools)
|
||||
|
||||
function(ie_build_samples)
|
||||
# samples should be build with the same flags as from OpenVINO package,
|
||||
@@ -58,7 +58,7 @@ endfunction()
|
||||
|
||||
# gflags and format_reader targets are kept inside of samples directory and
|
||||
# they must be built even if samples build is disabled (required for tests and tools).
|
||||
-ie_build_samples()
|
||||
+#ie_build_samples()
|
||||
|
||||
if(ENABLE_PYTHON)
|
||||
add_subdirectory(ie_bridges/python)
|
||||
@@ -142,7 +142,7 @@ endif()
|
||||
# Developer package
|
||||
#
|
||||
|
||||
-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
|
||||
+#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
|
||||
|
||||
# for Template plugin
|
||||
if(NGRAPH_INTERPRETER_ENABLE)
|
||||
@@ -166,7 +166,7 @@ function(ie_generate_dev_package_config)
|
||||
@ONLY)
|
||||
endfunction()
|
||||
|
||||
-ie_generate_dev_package_config()
|
||||
+#ie_generate_dev_package_config()
|
||||
|
||||
#
|
||||
# Coverage
|
||||
diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
|
||||
index e8ed1a5c4..1fc9fc3ff 100644
|
||||
--- a/inference-engine/src/inference_engine/CMakeLists.txt
|
||||
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
|
||||
@@ -110,7 +110,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
|
||||
|
||||
# Create object library
|
||||
|
||||
-add_library(${TARGET_NAME}_obj OBJECT
|
||||
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
|
||||
${LIBRARY_SRC}
|
||||
${LIBRARY_HEADERS}
|
||||
${PUBLIC_HEADERS})
|
||||
@@ -181,7 +181,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
|
||||
|
||||
# Static library used for unit tests which are always built
|
||||
|
||||
-add_library(${TARGET_NAME}_s STATIC
|
||||
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
|
||||
$<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
|
||||
$<TARGET_OBJECTS:${TARGET_NAME}_obj>
|
||||
${IE_STATIC_DEPENDENT_FILES})
|
||||
diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
|
||||
index 8eae82bd2..e0e6745b1 100644
|
||||
--- a/inference-engine/src/legacy_api/CMakeLists.txt
|
||||
+++ b/inference-engine/src/legacy_api/CMakeLists.txt
|
||||
@@ -26,7 +26,7 @@ endif()
|
||||
|
||||
file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
|
||||
|
||||
-add_library(${TARGET_NAME}_obj OBJECT
|
||||
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
|
||||
${LIBRARY_SRC}
|
||||
${PUBLIC_HEADERS})
|
||||
|
||||
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
|
||||
index fe57b29dd..07831e2fb 100644
|
||||
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
|
||||
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
|
||||
@@ -67,7 +67,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
|
||||
|
||||
# add test object library
|
||||
|
||||
-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
|
||||
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
|
||||
target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
|
||||
|
||||
target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
|
||||
diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
|
||||
index f9548339d..ef962145a 100644
|
||||
--- a/inference-engine/src/preprocessing/CMakeLists.txt
|
||||
+++ b/inference-engine/src/preprocessing/CMakeLists.txt
|
||||
@@ -101,7 +101,7 @@ endif()
|
||||
|
||||
# Create object library
|
||||
|
||||
-add_library(${TARGET_NAME}_obj OBJECT
|
||||
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
|
||||
${LIBRARY_SRC}
|
||||
${LIBRARY_HEADERS})
|
||||
|
||||
@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
|
||||
|
||||
# Static library used for unit tests which are always built
|
||||
|
||||
-add_library(${TARGET_NAME}_s STATIC
|
||||
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
|
||||
$<TARGET_OBJECTS:${TARGET_NAME}_obj>)
|
||||
|
||||
set_ie_threading_interface_for(${TARGET_NAME}_s)
|
||||
diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
|
||||
index 249e47c28..4ddf63049 100644
|
||||
--- a/inference-engine/src/vpu/common/CMakeLists.txt
|
||||
+++ b/inference-engine/src/vpu/common/CMakeLists.txt
|
||||
@@ -5,7 +5,7 @@
|
||||
file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
|
||||
|
||||
function(add_common_target TARGET_NAME STATIC_IE)
|
||||
- add_library(${TARGET_NAME} STATIC ${SOURCES})
|
||||
+ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
|
||||
|
||||
ie_faster_build(${TARGET_NAME}
|
||||
UNITY
|
||||
@@ -60,7 +60,7 @@ add_common_target("vpu_common_lib" FALSE)
|
||||
|
||||
# Unit tests support for graph transformer
|
||||
if(WIN32)
|
||||
- add_common_target("vpu_common_lib_test_static" TRUE)
|
||||
+ #add_common_target("vpu_common_lib_test_static" TRUE)
|
||||
else()
|
||||
add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
|
||||
endif()
|
||||
diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
|
||||
index bc73ab5b1..b4c1547fc 100644
|
||||
--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
|
||||
+++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
|
||||
@@ -5,7 +5,7 @@
|
||||
file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
|
||||
|
||||
function(add_graph_transformer_target TARGET_NAME STATIC_IE)
|
||||
- add_library(${TARGET_NAME} STATIC ${SOURCES})
|
||||
+ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
|
||||
|
||||
set_ie_threading_interface_for(${TARGET_NAME})
|
||||
|
||||
@@ -70,7 +70,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
|
||||
|
||||
# Unit tests support for graph transformer
|
||||
if(WIN32)
|
||||
- add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
|
||||
+ #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
|
||||
else()
|
||||
add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
|
||||
endif()
|
||||
diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
|
||||
index 8bcb2801a..f7e031c01 100644
|
||||
--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
|
||||
+++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
|
||||
@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
|
||||
else()
|
||||
add_library(pugixml STATIC ${SOURCES})
|
||||
if (MSVC)
|
||||
- add_library(pugixml_mt STATIC ${SOURCES})
|
||||
+ #add_library(pugixml_mt STATIC ${SOURCES})
|
||||
#if (WIN32)
|
||||
# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
|
||||
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
|
||||
diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
|
||||
index ff5c381e7..2797ec9ab 100644
|
||||
--- a/ngraph/core/builder/CMakeLists.txt
|
||||
+++ b/ngraph/core/builder/CMakeLists.txt
|
||||
@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
|
||||
source_group("include" FILES ${PUBLIC_HEADERS})
|
||||
|
||||
# Create shared library
|
||||
-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
|
||||
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
|
||||
|
||||
if(COMMAND ie_faster_build)
|
||||
ie_faster_build(${TARGET_NAME}
|
||||
diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
|
||||
index ef4a764ab..f6d3172e2 100644
|
||||
--- a/ngraph/core/reference/CMakeLists.txt
|
||||
+++ b/ngraph/core/reference/CMakeLists.txt
|
||||
@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
|
||||
source_group("include" FILES ${PUBLIC_HEADERS})
|
||||
|
||||
# Create shared library
|
||||
-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
|
||||
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
|
||||
|
||||
if(COMMAND ie_faster_build)
|
||||
ie_faster_build(${TARGET_NAME}
|
||||
diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
|
||||
index e9f880b8c..c63f4df63 100644
|
||||
--- a/openvino/itt/CMakeLists.txt
|
||||
+++ b/openvino/itt/CMakeLists.txt
|
||||
@@ -6,7 +6,7 @@ set(TARGET_NAME itt)
|
||||
|
||||
file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp")
|
||||
|
||||
-add_library(${TARGET_NAME} STATIC ${SOURCES})
|
||||
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
|
||||
|
||||
add_library(openvino::itt ALIAS ${TARGET_NAME})
|
||||
|
15
platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch
Normal file
15
platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch
Normal file
@ -0,0 +1,15 @@
|
||||
iff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index e0706a72e..9a053b1e4 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -6,6 +6,10 @@ cmake_minimum_required(VERSION 3.13)
|
||||
|
||||
project(OpenVINO)
|
||||
|
||||
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
|
||||
+set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
|
||||
+set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
|
||||
+
|
||||
set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
|
||||
|
@ -0,0 +1,16 @@
|
||||
diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake
|
||||
index 14d4c0e1e..6a44f73b9 100644
|
||||
--- a/cmake/developer_package/vs_version/vs_version.cmake
|
||||
+++ b/cmake/developer_package/vs_version/vs_version.cmake
|
||||
@@ -8,9 +8,9 @@ set(IE_VS_VER_FILEVERSION_STR "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERS
|
||||
|
||||
set(IE_VS_VER_COMPANY_NAME_STR "Intel Corporation")
|
||||
set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
|
||||
-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
|
||||
+set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
|
||||
set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2021, Intel Corporation")
|
||||
-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
|
||||
+set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
|
||||
|
||||
#
|
||||
# ie_add_vs_version_file(NAME <name>
|
1
platforms/winpack_dldt/2021.4.1/build.config.py
Normal file
1
platforms/winpack_dldt/2021.4.1/build.config.py
Normal file
@ -0,0 +1 @@
|
||||
os.environ['CI_BUILD_NUMBER'] = '2021.4.1-opencv_winpack_dldt'
|
4
platforms/winpack_dldt/2021.4.1/patch.config.py
Normal file
4
platforms/winpack_dldt/2021.4.1/patch.config.py
Normal file
@ -0,0 +1,4 @@
|
||||
applyPatch('20210630-dldt-disable-unused-targets.patch')
|
||||
applyPatch('20210630-dldt-pdb.patch')
|
||||
applyPatch('20210630-dldt-disable-multidevice-autoplugin.patch')
|
||||
applyPatch('20210630-dldt-vs-version.patch')
|
56
platforms/winpack_dldt/2021.4.1/sysroot.config.py
Normal file
56
platforms/winpack_dldt/2021.4.1/sysroot.config.py
Normal file
@ -0,0 +1,56 @@
|
||||
sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
|
||||
copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
|
||||
#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
|
||||
|
||||
build_config = 'Release' if not self.config.build_debug else 'Debug'
|
||||
build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
|
||||
|
||||
def copy_bin(name):
|
||||
global build_bin_dir, sysroot_bin_dir
|
||||
copytree(build_bin_dir / name, sysroot_bin_dir / name)
|
||||
|
||||
dll_suffix = 'd' if self.config.build_debug else ''
|
||||
def copy_dll(name):
|
||||
global copy_bin, dll_suffix
|
||||
copy_bin(name + dll_suffix + '.dll')
|
||||
copy_bin(name + dll_suffix + '.pdb')
|
||||
|
||||
copy_bin('cache.json')
|
||||
copy_dll('clDNNPlugin')
|
||||
copy_dll('HeteroPlugin')
|
||||
copy_dll('inference_engine')
|
||||
copy_dll('inference_engine_ir_reader')
|
||||
#copy_dll('inference_engine_ir_v7_reader')
|
||||
copy_dll('inference_engine_legacy')
|
||||
copy_dll('inference_engine_transformations') # runtime
|
||||
copy_dll('inference_engine_lp_transformations') # runtime
|
||||
#copy_dll('inference_engine_preproc') # runtime
|
||||
copy_dll('MKLDNNPlugin') # runtime
|
||||
copy_dll('myriadPlugin') # runtime
|
||||
#copy_dll('MultiDevicePlugin') # runtime, not used
|
||||
copy_dll('ngraph')
|
||||
copy_bin('plugins.xml')
|
||||
copy_bin('pcie-ma2x8x.elf')
|
||||
copy_bin('usb-ma2x8x.mvcmd')
|
||||
|
||||
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
|
||||
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
|
||||
|
||||
sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
|
||||
sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
|
||||
|
||||
copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
|
||||
if not self.config.build_debug:
|
||||
copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
|
||||
copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
|
||||
copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
|
||||
copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
|
||||
else:
|
||||
copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
|
||||
copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
|
||||
copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
|
||||
copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
|
||||
|
||||
sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
|
||||
copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
|
||||
copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
|
@ -471,7 +471,8 @@ class Builder:
|
||||
def main():
|
||||
|
||||
dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
|
||||
dldt_src_commit = '2021.4'
|
||||
dldt_src_commit = '2021.4.1'
|
||||
dldt_config = None
|
||||
dldt_release = None
|
||||
|
||||
build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
|
||||
@ -505,7 +506,7 @@ def main():
|
||||
parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
|
||||
parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')
|
||||
|
||||
parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
|
||||
parser.add_argument('--dldt_config', default=dldt_config, help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
|
||||
|
||||
parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user