mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #20670 from alalek:core_ocl_fix_intel_gpu_gemm_requirements
core(OpenCL): fix intel_gpu_gemm kernel requirements * core(ocl): fix intel_gpu_gemm integration - allow bailout to generic OpenCL kernel * core(ocl): avoid failures of generic OpenCL gemm kernel * core(ocl): define alignment requirements of intel_gpu_gemm kernels
This commit is contained in:
parent
6ace801418
commit
e3f4f874c5
@ -24,11 +24,6 @@
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
#include <sstream>
|
||||
#include "opencl_kernels_core.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
@ -37,52 +32,79 @@ static bool intel_gpu_gemm(
|
||||
UMat B, Size sizeB,
|
||||
UMat D, Size sizeD,
|
||||
double alpha, double beta,
|
||||
bool atrans, bool btrans)
|
||||
bool atrans, bool btrans,
|
||||
bool& isPropagatedC2D
|
||||
)
|
||||
{
|
||||
CV_UNUSED(sizeB);
|
||||
|
||||
int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
|
||||
|
||||
std::string kernelName;
|
||||
bool ret = true;
|
||||
if (M < 4 || N < 4 || K < 4) // vload4
|
||||
return false;
|
||||
|
||||
size_t lx = 8, ly = 4;
|
||||
size_t dx = 4, dy = 8;
|
||||
CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K);
|
||||
|
||||
std::string kernelName;
|
||||
|
||||
unsigned int lx = 8, ly = 4;
|
||||
unsigned int dx = 4, dy = 8;
|
||||
|
||||
if(!atrans && !btrans)
|
||||
{
|
||||
|
||||
if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
|
||||
{
|
||||
kernelName = "intelblas_gemm_buffer_NN_sp";
|
||||
}
|
||||
else
|
||||
{
|
||||
if (M % 2 != 0)
|
||||
return false;
|
||||
// vload4(0, dst_write0) - 4 cols
|
||||
// multiply by lx: 8
|
||||
if (N % (4*8) != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_NN";
|
||||
}
|
||||
}
|
||||
else if(atrans && !btrans)
|
||||
{
|
||||
if (M % 32 != 0)
|
||||
return false;
|
||||
if (N % 32 != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_TN";
|
||||
}
|
||||
else if(!atrans && btrans)
|
||||
{
|
||||
if (M % 128 != 0)
|
||||
return false;
|
||||
if (N % 8 != 0)
|
||||
return false;
|
||||
if (K % 512 != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_NT";
|
||||
ly = 16;
|
||||
dx = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (M % 32 != 0)
|
||||
return false;
|
||||
if (N % 32 != 0)
|
||||
return false;
|
||||
if (K % 16 != 0)
|
||||
return false;
|
||||
kernelName = "intelblas_gemm_buffer_TT";
|
||||
}
|
||||
|
||||
const size_t gx = (size_t)(N + dx - 1) / dx;
|
||||
const size_t gy = (size_t)(M + dy - 1) / dy;
|
||||
CV_LOG_DEBUG(NULL, "kernel: " << kernelName << " (M=" << M << " N=" << N << " K=" << K << ")");
|
||||
|
||||
const size_t gx = divUp((size_t)N, dx);
|
||||
const size_t gy = divUp((size_t)M, dy);
|
||||
|
||||
size_t local[] = {lx, ly, 1};
|
||||
size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};
|
||||
|
||||
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
|
||||
size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1};
|
||||
|
||||
ocl::Queue q;
|
||||
String errmsg;
|
||||
@ -110,10 +132,13 @@ static bool intel_gpu_gemm(
|
||||
(int)(D.step / sizeof(float))
|
||||
);
|
||||
|
||||
ret = k.run(2, global, local, false, q);
|
||||
bool ret = k.run(2, global, local, false, q);
|
||||
return ret;
|
||||
}
|
||||
else
|
||||
{
|
||||
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
|
||||
|
||||
for(int start_index = 0; start_index < K; start_index += stride)
|
||||
{
|
||||
ocl::Kernel k(kernelName.c_str(), program);
|
||||
@ -132,12 +157,16 @@ static bool intel_gpu_gemm(
|
||||
(int) start_index, // 14 start_index
|
||||
stride);
|
||||
|
||||
ret = k.run(2, global, local, false, q);
|
||||
if (!ret) return ret;
|
||||
bool ret = k.run(2, global, local, false, q);
|
||||
if (!ret)
|
||||
{
|
||||
if (start_index != 0)
|
||||
isPropagatedC2D = false; // D array content is changed, need to rewrite
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace cv
|
||||
|
@ -42,6 +42,8 @@
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include "opencl_kernels_core.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
|
||||
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
|
||||
@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
|
||||
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
|
||||
InputArray matC, double beta, OutputArray matD, int flags )
|
||||
{
|
||||
int depth = matA.depth(), cn = matA.channels();
|
||||
int type = CV_MAKETYPE(depth, cn);
|
||||
int type = matA.type();
|
||||
int depth = CV_MAT_DEPTH(type);
|
||||
int cn = CV_MAT_CN(type);
|
||||
|
||||
CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
|
||||
CV_CheckTypeEQ(type, matB.type(), "");
|
||||
CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, "");
|
||||
|
||||
const ocl::Device & dev = ocl::Device::getDefault();
|
||||
bool doubleSupport = dev.doubleFPConfig() > 0;
|
||||
@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
|
||||
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
|
||||
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
|
||||
|
||||
CV_Assert( !haveC || matC.type() == type );
|
||||
if (haveC)
|
||||
CV_CheckTypeEQ(type, matC.type(), "");
|
||||
|
||||
Size sizeD(((btrans) ? sizeB.height : sizeB.width),
|
||||
((atrans) ? sizeA.width : sizeA.height));
|
||||
|
||||
if (atrans)
|
||||
sizeA = Size(sizeA.height, sizeA.width);
|
||||
if (btrans)
|
||||
sizeB = Size(sizeB.height, sizeB.width);
|
||||
if (haveC && ctrans)
|
||||
sizeC = Size(sizeC.height, sizeC.width);
|
||||
|
||||
CV_CheckEQ(sizeA.width, sizeB.height, "");
|
||||
if (haveC)
|
||||
CV_CheckEQ(sizeC, sizeD, "");
|
||||
|
||||
UMat A = matA.getUMat();
|
||||
UMat B = matB.getUMat();
|
||||
|
||||
Size sizeD(((btrans)? sizeB.height : sizeB.width),
|
||||
((atrans)? sizeA.width : sizeA.height));
|
||||
matD.create(sizeD, type);
|
||||
UMat D = matD.getUMat();
|
||||
|
||||
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
|
||||
bool isPropagatedC2D = false; // D content is updated with C / C.t()
|
||||
|
||||
|
||||
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
|
||||
{
|
||||
String opts;
|
||||
|
||||
if (atrans)
|
||||
sizeA = Size(sizeA.height, sizeA.width);
|
||||
if (btrans)
|
||||
sizeB = Size(sizeB.height, sizeB.width);
|
||||
if (haveC && ctrans)
|
||||
sizeC = Size(sizeC.height, sizeC.width);
|
||||
|
||||
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
|
||||
|
||||
int max_wg_size = (int)dev.maxWorkGroupSize();
|
||||
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
|
||||
|
||||
if (atrans)
|
||||
A = A.t();
|
||||
|
||||
if (btrans)
|
||||
B = B.t();
|
||||
|
||||
if (haveC)
|
||||
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
||||
|
||||
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
|
||||
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
|
||||
|
||||
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
|
||||
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
|
||||
cn, kercn, block_size,
|
||||
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
|
||||
haveC ? " -D HAVE_C" : "",
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
if (depth == CV_64F)
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, alpha, beta);
|
||||
else
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, (float)alpha, (float)beta);
|
||||
|
||||
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
|
||||
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
|
||||
|
||||
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
|
||||
}
|
||||
else
|
||||
if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1)
|
||||
{
|
||||
if (haveC && beta != 0.0)
|
||||
{
|
||||
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
||||
isPropagatedC2D = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
beta = 0.0;
|
||||
}
|
||||
|
||||
return intel_gpu_gemm(A, sizeA,
|
||||
B, sizeB,
|
||||
D, sizeD,
|
||||
alpha,
|
||||
beta,
|
||||
atrans, btrans);
|
||||
bool res = intel_gpu_gemm(A, matA.size(),
|
||||
B, matB.size(),
|
||||
D, sizeD,
|
||||
alpha,
|
||||
beta,
|
||||
atrans, btrans,
|
||||
isPropagatedC2D);
|
||||
if (res)
|
||||
return true;
|
||||
// fallback on generic OpenCL code
|
||||
}
|
||||
|
||||
if (sizeD.width < 8 || sizeD.height < 8)
|
||||
return false;
|
||||
|
||||
String opts;
|
||||
|
||||
int wg_size = (int)dev.maxWorkGroupSize();
|
||||
int sizeDmin = std::min(sizeD.width, sizeD.height);
|
||||
wg_size = std::min(wg_size, sizeDmin * sizeDmin);
|
||||
int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
|
||||
|
||||
if (atrans)
|
||||
A = A.t();
|
||||
|
||||
if (btrans)
|
||||
B = B.t();
|
||||
|
||||
if (haveC && !isPropagatedC2D)
|
||||
ctrans ? transpose(matC, D) : matC.copyTo(D);
|
||||
|
||||
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
|
||||
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
|
||||
|
||||
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
|
||||
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
|
||||
cn, kercn, block_size,
|
||||
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
|
||||
haveC ? " -D HAVE_C" : "",
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
|
||||
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
if (depth == CV_64F)
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, alpha, beta);
|
||||
else
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
|
||||
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
|
||||
ocl::KernelArg::ReadWrite(D, cn, kercn),
|
||||
sizeA.width, (float)alpha, (float)beta);
|
||||
|
||||
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
|
||||
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
|
||||
|
||||
return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm,
|
||||
|
||||
double alpha, beta;
|
||||
|
||||
int M, N, K;
|
||||
|
||||
TEST_DECLARE_INPUT_PARAMETER(A);
|
||||
TEST_DECLARE_INPUT_PARAMETER(B);
|
||||
TEST_DECLARE_INPUT_PARAMETER(C);
|
||||
@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm,
|
||||
|
||||
void generateTestData()
|
||||
{
|
||||
// set minimum size to 20, since testing less sizes doesn't make sense
|
||||
Size ARoiSize = randomSize(20, MAX_VALUE);
|
||||
M = (int)randomDoubleLog(1, 100);
|
||||
N = (int)randomDoubleLog(1, 100);
|
||||
K = (int)randomDoubleLog(1, 1200);
|
||||
|
||||
M = roundUp(M, 1);
|
||||
N = roundUp(N, 1);
|
||||
K = roundUp(K, 1);
|
||||
|
||||
Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M);
|
||||
Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
|
||||
|
||||
if (atrans)
|
||||
ARoiSize = Size(ARoiSize.height, ARoiSize.width);
|
||||
|
||||
Size BRoiSize = randomSize(20, MAX_VALUE);
|
||||
if (btrans)
|
||||
BRoiSize.width = ARoiSize.width;
|
||||
else
|
||||
BRoiSize.height = ARoiSize.width;
|
||||
|
||||
Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K);
|
||||
Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11);
|
||||
|
||||
if (btrans)
|
||||
BRoiSize = Size(BRoiSize.height, BRoiSize.width);
|
||||
|
||||
Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width);
|
||||
Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M);
|
||||
Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11);
|
||||
randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11);
|
||||
|
||||
Size DRoiSize = Size(N, M);
|
||||
Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
|
||||
randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11);
|
||||
|
||||
@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy)
|
||||
for (int i = 0; i < test_loop_times; ++i)
|
||||
{
|
||||
generateTestData();
|
||||
SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K));
|
||||
|
||||
OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags));
|
||||
OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags));
|
||||
|
||||
double eps = D_roi.size().area() * 1e-4;
|
||||
double eps = D_roi.size().area() * (1e-5 * K);
|
||||
OCL_EXPECT_MATS_NEAR(D, eps);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user