mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #13221 from elatkin:el/gapi_perf_sepfilter
GAPI (fluid): optimization of Separable filter (#13221) * GAPI (fluid): Separable filter: performance test * GAPI (fluid): enable all performance tests * GAPI: separable filters: alternative code for Sobel * GAPI (fluid): hide unused old code for Sobel filter * GAPI (fluid): especial code for Sobel if U8 into S16 * GAPI (fluid): back to old code for Sobel * GAPI (fluid): run_sepfilter3x3_impl() with CPU dispatcher * GAPI (fluid): run_sepfilter3x3_impl(): fix compiler warnings * GAPI (fluid): new engine for separable filters (but Sobel) * GAPI (fluid): new performance engine for Sobel * GAPI (fluid): Sepfilters performance: fixed compilation error
This commit is contained in:
parent
dd952f6d68
commit
f07856eab9
@ -52,7 +52,7 @@ PERF_TEST_P_(SepFilterPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -100,7 +100,7 @@ PERF_TEST_P_(Filter2DPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -145,7 +145,7 @@ PERF_TEST_P_(BoxFilterPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -188,7 +188,7 @@ PERF_TEST_P_(BlurPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -230,7 +230,7 @@ PERF_TEST_P_(GaussianBlurPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -271,7 +271,7 @@ PERF_TEST_P_(MedianBlurPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -314,7 +314,7 @@ PERF_TEST_P_(ErodePerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -357,7 +357,7 @@ PERF_TEST_P_(Erode3x3PerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -400,7 +400,7 @@ PERF_TEST_P_(DilatePerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -443,7 +443,7 @@ PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -526,7 +526,7 @@ PERF_TEST_P_(CannyPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -564,7 +564,7 @@ PERF_TEST_P_(EqHistPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -830,7 +830,7 @@ PERF_TEST_P_(LUV2BGRPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
|
@ -13,9 +13,101 @@
|
||||
namespace opencv_test
|
||||
{
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
|
||||
Combine(Values(AbsExact().to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), // add CV_32FC1 when ready
|
||||
INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_8U, SepFilterPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3),
|
||||
Values(3),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(-1, CV_16S, CV_32F),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_other, SepFilterPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
|
||||
Values(CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(-1, CV_32F),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter2DPerfTestFluid, Filter2DPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3), // add 4, 5, 7 when kernel is ready
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::BORDER_DEFAULT),
|
||||
Values(-1, CV_32F),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestFluid, BoxFilterPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3), // add size=5, when kernel is ready
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::BORDER_DEFAULT),
|
||||
Values(-1, CV_32F),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BlurPerfTestFluid, BlurPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3), // add size=5, when kernel is ready
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::BORDER_DEFAULT),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestFluid, GaussianBlurPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-3f, 0.01).to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3), // add size=5, when kernel is ready
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestFluid, MedianBlurPerfTest,
|
||||
Combine(Values(AbsExact().to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3), // add size=5, when kernel is ready
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ErodePerfTestFluid, ErodePerfTest,
|
||||
Combine(Values(AbsExact().to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3), // add size=5, when kernel is ready
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::MorphShapes::MORPH_RECT,
|
||||
cv::MorphShapes::MORPH_CROSS,
|
||||
cv::MorphShapes::MORPH_ELLIPSE),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
// GAPI/fluid does not support iterations parameter for the Erode kernel
|
||||
INSTANTIATE_TEST_CASE_P(DISABLED_Erode3x3PerfTestFluid, Erode3x3PerfTest,
|
||||
Combine(Values(AbsExact().to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(1, 2, 4),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DilatePerfTestFluid, DilatePerfTest,
|
||||
Combine(Values(AbsExact().to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(3), // add size=5, when kernel is ready
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::MorphShapes::MORPH_RECT,
|
||||
cv::MorphShapes::MORPH_CROSS,
|
||||
cv::MorphShapes::MORPH_ELLIPSE),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
// GAPI/fluid does not support iterations parameter for the Dilate kernel
|
||||
INSTANTIATE_TEST_CASE_P(DISABLED_Dilate3x3PerfTestFluid, Dilate3x3PerfTest,
|
||||
Combine(Values(AbsExact().to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(1, 2, 4),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
|
||||
Combine(Values(AbsExact().to_compare_f()),
|
||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
|
||||
Values(3), // add 5x5 once supported
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(-1, CV_16S, CV_32F),
|
||||
@ -23,8 +115,8 @@ namespace opencv_test
|
||||
Values(1, 2),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
|
||||
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
|
||||
Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
|
||||
Values(CV_32FC1),
|
||||
Values(3), // add 5x5 once supported
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
@ -33,44 +125,44 @@ namespace opencv_test
|
||||
Values(1, 2),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
|
||||
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
|
||||
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
|
||||
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
|
||||
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
}
|
||||
|
@ -344,7 +344,7 @@ static const int maxKernelSize = 9;
|
||||
|
||||
template<typename DST, typename SRC>
|
||||
static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSize,
|
||||
const cv::Point& /* anchor */, bool normalize)
|
||||
const cv::Point& /* anchor */, bool normalize, float *buf[])
|
||||
{
|
||||
GAPI_Assert(kernelSize.width <= maxKernelSize);
|
||||
GAPI_Assert(kernelSize.width == kernelSize.height);
|
||||
@ -365,36 +365,53 @@ static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSi
|
||||
int width = dst.length();
|
||||
int chan = dst.meta().chan;
|
||||
|
||||
GAPI_DbgAssert(chan <= 4);
|
||||
|
||||
for (int w=0; w < width; w++)
|
||||
if (kernelSize.width == 3 && kernelSize.height == 3)
|
||||
{
|
||||
float sum[4] = {0, 0, 0, 0};
|
||||
int y = dst.y();
|
||||
int y0 = dst.priv().writeStart();
|
||||
|
||||
for (int i=0; i < kernel; i++)
|
||||
float kx[3] = {1, 1, 1};
|
||||
float *ky = kx;
|
||||
|
||||
float scale=1, delta=0;
|
||||
if (normalize)
|
||||
scale = 1/9.f;
|
||||
|
||||
run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
|
||||
} else
|
||||
{
|
||||
GAPI_DbgAssert(chan <= 4);
|
||||
|
||||
for (int w=0; w < width; w++)
|
||||
{
|
||||
for (int j=0; j < kernel; j++)
|
||||
float sum[4] = {0, 0, 0, 0};
|
||||
|
||||
for (int i=0; i < kernel; i++)
|
||||
{
|
||||
for (int c=0; c < chan; c++)
|
||||
sum[c] += in[i][(w + j - border)*chan + c];
|
||||
for (int j=0; j < kernel; j++)
|
||||
{
|
||||
for (int c=0; c < chan; c++)
|
||||
sum[c] += in[i][(w + j - border)*chan + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int c=0; c < chan; c++)
|
||||
{
|
||||
float result = normalize? sum[c]/(kernel * kernel) : sum[c];
|
||||
for (int c=0; c < chan; c++)
|
||||
{
|
||||
float result = normalize? sum[c]/(kernel * kernel) : sum[c];
|
||||
|
||||
out[w*chan + c] = saturate<DST>(result, rintf);
|
||||
out[w*chan + c] = saturate<DST>(result, rintf);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
|
||||
GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, true)
|
||||
{
|
||||
static const int Window = 3;
|
||||
|
||||
static void run(const View &src, const cv::Size& kernelSize, const cv::Point& anchor,
|
||||
int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst)
|
||||
int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst,
|
||||
Buffer& scratch)
|
||||
{
|
||||
// TODO: support sizes 3, 5, 7, 9, ...
|
||||
GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3);
|
||||
@ -404,14 +421,46 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
|
||||
|
||||
static const bool normalize = true;
|
||||
|
||||
int width = src.length();
|
||||
int chan = src.meta().chan;
|
||||
int length = width * chan;
|
||||
|
||||
float *buf[3];
|
||||
buf[0] = scratch.OutLine<float>();
|
||||
buf[1] = buf[0] + length;
|
||||
buf[2] = buf[1] + length;
|
||||
|
||||
// DST SRC OP __VA_ARGS__
|
||||
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||
}
|
||||
|
||||
static void initScratch(const GMatDesc & in,
|
||||
const cv::Size & /* ksize */,
|
||||
const cv::Point & /* anchor */,
|
||||
int /* borderType */,
|
||||
const cv::Scalar & /* borderValue */,
|
||||
Buffer & scratch)
|
||||
{
|
||||
int width = in.size.width;
|
||||
int chan = in.chan;
|
||||
|
||||
int buflen = width * chan * Window; // work buffers
|
||||
|
||||
cv::gapi::own::Size bufsize(buflen, 1);
|
||||
GMatDesc bufdesc = {CV_32F, 1, bufsize};
|
||||
Buffer buffer(bufdesc);
|
||||
scratch = std::move(buffer);
|
||||
}
|
||||
|
||||
static void resetScratch(Buffer& /* scratch */)
|
||||
{
|
||||
}
|
||||
|
||||
static Border getBorder(const cv::GMatDesc& /* src */,
|
||||
const cv::Size & /* kernelSize */,
|
||||
const cv::Point & /* anchor */,
|
||||
@ -422,18 +471,19 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
|
||||
GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, true)
|
||||
{
|
||||
static const int Window = 3;
|
||||
|
||||
static void run(const View & src,
|
||||
int /* ddepth */,
|
||||
const cv::Size & kernelSize,
|
||||
const cv::Point & anchor,
|
||||
const cv::Point & anchor,
|
||||
bool normalize,
|
||||
int /* borderType */,
|
||||
const cv::Scalar& /* borderValue */,
|
||||
Buffer& dst)
|
||||
Buffer& dst,
|
||||
Buffer& scratch)
|
||||
{
|
||||
// TODO: support sizes 3, 5, 7, 9, ...
|
||||
GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3);
|
||||
@ -441,17 +491,51 @@ GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
|
||||
// TODO: suport non-trivial anchor
|
||||
GAPI_Assert(anchor.x == -1 && anchor.y == -1);
|
||||
|
||||
int width = src.length();
|
||||
int chan = src.meta().chan;
|
||||
int length = width * chan;
|
||||
|
||||
float *buf[3];
|
||||
buf[0] = scratch.OutLine<float>();
|
||||
buf[1] = buf[0] + length;
|
||||
buf[2] = buf[1] + length;
|
||||
|
||||
// DST SRC OP __VA_ARGS__
|
||||
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
|
||||
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||
}
|
||||
|
||||
static void initScratch(const GMatDesc & in,
|
||||
int /* ddepth */,
|
||||
const cv::Size & /* kernelSize */,
|
||||
const cv::Point & /* anchor */,
|
||||
bool /* normalize */,
|
||||
int /* borderType */,
|
||||
const cv::Scalar& /* borderValue */,
|
||||
Buffer & scratch)
|
||||
{
|
||||
int width = in.size.width;
|
||||
int chan = in.chan;
|
||||
|
||||
int buflen = width * chan * Window; // work buffers
|
||||
|
||||
cv::gapi::own::Size bufsize(buflen, 1);
|
||||
GMatDesc bufdesc = {CV_32F, 1, bufsize};
|
||||
Buffer buffer(bufdesc);
|
||||
scratch = std::move(buffer);
|
||||
}
|
||||
|
||||
static void resetScratch(Buffer& /* scratch */)
|
||||
{
|
||||
}
|
||||
|
||||
static Border getBorder(const cv::GMatDesc& /* src */,
|
||||
int /* ddepth */,
|
||||
const cv::Size & /* kernelSize */,
|
||||
@ -510,18 +594,21 @@ static void run_sepfilter(Buffer& dst, const View& src,
|
||||
const float kx[], int kxLen,
|
||||
const float ky[], int kyLen,
|
||||
const cv::Point& /* anchor */,
|
||||
float delta=0)
|
||||
float scale, float delta,
|
||||
float *buf[])
|
||||
{
|
||||
static const int maxLines = 9;
|
||||
GAPI_Assert(kyLen <= maxLines);
|
||||
constexpr int kMax = 11;
|
||||
GAPI_Assert(kxLen <= kMax && kyLen <= kMax);
|
||||
|
||||
const SRC *in[ maxLines ];
|
||||
const SRC *in[kMax];
|
||||
DST *out;
|
||||
|
||||
int border = (kyLen - 1) / 2;
|
||||
int xborder = (kxLen - 1) / 2;
|
||||
int yborder = (kyLen - 1) / 2;
|
||||
|
||||
for (int i=0; i < kyLen; i++)
|
||||
{
|
||||
in[i] = src.InLine<SRC>(i - border);
|
||||
in[i] = src.InLine<SRC>(i - yborder);
|
||||
}
|
||||
|
||||
out = dst.OutLine<DST>();
|
||||
@ -529,28 +616,52 @@ static void run_sepfilter(Buffer& dst, const View& src,
|
||||
int width = dst.length();
|
||||
int chan = dst.meta().chan;
|
||||
|
||||
for (int w=0; w < width; w++)
|
||||
// optimized 3x3 vs reference
|
||||
if (kxLen == 3 && kyLen == 3)
|
||||
{
|
||||
// TODO: make this cycle innermost
|
||||
for (int c=0; c < chan; c++)
|
||||
int y = dst.y();
|
||||
int y0 = dst.priv().writeStart();
|
||||
|
||||
int border = xborder;
|
||||
run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
|
||||
}
|
||||
else
|
||||
{
|
||||
int length = chan * width;
|
||||
int xshift = chan * xborder;
|
||||
|
||||
// horizontal pass
|
||||
|
||||
for (int k=0; k < kyLen; k++)
|
||||
{
|
||||
float sum=0;
|
||||
const SRC *inp[kMax] = {nullptr};
|
||||
|
||||
for (int i=0; i < kyLen; i++)
|
||||
for (int j=0; j < kxLen; j++)
|
||||
{
|
||||
float sumi=0;
|
||||
|
||||
for (int j=0; j < kxLen; j++)
|
||||
{
|
||||
sumi += in[i][(w + j - border)*chan + c] * kx[j];
|
||||
}
|
||||
|
||||
sum += sumi * ky[i];
|
||||
inp[j] = in[k] + (j - xborder)*xshift;
|
||||
}
|
||||
|
||||
float result = sum + delta;
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
float sum = 0;
|
||||
for (int j=0; j < kxLen; j++)
|
||||
{
|
||||
sum += inp[j][l] * kx[j];
|
||||
}
|
||||
buf[k][l] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
out[w*chan + c] = saturate<DST>(result, rintf);
|
||||
// vertical pass
|
||||
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
float sum = 0;
|
||||
for (int k=0; k < kyLen; k++)
|
||||
{
|
||||
sum += buf[k][l] * ky[k];
|
||||
}
|
||||
out[l] = saturate<DST>(sum*scale + delta, rintf);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -580,21 +691,37 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
|
||||
int kxLen = kernX.rows * kernX.cols;
|
||||
int kyLen = kernY.rows * kernY.cols;
|
||||
|
||||
GAPI_Assert(kyLen == 3);
|
||||
|
||||
float *kx = scratch.OutLine<float>();
|
||||
float *ky = kx + kxLen;
|
||||
|
||||
int width = src.meta().size.width;
|
||||
int chan = src.meta().chan;
|
||||
int length = width * chan;
|
||||
|
||||
float *buf[3];
|
||||
buf[0] = ky + kyLen;
|
||||
buf[1] = buf[0] + length;
|
||||
buf[2] = buf[1] + length;
|
||||
|
||||
float scale = 1;
|
||||
float delta = static_cast<float>(delta_[0]);
|
||||
|
||||
// DST SRC OP __VA_ARGS__
|
||||
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
|
||||
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
|
||||
UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
|
||||
UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
|
||||
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
UNARY_( short, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
UNARY_( float, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
UNARY_( float, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
UNARY_( float, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||
}
|
||||
|
||||
static void initScratch(const GMatDesc& /* in */,
|
||||
static void initScratch(const GMatDesc& in,
|
||||
int /* ddepth */,
|
||||
const Mat & kernX,
|
||||
const Mat & kernY,
|
||||
@ -607,7 +734,13 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
|
||||
int kxLen = kernX.rows * kernX.cols;
|
||||
int kyLen = kernY.rows * kernY.cols;
|
||||
|
||||
cv::gapi::own::Size bufsize(kxLen + kyLen, 1);
|
||||
int width = in.size.width;
|
||||
int chan = in.chan;
|
||||
|
||||
int buflen = kxLen + kyLen + // x, y kernels
|
||||
width * chan * Window; // work buffers
|
||||
|
||||
cv::gapi::own::Size bufsize(buflen, 1);
|
||||
GMatDesc bufdesc = {CV_32F, 1, bufsize};
|
||||
Buffer buffer(bufdesc);
|
||||
scratch = std::move(buffer);
|
||||
@ -664,29 +797,47 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
|
||||
auto *kx = scratch.OutLine<float>(); // cached kernX data
|
||||
auto *ky = kx + kxsize; // cached kernY data
|
||||
|
||||
int width = src.meta().size.width;
|
||||
int chan = src.meta().chan;
|
||||
int length = width * chan;
|
||||
|
||||
float *buf[3];
|
||||
buf[0] = ky + kysize;
|
||||
buf[1] = buf[0] + length;
|
||||
buf[2] = buf[1] + length;
|
||||
|
||||
auto anchor = cv::Point(-1, -1);
|
||||
float delta = 0.f;
|
||||
|
||||
float scale = 1;
|
||||
float delta = 0;
|
||||
|
||||
// DST SRC OP __VA_ARGS__
|
||||
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
|
||||
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
|
||||
UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
|
||||
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
|
||||
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
|
||||
UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
|
||||
UNARY_( float, float, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||
}
|
||||
|
||||
static void initScratch(const GMatDesc& /* in */,
|
||||
static void initScratch(const GMatDesc& in,
|
||||
const cv::Size & ksize,
|
||||
double sigmaX,
|
||||
double sigmaY,
|
||||
int /* borderType */,
|
||||
const cv::Scalar & /* borderValue */,
|
||||
int /* borderType */,
|
||||
const cv::Scalar & /* borderValue */,
|
||||
Buffer & scratch)
|
||||
{
|
||||
int kxsize = ksize.width;
|
||||
int kysize = ksize.height;
|
||||
|
||||
cv::gapi::own::Size bufsize(kxsize + kysize, 1);
|
||||
int width = in.size.width;
|
||||
int chan = in.chan;
|
||||
|
||||
int buflen = kxsize + kysize + // x, y kernels
|
||||
width * chan * Window; // work buffers
|
||||
|
||||
cv::gapi::own::Size bufsize(buflen, 1);
|
||||
GMatDesc bufdesc = {CV_32F, 1, bufsize};
|
||||
Buffer buffer(bufdesc);
|
||||
scratch = std::move(buffer);
|
||||
@ -767,7 +918,7 @@ static void run_sobel(Buffer& dst,
|
||||
int y0 = dst.priv().writeStart();
|
||||
// int y1 = dst.priv().writeEnd();
|
||||
|
||||
run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
|
||||
run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
|
||||
}
|
||||
|
||||
GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true)
|
||||
@ -1102,6 +1253,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
|
||||
UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
|
||||
UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
|
||||
UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
|
||||
UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||
}
|
||||
@ -1109,7 +1261,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
|
||||
static void initScratch(const GMatDesc& /* in */,
|
||||
const Mat & kernel,
|
||||
const Point & /* anchor */,
|
||||
int /* iterations */,
|
||||
int /* iterations */,
|
||||
int /* borderType */,
|
||||
const cv::Scalar & /* borderValue */,
|
||||
Buffer & scratch)
|
||||
@ -1179,6 +1331,7 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
|
||||
UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
|
||||
UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
|
||||
UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
|
||||
UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||
}
|
||||
@ -1290,6 +1443,7 @@ GAPI_FLUID_KERNEL(GFluidMedianBlur, cv::gapi::imgproc::GMedianBlur, false)
|
||||
UNARY_(uchar , uchar , run_medianblur, dst, src, ksize);
|
||||
UNARY_(ushort, ushort, run_medianblur, dst, src, ksize);
|
||||
UNARY_( short, short, run_medianblur, dst, src, ksize);
|
||||
UNARY_( float, float, run_medianblur, dst, src, ksize);
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||
}
|
||||
|
@ -57,34 +57,34 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
|
||||
CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
//---------------------
|
||||
//-------------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
// Fluid kernels: sepFilter
|
||||
//
|
||||
//---------------------
|
||||
//-------------------------
|
||||
|
||||
#define RUN_SOBEL_ROW(DST, SRC) \
|
||||
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, float *buf[], \
|
||||
int y, int y0) \
|
||||
{ \
|
||||
CV_CPU_DISPATCH(run_sobel_row, \
|
||||
(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
|
||||
CV_CPU_DISPATCH_MODES_ALL); \
|
||||
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
|
||||
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, \
|
||||
float *buf[], int y, int y0) \
|
||||
{ \
|
||||
CV_CPU_DISPATCH(run_sepfilter3x3_impl, \
|
||||
(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
|
||||
CV_CPU_DISPATCH_MODES_ALL); \
|
||||
}
|
||||
|
||||
RUN_SOBEL_ROW(uchar , uchar )
|
||||
RUN_SOBEL_ROW(ushort, ushort)
|
||||
RUN_SOBEL_ROW( short, uchar )
|
||||
RUN_SOBEL_ROW( short, ushort)
|
||||
RUN_SOBEL_ROW( short, short)
|
||||
RUN_SOBEL_ROW( float, uchar )
|
||||
RUN_SOBEL_ROW( float, ushort)
|
||||
RUN_SOBEL_ROW( float, short)
|
||||
RUN_SOBEL_ROW( float, float)
|
||||
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( short, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( float, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( float, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, float)
|
||||
|
||||
#undef RUN_SOBEL_ROW
|
||||
#undef RUN_SEPFILTER3X3_IMPL
|
||||
|
||||
} // namespace fliud
|
||||
} // namespace gapi
|
||||
|
@ -33,29 +33,29 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
|
||||
|
||||
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
|
||||
|
||||
//---------------------
|
||||
//-------------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
// Fluid kernels: sepFilter
|
||||
//
|
||||
//---------------------
|
||||
//-------------------------
|
||||
|
||||
#define RUN_SOBEL_ROW(DST, SRC) \
|
||||
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, float *buf[], \
|
||||
int y, int y0);
|
||||
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
|
||||
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, \
|
||||
float *buf[], int y, int y0);
|
||||
|
||||
RUN_SOBEL_ROW(uchar , uchar )
|
||||
RUN_SOBEL_ROW(ushort, ushort)
|
||||
RUN_SOBEL_ROW( short, uchar )
|
||||
RUN_SOBEL_ROW( short, ushort)
|
||||
RUN_SOBEL_ROW( short, short)
|
||||
RUN_SOBEL_ROW( float, uchar )
|
||||
RUN_SOBEL_ROW( float, ushort)
|
||||
RUN_SOBEL_ROW( float, short)
|
||||
RUN_SOBEL_ROW( float, float)
|
||||
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( short, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( float, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( float, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, float)
|
||||
|
||||
#undef RUN_SOBEL_ROW
|
||||
#undef RUN_SEPFILTER3X3_IMPL
|
||||
|
||||
} // namespace fluid
|
||||
} // namespace gapi
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
#if !defined(GAPI_STANDALONE)
|
||||
|
||||
#include "gfluidimgproc_func.hpp"
|
||||
|
||||
#include "opencv2/gapi/own/saturate.hpp"
|
||||
|
||||
#include "opencv2/core.hpp"
|
||||
@ -16,6 +18,8 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#ifdef __GNUC__
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wstrict-overflow"
|
||||
@ -48,34 +52,66 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
|
||||
|
||||
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
|
||||
|
||||
//---------------------
|
||||
//-------------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
// Fluid kernels: sepFilter
|
||||
//
|
||||
//---------------------
|
||||
//-------------------------
|
||||
|
||||
#define RUN_SOBEL_ROW(DST, SRC) \
|
||||
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, float *buf[], \
|
||||
int y, int y0);
|
||||
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
|
||||
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, \
|
||||
float *buf[], int y, int y0);
|
||||
|
||||
RUN_SOBEL_ROW(uchar , uchar )
|
||||
RUN_SOBEL_ROW(ushort, ushort)
|
||||
RUN_SOBEL_ROW( short, uchar )
|
||||
RUN_SOBEL_ROW( short, ushort)
|
||||
RUN_SOBEL_ROW( short, short)
|
||||
RUN_SOBEL_ROW( float, uchar )
|
||||
RUN_SOBEL_ROW( float, ushort)
|
||||
RUN_SOBEL_ROW( float, short)
|
||||
RUN_SOBEL_ROW( float, float)
|
||||
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( short, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( float, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( float, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, float)
|
||||
|
||||
#undef RUN_SOBEL_ROW
|
||||
#undef RUN_SEPFILTER3X3_IMPL
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
#if CV_SIMD
|
||||
template<typename SRC>
|
||||
static inline v_float32 vx_load_f32(const SRC* ptr)
|
||||
{
|
||||
if (std::is_same<SRC,uchar>::value)
|
||||
{
|
||||
v_uint32 tmp = vx_load_expand_q(reinterpret_cast<const uchar*>(ptr));
|
||||
return v_cvt_f32(v_reinterpret_as_s32(tmp));
|
||||
}
|
||||
|
||||
if (std::is_same<SRC,ushort>::value)
|
||||
{
|
||||
v_uint32 tmp = vx_load_expand(reinterpret_cast<const ushort*>(ptr));
|
||||
return v_cvt_f32(v_reinterpret_as_s32(tmp));
|
||||
}
|
||||
|
||||
if (std::is_same<SRC,short>::value)
|
||||
{
|
||||
v_int32 tmp = vx_load_expand(reinterpret_cast<const short*>(ptr));
|
||||
return v_cvt_f32(tmp);
|
||||
}
|
||||
|
||||
if (std::is_same<SRC,float>::value)
|
||||
{
|
||||
v_float32 tmp = vx_load(reinterpret_cast<const float*>(ptr));
|
||||
return tmp;
|
||||
}
|
||||
|
||||
CV_Error(cv::Error::StsBadArg, "unsupported type");
|
||||
}
|
||||
#endif // CV_SIMD
|
||||
|
||||
//----------------------------------
|
||||
//
|
||||
// Fluid kernels: RGB2Gray, BGR2Gray
|
||||
@ -309,187 +345,359 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------
|
||||
//-------------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
// Fluid kernels: sepFilter
|
||||
//
|
||||
//---------------------
|
||||
|
||||
// Sobel 3x3: vertical pass
|
||||
template<bool noscale, typename DST>
|
||||
static void run_sobel3x3_vert(DST out[], int length, const float ky[],
|
||||
float scale, float delta, const int r[], float *buf[])
|
||||
{
|
||||
float ky0 = ky[0],
|
||||
ky1 = ky[1],
|
||||
ky2 = ky[2];
|
||||
|
||||
int r0 = r[0],
|
||||
r1 = r[1],
|
||||
r2 = r[2];
|
||||
//-------------------------
|
||||
|
||||
#if CV_SIMD
|
||||
// for floating-point output,
|
||||
// manual vectoring may be not better than compiler's optimization
|
||||
#define EXPLICIT_SIMD_32F 0 // 1=vectorize 32f case explicitly, 0=don't
|
||||
#if EXPLICIT_SIMD_32F
|
||||
if (std::is_same<DST, float>::value && length >= v_int16::nlanes)
|
||||
// this variant not using buf[] appears 15% faster than reference any-2-float code below
|
||||
template<bool noscale, typename SRC>
|
||||
static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
|
||||
const float kx[], const float ky[], int border,
|
||||
float scale, float delta)
|
||||
{
|
||||
const int length = width * chan;
|
||||
const int shift = border * chan;
|
||||
|
||||
const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
|
||||
const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
|
||||
|
||||
for (int l=0; l < length; )
|
||||
{
|
||||
constexpr static int nlanes = v_float32::nlanes;
|
||||
static const int nlanes = v_float32::nlanes;
|
||||
|
||||
for (int l=0; l < length; )
|
||||
// main part
|
||||
for ( ; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[])
|
||||
{
|
||||
v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
|
||||
sum = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum);
|
||||
sum = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum);
|
||||
v_float32 t0 = vx_load_f32(&i[l - shift]);
|
||||
v_float32 t1 = vx_load_f32(&i[l ]);
|
||||
v_float32 t2 = vx_load_f32(&i[l + shift]);
|
||||
v_float32 t = t0 * vx_setall_f32(kx0);
|
||||
t = v_fma(t1, vx_setall_f32(kx1), t);
|
||||
t = v_fma(t2, vx_setall_f32(kx2), t);
|
||||
return t;
|
||||
};
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
}
|
||||
v_float32 s0 = xsum(in[0]);
|
||||
v_float32 s1 = xsum(in[1]);
|
||||
v_float32 s2 = xsum(in[2]);
|
||||
v_float32 s = s0 * vx_setall_f32(ky0);
|
||||
s = v_fma(s1, vx_setall_f32(ky1), s);
|
||||
s = v_fma(s2, vx_setall_f32(ky2), s);
|
||||
|
||||
v_store(reinterpret_cast<float*>(&out[l]), sum);
|
||||
if (!noscale)
|
||||
{
|
||||
s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
}
|
||||
|
||||
if (l < length)
|
||||
{
|
||||
// tail: recalculate last pixels
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
v_store(&out[l], s);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if ((std::is_same<DST, short>::value || std::is_same<DST, ushort>::value)
|
||||
&& length >= v_int16::nlanes)
|
||||
{
|
||||
constexpr static int nlanes = v_int16::nlanes;
|
||||
|
||||
for (int l=0; l < length; )
|
||||
// tail (if any)
|
||||
if (l < length)
|
||||
{
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
|
||||
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
|
||||
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
|
||||
|
||||
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
|
||||
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1);
|
||||
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1);
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
}
|
||||
|
||||
v_int32 isum0 = v_round(sum0),
|
||||
isum1 = v_round(sum1);
|
||||
|
||||
if (std::is_same<DST, short>::value)
|
||||
{
|
||||
// signed short
|
||||
v_int16 res = v_pack(isum0, isum1);
|
||||
v_store(reinterpret_cast<short*>(&out[l]), res);
|
||||
} else
|
||||
{
|
||||
// unsigned short
|
||||
v_uint16 res = v_pack_u(isum0, isum1);
|
||||
v_store(reinterpret_cast<ushort*>(&out[l]), res);
|
||||
}
|
||||
}
|
||||
|
||||
if (l < length)
|
||||
{
|
||||
// tail: recalculate last pixels
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
|
||||
{
|
||||
constexpr static int nlanes = v_uint8::nlanes;
|
||||
|
||||
for (int l=0; l < length; )
|
||||
{
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
|
||||
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
|
||||
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
|
||||
|
||||
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0);
|
||||
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1);
|
||||
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1);
|
||||
|
||||
v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
|
||||
sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2);
|
||||
sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2);
|
||||
|
||||
v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
|
||||
sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3);
|
||||
sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3);
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
}
|
||||
|
||||
v_int32 isum0 = v_round(sum0),
|
||||
isum1 = v_round(sum1),
|
||||
isum2 = v_round(sum2),
|
||||
isum3 = v_round(sum3);
|
||||
|
||||
v_int16 ires0 = v_pack(isum0, isum1),
|
||||
ires1 = v_pack(isum2, isum3);
|
||||
|
||||
v_uint8 res = v_pack_u(ires0, ires1);
|
||||
v_store(reinterpret_cast<uchar*>(&out[l]), res);
|
||||
}
|
||||
|
||||
if (l < length)
|
||||
{
|
||||
// tail: recalculate last pixels
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// reference code
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2;
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
sum = sum*scale + delta;
|
||||
}
|
||||
|
||||
out[l] = cv::gapi::own::saturate<DST>(sum, rintf);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename DST, typename SRC>
|
||||
static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
|
||||
const float kx[], const float ky[], int border,
|
||||
float scale, float delta, float *buf[],
|
||||
int y, int y0)
|
||||
// this variant with manually vectored rounding to short/ushort appears 10-40x faster
|
||||
// than reference code below
|
||||
template<bool noscale, typename DST, typename SRC>
|
||||
static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan,
|
||||
const float kx[], const float ky[], int border,
|
||||
float scale, float delta,
|
||||
float *buf[], int y, int y0)
|
||||
{
|
||||
int r[3];
|
||||
r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
|
||||
r[1] = (y - y0 + 1) % 3; // this
|
||||
r[2] = (y - y0 + 2) % 3; // next row
|
||||
|
||||
const int length = width * chan;
|
||||
const int shift = border * chan;
|
||||
|
||||
const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
|
||||
const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
|
||||
|
||||
// horizontal pass
|
||||
|
||||
int k0 = (y == y0)? 0: 2;
|
||||
|
||||
for (int k = k0; k < 3; k++)
|
||||
{
|
||||
// previous , this , next pixel
|
||||
const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
|
||||
|
||||
// rely on compiler vectoring
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
|
||||
}
|
||||
}
|
||||
|
||||
// vertical pass
|
||||
|
||||
const int r0=r[0], r1=r[1], r2=r[2];
|
||||
|
||||
for (int l=0; l < length;)
|
||||
{
|
||||
constexpr int nlanes = v_int16::nlanes;
|
||||
|
||||
// main part of row
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
|
||||
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
|
||||
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
|
||||
|
||||
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
|
||||
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1);
|
||||
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1);
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
}
|
||||
|
||||
v_int32 isum0 = v_round(sum0),
|
||||
isum1 = v_round(sum1);
|
||||
|
||||
if (std::is_same<DST, short>::value)
|
||||
{
|
||||
// signed short
|
||||
v_int16 res = v_pack(isum0, isum1);
|
||||
v_store(reinterpret_cast<short*>(&out[l]), res);
|
||||
} else
|
||||
{
|
||||
// unsigned short
|
||||
v_uint16 res = v_pack_u(isum0, isum1);
|
||||
v_store(reinterpret_cast<ushort*>(&out[l]), res);
|
||||
}
|
||||
}
|
||||
|
||||
// tail (if any)
|
||||
if (l < length)
|
||||
{
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// this code with manually vectored rounding to uchar is 10-40x faster than reference
|
||||
template<bool noscale, typename SRC>
|
||||
static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
|
||||
const float kx[], const float ky[], int border,
|
||||
float scale, float delta,
|
||||
float *buf[], int y, int y0)
|
||||
{
|
||||
int r[3];
|
||||
r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
|
||||
r[1] = (y - y0 + 1) % 3; // this
|
||||
r[2] = (y - y0 + 2) % 3; // next row
|
||||
|
||||
const int length = width * chan;
|
||||
const int shift = border * chan;
|
||||
|
||||
const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
|
||||
const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
|
||||
|
||||
// horizontal pass
|
||||
|
||||
int k0 = (y == y0)? 0: 2;
|
||||
|
||||
for (int k = k0; k < 3; k++)
|
||||
{
|
||||
// previous , this , next pixel
|
||||
const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
|
||||
|
||||
// rely on compiler vectoring
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
|
||||
}
|
||||
}
|
||||
|
||||
// vertical pass
|
||||
|
||||
const int r0=r[0], r1=r[1], r2=r[2];
|
||||
|
||||
for (int l=0; l < length;)
|
||||
{
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
|
||||
// main part of row
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
|
||||
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
|
||||
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
|
||||
|
||||
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0);
|
||||
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1);
|
||||
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1);
|
||||
|
||||
v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
|
||||
sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2);
|
||||
sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2);
|
||||
|
||||
v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
|
||||
sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3);
|
||||
sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3);
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
|
||||
}
|
||||
|
||||
v_int32 isum0 = v_round(sum0),
|
||||
isum1 = v_round(sum1),
|
||||
isum2 = v_round(sum2),
|
||||
isum3 = v_round(sum3);
|
||||
|
||||
v_int16 ires0 = v_pack(isum0, isum1),
|
||||
ires1 = v_pack(isum2, isum3);
|
||||
|
||||
v_uint8 res = v_pack_u(ires0, ires1);
|
||||
v_store(reinterpret_cast<uchar*>(&out[l]), res);
|
||||
}
|
||||
|
||||
// tail (if any)
|
||||
if (l < length)
|
||||
{
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// this code manually vectored for int16 not much faster than generic any-to-short code above
|
||||
#define USE_SEPFILTER3X3_CHAR2SHORT 1
|
||||
|
||||
#if USE_SEPFILTER3X3_CHAR2SHORT
|
||||
template<bool noscale>
|
||||
static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan,
|
||||
const float kx[], const float ky[], int border,
|
||||
float scale, float delta,
|
||||
float *buf[], int y, int y0)
|
||||
{
|
||||
const schar ikx0 = saturate<schar>(kx[0], rintf);
|
||||
const schar ikx1 = saturate<schar>(kx[1], rintf);
|
||||
const schar ikx2 = saturate<schar>(kx[2], rintf);
|
||||
|
||||
const schar iky0 = saturate<schar>(ky[0], rintf);
|
||||
const schar iky1 = saturate<schar>(ky[1], rintf);
|
||||
const schar iky2 = saturate<schar>(ky[2], rintf);
|
||||
|
||||
const short iscale = saturate<short>(scale * (1 << 15), rintf);
|
||||
const short idelta = saturate<short>(delta , rintf);
|
||||
|
||||
// check if this code is applicable
|
||||
if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] ||
|
||||
iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] ||
|
||||
idelta != delta ||
|
||||
std::abs(scale) > 1 || std::abs(scale) < 0.01)
|
||||
{
|
||||
run_sepfilter3x3_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
|
||||
buf, y, y0);
|
||||
return;
|
||||
}
|
||||
|
||||
short *ibuf[3];
|
||||
ibuf[0] = reinterpret_cast<short*>(buf[0]);
|
||||
ibuf[1] = reinterpret_cast<short*>(buf[1]);
|
||||
ibuf[2] = reinterpret_cast<short*>(buf[2]);
|
||||
|
||||
int r[3];
|
||||
r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
|
||||
r[1] = (y - y0 + 1) % 3; // this
|
||||
r[2] = (y - y0 + 2) % 3; // next row
|
||||
|
||||
const int length = width * chan;
|
||||
const int shift = border * chan;
|
||||
|
||||
// horizontal pass
|
||||
|
||||
int k0 = (y == y0)? 0: 2;
|
||||
|
||||
for (int k = k0; k < 3; k++)
|
||||
{
|
||||
for (int l=0; l < length;)
|
||||
{
|
||||
constexpr int nlanes = v_int16::nlanes;
|
||||
|
||||
// main part of output row
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
v_uint16 t0 = vx_load_expand(&in[k][l - shift]); // previous
|
||||
v_uint16 t1 = vx_load_expand(&in[k][l ]); // current
|
||||
v_uint16 t2 = vx_load_expand(&in[k][l + shift]); // next pixel
|
||||
v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
|
||||
v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
|
||||
v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
|
||||
v_store(&ibuf[r[k]][l], t);
|
||||
}
|
||||
|
||||
// tail (if any)
|
||||
if (l < length)
|
||||
{
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// vertical pass
|
||||
|
||||
for (int l=0; l < length;)
|
||||
{
|
||||
constexpr int nlanes = v_int16::nlanes;
|
||||
|
||||
// main part of output row
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
v_int16 s0 = vx_load(&ibuf[r[0]][l]); // previous
|
||||
v_int16 s1 = vx_load(&ibuf[r[1]][l]); // current
|
||||
v_int16 s2 = vx_load(&ibuf[r[2]][l]); // next row
|
||||
v_int16 s = s0 * vx_setall_s16(iky0) +
|
||||
s1 * vx_setall_s16(iky1) +
|
||||
s2 * vx_setall_s16(iky2);
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
|
||||
}
|
||||
|
||||
v_store(&out[l], s);
|
||||
}
|
||||
|
||||
// tail (if any)
|
||||
if (l < length)
|
||||
{
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // CV_SIMD
|
||||
|
||||
template<bool noscale, typename DST, typename SRC>
|
||||
static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan,
|
||||
const float kx[], const float ky[], int border,
|
||||
float scale, float delta,
|
||||
float *buf[], int y, int y0)
|
||||
{
|
||||
int r[3];
|
||||
r[0] = (y - y0) % 3; // buf[r[0]]: previous
|
||||
@ -497,19 +705,21 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
|
||||
r[2] = (y - y0 + 2) % 3; // next row
|
||||
|
||||
int length = width * chan;
|
||||
int shift = border * chan;
|
||||
|
||||
// horizontal pass
|
||||
|
||||
// full horizontal pass is needed only if very 1st row in ROI;
|
||||
// for 2nd and further rows, it is enough to convolve only the
|
||||
// "next" row - as we can reuse buffers from previous calls to
|
||||
// this kernel (note that Fluid processes rows consequently)
|
||||
// this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
|
||||
|
||||
int k0 = (y == y0)? 0: 2;
|
||||
|
||||
for (int k = k0; k < 3; k++)
|
||||
{
|
||||
// previous, this , next pixel
|
||||
const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan};
|
||||
// previous , this , next pixel
|
||||
const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
|
||||
|
||||
// rely on compiler vectoring
|
||||
for (int l=0; l < length; l++)
|
||||
@ -519,37 +729,121 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
|
||||
}
|
||||
|
||||
// vertical pass
|
||||
if (scale == 1 && delta == 0)
|
||||
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
constexpr static bool noscale = true; // omit scaling
|
||||
run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
|
||||
} else
|
||||
{
|
||||
constexpr static bool noscale = false; // do scaling
|
||||
run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
|
||||
float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2];
|
||||
|
||||
if (!noscale)
|
||||
{
|
||||
sum = sum*scale + delta;
|
||||
}
|
||||
|
||||
out[l] = saturate<DST>(sum, rintf);
|
||||
}
|
||||
}
|
||||
|
||||
#define RUN_SOBEL_ROW(DST, SRC) \
|
||||
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, float *buf[], \
|
||||
int y, int y0) \
|
||||
{ \
|
||||
run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \
|
||||
template<bool noscale, typename DST, typename SRC>
|
||||
static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan,
|
||||
const float kx[], const float ky[], int border,
|
||||
float scale, float delta,
|
||||
float *buf[], int y, int y0)
|
||||
{
|
||||
#if CV_SIMD
|
||||
int length = width * chan;
|
||||
|
||||
// length variable may be unused if types do not match at 'if' statements below
|
||||
(void) length;
|
||||
|
||||
#if USE_SEPFILTER3X3_CHAR2SHORT
|
||||
if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
|
||||
length >= v_int16::nlanes)
|
||||
{
|
||||
// only slightly faster than more generic any-to-short (see below)
|
||||
run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
|
||||
reinterpret_cast<const uchar**>(in),
|
||||
width, chan, kx, ky, border, scale, delta,
|
||||
buf, y, y0);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
|
||||
length >= v_float32::nlanes)
|
||||
{
|
||||
// appears 15% faster than reference any-to-float code (called below)
|
||||
run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
|
||||
width, chan, kx, ky, border, scale, delta);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
|
||||
{
|
||||
// appears 10-40x faster than reference due to much faster rounding
|
||||
run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
|
||||
width, chan, kx, ky, border, scale, delta,
|
||||
buf, y, y0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
|
||||
{
|
||||
// appears 10-40x faster than reference due to much faster rounding
|
||||
run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
|
||||
width, chan, kx, ky, border, scale, delta,
|
||||
buf, y, y0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
|
||||
{
|
||||
// appears 10-40x faster than reference due to much faster rounding
|
||||
run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
|
||||
width, chan, kx, ky, border, scale, delta,
|
||||
buf, y, y0);
|
||||
return;
|
||||
}
|
||||
#endif // CV_SIMD
|
||||
|
||||
// reference code is quite fast for any-to-float case,
|
||||
// but not for any-to-integral due to very slow rounding
|
||||
run_sepfilter3x3_reference<noscale>(out, in, width, chan, kx, ky, border,
|
||||
scale, delta, buf, y, y0);
|
||||
}
|
||||
|
||||
RUN_SOBEL_ROW(uchar , uchar )
|
||||
RUN_SOBEL_ROW(ushort, ushort)
|
||||
RUN_SOBEL_ROW( short, uchar )
|
||||
RUN_SOBEL_ROW( short, ushort)
|
||||
RUN_SOBEL_ROW( short, short)
|
||||
RUN_SOBEL_ROW( float, uchar )
|
||||
RUN_SOBEL_ROW( float, ushort)
|
||||
RUN_SOBEL_ROW( float, short)
|
||||
RUN_SOBEL_ROW( float, float)
|
||||
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
|
||||
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
|
||||
const float kx[], const float ky[], int border, \
|
||||
float scale, float delta, \
|
||||
float *buf[], int y, int y0) \
|
||||
{ \
|
||||
if (scale == 1 && delta == 0) \
|
||||
{ \
|
||||
constexpr bool noscale = true; \
|
||||
run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
|
||||
scale, delta, buf, y, y0); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
constexpr bool noscale = false; \
|
||||
run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
|
||||
scale, delta, buf, y, y0); \
|
||||
} \
|
||||
}
|
||||
|
||||
#undef RUN_SOBEL_ROW
|
||||
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( short, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL( float, uchar )
|
||||
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( float, ushort)
|
||||
RUN_SEPFILTER3X3_IMPL( short, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, short)
|
||||
RUN_SEPFILTER3X3_IMPL( float, float)
|
||||
|
||||
#undef RUN_SEPFILTER3X3_IMPL
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user