diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index 9c680bf116..e33269b4b8 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -3559,6 +3559,10 @@ CV_EXPORTS MatExpr operator + (const Mat& m, const MatExpr& e); CV_EXPORTS MatExpr operator + (const MatExpr& e, const Scalar& s); CV_EXPORTS MatExpr operator + (const Scalar& s, const MatExpr& e); CV_EXPORTS MatExpr operator + (const MatExpr& e1, const MatExpr& e2); +template static inline +MatExpr operator + (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); } +template static inline +MatExpr operator + (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; } CV_EXPORTS MatExpr operator - (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator - (const Mat& a, const Scalar& s); @@ -3568,6 +3572,10 @@ CV_EXPORTS MatExpr operator - (const Mat& m, const MatExpr& e); CV_EXPORTS MatExpr operator - (const MatExpr& e, const Scalar& s); CV_EXPORTS MatExpr operator - (const Scalar& s, const MatExpr& e); CV_EXPORTS MatExpr operator - (const MatExpr& e1, const MatExpr& e2); +template static inline +MatExpr operator - (const Mat& a, const Matx<_Tp, m, n>& b) { return a - Mat(b); } +template static inline +MatExpr operator - (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) - b; } CV_EXPORTS MatExpr operator - (const Mat& m); CV_EXPORTS MatExpr operator - (const MatExpr& e); @@ -3580,6 +3588,10 @@ CV_EXPORTS MatExpr operator * (const Mat& m, const MatExpr& e); CV_EXPORTS MatExpr operator * (const MatExpr& e, double s); CV_EXPORTS MatExpr operator * (double s, const MatExpr& e); CV_EXPORTS MatExpr operator * (const MatExpr& e1, const MatExpr& e2); +template static inline +MatExpr operator * (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); } +template static inline +MatExpr operator * (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; } CV_EXPORTS MatExpr operator / (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator / (const Mat& a, double s); @@ -3589,52 +3601,100 @@ CV_EXPORTS MatExpr operator / (const Mat& m, const MatExpr& e); CV_EXPORTS MatExpr operator / (const MatExpr& e, double s); CV_EXPORTS MatExpr operator / (double s, const MatExpr& e); CV_EXPORTS MatExpr operator / (const MatExpr& e1, const MatExpr& e2); +template static inline +MatExpr operator / (const Mat& a, const Matx<_Tp, m, n>& b) { return a / Mat(b); } +template static inline +MatExpr operator / (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) / b; } CV_EXPORTS MatExpr operator < (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator < (const Mat& a, double s); CV_EXPORTS MatExpr operator < (double s, const Mat& a); +template static inline +MatExpr operator < (const Mat& a, const Matx<_Tp, m, n>& b) { return a < Mat(b); } +template static inline +MatExpr operator < (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) < b; } CV_EXPORTS MatExpr operator <= (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator <= (const Mat& a, double s); CV_EXPORTS MatExpr operator <= (double s, const Mat& a); +template static inline +MatExpr operator <= (const Mat& a, const Matx<_Tp, m, n>& b) { return a <= Mat(b); } +template static inline +MatExpr operator <= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) <= b; } CV_EXPORTS MatExpr operator == (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator == (const Mat& a, double s); CV_EXPORTS MatExpr operator == (double s, const Mat& a); +template static inline +MatExpr operator == (const Mat& a, const Matx<_Tp, m, n>& b) { return a == Mat(b); } +template static inline +MatExpr operator == (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) == b; } CV_EXPORTS MatExpr operator != (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator != (const Mat& a, double s); CV_EXPORTS MatExpr operator != (double s, const Mat& a); +template static inline +MatExpr operator != (const Mat& a, const Matx<_Tp, m, n>& b) { return a != Mat(b); } +template static inline +MatExpr operator != (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) != b; } CV_EXPORTS MatExpr operator >= (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator >= (const Mat& a, double s); CV_EXPORTS MatExpr operator >= (double s, const Mat& a); +template static inline +MatExpr operator >= (const Mat& a, const Matx<_Tp, m, n>& b) { return a >= Mat(b); } +template static inline +MatExpr operator >= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) >= b; } CV_EXPORTS MatExpr operator > (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator > (const Mat& a, double s); CV_EXPORTS MatExpr operator > (double s, const Mat& a); +template static inline +MatExpr operator > (const Mat& a, const Matx<_Tp, m, n>& b) { return a > Mat(b); } +template static inline +MatExpr operator > (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) > b; } CV_EXPORTS MatExpr operator & (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator & (const Mat& a, const Scalar& s); CV_EXPORTS MatExpr operator & (const Scalar& s, const Mat& a); +template static inline +MatExpr operator & (const Mat& a, const Matx<_Tp, m, n>& b) { return a & Mat(b); } +template static inline +MatExpr operator & (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) & b; } CV_EXPORTS MatExpr operator | (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator | (const Mat& a, const Scalar& s); CV_EXPORTS MatExpr operator | (const Scalar& s, const Mat& a); +template static inline +MatExpr operator | (const Mat& a, const Matx<_Tp, m, n>& b) { return a | Mat(b); } +template static inline +MatExpr operator | (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) | b; } CV_EXPORTS MatExpr operator ^ (const Mat& a, const Mat& b); CV_EXPORTS MatExpr operator ^ (const Mat& a, const Scalar& s); CV_EXPORTS MatExpr operator ^ (const Scalar& s, const Mat& a); +template static inline +MatExpr operator ^ (const Mat& a, const Matx<_Tp, m, n>& b) { return a ^ Mat(b); } +template static inline +MatExpr operator ^ (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) ^ b; } CV_EXPORTS MatExpr operator ~(const Mat& m); CV_EXPORTS MatExpr min(const Mat& a, const Mat& b); CV_EXPORTS MatExpr min(const Mat& a, double s); CV_EXPORTS MatExpr min(double s, const Mat& a); +template static inline +MatExpr min (const Mat& a, const Matx<_Tp, m, n>& b) { return min(a, Mat(b)); } +template static inline +MatExpr min (const Matx<_Tp, m, n>& a, const Mat& b) { return min(Mat(a), b); } CV_EXPORTS MatExpr max(const Mat& a, const Mat& b); CV_EXPORTS MatExpr max(const Mat& a, double s); CV_EXPORTS MatExpr max(double s, const Mat& a); +template static inline +MatExpr max (const Mat& a, const Matx<_Tp, m, n>& b) { return max(a, Mat(b)); } +template static inline +MatExpr max (const Matx<_Tp, m, n>& a, const Mat& b) { return max(Mat(a), b); } /** @brief Calculates an absolute value of each matrix element. diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp index a352048a49..0e0db4072f 100644 --- a/modules/core/include/opencv2/core/operations.hpp +++ b/modules/core/include/opencv2/core/operations.hpp @@ -258,48 +258,67 @@ Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) c template CV_MAT_AUG_OPERATOR1(op, cvop, A, B) \ template CV_MAT_AUG_OPERATOR1(op, cvop, const A, B) +#define CV_MAT_AUG_OPERATOR_TN(op, cvop, A) \ + template static inline A& operator op (A& a, const Matx<_Tp,m,n>& b) { cvop; return a; } \ + template static inline const A& operator op (const A& a, const Matx<_Tp,m,n>& b) { cvop; return a; } + CV_MAT_AUG_OPERATOR (+=, cv::add(a,b,a), Mat, Mat) CV_MAT_AUG_OPERATOR (+=, cv::add(a,b,a), Mat, Scalar) CV_MAT_AUG_OPERATOR_T(+=, cv::add(a,b,a), Mat_<_Tp>, Mat) CV_MAT_AUG_OPERATOR_T(+=, cv::add(a,b,a), Mat_<_Tp>, Scalar) CV_MAT_AUG_OPERATOR_T(+=, cv::add(a,b,a), Mat_<_Tp>, Mat_<_Tp>) +CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a,Mat(b),a), Mat) +CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a,Mat(b),a), Mat_<_Tp>) CV_MAT_AUG_OPERATOR (-=, cv::subtract(a,b,a), Mat, Mat) CV_MAT_AUG_OPERATOR (-=, cv::subtract(a,b,a), Mat, Scalar) CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a,b,a), Mat_<_Tp>, Mat) CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a,b,a), Mat_<_Tp>, Scalar) CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a,b,a), Mat_<_Tp>, Mat_<_Tp>) +CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a,Mat(b),a), Mat) +CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a,Mat(b),a), Mat_<_Tp>) CV_MAT_AUG_OPERATOR (*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat, Mat) CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat) CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat_<_Tp>) CV_MAT_AUG_OPERATOR (*=, a.convertTo(a, -1, b), Mat, double) CV_MAT_AUG_OPERATOR_T(*=, a.convertTo(a, -1, b), Mat_<_Tp>, double) +CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat) +CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat_<_Tp>) CV_MAT_AUG_OPERATOR (/=, cv::divide(a,b,a), Mat, Mat) CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a,b,a), Mat_<_Tp>, Mat) CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a,b,a), Mat_<_Tp>, Mat_<_Tp>) CV_MAT_AUG_OPERATOR (/=, a.convertTo((Mat&)a, -1, 1./b), Mat, double) CV_MAT_AUG_OPERATOR_T(/=, a.convertTo((Mat&)a, -1, 1./b), Mat_<_Tp>, double) +CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), a), Mat) +CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), a), Mat_<_Tp>) CV_MAT_AUG_OPERATOR (&=, cv::bitwise_and(a,b,a), Mat, Mat) CV_MAT_AUG_OPERATOR (&=, cv::bitwise_and(a,b,a), Mat, Scalar) CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a,b,a), Mat_<_Tp>, Mat) CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a,b,a), Mat_<_Tp>, Scalar) CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a,b,a), Mat_<_Tp>, Mat_<_Tp>) +CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), a), Mat) +CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), a), Mat_<_Tp>) CV_MAT_AUG_OPERATOR (|=, cv::bitwise_or(a,b,a), Mat, Mat) CV_MAT_AUG_OPERATOR (|=, cv::bitwise_or(a,b,a), Mat, Scalar) CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a,b,a), Mat_<_Tp>, Mat) CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a,b,a), Mat_<_Tp>, Scalar) CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a,b,a), Mat_<_Tp>, Mat_<_Tp>) +CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), a), Mat) +CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), a), Mat_<_Tp>) CV_MAT_AUG_OPERATOR (^=, cv::bitwise_xor(a,b,a), Mat, Mat) CV_MAT_AUG_OPERATOR (^=, cv::bitwise_xor(a,b,a), Mat, Scalar) CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a,b,a), Mat_<_Tp>, Mat) CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a,b,a), Mat_<_Tp>, Scalar) CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a,b,a), Mat_<_Tp>, Mat_<_Tp>) +CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), a), Mat) +CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), a), Mat_<_Tp>) +#undef CV_MAT_AUG_OPERATOR_TN #undef CV_MAT_AUG_OPERATOR_T #undef CV_MAT_AUG_OPERATOR #undef CV_MAT_AUG_OPERATOR1 diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp index e0a2c99991..aea6f229ac 100644 --- a/modules/core/test/test_operations.cpp +++ b/modules/core/test/test_operations.cpp @@ -69,6 +69,7 @@ protected: bool TestVec(); bool TestMatxMultiplication(); bool TestMatxElementwiseDivison(); + bool TestMatMatxCastSum(); bool TestSubMatAccess(); bool TestExp(); bool TestSVD(); @@ -885,6 +886,74 @@ bool CV_OperationsTest::TestMatxMultiplication() return true; } +bool CV_OperationsTest::TestMatMatxCastSum() +{ + try + { + Mat ref1 = (Mat_(3, 1) << 1, 2, 3); + Mat ref2 = (Mat_(3, 1) << 3, 4, 5); + Mat ref3 = Mat::ones(3, 1, CV_64FC1); + + Mat mat = Mat::zeros(3, 1, CV_64FC1); + + Mat tst1 = ref1.clone(); + Mat_ tst2 = ref2.clone(); + Matx tst3(1, 2, 3); + Vec3d tst4(3, 4, 5); + Scalar tst5(1, 2, 3); + Mat res; + + res = mat + tst1; + CHECK_DIFF_FLT(res, ref1); + res = mat + tst2; + CHECK_DIFF_FLT(res, ref2); + res = mat + tst3; + CHECK_DIFF_FLT(res, ref1); + res = mat + tst4; + CHECK_DIFF_FLT(res, ref2); + + res = mat + tst5; + CHECK_DIFF_FLT(res, ref3); + res = mat + 1; + CHECK_DIFF_FLT(res, ref3); + + cv::add(mat, tst1, res); + CHECK_DIFF_FLT(res, ref1); + cv::add(mat, tst2, res); + CHECK_DIFF_FLT(res, ref2); + cv::add(mat, tst3, res); + CHECK_DIFF_FLT(res, ref1); + cv::add(mat, tst4, res); + CHECK_DIFF_FLT(res, ref2); + + cv::add(mat, tst5, res); + CHECK_DIFF_FLT(res, ref3); + cv::add(mat, 1, res); + CHECK_DIFF_FLT(res, ref3); + + res = mat.clone(); res += tst1; + CHECK_DIFF_FLT(res, ref1); + res = mat.clone(); res += tst2; + CHECK_DIFF_FLT(res, ref2); + res = mat.clone(); res += tst3; + CHECK_DIFF_FLT(res, ref1); + res = mat.clone(); res += tst4; + CHECK_DIFF_FLT(res, ref2); + + res = mat.clone(); res += tst5; + CHECK_DIFF_FLT(res, ref3); + res = mat.clone(); res += 1; + CHECK_DIFF_FLT(res, ref3); + } + catch (const test_excep& e) + { + ts->printf(cvtest::TS::LOG, "%s\n", e.s.c_str()); + ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH); + return false; + } + return true; +} + bool CV_OperationsTest::TestMatxElementwiseDivison() { try @@ -1135,6 +1204,9 @@ void CV_OperationsTest::run( int /* start_from */) if (!TestMatxElementwiseDivison()) return; + if (!TestMatMatxCastSum()) + return; + if (!TestSubMatAccess()) return; diff --git a/modules/features2d/src/draw.cpp b/modules/features2d/src/draw.cpp index e791596476..84fb0aca39 100644 --- a/modules/features2d/src/draw.cpp +++ b/modules/features2d/src/draw.cpp @@ -95,9 +95,9 @@ void drawKeypoints( InputArray image, const std::vector& keypoints, In if( !(flags & DrawMatchesFlags::DRAW_OVER_OUTIMG) ) { - if( image.type() == CV_8UC3 ) + if (image.type() == CV_8UC3 || image.type() == CV_8UC4) { - image.copyTo( outImage ); + image.copyTo(outImage); } else if( image.type() == CV_8UC1 ) { @@ -105,7 +105,7 @@ void drawKeypoints( InputArray image, const std::vector& keypoints, In } else { - CV_Error( Error::StsBadArg, "Incorrect type of input image.\n" ); + CV_Error( Error::StsBadArg, "Incorrect type of input image: " + typeToString(image.type()) ); } } @@ -122,6 +122,25 @@ void drawKeypoints( InputArray image, const std::vector& keypoints, In } } +static void _prepareImage(InputArray src, const Mat& dst) +{ + CV_CheckType(src.type(), src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4, "Unsupported source image"); + CV_CheckType(dst.type(), dst.type() == CV_8UC3 || dst.type() == CV_8UC4, "Unsupported destination image"); + const int src_cn = src.channels(); + const int dst_cn = dst.channels(); + + if (src_cn == dst_cn) + src.copyTo(dst); + else if (src_cn == 1) + cvtColor(src, dst, dst_cn == 3 ? COLOR_GRAY2BGR : COLOR_GRAY2BGRA); + else if (src_cn == 3 && dst_cn == 4) + cvtColor(src, dst, COLOR_BGR2BGRA); + else if (src_cn == 4 && dst_cn == 3) + cvtColor(src, dst, COLOR_BGRA2BGR); + else + CV_Error(Error::StsInternal, ""); +} + static void _prepareImgAndDrawKeypoints( InputArray img1, const std::vector& keypoints1, InputArray img2, const std::vector& keypoints2, InputOutputArray _outImg, Mat& outImg1, Mat& outImg2, @@ -140,21 +159,16 @@ static void _prepareImgAndDrawKeypoints( InputArray img1, const std::vector(11, 11) << + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 15, 54, 15, 1, 1, 1, 1, + 1, 1, 1, 76, 217, 217, 221, 81, 1, 1, 1, + 1, 1, 100, 224, 111, 57, 115, 225, 101, 1, 1, + 1, 44, 215, 100, 1, 1, 1, 101, 214, 44, 1, + 1, 54, 212, 57, 1, 1, 1, 55, 212, 55, 1, + 1, 40, 215, 104, 1, 1, 1, 105, 215, 40, 1, + 1, 1, 102, 221, 111, 55, 115, 222, 103, 1, 1, + 1, 1, 1, 76, 218, 217, 220, 81, 1, 1, 1, + 1, 1, 1, 1, 15, 55, 15, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + Mat res; + cvtColor(ref, res, (cn == 4) ? COLOR_GRAY2BGRA : COLOR_GRAY2BGR); + return res; +} + +typedef testing::TestWithParam Features2D_drawKeypoints; +TEST_P(Features2D_drawKeypoints, Accuracy) +{ + const int cn = CV_MAT_CN(GetParam()); + Mat inpImg(11, 11, GetParam(), Scalar(1, 1, 1, 255)), outImg; + + std::vector keypoints(1, KeyPoint(5, 5, 1)); + drawKeypoints(inpImg, keypoints, outImg, Scalar::all(255)); + ASSERT_EQ(outImg.channels(), (cn == 4) ? 4 : 3); + + Mat ref_ = getReference_DrawKeypoint(cn); + EXPECT_EQ(0, cv::norm(outImg, ref_, NORM_INF)); +} +INSTANTIATE_TEST_CASE_P(/**/, Features2D_drawKeypoints, Values(CV_8UC1, CV_8UC3, CV_8UC4)); + +typedef testing::TestWithParam > Features2D_drawMatches; +TEST_P(Features2D_drawMatches, Accuracy) +{ + Mat inpImg1(11, 11, get<0>(GetParam()), Scalar(1, 1, 1, 255)); + Mat inpImg2(11, 11, get<1>(GetParam()), Scalar(2, 2, 2, 255)), outImg2, outImg; + + std::vector keypoints(1, KeyPoint(5, 5, 1)); + + // Get outImg2 using drawKeypoints assuming that it works correctly (see the test above). + drawKeypoints(inpImg2, keypoints, outImg2, Scalar::all(255)); + ASSERT_EQ(outImg2.channels(), (inpImg2.channels() == 4) ? 4 : 3); + + // Merge both references. + const int cn = max(3, max(inpImg1.channels(), inpImg2.channels())); + if (cn == 4 && outImg2.channels() == 3) + cvtColor(outImg2, outImg2, COLOR_BGR2BGRA); + Mat ref_ = getReference_DrawKeypoint(cn); + Mat concattedRef; + hconcat(ref_, outImg2, concattedRef); + + std::vector matches; + drawMatches(inpImg1, keypoints, inpImg2, keypoints, matches, outImg, + Scalar::all(255), Scalar::all(255)); + ASSERT_EQ(outImg.channels(), cn); + + EXPECT_EQ(0, cv::norm(outImg, concattedRef, NORM_INF)); +} +INSTANTIATE_TEST_CASE_P(/**/, Features2D_drawMatches, Combine( + Values(CV_8UC1, CV_8UC3, CV_8UC4), + Values(CV_8UC1, CV_8UC3, CV_8UC4) +)); + +}} // namespace diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp index 0a386e50f3..81f7eb6ab2 100644 --- a/modules/highgui/src/window_w32.cpp +++ b/modules/highgui/src/window_w32.cpp @@ -66,6 +66,7 @@ #include #include "opencv2/highgui.hpp" #include +#include "opencv2/core/opengl.hpp" #endif static const char* trackbar_text = @@ -1144,20 +1145,20 @@ static void icvUpdateWindowPos( CvWindow* window ) { RECT rmw, rw = icvCalcWindowRect(window ); MoveWindow(window->hwnd, rw.left, rw.top, - rw.right - rw.left + 1, rw.bottom - rw.top + 1, FALSE); + rw.right - rw.left, rw.bottom - rw.top, FALSE); GetClientRect(window->hwnd, &rw); GetWindowRect(window->frame, &rmw); // Resize the mainhWnd window in order to make the bitmap fit into the child window MoveWindow(window->frame, rmw.left, rmw.top, - rmw.right - rmw.left + size.cx - rw.right + rw.left, - rmw.bottom - rmw.top + size.cy - rw.bottom + rw.top, TRUE ); + size.cx + (rmw.right - rmw.left) - (rw.right - rw.left), + size.cy + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE ); } } rect = icvCalcWindowRect(window); MoveWindow(window->hwnd, rect.left, rect.top, - rect.right - rect.left + 1, - rect.bottom - rect.top + 1, TRUE ); + rect.right - rect.left, + rect.bottom - rect.top, TRUE ); } CV_IMPL void @@ -1263,18 +1264,18 @@ CV_IMPL void cvResizeWindow(const char* name, int width, int height ) { rw = icvCalcWindowRect(window); MoveWindow(window->hwnd, rw.left, rw.top, - rw.right - rw.left + 1, rw.bottom - rw.top + 1, FALSE); + rw.right - rw.left, rw.bottom - rw.top, FALSE); GetClientRect(window->hwnd, &rw); GetWindowRect(window->frame, &rmw); // Resize the mainhWnd window in order to make the bitmap fit into the child window MoveWindow(window->frame, rmw.left, rmw.top, - rmw.right - rmw.left + width - rw.right + rw.left, - rmw.bottom - rmw.top + height - rw.bottom + rw.top, TRUE); + width + (rmw.right - rmw.left) - (rw.right - rw.left), + height + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE); } rect = icvCalcWindowRect(window); MoveWindow(window->hwnd, rect.left, rect.top, - rect.right - rect.left + 1, rect.bottom - rect.top + 1, TRUE); + rect.right - rect.left, rect.bottom - rect.top, TRUE); __END__; } @@ -1421,7 +1422,20 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam ) GetClientRect( window->hwnd, &rect ); SIZE size = {0,0}; - icvGetBitmapData( window, &size, 0, 0 ); +#ifdef HAVE_OPENGL + if (window->useGl) + { + cv::ogl::Texture2D* texObj = static_cast(window->glDrawData); + size.cx = texObj->cols(); + size.cy = texObj->rows(); + } + else + { + icvGetBitmapData(window, &size, 0, 0); + } +#else + icvGetBitmapData(window, &size, 0, 0); +#endif window->on_mouse( event, pt.x*size.cx/MAX(rect.right - rect.left,1), pt.y*size.cy/MAX(rect.bottom - rect.top,1), flags, @@ -1561,8 +1575,8 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM RECT rect = icvCalcWindowRect(window); pos->x = rect.left; pos->y = rect.top; - pos->cx = rect.right - rect.left + 1; - pos->cy = rect.bottom - rect.top + 1; + pos->cx = rect.right - rect.left; + pos->cy = rect.bottom - rect.top; } break; @@ -1615,7 +1629,21 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM SIZE size = {0, 0}; GetClientRect( window->hwnd, &rect ); + +#ifdef HAVE_OPENGL + if (window->useGl) + { + cv::ogl::Texture2D* texObj = static_cast(window->glDrawData); + size.cx = texObj->cols(); + size.cy = texObj->rows(); + } + else + { + icvGetBitmapData(window, &size, 0, 0); + } +#else icvGetBitmapData( window, &size, 0, 0 ); +#endif window->on_mouse( event, pt.x*size.cx/MAX(rect.right - rect.left,1), pt.y*size.cy/MAX(rect.bottom - rect.top,1), flags, diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 342421e134..550fdffdb9 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -587,6 +587,7 @@ struct RowVec_8u32s i += v_uint32::nlanes; } } + vx_cleanup(); return i; } @@ -1083,6 +1084,7 @@ struct SymmRowSmallVec_8u32s } } + vx_cleanup(); return i; } @@ -1106,6 +1108,8 @@ struct SymmColumnVec_32s8u int operator()(const uchar** _src, uchar* dst, int width) const { int _ksize = kernel.rows + kernel.cols - 1; + if( _ksize == 1 ) + return 0; int ksize2 = _ksize/2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; @@ -1115,9 +1119,8 @@ struct SymmColumnVec_32s8u v_float32 d4 = vx_setall_f32(delta); if( symmetrical ) { - if (_ksize == 1) - return 0; v_float32 f0 = vx_setall_f32(ky[0]); + v_float32 f1 = vx_setall_f32(ky[1]); for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { const int* S = src[0] + i; @@ -1125,11 +1128,17 @@ struct SymmColumnVec_32s8u v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4); v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4); - for( k = 1; k <= ksize2; k++ ) + const int* S0 = src[1] + i; + const int* S1 = src[-1] + i; + s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); + s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); + s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2); + s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3); + for( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); - const int* S0 = src[k] + i; - const int* S1 = src[-k] + i; + S0 = src[k] + i; + S1 = src[-k] + i; s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2); @@ -1142,11 +1151,15 @@ struct SymmColumnVec_32s8u const int* S = src[0] + i; v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); - for( k = 1; k <= ksize2; k++ ) + const int* S0 = src[1] + i; + const int* S1 = src[-1] + i; + s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); + s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); + for( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); - const int* S0 = src[k] + i; - const int* S1 = src[-k] + i; + S0 = src[k] + i; + S1 = src[-k] + i; s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); } @@ -1160,7 +1173,8 @@ struct SymmColumnVec_32s8u #endif { v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta)); - for( k = 1; k <= ksize2; k++ ) + s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0); + for( k = 2; k <= ksize2; k++ ) s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); v_int32x4 s32 = v_round(s0); v_int16x8 s16 = v_pack(s32, s32); @@ -1170,17 +1184,20 @@ struct SymmColumnVec_32s8u } else { + v_float32 f1 = vx_setall_f32(ky[1]); for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { - v_float32 s0 = d4; - v_float32 s1 = d4; - v_float32 s2 = d4; - v_float32 s3 = d4; - for ( k = 1; k <= ksize2; k++ ) + const int* S0 = src[1] + i; + const int* S1 = src[-1] + i; + v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); + v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); + v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4); + v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4); + for ( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); - const int* S0 = src[k] + i; - const int* S1 = src[-k] + i; + S0 = src[k] + i; + S1 = src[-k] + i; s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2); @@ -1190,13 +1207,15 @@ struct SymmColumnVec_32s8u } if( i <= width - v_uint16::nlanes ) { - v_float32 s0 = d4; - v_float32 s1 = d4; - for ( k = 1; k <= ksize2; k++ ) + const int* S0 = src[1] + i; + const int* S1 = src[-1] + i; + v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); + v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); + for ( k = 2; k <= ksize2; k++ ) { v_float32 f = vx_setall_f32(ky[k]); - const int* S0 = src[k] + i; - const int* S1 = src[-k] + i; + S0 = src[k] + i; + S1 = src[-k] + i; s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); } @@ -1209,8 +1228,8 @@ struct SymmColumnVec_32s8u if( i <= width - v_int32x4::nlanes ) #endif { - v_float32x4 s0 = v_setall_f32(delta); - for (k = 1; k <= ksize2; k++) + v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta)); + for (k = 2; k <= ksize2; k++) s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); v_int32x4 s32 = v_round(s0); v_int16x8 s16 = v_pack(s32, s32); @@ -1219,6 +1238,7 @@ struct SymmColumnVec_32s8u } } + vx_cleanup(); return i; } @@ -1250,57 +1270,104 @@ struct SymmColumnSmallVec_32s16s short* dst = (short*)_dst; v_float32 df4 = vx_setall_f32(delta); - v_int32 d4 = v_round(df4); + int d = cvRound(delta); + v_int16 d8 = vx_setall_s16((short)d); if( symmetrical ) { if( ky[0] == 2 && ky[1] == 1 ) { - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + { + v_int32 s0 = vx_load(S1 + i); + v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); + v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); + v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); + v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8); + v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2), + vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8); + } + if( i <= width - v_int16::nlanes ) { v_int32 sl = vx_load(S1 + i); v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh))); + v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8); + i += v_int16::nlanes; } if( i <= width - v_int32::nlanes ) { v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s)); + v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s)); i += v_int32::nlanes; } } else if( ky[0] == -2 && ky[1] == 1 ) { - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + { + v_int32 s0 = vx_load(S1 + i); + v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); + v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); + v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); + v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0), + vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8); + v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2), + vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8); + } + if( i <= width - v_int16::nlanes ) { v_int32 sl = vx_load(S1 + i); v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh))); + v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8); + i += v_int16::nlanes; } if( i <= width - v_int32::nlanes ) { v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s)); + v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s)); i += v_int32::nlanes; } } +#if CV_NEON else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) ) { v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]); - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + v_int32 d4 = vx_setall_s32(d); + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + { v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); + v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)), + v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4)))); + } + if( i <= width - v_int16::nlanes ) + { + v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), + v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); + i += v_int16::nlanes; + } if( i <= width - v_int32::nlanes ) { v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); i += v_int32::nlanes; } } +#endif else { v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + { v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); + v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))), + v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4))))); + } + if( i <= width - v_int16::nlanes ) + { + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), + v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); + i += v_int16::nlanes; + } if( i <= width - v_int32::nlanes ) { v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); @@ -1314,20 +1381,38 @@ struct SymmColumnSmallVec_32s16s { if( ky[1] < 0 ) std::swap(S0, S2); - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) - v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4)); + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + { + v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); + v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8); + } + if( i <= width - v_int16::nlanes ) + { + v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); + i += v_int16::nlanes; + } if( i <= width - v_int32::nlanes ) { - v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4); + v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d)); i += v_int32::nlanes; } } else { v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + { v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); + v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)), + v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4)))); + } + if( i <= width - v_int16::nlanes ) + { + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), + v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); + i += v_int16::nlanes; + } if( i <= width - v_int32::nlanes ) { v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4))); @@ -1336,6 +1421,7 @@ struct SymmColumnSmallVec_32s16s } } + vx_cleanup(); return i; } @@ -1362,19 +1448,43 @@ struct RowVec_16s32f const float* _kx = kernel.ptr(); width *= cn; - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) + { + const short* src = (const short*)_src + i; + v_float32 s0 = vx_setzero_f32(); + v_float32 s1 = vx_setzero_f32(); + v_float32 s2 = vx_setzero_f32(); + v_float32 s3 = vx_setzero_f32(); + for( k = 0; k < _ksize; k++, src += cn ) + { + v_float32 f = vx_setall_f32(_kx[k]); + v_int16 xl = vx_load(src); + v_int16 xh = vx_load(src + v_int16::nlanes); + s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0); + s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1); + s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2); + s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + v_store(dst + i + 2*v_float32::nlanes, s2); + v_store(dst + i + 3*v_float32::nlanes, s3); + } + if( i <= width - v_int16::nlanes ) { const short* src = (const short*)_src + i; v_float32 s0 = vx_setzero_f32(); v_float32 s1 = vx_setzero_f32(); for( k = 0; k < _ksize; k++, src += cn ) { + v_float32 f = vx_setall_f32(_kx[k]); v_int16 x = vx_load(src); - s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0); - s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1); + s0 = v_muladd(v_cvt_f32(v_expand_low(x)), f, s0); + s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1); } v_store(dst + i, s0); v_store(dst + i + v_float32::nlanes, s1); + i += v_int16::nlanes; } if( i <= width - v_float32::nlanes ) { @@ -1385,6 +1495,7 @@ struct RowVec_16s32f v_store(dst + i, s0); i += v_float32::nlanes; } + vx_cleanup(); return i; } @@ -1406,6 +1517,8 @@ struct SymmColumnVec_32f16s int operator()(const uchar** _src, uchar* _dst, int width) const { int _ksize = kernel.rows + kernel.cols - 1; + if( _ksize == 1 ) + return 0; int ksize2 = _ksize / 2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; @@ -1416,25 +1529,49 @@ struct SymmColumnVec_32f16s v_float32 d4 = vx_setall_f32(delta); if( symmetrical ) { - if (_ksize == 1) - return 0; v_float32 k0 = vx_setall_f32(ky[0]); - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + v_float32 k1 = vx_setall_f32(ky[1]); + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - for( k = 1; k <= ksize2; k++ ) + v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); + v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); + s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); + s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); + s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2); + s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3); + for( k = 2; k <= ksize2; k++ ) { - v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); + v_float32 k2 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); + s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); + v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); + } + if( i <= width - v_int16::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); + s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); + s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); + for( k = 2; k <= ksize2; k++ ) + { + v_float32 k2 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + } + v_store(dst + i, v_pack(v_round(s0), v_round(s1))); + i += v_int16::nlanes; } if( i <= width - v_float32::nlanes ) { v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - for( k = 1; k <= ksize2; k++ ) + s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); + for( k = 2; k <= ksize2; k++ ) s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); v_pack_store(dst + i, v_round(s0)); i += v_float32::nlanes; @@ -1442,28 +1579,48 @@ struct SymmColumnVec_32f16s } else { - for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + v_float32 k1 = vx_setall_f32(ky[1]); + for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) { - v_float32 s0 = d4; - v_float32 s1 = d4; - for( k = 1; k <= ksize2; k++ ) + v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); + v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); + v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); + for( k = 2; k <= ksize2; k++ ) { - v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1); + v_float32 k2 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); + s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); } v_store(dst + i, v_pack(v_round(s0), v_round(s1))); + v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); + } + if( i <= width - v_int16::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); + for( k = 2; k <= ksize2; k++ ) + { + v_float32 k2 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + } + v_store(dst + i, v_pack(v_round(s0), v_round(s1))); + i += v_int16::nlanes; } if( i <= width - v_float32::nlanes ) { - v_float32 s0 = d4; - for( k = 1; k <= ksize2; k++ ) + v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + for( k = 2; k <= ksize2; k++ ) s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); v_pack_store(dst + i, v_round(s0)); i += v_float32::nlanes; } } + vx_cleanup(); return i; } @@ -1505,6 +1662,7 @@ struct RowVec_32f } #endif int _ksize = kernel.rows + kernel.cols - 1; + CV_DbgAssert(_ksize > 0); const float* src0 = (const float*)_src; float* dst = (float*)_dst; const float* _kx = kernel.ptr(); @@ -1516,14 +1674,55 @@ struct RowVec_32f if (haveAVX2) return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); #endif - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + v_float32 k0 = vx_setall_f32(_kx[0]); + for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) { const float* src = src0 + i; - v_float32 s0 = vx_setzero_f32(); - for( k = 0; k < _ksize; k++, src += cn ) + v_float32 s0 = vx_load(src) * k0; + v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; + v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0; + v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0; + src += cn; + for( k = 1; k < _ksize; k++, src += cn ) + { + v_float32 k1 = vx_setall_f32(_kx[k]); + s0 = v_muladd(vx_load(src), k1, s0); + s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); + s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2); + s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + v_store(dst + i + 2*v_float32::nlanes, s2); + v_store(dst + i + 3*v_float32::nlanes, s3); + } + if( i <= width - 2*v_float32::nlanes ) + { + const float* src = src0 + i; + v_float32 s0 = vx_load(src) * k0; + v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; + src += cn; + for( k = 1; k < _ksize; k++, src += cn ) + { + v_float32 k1 = vx_setall_f32(_kx[k]); + s0 = v_muladd(vx_load(src), k1, s0); + s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + i += 2*v_float32::nlanes; + } + if( i <= width - v_float32::nlanes ) + { + const float* src = src0 + i; + v_float32 s0 = vx_load(src) * k0; + src += cn; + for( k = 1; k < _ksize; k++, src += cn ) s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0); v_store(dst + i, s0); + i += v_float32::nlanes; } + vx_cleanup(); return i; } @@ -1584,6 +1783,8 @@ struct SymmRowSmallVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { int i = 0, _ksize = kernel.rows + kernel.cols - 1; + if( _ksize == 1 ) + return 0; float* dst = (float*)_dst; const float* src = (const float*)_src + (_ksize/2)*cn; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -1592,15 +1793,28 @@ struct SymmRowSmallVec_32f if( symmetrical ) { - if( _ksize == 1 ) - return 0; if( _ksize == 3 ) { if( fabs(kx[0]) == 2 && kx[1] == 1 ) { +#if CV_FMA3 || CV_AVX2 v_float32 k0 = vx_setall_f32(kx[0]); for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn))); +#else + if( kx[0] > 0 ) + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + { + v_float32 x = vx_load(src); + v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x)); + } + else + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + { + v_float32 x = vx_load(src); + v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x)); + } +#endif } else { @@ -1613,9 +1827,17 @@ struct SymmRowSmallVec_32f { if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) { +#if CV_FMA3 || CV_AVX2 v_float32 k0 = vx_setall_f32(-2); for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn))); +#else + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + { + v_float32 x = vx_load(src); + v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x)); + } +#endif } else { @@ -1647,6 +1869,7 @@ struct SymmRowSmallVec_32f } } + vx_cleanup(); return i; } @@ -1688,12 +1911,47 @@ struct SymmColumnVec_32f return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); #endif const v_float32 d4 = vx_setall_f32(delta); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + const v_float32 k0 = vx_setall_f32(ky[0]); + for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) { - v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4); + v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); + v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); + v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); + for( k = 1; k <= ksize2; k++ ) + { + v_float32 k1 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); + s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2); + s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + v_store(dst + i + 2*v_float32::nlanes, s2); + v_store(dst + i + 3*v_float32::nlanes, s3); + } + if( i <= width - 2*v_float32::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); + for( k = 1; k <= ksize2; k++ ) + { + v_float32 k1 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + i += 2*v_float32::nlanes; + } + if( i <= width - v_float32::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); for( k = 1; k <= ksize2; k++ ) s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); v_store(dst + i, s0); + i += v_float32::nlanes; } } else @@ -1702,16 +1960,53 @@ struct SymmColumnVec_32f if (haveAVX2) return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); #endif + CV_DbgAssert(ksize2 > 0); const v_float32 d4 = vx_setall_f32(delta); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + const v_float32 k1 = vx_setall_f32(ky[1]); + for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) { - v_float32 s0 = d4; - for( k = 1; k <= ksize2; k++ ) + v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); + v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); + v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); + for( k = 2; k <= ksize2; k++ ) + { + v_float32 k2 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); + s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + v_store(dst + i + 2*v_float32::nlanes, s2); + v_store(dst + i + 3*v_float32::nlanes, s3); + } + if( i <= width - 2*v_float32::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); + for( k = 2; k <= ksize2; k++ ) + { + v_float32 k2 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + i += 2*v_float32::nlanes; + } + if( i <= width - v_float32::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); + for( k = 2; k <= ksize2; k++ ) s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); v_store(dst + i, s0); + i += v_float32::nlanes; } } + vx_cleanup(); return i; } @@ -1748,9 +2043,24 @@ struct SymmColumnSmallVec_32f { if( fabs(ky[0]) == 2 && ky[1] == 1 ) { +#if CV_FMA3 || CV_AVX2 v_float32 k0 = vx_setall_f32(ky[0]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4)); +#else + if(ky[0] > 0) + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + { + v_float32 x = vx_load(S1 + i); + v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x)); + } + else + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + { + v_float32 x = vx_load(S1 + i); + v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x)); + } +#endif } else { @@ -1776,6 +2086,7 @@ struct SymmColumnSmallVec_32f } } + vx_cleanup(); return i; } @@ -1804,19 +2115,27 @@ struct FilterVec_8u int operator()(const uchar** src, uchar* dst, int width) const { + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; int i = 0, k, nz = _nz; v_float32 d4 = vx_setall_f32(delta); + v_float32 f0 = vx_setall_f32(kf[0]); for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { - v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4; - for( k = 0; k < nz; k++ ) + v_uint16 xl, xh; + v_expand(vx_load(src[0] + i), xl, xh); + v_uint32 x0, x1, x2, x3; + v_expand(xl, x0, x1); + v_expand(xh, x2, x3); + v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); + v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f0, d4); + v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f0, d4); + for( k = 1; k < nz; k++ ) { v_float32 f = vx_setall_f32(kf[k]); - v_uint16 xl, xh; v_expand(vx_load(src[k] + i), xl, xh); - v_uint32 x0, x1, x2, x3; v_expand(xl, x0, x1); v_expand(xh, x2, x3); s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); @@ -1828,11 +2147,13 @@ struct FilterVec_8u } if( i <= width - v_uint16::nlanes ) { - v_float32 s0 = d4, s1 = d4; - for( k = 0; k < nz; k++ ) + v_uint32 x0, x1; + v_expand(vx_load_expand(src[0] + i), x0, x1); + v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); + for( k = 1; k < nz; k++ ) { v_float32 f = vx_setall_f32(kf[k]); - v_uint32 x0, x1; v_expand(vx_load_expand(src[k] + i), x0, x1); s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); @@ -1846,8 +2167,8 @@ struct FilterVec_8u if( i <= width - v_int32x4::nlanes ) #endif { - v_float32x4 s0 = v_setall_f32(delta); - for( k = 0; k < nz; k++ ) + v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta)); + for( k = 1; k < nz; k++ ) s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0); v_int32x4 s32 = v_round(s0); v_int16x8 s16 = v_pack(s32, s32); @@ -1855,6 +2176,7 @@ struct FilterVec_8u i += v_int32x4::nlanes; } + vx_cleanup(); return i; } @@ -1879,18 +2201,24 @@ struct FilterVec_8u16s int operator()(const uchar** src, uchar* _dst, int width) const { + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; short* dst = (short*)_dst; int i = 0, k, nz = _nz; v_float32 d4 = vx_setall_f32(delta); + v_float32 f0 = vx_setall_f32(kf[0]); for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { - v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4; - for( k = 0; k < nz; k++ ) + v_uint16 xl, xh; + v_expand(vx_load(src[0] + i), xl, xh); + v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f0, d4); + v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f0, d4); + v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f0, d4); + for( k = 1; k < nz; k++ ) { v_float32 f = vx_setall_f32(kf[k]); - v_uint16 xl, xh; v_expand(vx_load(src[k] + i), xl, xh); s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0); s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1); @@ -1902,11 +2230,13 @@ struct FilterVec_8u16s } if( i <= width - v_uint16::nlanes ) { - v_float32 s0 = d4, s1 = d4; - for( k = 0; k < nz; k++ ) + v_uint16 x = vx_load_expand(src[0] + i); + v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f0, d4); + for( k = 1; k < nz; k++ ) { v_float32 f = vx_setall_f32(kf[k]); - v_uint16 x = vx_load_expand(src[k] + i); + x = vx_load_expand(src[k] + i); s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0); s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1); } @@ -1915,13 +2245,14 @@ struct FilterVec_8u16s } if( i <= width - v_int32::nlanes ) { - v_float32 s0 = d4; - for( k = 0; k < nz; k++ ) + v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4); + for( k = 1; k < nz; k++ ) s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); v_pack_store(dst + i, v_round(s0)); i += v_int32::nlanes; } + vx_cleanup(); return i; } @@ -1950,14 +2281,50 @@ struct FilterVec_32f int i = 0, k, nz = _nz; v_float32 d4 = vx_setall_f32(delta); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + v_float32 f0 = vx_setall_f32(kf[0]); + for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) { - v_float32 s0 = d4; - for( k = 0; k < nz; k++ ) + v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); + v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4); + v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4); + for( k = 1; k < nz; k++ ) + { + v_float32 f1 = vx_setall_f32(kf[k]); + s0 = v_muladd(vx_load(src[k] + i), f1, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); + s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2); + s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + v_store(dst + i + 2*v_float32::nlanes, s2); + v_store(dst + i + 3*v_float32::nlanes, s3); + } + if( i <= width - 2*v_float32::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); + for( k = 1; k < nz; k++ ) + { + v_float32 f1 = vx_setall_f32(kf[k]); + s0 = v_muladd(vx_load(src[k] + i), f1, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + i += 2*v_float32::nlanes; + } + if( i <= width - v_float32::nlanes ) + { + v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); + for( k = 1; k < nz; k++ ) s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0); v_store(dst + i, s0); + i += v_float32::nlanes; } + vx_cleanup(); return i; } diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp index 155c62f342..7749e4a59a 100644 --- a/modules/imgproc/test/test_filter.cpp +++ b/modules/imgproc/test/test_filter.cpp @@ -403,9 +403,9 @@ void CV_FilterTest::get_test_array_types_and_sizes( int test_case_idx, { CV_FilterBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types ); RNG& rng = ts->get_rng(); - int depth = cvtest::randInt(rng)%3; + int depth = cvtest::randInt(rng)%4; int cn = CV_MAT_CN(types[INPUT][0]); - depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : CV_32F; + depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F; types[INPUT][0] = types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth, cn); } @@ -457,10 +457,11 @@ void CV_DerivBaseTest::get_test_array_types_and_sizes( int test_case_idx, { RNG& rng = ts->get_rng(); CV_FilterBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types ); - int depth = cvtest::randInt(rng) % 2; - depth = depth == 0 ? CV_8U : CV_32F; + int depth = cvtest::randInt(rng) % 4; + depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F; types[INPUT][0] = CV_MAKETYPE(depth,1); - types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth==CV_8U?CV_16S:CV_32F,1); + int sameDepth = cvtest::randInt(rng) % 2; + types[OUTPUT][0] = types[REF_OUTPUT][0] = sameDepth ? depth : CV_MAKETYPE(depth==CV_8U?CV_16S:CV_32F,1); _aperture_size = (cvtest::randInt(rng)%5)*2 - 1; sizes[INPUT][1] = aperture_size = cvSize(_aperture_size, _aperture_size); } @@ -2211,4 +2212,27 @@ TEST(Imgproc_MedianBlur, hires_regression_13409) ASSERT_EQ(0.0, cvtest::norm(dst_hires(Rect(516, 516, 1016, 1016)), dst_ref(Rect(4, 4, 1016, 1016)), NORM_INF)); } + +TEST(Imgproc_Sobel, s16_regression_13506) +{ + Mat src = (Mat_(8, 16) << 127, 138, 130, 102, 118, 97, 76, 84, 124, 90, 146, 63, 130, 87, 212, 85, + 164, 3, 51, 124, 151, 89, 154, 117, 36, 88, 116, 117, 180, 112, 147, 124, + 63, 50, 115, 103, 83, 148, 106, 79, 213, 106, 135, 53, 79, 106, 122, 112, + 218, 107, 81, 126, 78, 138, 85, 142, 151, 108, 104, 158, 155, 81, 112, 178, + 184, 96, 187, 148, 150, 112, 138, 162, 222, 146, 128, 49, 124, 46, 165, 104, + 119, 164, 77, 144, 186, 98, 106, 148, 155, 157, 160, 151, 156, 149, 43, 122, + 106, 155, 120, 132, 159, 115, 126, 188, 44, 79, 164, 201, 153, 97, 139, 133, + 133, 98, 111, 165, 66, 106, 131, 85, 176, 156, 67, 108, 142, 91, 74, 137); + Mat ref = (Mat_(8, 16) << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1020, -796, -489, -469, -247, 317, 760, 1429, 1983, 1384, 254, -459, -899, -1197, -1172, -1058, + 2552, 2340, 1617, 591, 9, 96, 722, 1985, 2746, 1916, 676, 9, -635, -1115, -779, -380, + 3546, 3349, 2838, 2206, 1388, 669, 938, 1880, 2252, 1785, 1083, 606, 180, -298, -464, -418, + 816, 966, 1255, 1652, 1619, 924, 535, 288, 5, 601, 1581, 1870, 1520, 625, -627, -1260, + -782, -610, -395, -267, -122, -42, -317, -1378, -2293, -1451, 596, 1870, 1679, 763, -69, -394, + -882, -681, -463, -818, -1167, -732, -463, -1042, -1604, -1592, -1047, -334, -104, -117, 229, 512, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + Mat dst; + Sobel(src, dst, CV_16S, 0, 1, 5); + ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF)); +} }} // namespace diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp index cabe515932..5e96822e81 100644 --- a/modules/stitching/src/matchers.cpp +++ b/modules/stitching/src/matchers.cpp @@ -351,6 +351,7 @@ void FeaturesMatcher::operator ()(const std::vector &features, st if (features[i].keypoints.size() > 0 && features[j].keypoints.size() > 0 && mask_(i, j)) near_pairs.push_back(std::make_pair(i, j)); + pairwise_matches.clear(); // clear history values pairwise_matches.resize(num_images * num_images); MatchPairsBody body(*this, features, pairwise_matches, near_pairs); diff --git a/samples/dnn/openpose.cpp b/samples/dnn/openpose.cpp index b4934d76e4..48e2dc0475 100644 --- a/samples/dnn/openpose.cpp +++ b/samples/dnn/openpose.cpp @@ -57,21 +57,26 @@ const int POSE_PAIRS[3][20][2] = { int main(int argc, char **argv) { CommandLineParser parser(argc, argv, - "{ h help | false | print this help message }" - "{ p proto | | (required) model configuration, e.g. hand/pose.prototxt }" - "{ m model | | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }" - "{ i image | | (required) path to image file (containing a single person, or hand) }" - "{ width | 368 | Preprocess input image by resizing to a specific width. }" - "{ height | 368 | Preprocess input image by resizing to a specific height. }" - "{ t threshold | 0.1 | threshold or confidence value for the heatmap }" + "{ h help | false | print this help message }" + "{ p proto | | (required) model configuration, e.g. hand/pose.prototxt }" + "{ m model | | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }" + "{ i image | | (required) path to image file (containing a single person, or hand) }" + "{ d dataset | | specify what kind of model was trained. It could be (COCO, MPI, HAND) depends on dataset. }" + "{ width | 368 | Preprocess input image by resizing to a specific width. }" + "{ height | 368 | Preprocess input image by resizing to a specific height. }" + "{ t threshold | 0.1 | threshold or confidence value for the heatmap }" + "{ s scale | 0.003922 | scale for blob }" ); String modelTxt = samples::findFile(parser.get("proto")); String modelBin = samples::findFile(parser.get("model")); String imageFile = samples::findFile(parser.get("image")); + String dataset = parser.get("dataset"); int W_in = parser.get("width"); int H_in = parser.get("height"); float thresh = parser.get("threshold"); + float scale = parser.get("scale"); + if (parser.get("help") || modelTxt.empty() || modelBin.empty() || imageFile.empty()) { cout << "A sample app to demonstrate human or hand pose detection with a pretrained OpenPose dnn." << endl; @@ -79,9 +84,18 @@ int main(int argc, char **argv) return 0; } - // read the network model - Net net = readNetFromCaffe(modelTxt, modelBin); + int midx, npairs, nparts; + if (!dataset.compare("COCO")) { midx = 0; npairs = 17; nparts = 18; } + else if (!dataset.compare("MPI")) { midx = 1; npairs = 14; nparts = 16; } + else if (!dataset.compare("HAND")) { midx = 2; npairs = 20; nparts = 22; } + else + { + std::cerr << "Can't interpret dataset parameter: " << dataset << std::endl; + exit(-1); + } + // read the network model + Net net = readNet(modelBin, modelTxt); // and the image Mat img = imread(imageFile); if (img.empty()) @@ -91,39 +105,14 @@ int main(int argc, char **argv) } // send it through the network - Mat inputBlob = blobFromImage(img, 1.0 / 255, Size(W_in, H_in), Scalar(0, 0, 0), false, false); + Mat inputBlob = blobFromImage(img, scale, Size(W_in, H_in), Scalar(0, 0, 0), false, false); net.setInput(inputBlob); Mat result = net.forward(); // the result is an array of "heatmaps", the probability of a body part being in location x,y - int midx, npairs; - int nparts = result.size[1]; int H = result.size[2]; int W = result.size[3]; - // find out, which model we have - if (nparts == 19) - { // COCO body - midx = 0; - npairs = 17; - nparts = 18; // skip background - } - else if (nparts == 16) - { // MPI body - midx = 1; - npairs = 14; - } - else if (nparts == 22) - { // hand - midx = 2; - npairs = 20; - } - else - { - cerr << "there should be 19 parts for the COCO model, 16 for MPI, or 22 for the hand one, but this model has " << nparts << " parts." << endl; - return (0); - } - // find the position of the body parts vector points(22); for (int n=0; n